diff --git a/Makefile.am b/Makefile.am
index c18d2fa26b6..e38497b3811 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -2,7 +2,7 @@
 # Copyright (c) 2016 Cisco Systems, Inc.  All rights reserved.
 # Copyright (c) 2017-2018 Intel Corporation, Inc. All right reserved.
 # Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved.
-# (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+# (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP
 #
 # Makefile.am for libfabric
 
@@ -458,6 +458,7 @@ include prov/verbs/Makefile.include
 include prov/efa/Makefile.include
 include prov/psm2/Makefile.include
 include prov/psm3/Makefile.include
+include prov/cxi/Makefile.include
 include prov/rxm/Makefile.include
 include prov/mrail/Makefile.include
 include prov/rxd/Makefile.include
diff --git a/configure.ac b/configure.ac
index 3e9c31c53e5..db61225cced 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1004,6 +1004,7 @@ FI_PROVIDER_SETUP([psm3])
 FI_PROVIDER_SETUP([sockets])
 FI_PROVIDER_SETUP([verbs])
 FI_PROVIDER_SETUP([efa])
+FI_PROVIDER_SETUP([cxi])
 FI_PROVIDER_SETUP([udp])
 FI_PROVIDER_SETUP([tcp])
 FI_PROVIDER_SETUP([rxm])
diff --git a/include/ofi_prov.h b/include/ofi_prov.h
index 506c1fd8f08..aabce7fc283 100644
--- a/include/ofi_prov.h
+++ b/include/ofi_prov.h
@@ -48,6 +48,17 @@
  * not built: no-op call for ctor
 */
 
+#if (HAVE_CXI) && (HAVE_CXI_DL)
+#  define CXI_INI FI_EXT_INI
+#  define CXI_INIT NULL
+#elif (HAVE_CXI)
+#  define CXI_INI INI_SIG(fi_cxi_ini)
+#  define CXI_INIT fi_cxi_ini()
+CXI_INI ;
+#else
+#  define CXI_INIT NULL
+#endif
+
 /* If HAVE_EFA is defined on Windows, then the VisualStudio project configures
  * MSBuild to include the efa related files and exclude the verbs related files.
  * With the verbs related files excluded from the build, we need only ensure
diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md
new file mode 100644
index 00000000000..7cc3d288675
--- /dev/null
+++ b/man/fi_cxi.7.md
@@ -0,0 +1,1781 @@
+---
+layout: page
+title: fi_cxi(7)
+tagline: Libfabric Programmer's Manual
+---
+{% include JB/setup %}
+
+# NAME
+
+fi_cxi \- The CXI Fabric Provider
+
+# OVERVIEW
+
+The CXI provider enables libfabric on Cray's Slingshot network. Slingshot is
+comprised of the Rosetta switch and Cassini NIC. Slingshot is an
+Ethernet-compliant network. However, The provider takes advantage of proprietary
+extensions to support HPC applications.
+
+The CXI provider supports reliable, connection-less endpoint semantics. It
+supports two-sided messaging interfaces with message matching offloaded by the
+Cassini NIC. It also supports one-sided RMA and AMO interfaces, light-weight
+counting events, triggered operations (via the deferred work API), and
+fabric-accelerated small reductions.
+
+# REQUIREMENTS
+
+The CXI Provider requires Cassini's optimized HPC protocol which is only
+supported in combination with the Rosetta switch.
+
+The provider uses the libCXI library for control operations and a set of
+Cassini-specific header files to enable direct hardware access in the data path.
+
+# SUPPORTED FEATURES
+
+The CXI provider supports a subset of OFI features.
+
+## Endpoint types
+
+The provider supports the *FI_EP_RDM* endpoint type.
+
+## Memory registration modes
+
+The provider implements scalable memory registration. The provider requires
+*FI_MR_ENDPOINT*. *FI_MR_ALLOCATED* is required if ODP in not enabled or not
+desired. Client specified 32-bit MR keys are the default unless *FI_MR_PROV_KEY*
+is specified. For *FI_MR_PROV_KEY* provider generated 64-bit MR keys are used.
+An RMA initiator can work concurrently with client and provider generated keys.
+
+In client/server environments, if concerns with stale MR key usage exists, then
+*FI_MR_PROV_KEY* generated keys should be used along with *FI_CXI_MR_MATCH_EVENTS=1*
+and *FI_CXI_OPTIMIZED_MRS=0*. The former speeds up MR close, allowing non-remote
+MR cached keys to be used that enable full remote memory access protection
+after an MR is closed, even if that memory remains in the libfabric MR cache.
+The latter uses only standard MR which use matching to enable robust key
+usage, protecting against a stale MR key matching a newly generated MR keys.
+
+## Data transfer operations
+
+The following data transfer interfaces are supported: *FI_ATOMIC*, *FI_MSG*,
+*FI_RMA*, *FI_TAGGED*.  See DATA TRANSFER OPERATIONS below for more details.
+
+## Completion events
+
+The CXI provider supports all CQ event formats.
+
+## Modes
+
+The CXI provider does not require any operation modes.
+
+## Progress
+
+The CXI provider currently supports *FI_PROGRESS_MANUAL* data and control
+progress modes.
+
+## Multi-threading
+
+The CXI provider supports FI_THREAD_SAFE and FI_THREAD_DOMAIN threading models.
+
+## Wait Objects
+
+The CXI provider supports FI_WAIT_FD and FI_WAIT_POLLFD CQ wait object types.
+FI_WAIT_UNSPEC will default to FI_WAIT_FD. However FI_WAIT_NONE should achieve
+the lowest latency and reduce interrupt overhead.
+
+## Additional Features
+
+The CXI provider also supports the following capabilities and features:
+
+* *FI_MULTI_RECV*
+* *FI_SOURCE*
+* *FI_NAMED_RX_CTX*
+* *FI_RM_ENABLED*
+* *FI_RMA_EVENT*
+* *FI_REMOTE_CQ_DATA*
+* *FI_MORE*
+* *FI_FENCE*
+
+## Addressing Format
+
+The CXI provider uses a proprietary address format. This format includes fields
+for NIC Address and PID. NIC Address is the topological address of the NIC
+endpoint on the fabric. All OFI Endpoints sharing a Domain share the same NIC
+Address. PID (for Port ID or Process ID, adopted from the Portals 4
+specification), is analogous to an IP socket port number. Valid PIDs are in the
+range [0-510].
+
+A third component of Slingshot network addressing is the Virtual Network ID
+(VNI). VNI is a protection key used by the Slingshot network to provide
+isolation between applications. A VNI defines an isolated PID space for a given
+NIC. Therefore, Endpoints must use the same VNI in order to communicate. Note
+that VNI is not a field of the CXI address, but rather is specified as part of
+the OFI Endpoint auth_key. The combination of NIC Address, VNI, and PID is
+unique to a single OFI Endpoint within a Slingshot fabric.
+
+The NIC Address of an OFI Endpoint is inherited from the Domain. By default, a
+PID is automatically assigned to an Endpoint when it is enabled. The address of
+an Endpoint can be queried using fi_getname. The address received from
+fi_getname may then be inserted into a peer's Address Vector. The resulting FI
+address may then be used to perform an RDMA operation.
+
+Alternatively, a client may manage PID assignment. fi_getinfo may be used to
+create an fi_info structure that can be used to create an Endpoint with a
+client-specified address. To achieve this, use fi_getinfo with the *FI_SOURCE*
+flag set and set node and service strings to represent the local NIC interface
+and PID to be assigned to the Endpoint. The NIC interface string should match
+the name of an available CXI domain (in the format cxi[0-9]). The PID string
+will be interpreted as a 9-bit integer. Address conflicts will be detected when
+the Endpoint is enabled.
+
+## Authorization Keys
+
+The CXI authorization key format is defined by struct cxi_auth_key. This
+structure is defined in fi_cxi_ext.h.
+
+```c
+struct cxi_auth_key {
+	uint32_t svc_id;
+	uint16_t vni;
+};
+```
+
+The CXI authorization key format includes a VNI and CXI service ID. VNI is a
+component of the CXI Endpoint address that provides isolation. A CXI service is
+a software container which defines a set of local CXI resources, VNIs, and
+Traffic Classes which a libfabric user can access.
+
+Two endpoints must use the same VNI in order to communicate. Generally, a
+parallel application should be assigned to a unique VNI on the fabric in order
+to achieve network traffic and address isolation. Typically a privileged
+entity, like a job launcher, will allocate one or more VNIs for use by the
+libfabric user.
+
+The CXI service API is provided by libCXI. It enables a privileged entity, like
+an application launcher, to control an unprivileged process's access to NIC
+resources. Generally, a parallel application should be assigned to a unique CXI
+service in order to control access to local resources, VNIs, and Traffic
+Classes.
+
+While a libfabric user provided authorization key is optional, it is highly
+encouraged that libfabric users provide an authorization key through the domain
+attribute hints during `fi_getinfo()`. How libfabric users acquire the
+authorization key may vary between the users and is outside the scope of this
+document.
+
+If an authorization key is not provided by the libfabric user, the CXI provider
+will attempt to generate an authorization key on behalf of the user. The
+following outlines how the CXI provider will attempt to generate an
+authorization key.
+
+1. Query for the following environment variables and generate an authorization
+key using them.
+    * *SLINGSHOT_VNIS*: Comma separated list of VNIs. The CXI provider will only
+    use the first VNI if multiple are provide. Example: `SLINGSHOT_VNIS=234`.
+    * *SLINGSHOT_DEVICES*: Comma separated list of device names. Each device index
+    will use the same index to lookup the service ID in *SLINGSHOT_SVC_IDS*.
+    Example: `SLINGSHOT_DEVICES=cxi0,cxi1`.
+    * *SLINGSHOT_SVC_IDS*: Comma separated list of pre-configured CXI service IDs.
+    Each service ID index will use the same index to lookup the CXI device in
+    *SLINGSHOT_DEVICES*. Example: `SLINGSHOT_SVC_IDS=5,6`.
+
+    **Note:** How valid VNIs and device services are configured is outside
+    the responsibility of the CXI provider.
+
+2. Query pre-configured device services and find first entry with same UID as
+the libfabric user.
+
+3. Query pre-configured device services and find first entry with same GID as
+the libfabric user.
+
+4. Query pre-configured device services and find first entry which does not
+restrict member access. If enabled, the default service is an example of an
+unrestricted service.
+
+    **Note:** There is a security concern with such services since it allows
+    for multiple independent libfabric users to use the same service.
+
+**Note:** For above entries 2-4, it is possible the found device service does
+not restrict VNI access. For such cases, the CXI provider will query
+*FI_CXI_DEFAULT_VNI* to assign a VNI.
+
+During Domain allocation, if the domain auth_key attribute is NULL, the CXI
+provider will attempt to generate a valid authorization key. If the domain
+auth_key attribute is valid (i.e. not NULL and encoded authorization key has
+been verified), the CXI provider will use the encoded VNI and service ID.
+Failure to generate a valid authorization key will result in Domain allocation
+failure.
+
+During Endpoint allocation, if the endpoint auth_key attribute is NULL, the
+Endpoint with inherit the parent Domain's VNI and service ID. If the Endpoint
+auth_key attribute is valid, the encoded VNI and service ID must match the
+parent Domain's VNI and service ID. Allocating an Endpoint with a different VNI
+and service from the parent Domain is not supported.
+
+The following is the expected parallel application launch workflow with
+CXI integrated launcher and CXI authorization key aware libfabric user:
+
+1. A parallel application is launched.
+2. The launcher allocates one or more VNIs for use by the application.
+3. The launcher communicates with compute node daemons where the application
+   will be run.
+4. The launcher compute node daemon configures local CXI interfaces. libCXI is
+   used to allocate one or more services for the application. The service will
+   define the local resources, VNIs, and Traffic Classes that the application
+   may access. Service allocation policies must be defined by the launcher.
+   libCXI returns an ID to represent a service.
+5. The launcher forks application processes.
+6. The launcher provides one or more service IDs and VNI values to the
+   application processes.
+7. Application processes select from the list of available service IDs and VNIs
+   to form an authorization key to use for Endpoint allocation.
+
+## Address Vectors
+
+The CXI provider supports both *FI_AV_TABLE* and *FI_AV_MAP* with the same
+internal implementation.
+
+The CXI provider uses the *FI_SYMMETRIC* AV flag for optimization. When used
+with *FI_AV_TABLE*, the CXI provider can use the fi_addr_t index as an endpoint
+identifier instead of a network address. The benefit of this is when running
+with FI_SOURCE, a reverse lookup is not needed to generate the source fi_addr_t
+for target CQ events. Note: FI_SOURCE_ERR should not be used for this
+configuration.
+
+If the AV is not configured with *FI_SYMMETRIC*, *FI_AV_USER_ID* is supported
+as a flag which can be passed into AV insert.
+
+Since scalable EPs are not supported, fi_av_attr::rx_ctx_bits must be zero.
+
+The following AV capabilities and flags are not supported: FI_SHARED_AV,
+FI_SYNC_ERR, FI_EVENT, and FI_READ.
+
+## Operation flags
+
+The CXI provider supports the following Operation flags:
+
+*FI_MORE*
+:   When *FI_MORE* is specified in a data transfer operation, the provider will
+    defer submission of RDMA commands to hardware. When one or more data
+    transfer operations is performed using *FI_MORE*, followed by an operation
+    without *FI_MORE*, the provider will submit the entire batch of queued
+    operations to hardware using a single PCIe transaction, improving PCIe
+    efficiency.
+
+    When *FI_MORE* is used, queued commands will not be submitted to hardware
+    until another data transfer operation is performed without *FI_MORE*.
+
+*FI_TRANSMIT_COMPLETE*
+:   By default, all CXI provider completion events satisfy the requirements of
+    the 'transmit complete' completion level. Transmit complete events are
+    generated when the intiator receives an Ack from the target NIC. The Ack is
+    generated once all data has been received by the target NIC. Transmit
+    complete events do not guarantee that data is visibile to the target
+    process.
+
+*FI_DELIVERY_COMPLETE*
+:   When the 'delivery complete' completion level is used, the event guarantees
+    that data is visible to the target process. To support this, hardware at
+    the target performs a zero-byte read operation to flush data across the
+    PCIe bus before generating an Ack. Flushing reads are performed
+    unconditionally and will lead to higher latency.
+
+*FI_MATCH_COMPLETE*
+:   When the 'match complete' completion level is used, the event guarantees
+    that the message has been matched to a client-provided buffer. All messages
+    longer than the eager threshold support this guarantee. When 'match
+    complete' is used with a Send that is shorter than the eager threshold, an
+    additional handshake may be performed by the provider to notify the
+    initiator that the Send has been matched.
+
+The CXI provider also supports the following operation flags:
+
+* *FI_INJECT*
+* *FI_FENCE*
+* *FI_COMPLETION*
+* *FI_REMOTE_CQ_DATA*
+
+## Scalable Endpoints
+
+Scalable Endpoints (SEPs) support is not enabled in the CXI provider. Future
+releases of the provider will re-introduce SEP support.
+
+## Messaging
+
+The CXI provider supports both tagged (*FI_TAGGED*) and untagged (*FI_MSG*)
+two-sided messaging interfaces. In the normal case, message matching is
+performed by hardware. In certain low resource conditions, the responsibility to
+perform message matching may be transferred to software. Specification
+of the receive message matching mode in the environment (*FI_CXI_RX_MATCH_MODE*)
+controls the initial matching mode and whether hardware matching can
+transparently transition matching to software where a hybrid of hardware
+and software receive matching is done.
+
+If a Send operation arrives at a node where there is no matching Receive
+operation posted, it is considered unexpected. Unexpected messages are
+supported. The provider manages buffers to hold unexpected message data.
+
+Unexpected message handling is transparent to clients. Despite that, clients
+should take care to avoid excessive use of unexpected messages by pre-posting
+Receive operations. An unexpected message ties up hardware and memory resources
+until it is matched with a user buffer.
+
+The CXI provider implements several message protocols internally. A message
+protocol is selected based on payload length. Short messages are transferred
+using the eager protocol. In the eager protocol, the entire message payload is
+sent along with the message header. If an eager message arrives unexpectedly,
+the entire message is buffered at the target until it is matched to a Receive
+operation.
+
+Long messages are transferred using a rendezvous protocol. The threshold at
+which the rendezvous protocol is used is controlled with the
+*FI_CXI_RDZV_THRESHOLD* and *FI_CXI_RDZV_GET_MIN* environment variables.
+
+In the rendezvous protocol, a portion of the message payload is sent
+along with the message header. Once the header is matched to a Receive
+operation, the remainder of the payload is pulled from the source using an RDMA
+Get operation. If the message arrives unexpectedly, the eager portion of the
+payload is buffered at the target until it is matched to a Receive operation.
+In the normal case, the Get is performed by hardware and the operation
+completes without software progress.
+
+Unexpected rendezvous protocol messages can not complete and release source side
+buffer resources until a matching receive is posted at the destination and the
+non-eager data is read from the source with a rendezvous get DMA. The number of
+rendezvous messages that may be outstanding is limited by the minimum of the
+hints->tx_attr->size value specified and the number of rendezvous operation ID
+mappings available. FI_TAGGED rendezvous messages have 32K-256 ID mappings,
+FI_MSG rendezvous messages are limited to 256 ID mappings. While this
+works well with MPI, care should be taken that this minimum is large enough to
+ensure applications written in a manner that assumes unlimited resources and
+use FI_MSG rendezvous messaging do not induce a software deadlock. If FI_MSG
+rendezvous messaging is done in a unexpected manner that may exceed the FI_MSG
+ID mappings available, it may be sufficient to reduce the number of rendezvous
+operations by increasing the rendezvous threshold. See *FI_CXI_RDZV_THRESHOLD*
+for information.
+
+Message flow-control is triggered when hardware message matching resources
+become exhausted. Messages may be dropped and retransmitted in order to
+recover; impacting performance significantly. Programs should be careful to avoid
+posting large numbers of unmatched receive operations and to minimize the
+number of outstanding unexpected messages to prevent message flow-control.
+If the RX message matching mode is configured to support hybrid mode, when
+resources are exhausted, hardware will transition to hybrid operation where
+hardware and software share matching responsibility.
+
+To help avoid this condition, increase Overflow buffer space using environment
+variables *FI_CXI_OFLOW_\**, and for software and hybrid RX match modes
+increase Request buffer space using the variables *FI_CXI_REQ_\**.
+
+## Message Ordering
+
+The CXI provider supports the following ordering rules:
+
+* All message Send operations are always ordered.
+* RMA Writes may be ordered by specifying *FI_ORDER_RMA_WAW*.
+* AMOs may be ordered by specifying *FI_ORDER_AMO_{WAW|WAR|RAW|RAR}*.
+* RMA Writes may be ordered with respect to AMOs by specifying *FI_ORDER_WAW*.
+  Fetching AMOs may be used to perform short reads that are ordered with
+  respect to RMA Writes.
+
+Ordered RMA size limits are set as follows:
+
+* *max_order_waw_size* is -1. RMA Writes and non-fetching AMOs of any size are
+  ordered with respect to each other.
+* *max_order_raw_size* is -1. Fetching AMOs of any size are ordered with
+  respect to RMA Writes and non-fetching AMOs.
+* *max_order_war_size* is -1. RMA Writes and non-fetching AMOs of any size are
+  ordered with respect to fetching AMOs.
+
+## PCIe Ordering
+
+Generally, PCIe writes are strictly ordered. As an optimization, PCIe TLPs may
+have the Relaxed Order (RO) bit set to allow writes to be reordered. Cassini
+sets the RO bit in PCIe TLPs when possible. Cassini sets PCIe RO as follows:
+
+* Ordering of messaging operations is established using completion events.
+  Therefore, all PCIe TLPs related to two-sided message payloads will have RO
+  set.
+* Every PCIe TLP associated with an unordered RMA or AMO operation will have RO
+  cleared.
+* PCIe TLPs associated with the last packet of an ordered RMA or AMO operation
+  will have RO cleared.
+* PCIe TLPs associated with the body packets (all except the last packet of an
+  operation) of an ordered RMA operation will have RO set.
+
+## Translation
+
+The CXI provider supports two translation mechanisms: Address Translation
+Services (ATS) and NIC Translation Agent (NTA). Use the environment variable
+*FI_CXI_ATS* to select between translation mechanisms.
+
+ATS refers to NIC support for PCIe rev. 4 ATS, PRI and PASID features. ATS
+enables the NIC to efficiently access the entire virtual address space of a
+process. ATS mode currently supports AMD hosts using the iommu_v2 API.
+
+The NTA is an on-NIC translation unit. The NTA supports two-level page tables
+and additional hugepage sizes. Most CPUs support 2MB and 1GB hugepage sizes.
+Other hugepage sizes may be supported by SW to enable the NIC to cache more
+address space.
+
+ATS and NTA both support on-demand paging (ODP) in the event of a page fault.
+Use the environment variable *FI_CXI_ODP* to enable ODP.
+
+With ODP enabled, buffers used for data transfers are not required to be backed
+by physical memory. An un-populated buffer that is referenced by the NIC will
+incur a network page fault. Network page faults will significantly impact
+application performance. Clients should take care to pre-populate buffers used
+for data-tranfer operations to avoid network page faults. Copy-on-write
+semantics work as expected with ODP.
+
+With ODP disabled, all buffers used for data transfers are backed by pinned
+physical memory. Using Pinned mode avoids any overhead due to network page
+faults but requires all buffers to be backed by physical memory. Copy-on-write
+semantics are broken when using pinned memory. See the Fork section for more
+information.
+
+## Translation Cache
+
+Mapping a buffer for use by the NIC is an expensive operation. To avoid this
+penalty for each data transfer operation, the CXI provider maintains an internal
+translation cache.
+
+When using the ATS translation mode, the provider does not maintain translations
+for individual buffers. It follows that translation caching is not required.
+
+## Triggered Operation
+
+The CXI provider supports triggered operations through the deferred work queue
+API. The following deferred work queue operations are supported: FI_OP_SEND,
+FI_OP_TSEND, FI_OP_READ, FI_OP_WRITE, FI_OP_ATOMIC, FI_OP_FETCH_ATOMIC, and
+FI_OP_COMPARE_ATOMIC. FI_OP_RECV and FI_OP_TRECV are also supported, but with
+only a threshold of zero.
+
+The CXI provider backs each triggered operation by hardware resources.
+Exhausting triggered operation resources leads to indeterminate behavior and
+should be prevented.
+
+The CXI provider offers two methods to prevent triggered operation resource
+exhaustion.
+
+### Experimental FI_CXI_ENABLE_TRIG_OP_LIMIT Environment Variable
+
+When FI_CXI_ENABLE_TRIG_OP_LIMIT is enabled, the CXI provider will use
+semaphores to coordinate triggered operation usage between threads and across
+processes using the same service ID. When triggered operation resources are
+exhausted, fi_control(FI_QUEUE_WORK) will return -FI_ENOSPC. It is up to the
+libfabric user to recover from this situation.
+
+**Note:** Preventing triggered operation resource exhaustion with this method
+may be expensive and result in a negative performance impact. It is encouraged
+libfabric users avoid method unless absolutely needed. By default,
+FI_CXI_ENABLE_TRIG_OP_LIMIT is disabled.
+
+**Note:** Named semaphores are used to coordinated triggered operation resource
+usage across multiple processes. System/node software may need to be implemented
+to ensure all semaphores are unlinked during unexpected application termination.
+
+**Note:** This feature is considered experimental and implementation may be
+subjected to changed.
+
+### CXI Domain get_dwq_depth Extension
+
+The CXI domain get_dwq_depth extension returns the deferred work queue queue
+depth (i.e. the number of triggered operation resources assigned to the service
+ID used by the fi_domain). Libfabric users can use the returned queue depth to
+coordinate resource usage.
+
+For example, suppose the job launcher has configured a service ID with for 512
+triggered operation resources. Since the CXI provider needs to consume 8 per
+service ID, 504 should be usable by libfabric users. If the libfabric user knows
+there are *N* processes using a given service ID and NIC, it can divide the 504
+triggered operation resource among all *N* processes.
+
+**Note:** This is the preferred method to prevent triggered operation resource
+exhaustion since it does not introduce semaphores into the
+fi_control(FI_QUEUE_WORK) critical path.
+
+## Fork Support
+
+The following subsections outline the CXI provider fork support.
+
+### RDMA and Fork Overview
+
+Under Linux, `fork()` is implemented using copy-on-write (COW) pages, so the
+only penalty that it incurs is the time and memory required to duplicate the
+parent's page tables, mark all of the process’s page structs as read only and
+COW, and create a unique task structure for the child.
+
+Due to the Linux COW fork policy, both parent and child processes’ virtual
+addresses are mapped to the same physical address. The first process to write
+to the virtual address will get a new physical page, and thus a new physical
+address, with the same content as the previous physical page.
+
+The Linux COW fork policy is problematic for RDMA NICs. RDMA NICs require
+memory to be registered with the NIC prior to executing any RDMA operations. In
+user-space, memory registration results in establishing a virtual address to
+physical address mapping with the RDMA NIC. This resulting RDMA NIC
+mapping/memory region does not get updated when the Linux COW fork policy is
+executed.
+
+Consider the following example:
+- Process A is planning to perform RDMA with virtual address 0xffff0000 and a
+size of 4096. This virtual address maps to physical address 0x1000.
+- Process A registers this virtual address range with the RDMA NIC. The RDMA
+NIC device driver programs its page tables to establish the virtual address
+0xffff0000 to physical address 0x1000 mapping.
+- Process A decides to fork Process B. Virtual address 0xffff0000 will now be
+subjected to COW.
+- Process A decides to write to virtual address 0xffff0000 before doing the
+RDMA operation. This will trigger the Linux COW fork policy resulting in the
+following:
+    - Process A: Virtual address 0xffff0000 maps to new physical address
+    0x2000
+    - Process B: Virtual address 0xffff0000 maps to previous physical address
+    0x1000
+- Process A now executes an RDMA operation using the mapping/memory region
+associated with virtual address 0xffff0000. Since COW occurred, the RDMA NIC
+executes the RDMA operation using physical address 0x1000 which belongs to
+Process B. This results in data corruption.
+
+The crux of the issue is the parent issuing forks while trying to do RDMA
+operations to registered memory regions. Excluding software RDMA emulation, two
+options exist for RDMA NIC vendors to resolve this data corruption issue.
+- Linux `madvise()` MADV_DONTFORK and MADV_DOFORK
+- RDMA NIC support for on-demand paging (ODP)
+
+#### Linux madvise() MADV_DONTFORK and MADV_DOFORK
+
+The generic (i.e. non-vendor specific) RDMA NIC solution to the Linux COW fork
+policy and RDMA problem is to use the following `madvise()` operations during
+memory registration and deregistration:
+- MADV_DONTFORK: Do not make the pages in this range available to the child
+after a `fork()`. This is useful to prevent copy-on-write semantics from
+changing the physical location of a page if the parent writes to it after a
+`fork()`. (Such page relocations cause problems for hardware that DMAs into the
+page.)
+- MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the default
+behavior, whereby a mapping is inherited across `fork()`.
+
+In the Linux kernel, MADV_DONTFORK will result in the virtual memory area struct
+(VMA) being marked with the VM_DONTCOPY flag. VM_DONTCOPY signals to the Linux
+kernel to not duplicate this VMA on fork. This effectively leaves a hole in
+child process address space. Should the child reference the virtual address
+corresponding to the VMA which was not duplicated, it will segfault.
+
+In the previous example, if Process A issued `madvise(0xffff0000, 4096,
+MADV_DONTFORK)` before performing RDMA memory registration, the physical address
+0x1000 would have remained with Process A. This would prevent the Process A data
+corruption as well. If Process B were to reference virtual address 0xffff0000, it
+will segfault due to the hole in the virtual address space.
+
+Using `madvise()` with MADV_DONTFORK may be problematic for applications
+performing RDMA and page aliasing. Paging aliasing is where the parent process
+uses part or all of a page to share information with the child process. If RDMA is
+also being used for a separate portion of this page, the child process will
+segfault when an access causes page aliasing.
+
+#### RDMA NIC Support for ODP
+
+An RDMA NIC vendor specific solution to the Linux COW fork policy and RDMA
+problem is to use ODP. ODP allows for the RDMA NIC to generate page requests for
+translations it does not have a physical address for. The following is an
+updated example with ODP:
+- Process A is planning to perform RDMA with virtual address 0xffff0000 and a
+size of 4096. This virtual address maps to physical address 0x1000.
+- Process A registers this virtual address range with the RDMA NIC. The RDMA NIC
+device driver may optionally program its page tables to establish the virtual
+address 0xffff0000 to physical address 0x1000 mapping.
+- Process A decides to fork Process B. Virtual address 0xffff0000 will now be
+subjected to COW.
+- Process A decides to write to virtual address 0xffff0000 before doing the RDMA
+operation. This will trigger the Linux COW fork policy resulting in the
+following:
+    - Process A: Virtual address 0xffff0000 maps to new physical address 0x2000
+    - Process B: Virtual address 0xffff0000 maps to previous physical address
+    0x1000
+    - RDMA NIC device driver: Receives MMU invalidation event for Process A
+    virtual address range 0xffff0000 through 0xffff0ffe. The device driver
+    updates the corresponding memory region to no longer reference physical
+    address 0x1000.
+- Process A now executes an RDMA operation using the memory region associated
+with 0xffff0000. The RDMA NIC will recognize the corresponding memory region as
+no longer having a valid physical address. The RDMA NIC will then signal to the
+device driver to fault in the corresponding address, if necessary, and update
+the physical address associated with the memory region. In this case, the memory
+region will be updated with physical address 0x2000. Once completed, the device
+driver signals to the RDMA NIC to continue the RDMA operation. Data corruption
+does not occur since RDMA occurred to the correct physical address.
+
+A RDMA NIC vendor specific solution to the Linux COW fork policy and RDMA
+problem is to use ODP. ODP allows for the RDMA NIC to generate page requests
+for translations it does not have a physical address for.
+
+### CXI Provider Fork Support
+
+The CXI provider is subjected to the Linux COW fork policy and RDMA issues
+described in section *RDMA and Fork Overview*. To prevent data corruption with
+fork, the CXI provider supports the following options:
+- CXI specific fork environment variables to enable `madvise()` MADV_DONTFORK
+and MADV_DOFORK
+- ODP Support*
+
+**Formal ODP support pending.*
+
+#### CXI Specific Fork Environment Variables
+
+The CXI software stack has two environment variables related to fork:
+0 CXI_FORK_SAFE: Enables base fork safe support. With this environment variable
+set, regardless of value, libcxi will issue `madvise()` with MADV_DONTFORK on
+the virtual address range being registered for RDMA. In addition, libcxi always
+align the `madvise()` to the system default page size. On x86, this is 4 KiB. To
+prevent redundant `madvise()` calls with MADV_DONTFORK against the same virtual
+address region, reference counting is used against each tracked `madvise()`
+region. In addition, libcxi will spilt and merge tracked `madvise()` regions if
+needed. Once the reference count reaches zero, libcxi will call `madvise()` with
+MADV_DOFORK, and no longer track the region.
+- CXI_FORK_SAFE_HP: With this environment variable set, in conjunction with
+CXI_FORK_SAFE, libcxi will not assume the page size is system default page size.
+Instead, libcxi will walk `/proc/<pid>/smaps` to determine the correct page size
+and align the `madvise()` calls accordingly. This environment variable should be
+set if huge pages are being used for RDMA. To amortize the per memory
+registration walk of `/proc/<pid>/smaps`, the libfabric MR cache should be used.
+
+Setting these environment variables will prevent data corruption when the parent
+issues a fork. But it may result in the child process experiencing a segfault if
+it references a virtual address being used for RDMA in the parent process.
+
+#### ODP Support and Fork
+
+CXI provider ODP support would allow for applications to not have to set
+CXI_FORK_SAFE and CXI_FORK_SAFE_HP to prevent parent process data corruption.
+Enabling ODP to resolve the RDMA and fork issue may or may not result in a
+performance impact. The concern with ODP is if the rate of invalidations and ODP
+page requests are relatively high and occur at the same time, ODP timeouts may
+occur. This would result in application libfabric data transfer operations
+failing.
+
+Please refer to the *CXI Provider ODP Support* for more information on how to
+enable/disable ODP.
+
+#### CXI Provider Fork Support Guidance
+
+Since the CXI provider offloads the majority of the libfabric data transfer
+operations to the NIC, thus enabling end-to-end RDMA between libfabric user
+buffers, it is subjected to the issue described in section *RDMA and Fork
+Overview*. For comparison, software emulated RDMA libfabric providers may not
+have these issues since they rely on bounce buffers to facilitate data transfer.
+
+The following is the CXI provider fork support guidance:
+- Enable CXI_FORK_SAFE. If huge pages are also used, CXI_FORK_SAFE_HP should be
+enabled as well. Since enabling this will result in `madvice()` with
+MADV_DONTFORK, the following steps should be taken to prevent a child process
+segfault:
+    - Avoid using stack memory for RDMA
+    - Avoid child process having to access a virtual address range the parent
+    process is performing RDMA against
+    - Use page-aligned heap allocations for RDMA
+- Enable ODP and run without CXI_FORK_SAFE and CXI_FORK_SAFE_HP. The
+functionality and performance of ODP with fork may be application specific.
+Currently, ODP is not formally supported.
+
+The CXI provider preferred approach is to use CXI_FORK_SAFE and
+CXI_FORK_SAFE_HP. While it may require the application to take certain
+precautions, it will result in a more portable application regardless of RDMA
+NIC.
+
+## Heterogenous Memory (HMEM) Supported Interfaces
+
+The CXI provider supports the following OFI iface types: FI_HMEM_CUDA, FI_HMEM_ROCR, and FI_HMEM_ZE.
+
+### FI_HMEM_ZE Limitations
+
+The CXI provider only supports GPU direct RDMA with ZE device buffers if implicit scaling
+is disabled. The following ZE environment variables disable implicit scaling:
+EnableImplicitScaling=0 NEOReadDebugKeys=1.
+
+For testing purposes only, the implicit scaling check can be disabled by setting the
+following environment variable: FI_CXI_FORCE_ZE_HMEM_SUPPORT=1. This may need to be
+combined with the following environment variable to get CXI provider memory registration
+to work: FI_CXI_DISABLE_HMEM_DEV_REGISTER=1.
+
+## Collectives (accelerated)
+
+The CXI provider supports a limited set of collective operations specifically
+intended to support use of the hardware-accelerated reduction features of the
+CXI-supported NIC and fabric hardware.
+
+These features are implemented using the (experimental) OFI collectives API. The
+implementation supports the following collective functions:
+
+* **fi_query_collective**()
+* **fi_join_collective**()
+* **fi_barrier**()
+* **fi_broadcast**()
+* **fi_reduce**()
+* **fi_allreduce**()
+
+### **fi_query_collective**()
+
+Standard implementation that exposes the features described below.
+
+### **fi_join_collective**()
+
+The **fi_join_collective**() implementation is provider-managed. However, the
+*coll_addr* parameter is not useful to the implementation, and must be
+specified as FI_ADDR_NOTAVAIL. The *set* parameter must contain fi_addr_t
+values that resolve to meaningful CXI addresses in the endpoint *fi_av*
+structure. **fi_join_collective**() must be called for every address in the
+*set* list, and must be progressed until the join operation is complete. There
+is no inherent limit on join concurrency.
+
+The join will create a multicast tree in the fabric to manage the collective
+operations. This operation requires access to a secure Fabric Manager REST API
+that constructs this tree, so any application that attempts to use accelerated
+collectives will bind to libcurl and associated security libraries, which must
+be available on the system.
+
+There are hard limits to the number of multicast addresses available on a
+system, and administrators may impose additional limits on the number of
+multicast addresses available to any given collective job.
+
+### fi_reduction operations
+
+Payloads are limited to 32-byte data structures, and because they all use the
+same underlying hardware model, they are all synchronizing calls. Specifically,
+the supported functions are all variants of fi_allreduce().
+
+* **fi_barrier** is **fi_allreduce** using an optimized no-data operator.
+* **fi_broadcast** is **fi_allreduce** using FI_BOR, with data forced to zero for all but the root rank.
+* **fi_reduce** is **fi_allreduce** with a result pointer ignored by all but the root rank.
+
+All functions must be progressed to completion on all ranks participating in
+the collective group. There is a hard limit of eight concurrent reductions on
+each collective group, and attempts to launch more operations will return
+<nobr>-FI_EAGAIN.</nobr>
+
+**allreduce** supports the following hardware-accelerated reduction operators:
+
+| Operator | Supported Datatypes |
+| -------- | --------- |
+| FI_BOR   | FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64 |
+| FI_BAND  | FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64 |
+| FI_BXOR  | FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64 |
+| FI_MIN   | FI_INT64, FI_DOUBLE |
+| FI_MAX   | FI_INT64, FI_DOUBLE |
+| FI_SUM   | FI_INT64, FI_DOUBLE |
+| FI_CXI_MINMAXLOC      | FI_INT64, FI_DOUBLE |
+| FI_CXI_REPSUM         | FI_DOUBLE |
+
+Data space is limited to 32 bytes in all cases except REPSUM, which supports
+only a single FI_DOUBLE.
+
+Only unsigned bitwise operators are supported.
+
+Only signed integer arithmetic operations are are supported.
+
+The MINMAXLOC operators are a mixed data representation consisting of two
+values, and two indices. Each rank reports its minimum value and rank index,
+and its maximum value and rank index. The collective result is the global
+minimum value and rank index, and the global maximum value and rank index. Data
+structures for these functions can be found int the fi_cxi_ext.h file. The
+*datatype* should represent the type of the minimum/maximum values, and the
+*count* must be 1.
+
+The double-precision operators provide an associative (NUM) variant for MIN,
+MAX, and MINMAXLOC. Default IEEE behavior is to treat any operation with NaN as
+invalid, including comparison, which has the interesting property of causing:
+
+    MIN(NaN, value) => NaN
+    MAX(NaN, value) => NaN
+
+This means that if NaN creeps into a MIN/MAX reduction in any rank, it tends to
+poison the entire result. The associative variants instead effectively ignore
+the NaN, such that:
+
+    MIN(NaN, value) => value
+    MAX(NaN, value) => value
+
+The REPSUM operator implements a reproducible (associative) sum of
+double-precision values. The payload can accommodate only a single
+double-precision value per reduction, so *count* must be 1.
+
+See: [Berkeley reproducible sum algorithm](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-121.pdf)
+https://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-121.pdf
+
+### double precision rounding
+
+C99 defines four rounding modes for double-precision SUM, and some systems may
+support a "flush-to-zero" mode for each of these, resulting in a total of eight
+different modes for double-precision sum.
+
+The fabric hardware supports all eight modes transparently.
+
+Although the rounding modes have thread scope, all threads, processes, and
+nodes should use the same rounding mode for any single reduction.
+
+### reduction flags
+
+The reduction operations supports two flags:
+
+* **FI_MORE**
+* **FI_CXI_PRE_REDUCED** (overloads **FI_SOURCE**)
+
+The **FI_MORE** flag advises that the *result* data pointer represents an
+opaque, local reduction accumulator, and will be used as the destination of the
+reduction. This operation can be repeated any number of times to accumulate
+results locally, and spans the full set of all supported reduction operators.
+The *op*, *count*, and *datatype* values must be consistent for all calls. The
+operation ignores all global or static variables &mdash; it can be treated as a
+*pure* function call &mdash; and returns immediately. The caller is responsible
+for protecting the accumulator memory if it is used by multiple threads or
+processes on a compute node.
+
+If **FI_MORE** is omitted, the destination is the fabric, and this will
+initiate a fabric reduction through the associated endpoint. The reduction must
+be progressed, and upon successful completion, the *result* data pointer will
+be filled with the final reduction result of *count* elements of type
+*datatype*.
+
+The **FI_CXI_PRE_REDUCED** flag advises that the source data pointer represents
+an opaque reduction accumulator containing pre-reduced data. The *count* and
+*datatype* arguments are ignored.
+
+if **FI_CXI_PRE_REDUCED** is omitted, the source is taken to be user data with
+*count* elements of type *datatype*.
+
+The opaque reduction accumulator is exposed as **struct cxip_coll_accumulator**
+in the fi_cxi_ext.h file.
+
+**Note**: The opaque reduction accumulator provides extra space for the
+expanded form of the reproducible sum, which carries the extra data required to
+make the operation reproducible in software.
+
+# OPTIMIZATION
+
+## Optimized MRs
+
+The CXI provider has two separate MR implementations: standard and optimized.
+Standard MRs are designed to support applications which require a large number
+of remote memory regions. Optimized MRs are designed to support one-sided
+programming models that allocate a small number of large remote memory windows.
+The CXI provider can achieve higher RMA Write rates when targeting an optimized
+MR.
+
+Both types of MRs are allocated using fi_mr_reg. MRs with client-provided key in
+the range [0-99] are optimized MRs. MRs with key greater or equal to 100 are
+standard MRs. An application may create a mix of standard and optimized MRs. To
+disable the use of optimized MRs, set environment variable
+*FI_CXI_OPTIMIZED_MRS=false*. When disabled, all MR keys are available and all MRs
+are implemented as standard MRs. All communicating processes must agree on the
+use of optimized MRs.
+
+When FI_MR_PROV_KEY mr_mode is specified caching of remote access MRs is enabled,
+which can improve registration/de-registration performance in RPC type applications,
+that wrap RMA operations within a message RPC protocol. Optimized MRs will be
+preferred, but will fallback to standard MRs if insufficient hardware resources are
+available.
+
+## Optimized RMA
+
+Optimized MRs are one requirement for the use of low overhead packet formats
+which enable higher RMA Write rates. An RMA Write will use the low overhead
+format when all the following requirements are met:
+
+* The Write targets an optimized MR
+* The target MR does not require remote completion notifications (no
+  *FI_RMA_EVENT*)
+* The Write does not have ordering requirements (no *FI_RMA_WAW*)
+
+Theoretically, Cassini has resources to support 64k standard MRs or 2k optimized
+MRs. Practically, the limits are much lower and depend greatly on application
+behavior.
+
+Hardware counters can be used to validate the use of the low overhead packets.
+The counter C_CNTR_IXE_RX_PTL_RESTRICTED_PKT counts the number of low overhead
+packets received at the target NIC. Counter C_CNTR_IXE_RX_PTL_UNRESTRICTED_PKT
+counts the number of ordered RDMA packets received at the target NIC.
+
+Message rate performance may be further optimized by avoiding target counting
+events. To avoid counting events, do not bind a counter to the MR. To validate
+optimal writes without target counting events, monitor the counter:
+C_CNTR_LPE_PLEC_HITS.
+
+## Unreliable AMOs
+
+By default, all AMOs are resilient to intermittent packet loss in the network.
+Cassini implements a connection-based reliability model to support reliable
+execution of AMOs.
+
+The connection-based reliability model may be disabled for AMOs in order to
+increase message rate. With reliability disabled, a lost AMO packet will result
+in operation failure. A failed AMO will be reported to the client in a
+completion event as usual. Unreliable AMOs may be useful for applications that
+can tolerate intermittent AMO failures or those where the benefit of increased
+message rate outweighs by the cost of restarting after a failure.
+
+Unreliable, non-fetching AMOs may be performed by specifying the
+*FI_CXI_UNRELIABLE* flag. Unreliable, fetching AMOs are not supported. Unreliable
+AMOs must target an optimized MR and cannot use remote completion notification.
+Unreliable AMOs are not ordered.
+
+## High Rate Put
+
+High Rate Put (HRP) is a feature that increases message rate performance of RMA
+and unreliable non-fetching AMO operations at the expense of global ordering
+guarantees.
+
+HRP responses are generated by the fabric egress port. Responses are coalesced
+by the fabric to achieve higher message rates. The completion event for an HRP
+operation guarantees delivery but does not guarantee global ordering. If global
+ordering is needed following an HRP operation, the source may follow the
+operation with a normal, fenced Put.
+
+HRP RMA and unreliable AMO operations may be performed by specifying the
+*FI_CXI_HRP* flag. HRP AMOs must also use the *FI_CXI_UNRELIABLE* flag. Monitor the
+hardware counter C_CNTR_HNI_HRP_ACK at the initiator to validate that HRP is in
+use.
+
+## Counters
+
+Cassini offloads light-weight counting events for certain types of operations.
+The rules for offloading are:
+
+* Counting events for RMA and AMO source events are always offloaded.
+* Counting events for RMA and AMO target events are always offloaded.
+* Counting events for Sends are offloaded when message size is less than the
+  rendezvous threshold.
+* Counting events for message Receives are never offloaded by default.
+
+Software progress is required to update counters unless the criteria for
+offloading are met.
+
+# RUNTIME PARAMETERS
+
+The CXI provider checks for the following environment variables:
+
+*FI_CXI_ODP*
+:   Enables on-demand paging. If disabled, all DMA buffers are pinned.
+    If enabled and mr_mode bits in the hints exclude FI_MR_ALLOCATED,
+    then ODP mode will be used.
+
+*FI_CXI_FORCE_ODP*
+:   Experimental value that can be used to force the use of ODP mode
+    even if FI_MR_ALLOCATED is set in the mr_mode hint bits. This is
+    intended to be used primarily for testing.
+
+*FI_CXI_ATS*
+:   Enables PCIe ATS. If disabled, the NTA mechanism is used.
+
+*FI_CXI_ATS_MLOCK_MODE*
+:   Sets ATS mlock mode. The mlock() system call may be used in conjunction
+    with ATS to help avoid network page faults. Valid values are "off" and
+    "all". When mlock mode is "off", the provider does not use mlock(). An
+    application using ATS without mlock() may experience network page faults,
+    reducing network performance. When ats_mlock_mode is set to "all", the
+    provider uses mlockall() during initialization with ATS. mlockall() causes
+    all mapped addresses to be locked in RAM at all times. This helps to avoid
+    most network page faults. Using mlockall() may increase pressure on
+    physical memory.  Ignored when ODP is disabled.
+
+*FI_CXI_RDZV_THRESHOLD*
+:   Message size threshold for rendezvous protocol.
+
+*FI_CXI_RDZV_GET_MIN*
+:   Minimum rendezvous Get payload size. A Send with length less than or equal
+    to *FI_CXI_RDZV_THRESHOLD* plus *FI_CXI_RDZV_GET_MIN* will be performed
+    using the eager protocol. Larger Sends will be performed using the
+    rendezvous protocol with *FI_CXI_RDZV_THRESHOLD* bytes of payload sent
+    eagerly and the remainder of the payload read from the source using a Get.
+    *FI_CXI_RDZV_THRESHOLD* plus *FI_CXI_RDZV_GET_MIN* must be less than or
+    equal to *FI_CXI_OFLOW_BUF_SIZE*.
+
+*FI_CXI_RDZV_EAGER_SIZE*
+:   Eager data size for rendezvous protocol.
+
+*FI_CXI_RDZV_PROTO*
+:   Direct the provider to use a preferred protocol to transfer non-eager
+    rendezvous data.
+    *FI_CXI_RDZV_PROTO*= default | alt_read
+
+    To use an alternate protocol, the CXI driver property rdzv_get_en should be
+    set to "0". The "alt_read" rendezvous protocol may help improve collective
+    operation performance. Note that all rendezvous protocol use RDMA to transfer
+    eager and non-eager rendezvous data.
+
+*FI_CXI_DISABLE_NON_INJECT_MSG_IDC*
+:   Experimental option to disable favoring IDC for transmit of small messages
+    when FI_INJECT is not specified. This can be useful with GPU source buffers
+    to avoid the host copy in cases a performant copy can not be used. The default
+    is to use IDC for all messages less than IDC size.
+
+*FI_CXI_DISABLE_HOST_REGISTER*
+:   Disable registration of host buffers (overflow and request) with GPU. There
+    are scenarios where using a large number of processes per GPU results in page
+    locking excessive amounts of memory degrading performance and/or restricting
+    process counts. The default is to register buffers with the GPU.
+
+*FI_CXI_OFLOW_BUF_SIZE*
+:   Size of overflow buffers. Increasing the overflow buffer size allows for
+    more unexpected message eager data to be held in single overflow buffer.
+    The default size is 2MB.
+
+*FI_CXI_OFLOW_BUF_MIN_POSTED/FI_CXI_OFLOW_BUF_COUNT*
+:   The minimum number of overflow buffers that should be posted. The default
+    minimum posted count is 3. Buffers will grow unbounded to support
+    outstanding unexpected messages. Care should be taken to size appropriately
+    based on job scale, size of eager data, and the amount of unexpected
+    message traffic to reduce the need for flow control.
+
+*FI_CXI_OFLOW_BUF_MAX_CACHED*
+:   The maximum number of overflow buffers that will be cached. The default
+    maximum count is 3 * FI_CXI_OFLOW_BUF_MIN_POSTED. A value of zero indicates
+    that once a overflow buffer is allocated it will be cached and used as
+    needed. A non-zero value can be used with bursty traffic to shrink the
+    number of allocated buffers to the maximum count when they are no longer
+    needed.
+
+*FI_CXI_SAFE_DEVMEM_COPY_THRESHOLD
+:   Defines the maximum CPU memcpy size for HMEM device memory that is
+    accessible by the CPU with load/store operations.
+
+*FI_CXI_OPTIMIZED_MRS*
+:   Enables optimized memory regions. See section
+    *CXI Domain Control Extensions* on how to enable/disable optimized MRs at
+    the domain level instead of for the global process/job.
+
+*FI_CXI_MR_MATCH_EVENTS*
+:   Enabling MR match events in a client/server environment can be used
+    to ensure that memory backing a memory region cannot be remotely
+    accessed after the MR has been closed, even if it that memory remains
+    mapped in the libfabric MR cache. Manual progress must be made at the
+    target to process the MR match event accounting and avoid event queue
+    overflow. There is a slight additional cost in the creation and
+    tear-down of MR. This option is disabled by default.
+
+    See section *CXI Domain Control Extensions* on how to enable MR match
+    events at the domain level instead of for the global process/job.
+
+*FI_CXI_PROV_KEY_CACHE*
+:   Enabled by default, the caching of remote MR provider keys can be
+    disable by setting to 0.
+
+    See section *CXI Domain Control Extensions* on how to disable the
+    remote provider key cache at the domain level instead of for the
+    global process/job.
+
+*FI_CXI_LLRING_MODE*
+:   Set the policy for use of the low-latency command queue ring mechanism.
+    This mechanism improves the latency of command processing on an idle
+    command queue.  Valid values are idle, always, and never.
+
+*FI_CXI_CQ_POLICY*
+:   Experimental. Set Command Queue write-back policy. Valid values are always,
+    high_empty, low_empty, and low. "always", "high", and "low" refer to the
+    frequency of write-backs. "empty" refers to whether a write-back is
+    performed when the queue becomes empty.
+
+*FI_CXI_DEFAULT_VNI*
+:   Default VNI value used only for service IDs where the VNI is not restricted.
+
+*FI_CXI_EQ_ACK_BATCH_SIZE*
+:   Number of EQ events to process before writing an acknowledgement to HW.
+    Batching ACKs amortizes the cost of event acknowledgement over multiple
+    network operations.
+
+*FI_CXI_RX_MATCH_MODE*
+:   Specify the receive message matching mode to be utilized.
+    *FI_CXI_RX_MATCH_MODE=*hardware | software | hybrid
+
+    *hardware* - Message matching is fully offloaded, if resources become
+    exhausted flow control will be performed and existing unexpected message
+    headers will be onloaded to free resources.
+
+    *software* - Message matching is fully onloaded.
+
+    *hybrid* - Message matching begins fully offloaded, if resources become
+    exhuasted hardware will transition message matching to a hybrid of
+    hardware and software matching.
+
+    For both *"hybrid"* and *"software"* modes and care should be taken to
+    minimize the threshold for rendezvous processing
+    (i.e. *FI_CXI_RDZV_THRESHOLD* + *FI_CXI_RDZV_GET_MIN*). When running in
+    software endpoint mode the environment variables *FI_CXI_REQ_BUF_SIZE*
+    and *FI_CXI_REQ_BUF_MIN_POSTED* are used to control the size and number
+    of the eager request buffers posted to handle incoming unmatched messages.
+
+*FI_CXI_HYBRID_PREEMPTIVE*
+:   When in hybrid mode, this variable can be used to enable preemptive
+    transitions to software matching. This is useful at scale for poorly
+    written applications with a large number of unexpected messages
+    where reserved resources may be insufficient to prevent to prevent
+    starvation of software request list match entries. Default is 0, disabled.
+
+*FI_CXI_HYBRID_RECV_PREEMPTIVE*
+:   When in hybrid mode, this variable can be used to enable preemptive
+    transitions to software matching. This is useful at scale for poorly
+    written applications with a large number of unmatched posted receives
+    where reserved resources may be insufficient to prevent starvation of
+    software request list match entries. Default is 0, disabled.
+
+*FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE*
+:   When in hybrid mode, this variable can be used to enable preemptive
+    transitions to software matching when the number of posted receives
+    exceeds the user requested RX size attribute. This is useful for
+    applications where they may not know the exact number of posted receives
+    and they are expereincing application termination due to event queue
+    overflow. Default is 0, disabled.
+
+*FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE*
+:   When in hybrid mode, this variable can be used to enable preemptive
+    transitions to software matching when the number of hardware unexpected
+    messages exceeds the user requested RX size attribute. This is useful for
+    applications where they may not know the exact number of posted receives
+    and they are expereincing application termination due to event queue
+    overflow. Default is 0, disabled.
+
+*FI_CXI_REQ_BUF_SIZE*
+:   Size of request buffers. Increasing the request buffer size allows for more
+    unmatched messages to be sent into a single request buffer. The default
+    size is 2MB.
+
+*FI_CXI_REQ_BUF_MIN_POSTED*
+:   The minimum number of request buffers that should be posted. The default
+    minimum posted count is 4. The number of buffers will grow unbounded to
+    support outstanding unexpected messages. Care should be taken to size
+    appropriately based on job scale and the size of eager data to reduce
+    the need for flow control.
+
+*FI_CXI_REQ_BUF_MAX_CACHED/FI_CXI_REQ_BUF_MAX_COUNT*
+:   The maximum number of request buffers that will be cached. The default
+    maximum count is 0. A value of zero indicates that once a request buffer
+    is allocated it will be cached and used as needed. A non-zero value can
+    be used with bursty traffic to shrink the number of allocated buffers to
+    a maximum count when they are no longer needed.
+
+*FI_CXI_MSG_LOSSLESS*
+:   Enable or disable lossless receive matching. If hardware resources are
+    exhausted, hardware will pause the associated traffic class until a
+    overflow buffer (hardware match mode) or request buffer (software match
+    mode or hybrid match mode) is posted. This is considered experimental and
+    defaults to disabled.
+
+*FI_CXI_FC_RETRY_USEC_DELAY*
+:   Number of micro-seconds to sleep before retrying a dropped side-band, flow
+    control message. Setting to zero will disable any sleep.
+
+*FI_UNIVERSE_SIZE*
+:   Defines the maximum number of processes that will be used by distribute
+    OFI application. Note that this value is used in setting the default
+    control EQ size, see FI_CXI_CTRL_RX_EQ_MAX_SIZE.
+
+*FI_CXI_CTRL_RX_EQ_MAX_SIZE*
+:   Max size of the receive event queue used for side-band/control messages.
+    Default receive event queue size is based on FI_UNIVERSE_SIZE. Increasing the
+    receive event queue size can help prevent side-band/control messages from
+    being dropped and retried but at the cost of additional memory usage. Size is
+    always aligned up to a 4KiB boundary.
+
+*FI_CXI_DEFAULT_CQ_SIZE*
+:   Change the provider default completion queue size expressed in entries. This
+    may be useful for applications which rely on middleware, and middleware defaults
+    the completion queue size to the provider default.
+
+*FI_CXI_DISABLE_EQ_HUGETLB/FI_CXI_DISABLE_CQ_HUGETLB*
+:   By default, the provider will attempt to allocate 2 MiB hugetlb pages for
+    provider event queues. Disabling hugetlb support will cause the provider
+    to fallback to memory allocators using host page sizes.
+    FI_CXI_DISABLE_EQ_HUGETLB replaces FI_CXI_DISABLE_CQ_HUGETLB, however use
+    of either is still supported.
+
+*FI_CXI_DEFAULT_TX_SIZE*
+:   Set the default tx_attr.size field to be used by the provider if the size
+    is not specified in the user provided fi_info hints.
+
+*FI_CXI_DEFAULT_RX_SIZE*
+:   Set the default rx_attr.size field to be used by the provider if the size
+    is not specified in the user provided fi_info hints.
+
+*FI_CXI_SW_RX_TX_INIT_MAX*
+:   Debug control to override the number of TX operations that can be
+    outstanding that are initiated by software RX processing. It has no impact
+    on hardware initiated RX rendezvous gets.
+
+*FI_CXI_DEVICE_NAME*
+:   Restrict CXI provider to specific CXI devices. Format is a comma separated
+    list of CXI devices (e.g. cxi0,cxi1).
+
+*FI_CXI_TELEMETRY*
+:   Perform a telemetry delta between fi_domain open and close. Format is a
+    comma separated list of telemetry files as defined in
+    /sys/class/cxi/cxi*/device/telemetry/. The ALL-in-binary file in this
+    directory is invalid. Note that these are per CXI interface counters and not
+    per CXI process per interface counters.
+
+*FI_CXI_TELEMETRY_RGID*
+:   Resource group ID (RGID) to restrict the telemetry collection to. Value less
+    than 0 is no restrictions.
+
+*FI_CXI_CQ_FILL_PERCENT*
+:   Fill percent of underlying hardware event queue used to determine when
+    completion queue is saturated. A saturated completion queue results in the
+    provider returning -FI_EAGAIN for data transfer and other related libfabric
+    operations.
+
+*FI_CXI_COMPAT*
+:   Temporary compatibility to allow use of pre-upstream values for FI_ADDR_CXI and
+    FI_PROTO_CXI. Compatibility can be disabled to verify operation with upstream
+    constant values and to enable access to conflicting provider values. The default
+    setting of 1 specifies both old and new constants are supported. A setting of 0
+    disables support for old constants and can be used to test that an application is
+    compatible with the upstream values. A setting of 2 is a safety fallback that if
+    used the provider will only export fi_info with old constants and will be incompatible
+    with libfabric clients that been recompiled.
+
+*FI_CXI_COLL_FABRIC_MGR_URL*
+:   **accelerated collectives:** Specify the HTTPS address of the fabric manager REST API
+    used to create specialized multicast trees for accelerated collectives. This parameter
+    is **REQUIRED** for accelerated collectives, and is a fixed, system-dependent value.
+
+*FI_CXI_COLL_TIMEOUT_USEC*
+:   **accelerated collectives:** Specify the reduction engine timeout. This should be
+    larger than the maximum expected compute cycle in repeated reductions, or acceleration
+    can create incast congestion in the switches. The relative performance benefit of
+    acceleration declines with increasing compute cycle time, dropping below one percent at
+    32 msec (32000). Using acceleration with compute cycles larger than 32 msec is not
+    recommended except for experimental purposes. Default is 32 msec (32000), maximum is
+    20 sec (20000000).
+
+*FI_CXI_COLL_USE_DMA_PUT*
+:   **accelerated collectives:** Use DMA for collective packet put. This uses DMA to
+    inject reduction packets rather than IDC, and is considered experimental. Default
+    is false.
+
+*FI_CXI_DISABLE_HMEM_DEV_REGISTER*
+:   Disable registering HMEM device buffer for load/store access. Some HMEM devices
+    (e.g. AMD, Nvidia, and Intel GPUs) support backing the device memory by the PCIe BAR.
+    This enables software to perform load/stores to the device memory via the BAR instead
+    of using device DMA engines. Direct load/store access may improve performance.
+
+*FI_CXI_FORCE_ZE_HMEM_SUPPORT*
+:   Force the enablement of ZE HMEM support. By default, the CXI provider will only
+    support ZE memory registration if implicit scaling is disabled (i.e. the environment
+    variables EnableImplicitScaling=0 NEOReadDebugKeys=1 are set). Set
+    FI_CXI_FORCE_ZE_HMEM_SUPPORT to 1 will cause the CXI provider to skip the implicit
+    scaling checks. GPU direct RDMA may or may not work in this case.
+
+*FI_CXI_ENABLE_TRIG_OP_LIMIT*
+:   Enable enforcement of triggered operation limit. Doing this can prevent
+    fi_control(FI_QUEUE_WORK) deadlocking at the cost of performance.
+
+Note: Use the fi_info utility to query provider environment variables:
+<code>fi_info -p cxi -e</code>
+
+# CXI EXTENSIONS
+
+The CXI provider supports various fabric-specific extensions. Extensions are
+accessed using the fi_open_ops function.
+
+### CXI Domain Control Extensions
+
+The **fi_control**() function is extended for domain FIDs to query and override
+global environment settings for a specific domain. This is useful for example
+where the application process also includes a client API that has different
+optimizations and protections.
+
+Command *FI_OPT_CXI_GET_OPTIMIZED* where the argument is a pointer to a bool.
+The call returns the setting for optimized MR usage for the domain. The default
+is determined by the environment setting of *FI_CXI_OPTIMIZED_MRS*.
+
+Command *FI_OPT_CXI_SET_OPTIMIZED* where the argument is a pointer to a bool
+initialized to true or false. The call enables or disables the use of optimized
+MRs for the domain. If the domain is not configured for FI_MR_PROV_KEY MR mode,
+the call will fail with -FI_EINVAL, it is not supported for client generated
+keys. It must be called prior to MR being created.
+
+Command *FI_OPT_CXI_GET_MR_MATCH_EVENTS* where the argument is a pointer to a
+bool. The call returns the setting for MR Match Event accounting for the
+domain. The default is determined by the environment setting of
+*FI_CXI_MR_MATCH_EVENTS*.
+
+Command *FI_OPT_CXI_SET_MR_MATCH_EVENTS* where the argument is a pointer to a
+bool initialized to true or false. This call enables or disables the use of MR
+Match Event counting. This ensures that memory backing a MR cannot be accessed
+after invoking fi_close() on the MR, even if that memory remains in the
+libfabric MR cache. Manual progress must be made to process events at the RMA
+destination. It can only be changed prior to any EP or MR being created.
+
+Command *FI_OPT_CXI_GET_PROV_KEY_CACHE* where the argument is a pointer to a
+bool. The call returns the setting for enabling use of the remote MR
+cache for provider keys for the domain. The default is determined by the
+environment setting of *FI_CXI_PROV_KEY_CACHE* and is only valid if
+FI_MR_PROV_KEY MR mode is used.
+
+Command *FI_OPT_CXI_SET_PROV_KEY_CACHE* where the argument is a pointer to a
+bool initialized to true or false. This call enables or disables the use of
+the remote MR cache for provider keys for the domain. By default the cache
+is enabled and can be used for provider keys that do not require events.
+The command will fail with -FI_EINVAL if FI_MR_PROV_KEY MR mode is not in use.
+It can only be changed prior to any MR being created.
+
+## CXI Domain Extensions
+
+CXI domain extensions have been named *FI_CXI_DOM_OPS_6*. The flags parameter
+is ignored. The fi_open_ops function takes a `struct fi_cxi_dom_ops`. See an
+example of usage below:
+
+```c
+struct fi_cxi_dom_ops *dom_ops;
+
+ret = fi_open_ops(&domain->fid, FI_CXI_DOM_OPS_4, 0, (void **)&dom_ops, NULL);
+```
+
+The following domain extensions are defined:
+
+```c
+struct fi_cxi_dom_ops {
+	int (*cntr_read)(struct fid *fid, unsigned int cntr, uint64_t *value,
+		      struct timespec *ts);
+	int (*topology)(struct fid *fid, unsigned int *group_id,
+			unsigned int *switch_id, unsigned int *port_id);
+	int (*enable_hybrid_mr_desc)(struct fid *fid, bool enable);
+	size_t (*ep_get_unexp_msgs)(struct fid_ep *fid_ep,
+				    struct fi_cq_tagged_entry *entry,
+				    size_t count, fi_addr_t *src_addr,
+				    size_t *ux_count);
+	int (*get_dwq_depth)(struct fid *fid, size_t *depth);
+};
+```
+
+*cntr_read* extension is used to read hardware counter values. Valid values
+of the cntr argument are found in the Cassini-specific header file
+cassini_cntr_defs.h. Note that Counter accesses by applications may be
+rate-limited to 1HZ.
+
+*topology* extension is used to return CXI NIC address topology information
+for the domain. Currently only a dragonfly fabric topology is reported.
+
+The enablement of hybrid MR descriptor mode allows for libfabric users
+to optionally pass in a valid MR desc for local communications operations.
+
+The get unexpected message function is used to obtain a list of
+unexpected messages associated with an endpoint. The list is returned
+as an array of CQ tagged entries set in the following manner:
+
+```
+struct fi_cq_tagged_entry {
+	.op_context = NULL,
+	.flags = any of [FI_TAGGED | FI_MSG | FI_REMOTE_CQ_DATA],
+	.len = message length,
+	.buf = NULL,
+	.data = CQ data if FI_REMOTE_CQ_DATA set
+	.tag = tag if FI_TAGGED set
+};
+```
+
+If the src_addr or entry array is NULL, only the ux_count of
+available unexpected list entries will be returned. The parameter
+count specifies the size of the array provided, if it is 0 then only
+the ux_count will be returned. The function returns the number of
+entries written to the array or a negative errno. On successful return,
+ux_count will always be set to the total number of unexpected messages available.
+
+*enable_hybrid_mr_desc* is used to enable hybrid MR descriptor mode. Hybrid MR
+desc allows for libfabric users to optionally pass in a valid MR desc for local
+communication operations. This is currently only used for RMA and AMO transfers.
+
+*get_dwq_depth* is used to get the depth of the deferred work queue. The depth
+is the number of triggered operation commands which can be queued to hardware.
+The depth is not per fi_domain but rather per service ID. Since a single service
+ID is intended to be shared between all processing using the same NIC in a job
+step, the triggered operations are shared across processes.
+
+*enable_mr_match_events* and *enable_optimized_mrs* have been deprecated
+in favor of using the fi_control() API. While the can be still be called via
+the domain ops, They will be removed from the domain opts prior to software
+release 2.2.
+
+## CXI Counter Extensions
+
+CXI counter extensions have been named *FI_CXI_COUNTER_OPS*. The flags parameter
+is ignored. The fi_open_ops function takes a `struct fi_cxi_cntr_ops`. See an
+example of usage below.
+
+```c
+struct fi_cxi_cntr_ops *cntr_ops;
+
+ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0, (void **)&cntr_ops, NULL);
+```
+
+The following domain extensions are defined:
+
+```c
+struct fi_cxi_cntr_ops {
+	/* Set the counter writeback address to a client provided address. */
+	int (*set_wb_buffer)(struct fid *fid, const void *buf, size_t len);
+
+	/* Get the counter MMIO region. */
+	int (*get_mmio_addr)(struct fid *fid, void **addr, size_t *len);
+};
+```
+
+## CXI Counter Writeback Flag
+
+If a client is using the CXI counter extensions to define a counter writeback
+buffer, the CXI provider will not update the writeback buffer success or
+failure values for each hardware counter success or failure update. This can
+especially create issues when clients expect the completion of a deferred
+workqueue operation to generate a counter writeback. To support this, the flag
+*FI_CXI_CNTR_WB* can be used in conjunction with a deferred workqueue operation
+to force a writeback at the completion of the deferred workqueue operation. See
+an example of usage below.
+
+```c
+struct fi_op_rma rma = {
+  /* Signal to the provider the completion of the RMA should trigger a
+   * writeback.
+   */
+  .flags = FI_CXI_CNTR_WB,
+};
+
+struct fi_deferred_work rma_work = {
+  .op_type = FI_OP_READ,
+  .triggering_counter = cntr,
+  .completion_cntr = cntr,
+  .threshold = 1,
+  .op.rma = &rma,
+};
+
+ret = fi_control(&domain->fid, FI_QUEUE_WORK, &rma_work);
+```
+
+**Note:** Using *FI_CXI_CNTR_WB* will lead to additional hardware usage. To
+conserve hardware resources, it is recommended to only use the *FI_CXI_CNTR_WB*
+when a counter writeback is absolutely required.
+
+## CXI Alias EP Overrides
+
+A transmit alias endpoint can be created and configured to utilize
+a different traffic class than the original endpoint. This provides a
+lightweight mechanism to utilize multiple traffic classes within a process.
+Message order between the original endpoint and the alias endpoint is
+not defined/guaranteed. See example usage below for setting the traffic
+class of a transmit alias endpoint.
+
+```c
+#include <rdma/fabric.h>
+#include <rdma/fi_endpoint.h>
+#include <rdma/fi_cxi_ext.h>     // Ultimately fi_ext.h
+
+struct fid_ep *ep;
+. . .
+
+struct fid_ep *alias_ep = NULL;
+uint32_t tclass = FI_TC_LOW_LATENCY;
+uint64_t op_flags = FI_TRANSMIT | desired data operation flags;
+
+ret = fi_ep_alias(ep, &alias_ep, op_flags);
+if (ret)
+    error;
+
+ret = fi_set_val(&alias_ep->fid, FI_OPT_CXI_SET_TCLASS, (void *)&tlcass);
+if (ret)
+    error;
+```
+
+In addition, the alias endpoint message order may be modified to override
+the default endpoint message order. Message order between the modified
+alias endpoint and the original endpoint is not guaranteed. See example
+usage below for setting the traffic class of a transmit alias endpoint.
+
+```c
+uint64_t msg_order = FI_ORDER_RMA_WAW;
+
+ret = fi_set_val(&alias_ep->fid, FI_OPT_CXI_SET_MSG_ORDER,
+                 (void *)&msg_order);
+if (ret)
+    error;
+```
+
+When an endpoint does not support FI_FENCE (e.g. optimized MR), a provider
+specific transmit flag, FI_CXI_WEAK_FENCE, may be specified on an alias EP
+to issue a FENCE operation to create a data ordering point for the alias.
+This is supported for one-sided operations only.
+
+Alias EP must be closed prior to closing the original EP.
+
+## PCIe Atomics
+The CXI provider has the ability to issue a given libfabric atomic memory
+operation as a PCIe operation as compared to a NIC operation. The CXI
+provider extension flag FI_CXI_PCIE_AMO is used to signify this.
+
+Since not all libfabric atomic memory operations can be executed as a PCIe
+atomic memory operation, `fi_query_atomic()` could be used to query if a
+given libfabric atomic memory operation could be executed as PCIe atomic
+memory operation.
+
+The following is a query to see if a given libfabric operation can be a
+PCIe atomic operation.
+```c
+int ret;
+struct fi_atomic_attr out_attrs;
+
+/* Query if non-fetching PCIe atomic is supported. */
+ret = fi_query_atomic(domain, FI_UINT32, FI_SUM, &out_attrs, FI_CXI_PCIE_AMO);
+
+/* Query if fetching PCIe atomic is supported. */
+ret = fi_query_atomic(domain, FI_UINT32, FI_SUM, &out_attrs,
+                      FI_FETCH_ATOMIC | FI_CXI_PCIE_AMO);
+```
+
+The following is how to issue a PCIe atomic operation.
+```c
+ssize_t ret;
+struct fi_msg_atomic msg;
+struct fi_ioc resultv;
+void *result_desc;
+size_t result_count;
+
+ret = fi_fetch_atomicmsg(ep, &msg, &resultv, &result_desc, result_count,
+                         FI_CXI_PCIE_AMO);
+```
+
+**Note:** The CXI provider only supports PCIe fetch add for UINT32_T, INT32_t,
+UINT64_T, and INT64_t. This support requires enablement of PCIe fetch add in
+the CXI driver, and it comes at the cost of losing NIC atomic support for another
+libfabric atomic operation.
+
+**Note:** Ordering between PCIe atomic operations and NIC atomic/RMA operations is
+undefined.
+
+To enable PCIe fetch add for libfabric, the following CXI driver kernel module
+parameter must be set to non-zero.
+
+```
+/sys/module/cxi_core/parameters/amo_remap_to_pcie_fadd
+```
+
+The following are the possible values for this kernel module and the impact of
+each value:
+- -1: Disable PCIe fetch add support. FI_CXI_PCIE_AMO is not supported.
+- 0: Enable PCIe fetch add support. FI_MIN is not supported.
+- 1: Enable PCIe fetch add support. FI_MAX is not supported.
+- 2: Enable PCIe fetch add support. FI_SUM is not supported.
+- 4: Enable PCIe fetch add support. FI_LOR is not supported.
+- 5: Enable PCIe fetch add support. FI_LAND is not supported.
+- 6: Enable PCIe fetch add support. FI_BOR is not supported.
+- 7: Enable PCIe fetch add support. FI_BAND is not supported.
+- 8: Enable PCIe fetch add support. FI_LXOR is not supported.
+- 9: Enable PCIe fetch add support. FI_BXOR is not supported.
+- 10: Enable PCIe fetch add support. No loss of default CXI provider AMO
+functionality.
+
+Guidance is to default amo_remap_to_pcie_fadd to 10.
+
+# FABTESTS
+
+The CXI provider does not currently support fabtests which depend on IP
+addressing.
+
+fabtest RDM benchmarks are supported, like:
+
+```c
+# Start server by specifying source PID and interface
+./fabtests/benchmarks/fi_rdm_tagged_pingpong -B 10 -s cxi0
+
+# Read server NIC address
+CXI0_ADDR=$(cat /sys/class/cxi/cxi0/device/properties/nic_addr)
+
+# Start client by specifying server PID and NIC address
+./fabtests/benchmarks/fi_rdm_tagged_pingpong -P 10 $CXI0_ADDR
+
+# The client may be bound to a specific interface, like:
+./fabtests/benchmarks/fi_rdm_tagged_pingpong -B 10 -s cxi1 -P 10 $CXI0_ADDR
+```
+
+Some functional fabtests are supported (including fi_bw). Others use IP sockets
+and are not yet supported.
+
+multinode fabtests are not yet supported.
+
+ubertest is supported for test configs matching the provider's current
+capabilities.
+
+unit tests are supported where the test feature set matches the CXI provider's
+current capabilities.
+
+# ERRATA
+
+* Fetch and compare type AMOs with FI_DELIVERY_COMPLETE or FI_MATCH_COMPLETE
+  completion semantics are not supported with FI_RMA_EVENT.
+
+# Libfabric CXI Provider User Programming and Troubleshooting Guide
+
+The scope of the following subsection is to provide guidance and/or troubleshooting tips
+for users of the libfabric CXI provider. The scope of this section is not a full guide
+for user libfabric.
+
+## Sizing Libfabric Objects Based on Expected Usage
+
+The CXI provider uses various libfabric object attribute size and/or libfabric enviroment
+variables to size hardware related resources accordingly. Failure to size resources properly
+can result in the CXI provider frequently returning -FI_EAGAIN which may negatively impact
+performance. The following subsection outline important sizing related attributes and
+environment variables.
+
+### Completion Queue Size Attribute
+
+The CXI provider uses completion queue attribute size to size various software and hardware
+event queues used to generate libfabric completion events. While the size of the software
+queues may grow, hardware event queue sizes are static. Failing to size hardware queues
+properly may result in CXI provider returning -FI_EAGAIN frequently for data transfer
+operations. When this error is returned, user should progress the corresponding endpoint
+completion queues by calling fi_cq_read().
+
+Users are encouraged to set the completion queue size attribute based on the expected
+number of inflight RDMA operations to and from a single endpoint. For users which are
+relying on the provider default value (e.g. MPI), the FI_CXI_DEFAULT_CQ_SIZE environment
+variable can be used to override the provider default value.
+
+### Endpoint Recieve Size Attribute
+
+The CXI provider uses the endpoint receive size attribute to size internal command
+and hardware event queues. Failing to size the either command queue correctly can result
+in the CXI provider returning -FI_EAGAIN frequently for data transfer operations. When
+this error is returned, user should progress the corresponding endpoint completion queues
+by calling fi_cq_read().
+
+Users are encouraged to set the endpoint receive size attribute based on the expected
+numbfer of inflight untagged and tagged RDMA operations. For users which are relying on the
+provider default value (e.g. MPI), the FI_CXI_DEFAULT_RX_SIZE environment variable can be
+used to override the provider default value.
+
+### Endpoint Transmit Size Attribute
+
+The CXI provider uses the endpoint transmit size attribute to size internal command
+and hardware event queues. Failing to size the either command queue correctly can result
+in the CXI provider returning -FI_EAGAIN frequently for data transfer operations. When
+this error is returned, user should progress the corresponding endpoint completion queues
+by calling fi_cq_read().
+
+At a minimum, users are encouraged to set the endpoint transmit size attribute based on
+the expected numbfer of inflight, initiator RDMA operations. If users are going to be
+issuing message opeartions over the CXI provider rendezvous limit (FI_CXI_RDZV_THRESHOLD),
+the transmit size attribute must also include the number of outstanding, unexpected
+rendezvous operations (i.e. inflight, initiator RDMA operations + outstanding, unexpected
+rendezvous operations).
+
+For users which are relying on the provider default value (e.g. MPI), the
+FI_CXI_DEFAULT_TX_SIZE environment variable can be used to override the provider default
+value.
+
+### FI_UNIVERSE_SIZE Environment Variable
+
+The libfabric FI_UNIVERSE_SIZE environment variable defines the number of expected ranks/peers
+an application needs to communicate with. The CXI provider may use this environment variable
+to size resources tied to number of peers. Users are encourage to set this environment
+variable accordingly.
+
+## Selecting Proper Receive Match Mode
+
+As mentioned in the *Runtime Parameters* section, the CXI provider supports 3 different
+operational modes: hardware, hybrid, and software.
+
+Hardware match mode is approriate for users who can ensure the sum of unexpected messages
+and posted receives does not exceed the configured hardware receive resource limit for the
+application. When resources are consumed, the endpoint will transition into a flow control
+operational mode which requires side-band messaging to recover from. Recovery will involve
+the CXI provider trying to reclaim hardware receive resources to help prevent future
+transition into flow control. If the CXI provider is unable to reclaim hardware receive
+resoures, this can lead to a cycle of entering and exiting flow control which may present
+itself as a hang to the libfabric user. Running with FI_LOG_LEVEL=warn and FI_LOG_PROV=cxi
+will report if this flow control transition is happening.
+
+Hybrid match mode is approriate for users who are unsure if the sum of unexpected messages
+and posted receives will not exceed the configure hardware receive resource limit for the
+application but want to ensure they application still functions if hardware receive resources
+are consumed. Hybrid match mode extends hardware match by allowing for an automated
+transition into software match mode if resources are consumed.
+
+Sofftware match mode is approriate for user who know the sum of unexpected messages
+and posted receives will exceed the configured hardware receive resource limit for the
+application. In software match mode, the CXI provider maintains the a software unexpected and
+posted receive list rather than offloading to hardware. This avoids having to allocated a
+hardware receive resource for each unxpected messsage and posted receive.
+
+*Note*: In practice, dependent processes (e.g. parallel job) will most likely be sharing a
+recieve hardware resource pool.
+
+*Note*: Each match mode may still enter flow control. For example, if a user is not draining
+the libfabric completion queue at a reasonable rate, corresponding hardware events may fill
+up which will trigger flow control.
+
+## Using Hybrid Match Mode Preemptive Options
+
+The high-level objective of the hybrid match mode preemptive environment variables (i.e.
+FI_CXI_HYBRID_PREEMPTIVE, FI_CXI_HYBRID_RECV_PREEMPTIVE,
+FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE, and FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE) is to
+ensure a process requiring more hardware receives resource does not force other process
+requiring less hardware receive resource to be force into software match mode due to no
+available hardware receive resources available.
+
+For example, considered a parallel application which has multiple processes (i.e. ranks)
+per NIC all sharing the same hardware receive resource pool. Suppose that the application
+communication pattern results in an all-to-one communication to only a single rank (e.g.
+rank 0) while other ranks may be doing communication amongst each other. If the width of
+the all-to-one exceeds hardware resource consumptions, all ranks on the target NIC will
+transition to software match mode. The preemptive options may help ensure that only
+rank 0 would transition to software match mode instead of all the ranks on the target NIC.
+
+The FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE and FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE
+environment variables will force the transition to software match mode if the user
+requested endpoint recieve size attribute is exceeded. The benefit of running with
+these enabled is that software match mode transition is 100% in control of the libfabric
+user through the receive size attribute. One approach users could take here is set
+receive size attribute to expected usage, and if this expected usage is exceeded, only
+the offending endpoints will transition to software match mode.
+
+FI_CXI_HYBRID_PREEMPTIVE and FI_CXI_HYBRID_RECV_PREEMPTIVE environment variables will
+force the transition to software match mode if hardware receive resources in the pool
+are running low. The CXI provider will do a multi-step process to transition the libfabric
+endpoint to software match mode. The benefit of running with these enabled is that the
+number of endpoints transitioning to software match mode may be smaller when compared to
+forced software match mode transition due to zero hardware resources available.
+
+## Preventing Messaging Flow Control Due to Hardware Event Queue Sizing
+
+As much as possible, CXI provider message flow control should be avoided. Flow control
+results in expensive, side-band, CXI provider internal messaging to recover from. One
+cause for flow control is due to improper hardware event queue sizing. If the hardware
+event queue is undersized resulting it filling quicker than expected, the next incoming
+message operation targeting a full event queue will result in the message operation
+being dropped and flow control triggered.
+
+The default CXI provider behavior is to size hardware event queues based on endpoint
+transmit and receive size attributes. Thus, it is critical for users to set these
+attributes accordingly.
+
+The CQ size can be used to override the CXI provider calcuatled hardware event queue
+size based on endpoint transmit and receive size attributes. If the CQ size is greater
+than the CXI proviuder calcuation, the value from the CQ size will be used.
+
+The CQ fill percent can be used to define a threshold for when no new RDMA operations
+can be queued until the libfabric CQ a progressed thus draining hardware event queues.
+
+## Interrupting CXI Provider CQ Error Event Errno
+
+The following are the libfabric errno value which may be returned in an RDMA CQ error event.
+
+FI_ETRUNC: Receive message truncation.
+
+FI_EHOSTUNREACH: Target is unreachable. This is due to connectivity issues, such as downed
+links, between the two peers.
+
+FI_ENOTCONN: Cannot communicate due to no libfabric endpoint configure. In this case, the
+target NIC is reachable.
+
+FI_EIO: Catch all errno.
+
+# SEE ALSO
+
+[`fabric`(7)](fabric.7.html),
+[`fi_provider`(7)](fi_provider.7.html),
diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7
new file mode 100644
index 00000000000..bb8e7cba3e5
--- /dev/null
+++ b/man/man7/fi_cxi.7
@@ -0,0 +1,2144 @@
+.\"t
+.\" Automatically generated by Pandoc 2.9.2.1
+.\"
+.TH "fi_cxi" "7" "2023\-11\-30" "Libfabric Programmer\[cq]s Manual" "#VERSION#"
+.hy
+.SH NAME
+.PP
+fi_cxi - The CXI Fabric Provider
+.SH OVERVIEW
+.PP
+The CXI provider enables libfabric on Cray\[cq]s Slingshot network.
+Slingshot is comprised of the Rosetta switch and Cassini NIC.
+Slingshot is an Ethernet-compliant network.
+However, The provider takes advantage of proprietary extensions to
+support HPC applications.
+.PP
+The CXI provider supports reliable, connection-less endpoint semantics.
+It supports two-sided messaging interfaces with message matching
+offloaded by the Cassini NIC.
+It also supports one-sided RMA and AMO interfaces, light-weight counting
+events, triggered operations (via the deferred work API), and
+fabric-accelerated small reductions.
+.SH REQUIREMENTS
+.PP
+The CXI Provider requires Cassini\[cq]s optimized HPC protocol which is
+only supported in combination with the Rosetta switch.
+.PP
+The provider uses the libCXI library for control operations and a set of
+Cassini-specific header files to enable direct hardware access in the
+data path.
+.SH SUPPORTED FEATURES
+.PP
+The CXI provider supports a subset of OFI features.
+.SS Endpoint types
+.PP
+The provider supports the \f[I]FI_EP_RDM\f[R] endpoint type, including
+scalable endpoints.
+.SS Memory registration modes
+.PP
+The provider implements scalable memory registration.
+The provider requires \f[I]FI_MR_ENDPOINT\f[R].
+\f[I]FI_MR_ALLOCATED\f[R] is required if ODP in not enabled or not
+desired.
+Client specified 32-bit MR keys are the default unless
+\f[I]FI_MR_PROV_KEY\f[R] is specified.
+For \f[I]FI_MR_PROV_KEY\f[R] provider generated 64-bit MR keys are used.
+An RMA initiator can work concurrently with client and provider
+generated keys.
+.PP
+In client/server environments, if concerns with stale MR key usage
+exists, then \f[I]FI_MR_PROV_KEY\f[R] generated keys should be used
+along with \f[I]FI_CXI_MR_MATCH_EVENTS=1\f[R] and
+\f[I]FI_CXI_OPTIMIZED_MRS=0\f[R].
+The former speeds up MR close, allowing non-remote MR cached keys to be
+used that enable full remote memory access protection after an MR is
+closed, even if that memory remains in the libfabric MR cache.
+The latter uses only standard MR which use matching to enable robust key
+usage, protecting against a stale MR key matching a newly generated MR
+keys.
+.SS Data transfer operations
+.PP
+The following data transfer interfaces are supported:
+\f[I]FI_ATOMIC\f[R], \f[I]FI_MSG\f[R], \f[I]FI_RMA\f[R],
+\f[I]FI_TAGGED\f[R].
+See DATA TRANSFER OPERATIONS below for more details.
+.SS Completion events
+.PP
+The CXI provider supports all CQ event formats.
+.SS Modes
+.PP
+The CXI provider does not require any operation modes.
+.SS Progress
+.PP
+The CXI provider currently supports \f[I]FI_PROGRESS_MANUAL\f[R] data
+and control progress modes.
+.SS Multi-threading
+.PP
+The CXI provider supports FI_THREAD_SAFE and FI_THREAD_DOMAIN threading
+models.
+.SS Wait Objects
+.PP
+The CXI provider supports FI_WAIT_FD and FI_WAIT_POLLFD CQ wait object
+types.
+FI_WAIT_UNSPEC will default to FI_WAIT_FD.
+However FI_WAIT_NONE should achieve the lowest latency and reduce
+interrupt overhead.
+.SS Additional Features
+.PP
+The CXI provider also supports the following capabilities and features:
+.IP \[bu] 2
+\f[I]FI_MULTI_RECV\f[R]
+.IP \[bu] 2
+\f[I]FI_SOURCE\f[R]
+.IP \[bu] 2
+\f[I]FI_NAMED_RX_CTX\f[R]
+.IP \[bu] 2
+\f[I]FI_RM_ENABLED\f[R]
+.IP \[bu] 2
+\f[I]FI_RMA_EVENT\f[R]
+.IP \[bu] 2
+\f[I]FI_REMOTE_CQ_DATA\f[R]
+.IP \[bu] 2
+\f[I]FI_MORE\f[R]
+.IP \[bu] 2
+\f[I]FI_FENCE\f[R]
+.SS Addressing Format
+.PP
+The CXI provider uses a proprietary address format.
+This format includes fields for NIC Address and PID.
+NIC Address is the topological address of the NIC endpoint on the
+fabric.
+All OFI Endpoints sharing a Domain share the same NIC Address.
+PID (for Port ID or Process ID, adopted from the Portals 4
+specification), is analogous to an IP socket port number.
+Valid PIDs are in the range [0-510].
+.PP
+A third component of Slingshot network addressing is the Virtual Network
+ID (VNI).
+VNI is a protection key used by the Slingshot network to provide
+isolation between applications.
+A VNI defines an isolated PID space for a given NIC.
+Therefore, Endpoints must use the same VNI in order to communicate.
+Note that VNI is not a field of the CXI address, but rather is specified
+as part of the OFI Endpoint auth_key.
+The combination of NIC Address, VNI, and PID is unique to a single OFI
+Endpoint within a Slingshot fabric.
+.PP
+The NIC Address of an OFI Endpoint is inherited from the Domain.
+By default, a PID is automatically assigned to an Endpoint when it is
+enabled.
+The address of an Endpoint can be queried using fi_getname.
+The address received from fi_getname may then be inserted into a
+peer\[cq]s Address Vector.
+The resulting FI address may then be used to perform an RDMA operation.
+.PP
+Alternatively, a client may manage PID assignment.
+fi_getinfo may be used to create an fi_info structure that can be used
+to create an Endpoint with a client-specified address.
+To achieve this, use fi_getinfo with the \f[I]FI_SOURCE\f[R] flag set
+and set node and service strings to represent the local NIC interface
+and PID to be assigned to the Endpoint.
+The NIC interface string should match the name of an available CXI
+domain (in the format cxi[0-9]).
+The PID string will be interpreted as a 9-bit integer.
+Address conflicts will be detected when the Endpoint is enabled.
+.SS Authorization Keys
+.PP
+The CXI authorization key format is defined by struct cxi_auth_key.
+This structure is defined in fi_cxi_ext.h.
+.IP
+.nf
+\f[C]
+struct cxi_auth_key {
+    uint32_t svc_id;
+    uint16_t vni;
+};
+\f[R]
+.fi
+.PP
+The CXI authorization key format includes a VNI and CXI service ID.
+VNI is a component of the CXI Endpoint address that provides isolation.
+A CXI service is a software container which defines a set of local CXI
+resources, VNIs, and Traffic Classes which a libfabric user can access.
+.PP
+Two endpoints must use the same VNI in order to communicate.
+Generally, a parallel application should be assigned to a unique VNI on
+the fabric in order to achieve network traffic and address isolation.
+Typically a privileged entity, like a job launcher, will allocate one or
+more VNIs for use by the libfabric user.
+.PP
+The CXI service API is provided by libCXI.
+It enables a privileged entity, like an application launcher, to control
+an unprivileged process\[cq]s access to NIC resources.
+Generally, a parallel application should be assigned to a unique CXI
+service in order to control access to local resources, VNIs, and Traffic
+Classes.
+.PP
+While a libfabric user provided authorization key is optional, it is
+highly encouraged that libfabric users provide an authorization key
+through the domain attribute hints during \f[C]fi_getinfo()\f[R].
+How libfabric users acquire the authorization key may vary between the
+users and is outside the scope of this document.
+.PP
+If an authorization key is not provided by the libfabric user, the CXI
+provider will attempt to generate an authorization key on behalf of the
+user.
+The following outlines how the CXI provider will attempt to generate an
+authorization key.
+.IP "1." 3
+Query for the following environment variables and generate an
+authorization key using them.
+.RS 4
+.IP \[bu] 2
+\f[I]SLINGSHOT_VNIS\f[R]: Comma separated list of VNIs.
+The CXI provider will only use the first VNI if multiple are provide.
+Example: \f[C]SLINGSHOT_VNIS=234\f[R].
+.IP \[bu] 2
+\f[I]SLINGSHOT_DEVICES\f[R]: Comma separated list of device names.
+Each device index will use the same index to lookup the service ID in
+\f[I]SLINGSHOT_SVC_IDS\f[R].
+Example: \f[C]SLINGSHOT_DEVICES=cxi0,cxi1\f[R].
+.IP \[bu] 2
+\f[I]SLINGSHOT_SVC_IDS\f[R]: Comma separated list of pre-configured CXI
+service IDs.
+Each service ID index will use the same index to lookup the CXI device
+in \f[I]SLINGSHOT_DEVICES\f[R].
+Example: \f[C]SLINGSHOT_SVC_IDS=5,6\f[R].
+.PP
+\f[B]Note:\f[R] How valid VNIs and device services are configured is
+outside the responsibility of the CXI provider.
+.RE
+.IP "2." 3
+Query pre-configured device services and find first entry with same UID
+as the libfabric user.
+.IP "3." 3
+Query pre-configured device services and find first entry with same GID
+as the libfabric user.
+.IP "4." 3
+Query pre-configured device services and find first entry which does not
+restrict member access.
+If enabled, the default service is an example of an unrestricted
+service.
+.RS 4
+.PP
+\f[B]Note:\f[R] There is a security concern with such services since it
+allows for multiple independent libfabric users to use the same service.
+.RE
+.PP
+\f[B]Note:\f[R] For above entries 2-4, it is possible the found device
+service does not restrict VNI access.
+For such cases, the CXI provider will query \f[I]FI_CXI_DEFAULT_VNI\f[R]
+to assign a VNI.
+.PP
+During Domain allocation, if the domain auth_key attribute is NULL, the
+CXI provider will attempt to generate a valid authorization key.
+If the domain auth_key attribute is valid (i.e.\ not NULL and encoded
+authorization key has been verified), the CXI provider will use the
+encoded VNI and service ID.
+Failure to generate a valid authorization key will result in Domain
+allocation failure.
+.PP
+During Endpoint allocation, if the endpoint auth_key attribute is NULL,
+the Endpoint with inherit the parent Domain\[cq]s VNI and service ID.
+If the Endpoint auth_key attribute is valid, the encoded VNI and service
+ID must match the parent Domain\[cq]s VNI and service ID.
+Allocating an Endpoint with a different VNI and service from the parent
+Domain is not supported.
+.PP
+The following is the expected parallel application launch workflow with
+CXI integrated launcher and CXI authorization key aware libfabric user:
+.IP "1." 3
+A parallel application is launched.
+.IP "2." 3
+The launcher allocates one or more VNIs for use by the application.
+.IP "3." 3
+The launcher communicates with compute node daemons where the
+application will be run.
+.IP "4." 3
+The launcher compute node daemon configures local CXI interfaces.
+libCXI is used to allocate one or more services for the application.
+The service will define the local resources, VNIs, and Traffic Classes
+that the application may access.
+Service allocation policies must be defined by the launcher.
+libCXI returns an ID to represent a service.
+.IP "5." 3
+The launcher forks application processes.
+.IP "6." 3
+The launcher provides one or more service IDs and VNI values to the
+application processes.
+.IP "7." 3
+Application processes select from the list of available service IDs and
+VNIs to form an authorization key to use for Endpoint allocation.
+.SS Address Vectors
+.PP
+The CXI provider supports both \f[I]FI_AV_TABLE\f[R] and
+\f[I]FI_AV_MAP\f[R] with the same internal implementation.
+.PP
+The CXI provider uses the \f[I]FI_SYMMETRIC\f[R] AV flag for
+optimization.
+When used with \f[I]FI_AV_TABLE\f[R], the CXI provider can use the
+fi_addr_t index as an endpoint identifier instead of a network address.
+The benefit of this is when running with FI_SOURCE, a reverse lookup is
+not needed to generate the source fi_addr_t for target CQ events.
+Note: FI_SOURCE_ERR should not be used for this configuration.
+.PP
+If the AV is not configured with \f[I]FI_SYMMETRIC\f[R],
+\f[I]FI_AV_USER_ID\f[R] is supported as a flag which can be passed into
+AV insert.
+.PP
+Since scalable EPs are not support, fi_av_attr::rx_ctx_bits must be
+zero.
+.PP
+The following AV capabilities and flags are not supported: FI_SHARED_AV,
+FI_SYNC_ERR, FI_EVENT, and FI_READ.
+.SS Operation flags
+.PP
+The CXI provider supports the following Operation flags:
+.TP
+\f[I]FI_MORE\f[R]
+When \f[I]FI_MORE\f[R] is specified in a data transfer operation, the
+provider will defer submission of RDMA commands to hardware.
+When one or more data transfer operations is performed using
+\f[I]FI_MORE\f[R], followed by an operation without \f[I]FI_MORE\f[R],
+the provider will submit the entire batch of queued operations to
+hardware using a single PCIe transaction, improving PCIe efficiency.
+.RS
+.PP
+When \f[I]FI_MORE\f[R] is used, queued commands will not be submitted to
+hardware until another data transfer operation is performed without
+\f[I]FI_MORE\f[R].
+.RE
+.TP
+\f[I]FI_TRANSMIT_COMPLETE\f[R]
+By default, all CXI provider completion events satisfy the requirements
+of the `transmit complete' completion level.
+Transmit complete events are generated when the intiator receives an Ack
+from the target NIC.
+The Ack is generated once all data has been received by the target NIC.
+Transmit complete events do not guarantee that data is visibile to the
+target process.
+.TP
+\f[I]FI_DELIVERY_COMPLETE\f[R]
+When the `delivery complete' completion level is used, the event
+guarantees that data is visible to the target process.
+To support this, hardware at the target performs a zero-byte read
+operation to flush data across the PCIe bus before generating an Ack.
+Flushing reads are performed unconditionally and will lead to higher
+latency.
+.TP
+\f[I]FI_MATCH_COMPLETE\f[R]
+When the `match complete' completion level is used, the event guarantees
+that the message has been matched to a client-provided buffer.
+All messages longer than the eager threshold support this guarantee.
+When `match complete' is used with a Send that is shorter than the eager
+threshold, an additional handshake may be performed by the provider to
+notify the initiator that the Send has been matched.
+.PP
+The CXI provider also supports the following operation flags:
+.IP \[bu] 2
+\f[I]FI_INJECT\f[R]
+.IP \[bu] 2
+\f[I]FI_FENCE\f[R]
+.IP \[bu] 2
+\f[I]FI_COMPLETION\f[R]
+.IP \[bu] 2
+\f[I]FI_REMOTE_CQ_DATA\f[R]
+.SS Scalable Endpoints
+.PP
+Scalable Endpoints (SEPs) support is not enabled in the CXI provider.
+Future releases of the provider will re-introduce SEP support.
+.SS Messaging
+.PP
+The CXI provider supports both tagged (\f[I]FI_TAGGED\f[R]) and untagged
+(\f[I]FI_MSG\f[R]) two-sided messaging interfaces.
+In the normal case, message matching is performed by hardware.
+In certain low resource conditions, the responsibility to perform
+message matching may be transferred to software.
+Specification of the receive message matching mode in the environment
+(\f[I]FI_CXI_RX_MATCH_MODE\f[R]) controls the initial matching mode and
+whether hardware matching can transparently transition matching to
+software where a hybrid of hardware and software receive matching is
+done.
+.PP
+If a Send operation arrives at a node where there is no matching Receive
+operation posted, it is considered unexpected.
+Unexpected messages are supported.
+The provider manages buffers to hold unexpected message data.
+.PP
+Unexpected message handling is transparent to clients.
+Despite that, clients should take care to avoid excessive use of
+unexpected messages by pre-posting Receive operations.
+An unexpected message ties up hardware and memory resources until it is
+matched with a user buffer.
+.PP
+The CXI provider implements several message protocols internally.
+A message protocol is selected based on payload length.
+Short messages are transferred using the eager protocol.
+In the eager protocol, the entire message payload is sent along with the
+message header.
+If an eager message arrives unexpectedly, the entire message is buffered
+at the target until it is matched to a Receive operation.
+.PP
+Long messages are transferred using a rendezvous protocol.
+The threshold at which the rendezvous protocol is used is controlled
+with the \f[I]FI_CXI_RDZV_THRESHOLD\f[R] and
+\f[I]FI_CXI_RDZV_GET_MIN\f[R] environment variables.
+.PP
+In the rendezvous protocol, a portion of the message payload is sent
+along with the message header.
+Once the header is matched to a Receive operation, the remainder of the
+payload is pulled from the source using an RDMA Get operation.
+If the message arrives unexpectedly, the eager portion of the payload is
+buffered at the target until it is matched to a Receive operation.
+In the normal case, the Get is performed by hardware and the operation
+completes without software progress.
+.PP
+Unexpected rendezvous protocol messages can not complete and release
+source side buffer resources until a matching receive is posted at the
+destination and the non-eager data is read from the source with a
+rendezvous get DMA.
+The number of rendezvous messages that may be outstanding is limited by
+the minimum of the hints->tx_attr->size value specified and the number
+of rendezvous operation ID mappings available.
+FI_TAGGED rendezvous messages have 32K-256 ID mappings, FI_MSG
+rendezvous messages are limited to 256 ID mappings.
+While this works well with MPI, care should be taken that this minimum
+is large enough to ensure applications written in a manner that assumes
+unlimited resources and use FI_MSG rendezvous messaging do not induce a
+software deadlock.
+If FI_MSG rendezvous messaging is done in a unexpected manner that may
+exceed the FI_MSG ID mappings available, it may be sufficient to reduce
+the number of rendezvous operations by increasing the rendezvous
+threshold.
+See \f[I]FI_CXI_RDZV_THRESHOLD\f[R] for information.
+.PP
+Message flow-control is triggered when hardware message matching
+resources become exhausted.
+Messages may be dropped and retransmitted in order to recover; impacting
+performance significantly.
+Programs should be careful to avoid posting large numbers of unmatched
+receive operations and to minimize the number of outstanding unexpected
+messages to prevent message flow-control.
+If the RX message matching mode is configured to support hybrid mode,
+when resources are exhausted, hardware will transition to hybrid
+operation where hardware and software share matching responsibility.
+.PP
+To help avoid this condition, increase Overflow buffer space using
+environment variables \f[I]FI_CXI_OFLOW_*\f[R], and for software and
+hybrid RX match modes increase Request buffer space using the variables
+\f[I]FI_CXI_REQ_*\f[R].
+.SS Message Ordering
+.PP
+The CXI provider supports the following ordering rules:
+.IP \[bu] 2
+All message Send operations are always ordered.
+.IP \[bu] 2
+RMA Writes may be ordered by specifying \f[I]FI_ORDER_RMA_WAW\f[R].
+.IP \[bu] 2
+AMOs may be ordered by specifying
+\f[I]FI_ORDER_AMO_{WAW|WAR|RAW|RAR}\f[R].
+.IP \[bu] 2
+RMA Writes may be ordered with respect to AMOs by specifying
+\f[I]FI_ORDER_WAW\f[R].
+Fetching AMOs may be used to perform short reads that are ordered with
+respect to RMA Writes.
+.PP
+Ordered RMA size limits are set as follows:
+.IP \[bu] 2
+\f[I]max_order_waw_size\f[R] is -1.
+RMA Writes and non-fetching AMOs of any size are ordered with respect to
+each other.
+.IP \[bu] 2
+\f[I]max_order_raw_size\f[R] is -1.
+Fetching AMOs of any size are ordered with respect to RMA Writes and
+non-fetching AMOs.
+.IP \[bu] 2
+\f[I]max_order_war_size\f[R] is -1.
+RMA Writes and non-fetching AMOs of any size are ordered with respect to
+fetching AMOs.
+.SS PCIe Ordering
+.PP
+Generally, PCIe writes are strictly ordered.
+As an optimization, PCIe TLPs may have the Relaxed Order (RO) bit set to
+allow writes to be reordered.
+Cassini sets the RO bit in PCIe TLPs when possible.
+Cassini sets PCIe RO as follows:
+.IP \[bu] 2
+Ordering of messaging operations is established using completion events.
+Therefore, all PCIe TLPs related to two-sided message payloads will have
+RO set.
+.IP \[bu] 2
+Every PCIe TLP associated with an unordered RMA or AMO operation will
+have RO cleared.
+.IP \[bu] 2
+PCIe TLPs associated with the last packet of an ordered RMA or AMO
+operation will have RO cleared.
+.IP \[bu] 2
+PCIe TLPs associated with the body packets (all except the last packet
+of an operation) of an ordered RMA operation will have RO set.
+.SS Translation
+.PP
+The CXI provider supports two translation mechanisms: Address
+Translation Services (ATS) and NIC Translation Agent (NTA).
+Use the environment variable \f[I]FI_CXI_ATS\f[R] to select between
+translation mechanisms.
+.PP
+ATS refers to NIC support for PCIe rev.
+4 ATS, PRI and PASID features.
+ATS enables the NIC to efficiently access the entire virtual address
+space of a process.
+ATS mode currently supports AMD hosts using the iommu_v2 API.
+.PP
+The NTA is an on-NIC translation unit.
+The NTA supports two-level page tables and additional hugepage sizes.
+Most CPUs support 2MB and 1GB hugepage sizes.
+Other hugepage sizes may be supported by SW to enable the NIC to cache
+more address space.
+.PP
+ATS and NTA both support on-demand paging (ODP) in the event of a page
+fault.
+Use the environment variable \f[I]FI_CXI_ODP\f[R] to enable ODP.
+.PP
+With ODP enabled, buffers used for data transfers are not required to be
+backed by physical memory.
+An un-populated buffer that is referenced by the NIC will incur a
+network page fault.
+Network page faults will significantly impact application performance.
+Clients should take care to pre-populate buffers used for data-tranfer
+operations to avoid network page faults.
+Copy-on-write semantics work as expected with ODP.
+.PP
+With ODP disabled, all buffers used for data transfers are backed by
+pinned physical memory.
+Using Pinned mode avoids any overhead due to network page faults but
+requires all buffers to be backed by physical memory.
+Copy-on-write semantics are broken when using pinned memory.
+See the Fork section for more information.
+.SS Translation Cache
+.PP
+Mapping a buffer for use by the NIC is an expensive operation.
+To avoid this penalty for each data transfer operation, the CXI provider
+maintains an internal translation cache.
+.PP
+When using the ATS translation mode, the provider does not maintain
+translations for individual buffers.
+It follows that translation caching is not required.
+.SS Triggered Operation
+.PP
+The CXI provider supports triggered operations through the deferred work
+queue API.
+The following deferred work queue operations are supported: FI_OP_SEND,
+FI_OP_TSEND, FI_OP_READ, FI_OP_WRITE, FI_OP_ATOMIC, FI_OP_FETCH_ATOMIC,
+and FI_OP_COMPARE_ATOMIC.
+FI_OP_RECV and FI_OP_TRECV are also supported, but with only a threshold
+of zero.
+.PP
+The CXI provider backs each triggered operation by hardware resources.
+Exhausting triggered operation resources leads to indeterminate behavior
+and should be prevented.
+.PP
+The CXI provider offers two methods to prevent triggered operation
+resource exhaustion.
+.SS Experimental FI_CXI_ENABLE_TRIG_OP_LIMIT Environment Variable
+.PP
+When FI_CXI_ENABLE_TRIG_OP_LIMIT is enabled, the CXI provider will use
+semaphores to coordinate triggered operation usage between threads and
+across processes using the same service ID.
+When triggered operation resources are exhausted,
+fi_control(FI_QUEUE_WORK) will return -FI_ENOSPC.
+It is up to the libfabric user to recover from this situation.
+.PP
+\f[B]Note:\f[R] Preventing triggered operation resource exhaustion with
+this method may be expensive and result in a negative performance
+impact.
+It is encouraged libfabric users avoid method unless absolutely needed.
+By default, FI_CXI_ENABLE_TRIG_OP_LIMIT is disabled.
+.PP
+\f[B]Note:\f[R] Named semaphores are used to coordinated triggered
+operation resource usage across multiple processes.
+System/node software may need to be implemented to ensure all semaphores
+are unlinked during unexpected application termination.
+.PP
+\f[B]Note:\f[R] This feature is considered experimental and
+implementation may be subjected to changed.
+.SS CXI Domain get_dwq_depth Extension
+.PP
+The CXI domain get_dwq_depth extension returns the deferred work queue
+queue depth (i.e.\ the number of triggered operation resources assigned
+to the service ID used by the fi_domain).
+Libfabric users can use the returned queue depth to coordinate resource
+usage.
+.PP
+For example, suppose the job launcher has configured a service ID with
+for 512 triggered operation resources.
+Since the CXI provider needs to consume 8 per service ID, 504 should be
+usable by libfabric users.
+If the libfabric user knows there are \f[I]N\f[R] processes using a
+given service ID and NIC, it can divide the 504 triggered operation
+resource among all \f[I]N\f[R] processes.
+.PP
+\f[B]Note:\f[R] This is the preferred method to prevent triggered
+operation resource exhaustion since it does not introduce semaphores
+into the fi_control(FI_QUEUE_WORK) critical path.
+.SS Fork Support
+.PP
+The following subsections outline the CXI provider fork support.
+.SS RDMA and Fork Overview
+.PP
+Under Linux, \f[C]fork()\f[R] is implemented using copy-on-write (COW)
+pages, so the only penalty that it incurs is the time and memory
+required to duplicate the parent\[cq]s page tables, mark all of the
+process\[cq]s page structs as read only and COW, and create a unique
+task structure for the child.
+.PP
+Due to the Linux COW fork policy, both parent and child processes\[cq]
+virtual addresses are mapped to the same physical address.
+The first process to write to the virtual address will get a new
+physical page, and thus a new physical address, with the same content as
+the previous physical page.
+.PP
+The Linux COW fork policy is problematic for RDMA NICs.
+RDMA NICs require memory to be registered with the NIC prior to
+executing any RDMA operations.
+In user-space, memory registration results in establishing a virtual
+address to physical address mapping with the RDMA NIC.
+This resulting RDMA NIC mapping/memory region does not get updated when
+the Linux COW fork policy is executed.
+.PP
+Consider the following example: - Process A is planning to perform RDMA
+with virtual address 0xffff0000 and a size of 4096.
+This virtual address maps to physical address 0x1000.
+- Process A registers this virtual address range with the RDMA NIC.
+The RDMA NIC device driver programs its page tables to establish the
+virtual address 0xffff0000 to physical address 0x1000 mapping.
+- Process A decides to fork Process B.
+Virtual address 0xffff0000 will now be subjected to COW.
+- Process A decides to write to virtual address 0xffff0000 before doing
+the RDMA operation.
+This will trigger the Linux COW fork policy resulting in the following:
+- Process A: Virtual address 0xffff0000 maps to new physical address
+0x2000 - Process B: Virtual address 0xffff0000 maps to previous physical
+address 0x1000 - Process A now executes an RDMA operation using the
+mapping/memory region associated with virtual address 0xffff0000.
+Since COW occurred, the RDMA NIC executes the RDMA operation using
+physical address 0x1000 which belongs to Process B.
+This results in data corruption.
+.PP
+The crux of the issue is the parent issuing forks while trying to do
+RDMA operations to registered memory regions.
+Excluding software RDMA emulation, two options exist for RDMA NIC
+vendors to resolve this data corruption issue.
+- Linux \f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC
+support for on-demand paging (ODP)
+.SS Linux madvise() MADV_DONTFORK and MADV_DOFORK
+.PP
+The generic (i.e.\ non-vendor specific) RDMA NIC solution to the Linux
+COW fork policy and RDMA problem is to use the following
+\f[C]madvise()\f[R] operations during memory registration and
+deregistration: - MADV_DONTFORK: Do not make the pages in this range
+available to the child after a \f[C]fork()\f[R].
+This is useful to prevent copy-on-write semantics from changing the
+physical location of a page if the parent writes to it after a
+\f[C]fork()\f[R].
+(Such page relocations cause problems for hardware that DMAs into the
+page.) - MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the
+default behavior, whereby a mapping is inherited across
+\f[C]fork()\f[R].
+.PP
+In the Linux kernel, MADV_DONTFORK will result in the virtual memory
+area struct (VMA) being marked with the VM_DONTCOPY flag.
+VM_DONTCOPY signals to the Linux kernel to not duplicate this VMA on
+fork.
+This effectively leaves a hole in child process address space.
+Should the child reference the virtual address corresponding to the VMA
+which was not duplicated, it will segfault.
+.PP
+In the previous example, if Process A issued
+\f[C]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing
+RDMA memory registration, the physical address 0x1000 would have
+remained with Process A.
+This would prevent the Process A data corruption as well.
+If Process B were to reference virtual address 0xffff0000, it will
+segfault due to the hole in the virtual address space.
+.PP
+Using \f[C]madvise()\f[R] with MADV_DONTFORK may be problematic for
+applications performing RDMA and page aliasing.
+Paging aliasing is where the parent process uses part or all of a page
+to share information with the child process.
+If RDMA is also being used for a separate portion of this page, the
+child process will segfault when an access causes page aliasing.
+.SS RDMA NIC Support for ODP
+.PP
+An RDMA NIC vendor specific solution to the Linux COW fork policy and
+RDMA problem is to use ODP.
+ODP allows for the RDMA NIC to generate page requests for translations
+it does not have a physical address for.
+The following is an updated example with ODP: - Process A is planning to
+perform RDMA with virtual address 0xffff0000 and a size of 4096.
+This virtual address maps to physical address 0x1000.
+- Process A registers this virtual address range with the RDMA NIC.
+The RDMA NIC device driver may optionally program its page tables to
+establish the virtual address 0xffff0000 to physical address 0x1000
+mapping.
+- Process A decides to fork Process B.
+Virtual address 0xffff0000 will now be subjected to COW.
+- Process A decides to write to virtual address 0xffff0000 before doing
+the RDMA operation.
+This will trigger the Linux COW fork policy resulting in the following:
+- Process A: Virtual address 0xffff0000 maps to new physical address
+0x2000 - Process B: Virtual address 0xffff0000 maps to previous physical
+address 0x1000 - RDMA NIC device driver: Receives MMU invalidation event
+for Process A virtual address range 0xffff0000 through 0xffff0ffe.
+The device driver updates the corresponding memory region to no longer
+reference physical address 0x1000.
+- Process A now executes an RDMA operation using the memory region
+associated with 0xffff0000.
+The RDMA NIC will recognize the corresponding memory region as no longer
+having a valid physical address.
+The RDMA NIC will then signal to the device driver to fault in the
+corresponding address, if necessary, and update the physical address
+associated with the memory region.
+In this case, the memory region will be updated with physical address
+0x2000.
+Once completed, the device driver signals to the RDMA NIC to continue
+the RDMA operation.
+Data corruption does not occur since RDMA occurred to the correct
+physical address.
+.PP
+A RDMA NIC vendor specific solution to the Linux COW fork policy and
+RDMA problem is to use ODP.
+ODP allows for the RDMA NIC to generate page requests for translations
+it does not have a physical address for.
+.SS CXI Provider Fork Support
+.PP
+The CXI provider is subjected to the Linux COW fork policy and RDMA
+issues described in section \f[I]RDMA and Fork Overview\f[R].
+To prevent data corruption with fork, the CXI provider supports the
+following options: - CXI specific fork environment variables to enable
+\f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support*
+.PP
+**Formal ODP support pending.*
+.SS CXI Specific Fork Environment Variables
+.PP
+The CXI software stack has two environment variables related to fork: 0
+CXI_FORK_SAFE: Enables base fork safe support.
+With this environment variable set, regardless of value, libcxi will
+issue \f[C]madvise()\f[R] with MADV_DONTFORK on the virtual address
+range being registered for RDMA.
+In addition, libcxi always align the \f[C]madvise()\f[R] to the system
+default page size.
+On x86, this is 4 KiB.
+To prevent redundant \f[C]madvise()\f[R] calls with MADV_DONTFORK
+against the same virtual address region, reference counting is used
+against each tracked \f[C]madvise()\f[R] region.
+In addition, libcxi will spilt and merge tracked \f[C]madvise()\f[R]
+regions if needed.
+Once the reference count reaches zero, libcxi will call
+\f[C]madvise()\f[R] with MADV_DOFORK, and no longer track the region.
+- CXI_FORK_SAFE_HP: With this environment variable set, in conjunction
+with CXI_FORK_SAFE, libcxi will not assume the page size is system
+default page size.
+Instead, libcxi will walk \f[C]/proc/<pid>/smaps\f[R] to determine the
+correct page size and align the \f[C]madvise()\f[R] calls accordingly.
+This environment variable should be set if huge pages are being used for
+RDMA.
+To amortize the per memory registration walk of
+\f[C]/proc/<pid>/smaps\f[R], the libfabric MR cache should be used.
+.PP
+Setting these environment variables will prevent data corruption when
+the parent issues a fork.
+But it may result in the child process experiencing a segfault if it
+references a virtual address being used for RDMA in the parent process.
+.SS ODP Support and Fork
+.PP
+CXI provider ODP support would allow for applications to not have to set
+CXI_FORK_SAFE and CXI_FORK_SAFE_HP to prevent parent process data
+corruption.
+Enabling ODP to resolve the RDMA and fork issue may or may not result in
+a performance impact.
+The concern with ODP is if the rate of invalidations and ODP page
+requests are relatively high and occur at the same time, ODP timeouts
+may occur.
+This would result in application libfabric data transfer operations
+failing.
+.PP
+Please refer to the \f[I]CXI Provider ODP Support\f[R] for more
+information on how to enable/disable ODP.
+.SS CXI Provider Fork Support Guidance
+.PP
+Since the CXI provider offloads the majority of the libfabric data
+transfer operations to the NIC, thus enabling end-to-end RDMA between
+libfabric user buffers, it is subjected to the issue described in
+section \f[I]RDMA and Fork Overview\f[R].
+For comparison, software emulated RDMA libfabric providers may not have
+these issues since they rely on bounce buffers to facilitate data
+transfer.
+.PP
+The following is the CXI provider fork support guidance: - Enable
+CXI_FORK_SAFE.
+If huge pages are also used, CXI_FORK_SAFE_HP should be enabled as well.
+Since enabling this will result in \f[C]madvice()\f[R] with
+MADV_DONTFORK, the following steps should be taken to prevent a child
+process segfault: - Avoid using stack memory for RDMA - Avoid child
+process having to access a virtual address range the parent process is
+performing RDMA against - Use page-aligned heap allocations for RDMA -
+Enable ODP and run without CXI_FORK_SAFE and CXI_FORK_SAFE_HP.
+The functionality and performance of ODP with fork may be application
+specific.
+Currently, ODP is not formally supported.
+.PP
+The CXI provider preferred approach is to use CXI_FORK_SAFE and
+CXI_FORK_SAFE_HP.
+While it may require the application to take certain precautions, it
+will result in a more portable application regardless of RDMA NIC.
+.SS Heterogenous Memory (HMEM) Supported Interfaces
+.PP
+The CXI provider supports the following OFI iface types: FI_HMEM_CUDA,
+FI_HMEM_ROCR, and FI_HMEM_ZE.
+.SS FI_HMEM_ZE Limitations
+.PP
+The CXI provider only supports GPU direct RDMA with ZE device buffers if
+implicit scaling is disabled.
+The following ZE environment variables disable implicit scaling:
+EnableImplicitScaling=0 NEOReadDebugKeys=1.
+.PP
+For testing purposes only, the implicit scaling check can be disabled by
+setting the following environment variable:
+FI_CXI_FORCE_ZE_HMEM_SUPPORT=1.
+This may need to be combined with the following environment variable to
+get CXI provider memory registration to work:
+FI_CXI_DISABLE_HMEM_DEV_REGISTER=1.
+.SS Collectives (accelerated)
+.PP
+The CXI provider supports a limited set of collective operations
+specifically intended to support use of the hardware-accelerated
+reduction features of the CXI-supported NIC and fabric hardware.
+.PP
+These features are implemented using the (experimental) OFI collectives
+API.
+The implementation supports the following collective functions:
+.IP \[bu] 2
+\f[B]fi_query_collective\f[R]()
+.IP \[bu] 2
+\f[B]fi_join_collective\f[R]()
+.IP \[bu] 2
+\f[B]fi_barrier\f[R]()
+.IP \[bu] 2
+\f[B]fi_broadcast\f[R]()
+.IP \[bu] 2
+\f[B]fi_reduce\f[R]()
+.IP \[bu] 2
+\f[B]fi_allreduce\f[R]()
+.SS \f[B]fi_query_collective\f[R]()
+.PP
+Standard implementation that exposes the features described below.
+.SS \f[B]fi_join_collective\f[R]()
+.PP
+The \f[B]fi_join_collective\f[R]() implementation is provider-managed.
+However, the \f[I]coll_addr\f[R] parameter is not useful to the
+implementation, and must be specified as FI_ADDR_NOTAVAIL.
+The \f[I]set\f[R] parameter must contain fi_addr_t values that resolve
+to meaningful CXI addresses in the endpoint \f[I]fi_av\f[R] structure.
+\f[B]fi_join_collective\f[R]() must be called for every address in the
+\f[I]set\f[R] list, and must be progressed until the join operation is
+complete.
+There is no inherent limit on join concurrency.
+.PP
+The join will create a multicast tree in the fabric to manage the
+collective operations.
+This operation requires access to a secure Fabric Manager REST API that
+constructs this tree, so any application that attempts to use
+accelerated collectives will bind to libcurl and associated security
+libraries, which must be available on the system.
+.PP
+There are hard limits to the number of multicast addresses available on
+a system, and administrators may impose additional limits on the number
+of multicast addresses available to any given collective job.
+.SS fi_reduction operations
+.PP
+Payloads are limited to 32-byte data structures, and because they all
+use the same underlying hardware model, they are all synchronizing
+calls.
+Specifically, the supported functions are all variants of
+fi_allreduce().
+.IP \[bu] 2
+\f[B]fi_barrier\f[R] is \f[B]fi_allreduce\f[R] using an optimized
+no-data operator.
+.IP \[bu] 2
+\f[B]fi_broadcast\f[R] is \f[B]fi_allreduce\f[R] using FI_BOR, with data
+forced to zero for all but the root rank.
+.IP \[bu] 2
+\f[B]fi_reduce\f[R] is \f[B]fi_allreduce\f[R] with a result pointer
+ignored by all but the root rank.
+.PP
+All functions must be progressed to completion on all ranks
+participating in the collective group.
+There is a hard limit of eight concurrent reductions on each collective
+group, and attempts to launch more operations will return -FI_EAGAIN.
+.PP
+\f[B]allreduce\f[R] supports the following hardware-accelerated
+reduction operators:
+.PP
+.TS
+tab(@);
+l l.
+T{
+Operator
+T}@T{
+Supported Datatypes
+T}
+_
+T{
+FI_BOR
+T}@T{
+FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64
+T}
+T{
+FI_BAND
+T}@T{
+FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64
+T}
+T{
+FI_BXOR
+T}@T{
+FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64
+T}
+T{
+FI_MIN
+T}@T{
+FI_INT64, FI_DOUBLE
+T}
+T{
+FI_MAX
+T}@T{
+FI_INT64, FI_DOUBLE
+T}
+T{
+FI_SUM
+T}@T{
+FI_INT64, FI_DOUBLE
+T}
+T{
+FI_CXI_MINMAXLOC
+T}@T{
+FI_INT64, FI_DOUBLE
+T}
+T{
+FI_CXI_REPSUM
+T}@T{
+FI_DOUBLE
+T}
+.TE
+.PP
+Data space is limited to 32 bytes in all cases except REPSUM, which
+supports only a single FI_DOUBLE.
+.PP
+Only unsigned bitwise operators are supported.
+.PP
+Only signed integer arithmetic operations are are supported.
+.PP
+The MINMAXLOC operators are a mixed data representation consisting of
+two values, and two indices.
+Each rank reports its minimum value and rank index, and its maximum
+value and rank index.
+The collective result is the global minimum value and rank index, and
+the global maximum value and rank index.
+Data structures for these functions can be found int the fi_cxi_ext.h
+file.
+The \f[I]datatype\f[R] should represent the type of the minimum/maximum
+values, and the \f[I]count\f[R] must be 1.
+.PP
+The double-precision operators provide an associative (NUM) variant for
+MIN, MAX, and MINMAXLOC.
+Default IEEE behavior is to treat any operation with NaN as invalid,
+including comparison, which has the interesting property of causing:
+.IP
+.nf
+\f[C]
+MIN(NaN, value) => NaN
+MAX(NaN, value) => NaN
+\f[R]
+.fi
+.PP
+This means that if NaN creeps into a MIN/MAX reduction in any rank, it
+tends to poison the entire result.
+The associative variants instead effectively ignore the NaN, such that:
+.IP
+.nf
+\f[C]
+MIN(NaN, value) => value
+MAX(NaN, value) => value
+\f[R]
+.fi
+.PP
+The REPSUM operator implements a reproducible (associative) sum of
+double-precision values.
+The payload can accommodate only a single double-precision value per
+reduction, so \f[I]count\f[R] must be 1.
+.PP
+See: Berkeley reproducible sum algorithm
+https://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-121.pdf
+.SS double precision rounding
+.PP
+C99 defines four rounding modes for double-precision SUM, and some
+systems may support a \[lq]flush-to-zero\[rq] mode for each of these,
+resulting in a total of eight different modes for double-precision sum.
+.PP
+The fabric hardware supports all eight modes transparently.
+.PP
+Although the rounding modes have thread scope, all threads, processes,
+and nodes should use the same rounding mode for any single reduction.
+.SS reduction flags
+.PP
+The reduction operations supports two flags:
+.IP \[bu] 2
+\f[B]FI_MORE\f[R]
+.IP \[bu] 2
+\f[B]FI_CXI_PRE_REDUCED\f[R] (overloads \f[B]FI_SOURCE\f[R])
+.PP
+The \f[B]FI_MORE\f[R] flag advises that the \f[I]result\f[R] data
+pointer represents an opaque, local reduction accumulator, and will be
+used as the destination of the reduction.
+This operation can be repeated any number of times to accumulate results
+locally, and spans the full set of all supported reduction operators.
+The \f[I]op\f[R], \f[I]count\f[R], and \f[I]datatype\f[R] values must be
+consistent for all calls.
+The operation ignores all global or static variables \[em] it can be
+treated as a \f[I]pure\f[R] function call \[em] and returns immediately.
+The caller is responsible for protecting the accumulator memory if it is
+used by multiple threads or processes on a compute node.
+.PP
+If \f[B]FI_MORE\f[R] is omitted, the destination is the fabric, and this
+will initiate a fabric reduction through the associated endpoint.
+The reduction must be progressed, and upon successful completion, the
+\f[I]result\f[R] data pointer will be filled with the final reduction
+result of \f[I]count\f[R] elements of type \f[I]datatype\f[R].
+.PP
+The \f[B]FI_CXI_PRE_REDUCED\f[R] flag advises that the source data
+pointer represents an opaque reduction accumulator containing
+pre-reduced data.
+The \f[I]count\f[R] and \f[I]datatype\f[R] arguments are ignored.
+.PP
+if \f[B]FI_CXI_PRE_REDUCED\f[R] is omitted, the source is taken to be
+user data with \f[I]count\f[R] elements of type \f[I]datatype\f[R].
+.PP
+The opaque reduction accumulator is exposed as \f[B]struct
+cxip_coll_accumulator\f[R] in the fi_cxi_ext.h file.
+.PP
+\f[B]Note\f[R]: The opaque reduction accumulator provides extra space
+for the expanded form of the reproducible sum, which carries the extra
+data required to make the operation reproducible in software.
+.SH OPTIMIZATION
+.SS Optimized MRs
+.PP
+The CXI provider has two separate MR implementations: standard and
+optimized.
+Standard MRs are designed to support applications which require a large
+number of remote memory regions.
+Optimized MRs are designed to support one-sided programming models that
+allocate a small number of large remote memory windows.
+The CXI provider can achieve higher RMA Write rates when targeting an
+optimized MR.
+.PP
+Both types of MRs are allocated using fi_mr_reg.
+MRs with client-provided key in the range [0-99] are optimized MRs.
+MRs with key greater or equal to 100 are standard MRs.
+An application may create a mix of standard and optimized MRs.
+To disable the use of optimized MRs, set environment variable
+\f[I]FI_CXI_OPTIMIZED_MRS=false\f[R].
+When disabled, all MR keys are available and all MRs are implemented as
+standard MRs.
+All communicating processes must agree on the use of optimized MRs.
+.PP
+When FI_MR_PROV_KEY mr_mode is specified caching of remote access MRs is
+enabled, which can improve registration/de-registration performance in
+RPC type applications, that wrap RMA operations within a message RPC
+protocol.
+Optimized MRs will be preferred, but will fallback to standard MRs if
+insufficient hardware resources are available.
+.SS Optimized RMA
+.PP
+Optimized MRs are one requirement for the use of low overhead packet
+formats which enable higher RMA Write rates.
+An RMA Write will use the low overhead format when all the following
+requirements are met:
+.IP \[bu] 2
+The Write targets an optimized MR
+.IP \[bu] 2
+The target MR does not require remote completion notifications (no
+\f[I]FI_RMA_EVENT\f[R])
+.IP \[bu] 2
+The Write does not have ordering requirements (no \f[I]FI_RMA_WAW\f[R])
+.PP
+Theoretically, Cassini has resources to support 64k standard MRs or 2k
+optimized MRs.
+Practically, the limits are much lower and depend greatly on application
+behavior.
+.PP
+Hardware counters can be used to validate the use of the low overhead
+packets.
+The counter C_CNTR_IXE_RX_PTL_RESTRICTED_PKT counts the number of low
+overhead packets received at the target NIC.
+Counter C_CNTR_IXE_RX_PTL_UNRESTRICTED_PKT counts the number of ordered
+RDMA packets received at the target NIC.
+.PP
+Message rate performance may be further optimized by avoiding target
+counting events.
+To avoid counting events, do not bind a counter to the MR.
+To validate optimal writes without target counting events, monitor the
+counter: C_CNTR_LPE_PLEC_HITS.
+.SS Unreliable AMOs
+.PP
+By default, all AMOs are resilient to intermittent packet loss in the
+network.
+Cassini implements a connection-based reliability model to support
+reliable execution of AMOs.
+.PP
+The connection-based reliability model may be disabled for AMOs in order
+to increase message rate.
+With reliability disabled, a lost AMO packet will result in operation
+failure.
+A failed AMO will be reported to the client in a completion event as
+usual.
+Unreliable AMOs may be useful for applications that can tolerate
+intermittent AMO failures or those where the benefit of increased
+message rate outweighs by the cost of restarting after a failure.
+.PP
+Unreliable, non-fetching AMOs may be performed by specifying the
+\f[I]FI_CXI_UNRELIABLE\f[R] flag.
+Unreliable, fetching AMOs are not supported.
+Unreliable AMOs must target an optimized MR and cannot use remote
+completion notification.
+Unreliable AMOs are not ordered.
+.SS High Rate Put
+.PP
+High Rate Put (HRP) is a feature that increases message rate performance
+of RMA and unreliable non-fetching AMO operations at the expense of
+global ordering guarantees.
+.PP
+HRP responses are generated by the fabric egress port.
+Responses are coalesced by the fabric to achieve higher message rates.
+The completion event for an HRP operation guarantees delivery but does
+not guarantee global ordering.
+If global ordering is needed following an HRP operation, the source may
+follow the operation with a normal, fenced Put.
+.PP
+HRP RMA and unreliable AMO operations may be performed by specifying the
+\f[I]FI_CXI_HRP\f[R] flag.
+HRP AMOs must also use the \f[I]FI_CXI_UNRELIABLE\f[R] flag.
+Monitor the hardware counter C_CNTR_HNI_HRP_ACK at the initiator to
+validate that HRP is in use.
+.SS Counters
+.PP
+Cassini offloads light-weight counting events for certain types of
+operations.
+The rules for offloading are:
+.IP \[bu] 2
+Counting events for RMA and AMO source events are always offloaded.
+.IP \[bu] 2
+Counting events for RMA and AMO target events are always offloaded.
+.IP \[bu] 2
+Counting events for Sends are offloaded when message size is less than
+the rendezvous threshold.
+.IP \[bu] 2
+Counting events for message Receives are never offloaded by default.
+.PP
+Software progress is required to update counters unless the criteria for
+offloading are met.
+.SH RUNTIME PARAMETERS
+.PP
+The CXI provider checks for the following environment variables:
+.TP
+\f[I]FI_CXI_ODP\f[R]
+Enables on-demand paging.
+If disabled, all DMA buffers are pinned.
+If enabled and mr_mode bits in the hints exclude FI_MR_ALLOCATED, then
+ODP mode will be used.
+.TP
+\f[I]FI_CXI_FORCE_ODP\f[R]
+Experimental value that can be used to force the use of ODP mode even if
+FI_MR_ALLOCATED is set in the mr_mode hint bits.
+This is intended to be used primarily for testing.
+.TP
+\f[I]FI_CXI_ATS\f[R]
+Enables PCIe ATS.
+If disabled, the NTA mechanism is used.
+.TP
+\f[I]FI_CXI_ATS_MLOCK_MODE\f[R]
+Sets ATS mlock mode.
+The mlock() system call may be used in conjunction with ATS to help
+avoid network page faults.
+Valid values are \[lq]off\[rq] and \[lq]all\[rq].
+When mlock mode is \[lq]off\[rq], the provider does not use mlock().
+An application using ATS without mlock() may experience network page
+faults, reducing network performance.
+When ats_mlock_mode is set to \[lq]all\[rq], the provider uses
+mlockall() during initialization with ATS.
+mlockall() causes all mapped addresses to be locked in RAM at all times.
+This helps to avoid most network page faults.
+Using mlockall() may increase pressure on physical memory.
+Ignored when ODP is disabled.
+.TP
+\f[I]FI_CXI_RDZV_THRESHOLD\f[R]
+Message size threshold for rendezvous protocol.
+.TP
+\f[I]FI_CXI_RDZV_GET_MIN\f[R]
+Minimum rendezvous Get payload size.
+A Send with length less than or equal to \f[I]FI_CXI_RDZV_THRESHOLD\f[R]
+plus \f[I]FI_CXI_RDZV_GET_MIN\f[R] will be performed using the eager
+protocol.
+Larger Sends will be performed using the rendezvous protocol with
+\f[I]FI_CXI_RDZV_THRESHOLD\f[R] bytes of payload sent eagerly and the
+remainder of the payload read from the source using a Get.
+\f[I]FI_CXI_RDZV_THRESHOLD\f[R] plus \f[I]FI_CXI_RDZV_GET_MIN\f[R] must
+be less than or equal to \f[I]FI_CXI_OFLOW_BUF_SIZE\f[R].
+.TP
+\f[I]FI_CXI_RDZV_EAGER_SIZE\f[R]
+Eager data size for rendezvous protocol.
+.TP
+\f[I]FI_CXI_RDZV_PROTO\f[R]
+Direct the provider to use a preferred protocol to transfer non-eager
+rendezvous data.
+\f[I]FI_CXI_RDZV_PROTO\f[R]= default | alt_read
+.RS
+.PP
+To use an alternate protocol, the CXI driver property rdzv_get_en should
+be set to \[lq]0\[rq].
+The \[lq]alt_read\[rq] rendezvous protocol may help improve collective
+operation performance.
+Note that all rendezvous protocol use RDMA to transfer eager and
+non-eager rendezvous data.
+.RE
+.TP
+\f[I]FI_CXI_DISABLE_NON_INJECT_MSG_IDC\f[R]
+Experimental option to disable favoring IDC for transmit of small
+messages when FI_INJECT is not specified.
+This can be useful with GPU source buffers to avoid the host copy in
+cases a performant copy can not be used.
+The default is to use IDC for all messages less than IDC size.
+.TP
+\f[I]FI_CXI_DISABLE_HOST_REGISTER\f[R]
+Disable registration of host buffers (overflow and request) with GPU.
+There are scenarios where using a large number of processes per GPU
+results in page locking excessive amounts of memory degrading
+performance and/or restricting process counts.
+The default is to register buffers with the GPU.
+.TP
+\f[I]FI_CXI_OFLOW_BUF_SIZE\f[R]
+Size of overflow buffers.
+Increasing the overflow buffer size allows for more unexpected message
+eager data to be held in single overflow buffer.
+The default size is 2MB.
+.TP
+\f[I]FI_CXI_OFLOW_BUF_MIN_POSTED/FI_CXI_OFLOW_BUF_COUNT\f[R]
+The minimum number of overflow buffers that should be posted.
+The default minimum posted count is 3.
+Buffers will grow unbounded to support outstanding unexpected messages.
+Care should be taken to size appropriately based on job scale, size of
+eager data, and the amount of unexpected message traffic to reduce the
+need for flow control.
+.TP
+\f[I]FI_CXI_OFLOW_BUF_MAX_CACHED\f[R]
+The maximum number of overflow buffers that will be cached.
+The default maximum count is 3 * FI_CXI_OFLOW_BUF_MIN_POSTED.
+A value of zero indicates that once a overflow buffer is allocated it
+will be cached and used as needed.
+A non-zero value can be used with bursty traffic to shrink the number of
+allocated buffers to the maximum count when they are no longer needed.
+.TP
+*FI_CXI_SAFE_DEVMEM_COPY_THRESHOLD
+Defines the maximum CPU memcpy size for HMEM device memory that is
+accessible by the CPU with load/store operations.
+.TP
+\f[I]FI_CXI_OPTIMIZED_MRS\f[R]
+Enables optimized memory regions.
+See section \f[I]CXI Domain Control Extensions\f[R] on how to
+enable/disable optimized MRs at the domain level instead of for the
+global process/job.
+.TP
+\f[I]FI_CXI_MR_MATCH_EVENTS\f[R]
+Enabling MR match events in a client/server environment can be used to
+ensure that memory backing a memory region cannot be remotely accessed
+after the MR has been closed, even if it that memory remains mapped in
+the libfabric MR cache.
+Manual progress must be made at the target to process the MR match event
+accounting and avoid event queue overflow.
+There is a slight additional cost in the creation and tear-down of MR.
+This option is disabled by default.
+.RS
+.PP
+See section \f[I]CXI Domain Control Extensions\f[R] on how to enable MR
+match events at the domain level instead of for the global process/job.
+.RE
+.TP
+\f[I]FI_CXI_PROV_KEY_CACHE\f[R]
+Enabled by default, the caching of remote MR provider keys can be
+disable by setting to 0.
+.RS
+.PP
+See section \f[I]CXI Domain Control Extensions\f[R] on how to disable
+the remote provider key cache at the domain level instead of for the
+global process/job.
+.RE
+.TP
+\f[I]FI_CXI_LLRING_MODE\f[R]
+Set the policy for use of the low-latency command queue ring mechanism.
+This mechanism improves the latency of command processing on an idle
+command queue.
+Valid values are idle, always, and never.
+.TP
+\f[I]FI_CXI_CQ_POLICY\f[R]
+Experimental.
+Set Command Queue write-back policy.
+Valid values are always, high_empty, low_empty, and low.
+\[lq]always\[rq], \[lq]high\[rq], and \[lq]low\[rq] refer to the
+frequency of write-backs.
+\[lq]empty\[rq] refers to whether a write-back is performed when the
+queue becomes empty.
+.TP
+\f[I]FI_CXI_DEFAULT_VNI\f[R]
+Default VNI value used only for service IDs where the VNI is not
+restricted.
+.TP
+\f[I]FI_CXI_EQ_ACK_BATCH_SIZE\f[R]
+Number of EQ events to process before writing an acknowledgement to HW.
+Batching ACKs amortizes the cost of event acknowledgement over multiple
+network operations.
+.TP
+\f[I]FI_CXI_RX_MATCH_MODE\f[R]
+Specify the receive message matching mode to be utilized.
+\f[I]FI_CXI_RX_MATCH_MODE=\f[R]hardware | software | hybrid
+.RS
+.PP
+\f[I]hardware\f[R] - Message matching is fully offloaded, if resources
+become exhausted flow control will be performed and existing unexpected
+message headers will be onloaded to free resources.
+.PP
+\f[I]software\f[R] - Message matching is fully onloaded.
+.PP
+\f[I]hybrid\f[R] - Message matching begins fully offloaded, if resources
+become exhuasted hardware will transition message matching to a hybrid
+of hardware and software matching.
+.PP
+For both \f[I]\[lq]hybrid\[rq]\f[R] and \f[I]\[lq]software\[rq]\f[R]
+modes and care should be taken to minimize the threshold for rendezvous
+processing (i.e.\ \f[I]FI_CXI_RDZV_THRESHOLD\f[R] +
+\f[I]FI_CXI_RDZV_GET_MIN\f[R]).
+When running in software endpoint mode the environment variables
+\f[I]FI_CXI_REQ_BUF_SIZE\f[R] and \f[I]FI_CXI_REQ_BUF_MIN_POSTED\f[R]
+are used to control the size and number of the eager request buffers
+posted to handle incoming unmatched messages.
+.RE
+.TP
+\f[I]FI_CXI_HYBRID_PREEMPTIVE\f[R]
+When in hybrid mode, this variable can be used to enable preemptive
+transitions to software matching.
+This is useful at scale for poorly written applications with a large
+number of unexpected messages where reserved resources may be
+insufficient to prevent to prevent starvation of software request list
+match entries.
+Default is 0, disabled.
+.TP
+\f[I]FI_CXI_HYBRID_RECV_PREEMPTIVE\f[R]
+When in hybrid mode, this variable can be used to enable preemptive
+transitions to software matching.
+This is useful at scale for poorly written applications with a large
+number of unmatched posted receives where reserved resources may be
+insufficient to prevent starvation of software request list match
+entries.
+Default is 0, disabled.
+.TP
+\f[I]FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE\f[R]
+When in hybrid mode, this variable can be used to enable preemptive
+transitions to software matching when the number of posted receives
+exceeds the user requested RX size attribute.
+This is useful for applications where they may not know the exact number
+of posted receives and they are expereincing application termination due
+to event queue overflow.
+Default is 0, disabled.
+.TP
+\f[I]FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE\f[R]
+When in hybrid mode, this variable can be used to enable preemptive
+transitions to software matching when the number of hardware unexpected
+messages exceeds the user requested RX size attribute.
+This is useful for applications where they may not know the exact number
+of posted receives and they are expereincing application termination due
+to event queue overflow.
+Default is 0, disabled.
+.TP
+\f[I]FI_CXI_REQ_BUF_SIZE\f[R]
+Size of request buffers.
+Increasing the request buffer size allows for more unmatched messages to
+be sent into a single request buffer.
+The default size is 2MB.
+.TP
+\f[I]FI_CXI_REQ_BUF_MIN_POSTED\f[R]
+The minimum number of request buffers that should be posted.
+The default minimum posted count is 4.
+The number of buffers will grow unbounded to support outstanding
+unexpected messages.
+Care should be taken to size appropriately based on job scale and the
+size of eager data to reduce the need for flow control.
+.TP
+\f[I]FI_CXI_REQ_BUF_MAX_CACHED/FI_CXI_REQ_BUF_MAX_COUNT\f[R]
+The maximum number of request buffers that will be cached.
+The default maximum count is 0.
+A value of zero indicates that once a request buffer is allocated it
+will be cached and used as needed.
+A non-zero value can be used with bursty traffic to shrink the number of
+allocated buffers to a maximum count when they are no longer needed.
+.TP
+\f[I]FI_CXI_MSG_LOSSLESS\f[R]
+Enable or disable lossless receive matching.
+If hardware resources are exhausted, hardware will pause the associated
+traffic class until a overflow buffer (hardware match mode) or request
+buffer (software match mode or hybrid match mode) is posted.
+This is considered experimental and defaults to disabled.
+.TP
+\f[I]FI_CXI_FC_RETRY_USEC_DELAY\f[R]
+Number of micro-seconds to sleep before retrying a dropped side-band,
+flow control message.
+Setting to zero will disable any sleep.
+.TP
+\f[I]FI_UNIVERSE_SIZE\f[R]
+Defines the maximum number of processes that will be used by distribute
+OFI application.
+Note that this value is used in setting the default control EQ size, see
+FI_CXI_CTRL_RX_EQ_MAX_SIZE.
+.TP
+\f[I]FI_CXI_CTRL_RX_EQ_MAX_SIZE\f[R]
+Max size of the receive event queue used for side-band/control messages.
+Default receive event queue size is based on FI_UNIVERSE_SIZE.
+Increasing the receive event queue size can help prevent
+side-band/control messages from being dropped and retried but at the
+cost of additional memory usage.
+Size is always aligned up to a 4KiB boundary.
+.TP
+\f[I]FI_CXI_DEFAULT_CQ_SIZE\f[R]
+Change the provider default completion queue size expressed in entries.
+This may be useful for applications which rely on middleware, and
+middleware defaults the completion queue size to the provider default.
+.TP
+\f[I]FI_CXI_DISABLE_EQ_HUGETLB/FI_CXI_DISABLE_CQ_HUGETLB\f[R]
+By default, the provider will attempt to allocate 2 MiB hugetlb pages
+for provider event queues.
+Disabling hugetlb support will cause the provider to fallback to memory
+allocators using host page sizes.
+FI_CXI_DISABLE_EQ_HUGETLB replaces FI_CXI_DISABLE_CQ_HUGETLB, however
+use of either is still supported.
+.TP
+\f[I]FI_CXI_DEFAULT_TX_SIZE\f[R]
+Set the default tx_attr.size field to be used by the provider if the
+size is not specified in the user provided fi_info hints.
+.TP
+\f[I]FI_CXI_DEFAULT_RX_SIZE\f[R]
+Set the default rx_attr.size field to be used by the provider if the
+size is not specified in the user provided fi_info hints.
+.TP
+\f[I]FI_CXI_SW_RX_TX_INIT_MAX\f[R]
+Debug control to override the number of TX operations that can be
+outstanding that are initiated by software RX processing.
+It has no impact on hardware initiated RX rendezvous gets.
+.TP
+\f[I]FI_CXI_DEVICE_NAME\f[R]
+Restrict CXI provider to specific CXI devices.
+Format is a comma separated list of CXI devices (e.g.\ cxi0,cxi1).
+.TP
+\f[I]FI_CXI_TELEMETRY\f[R]
+Perform a telemetry delta between fi_domain open and close.
+Format is a comma separated list of telemetry files as defined in
+/sys/class/cxi/cxi*/device/telemetry/.
+The ALL-in-binary file in this directory is invalid.
+Note that these are per CXI interface counters and not per CXI process
+per interface counters.
+.TP
+\f[I]FI_CXI_TELEMETRY_RGID\f[R]
+Resource group ID (RGID) to restrict the telemetry collection to.
+Value less than 0 is no restrictions.
+.TP
+\f[I]FI_CXI_CQ_FILL_PERCENT\f[R]
+Fill percent of underlying hardware event queue used to determine when
+completion queue is saturated.
+A saturated completion queue results in the provider returning
+-FI_EAGAIN for data transfer and other related libfabric operations.
+.TP
+\f[I]FI_CXI_COMPAT\f[R]
+Temporary compatibility to allow use of pre-upstream values for
+FI_ADDR_CXI and FI_PROTO_CXI.
+Compatibility can be disabled to verify operation with upstream constant
+values and to enable access to conflicting provider values.
+The default setting of 1 specifies both old and new constants are
+supported.
+A setting of 0 disables support for old constants and can be used to
+test that an application is compatible with the upstream values.
+A setting of 2 is a safety fallback that if used the provider will only
+export fi_info with old constants and will be incompatible with
+libfabric clients that been recompiled.
+.TP
+\f[I]FI_CXI_COLL_FABRIC_MGR_URL\f[R]
+\f[B]accelerated collectives:\f[R] Specify the HTTPS address of the
+fabric manager REST API used to create specialized multicast trees for
+accelerated collectives.
+This parameter is \f[B]REQUIRED\f[R] for accelerated collectives, and is
+a fixed, system-dependent value.
+.TP
+\f[I]FI_CXI_COLL_TIMEOUT_USEC\f[R]
+\f[B]accelerated collectives:\f[R] Specify the reduction engine timeout.
+Upon expiration, reduction engines in hardware will deliver any partial
+results and expire. Any remaining results will arrive individually, without
+hardware reduction, unless the retry period (below) expires and re-arms
+the reduction.
+The relative performance benefit of acceleration declines with
+increasing compute cycle time, dropping below one percent at 32 msec
+(32000).
+Using acceleration with compute cycles larger than 32 msec is not
+recommended except for experimental purposes.
+.TP
+\f[I]FI_CXI_COLL_RETRY_USEC\f[R]
+\f[B]accelerated collectives:\f[R] Specify the reduction engine retry
+period. Upon expiration, incomplete reductions will be automatically
+restarted, forcing partial results from leaf endpoints to be sent again. This
+allows dropped packets to be recovered, and prevents potential incast at the
+root if many nodes submit late results (unexpectedly long compute cycles).
+The relative performance benefit of acceleration declines with
+increasing compute cycle time, dropping below one percent at 32 msec
+(32000).
+Using acceleration with compute cycles larger than 32 msec is not
+recommended except for experimental purposes.
+.TP
+\f[I]FI_CXI_COLL_USE_DMA_PUT\f[R]
+\f[B]accelerated collectives:\f[R] Use DMA for collective packet put.
+This uses DMA to inject reduction packets rather than IDC, and is
+considered experimental.
+Default is false.
+.TP
+\f[I]FI_CXI_DISABLE_HMEM_DEV_REGISTER\f[R]
+Disable registering HMEM device buffer for load/store access.
+Some HMEM devices (e.g.\ AMD, Nvidia, and Intel GPUs) support backing
+the device memory by the PCIe BAR.
+This enables software to perform load/stores to the device memory via
+the BAR instead of using device DMA engines.
+Direct load/store access may improve performance.
+.TP
+\f[I]FI_CXI_FORCE_ZE_HMEM_SUPPORT\f[R]
+Force the enablement of ZE HMEM support.
+By default, the CXI provider will only support ZE memory registration if
+implicit scaling is disabled (i.e.\ the environment variables
+EnableImplicitScaling=0 NEOReadDebugKeys=1 are set).
+Set FI_CXI_FORCE_ZE_HMEM_SUPPORT to 1 will cause the CXI provider to
+skip the implicit scaling checks.
+GPU direct RDMA may or may not work in this case.
+.TP
+\f[I]FI_CXI_ENABLE_TRIG_OP_LIMIT\f[R]
+Enable enforcement of triggered operation limit.
+Doing this can prevent fi_control(FI_QUEUE_WORK) deadlocking at the cost
+of performance.
+.PP
+Note: Use the fi_info utility to query provider environment variables:
+fi_info -p cxi -e
+.SH CXI EXTENSIONS
+.PP
+The CXI provider supports various fabric-specific extensions.
+Extensions are accessed using the fi_open_ops function.
+.SS CXI Domain Control Extensions
+.PP
+The \f[B]fi_control\f[R]() function is extended for domain FIDs to query
+and override global environment settings for a specific domain.
+This is useful for example where the application process also includes a
+client API that has different optimizations and protections.
+.PP
+Command \f[I]FI_OPT_CXI_GET_OPTIMIZED\f[R] where the argument is a
+pointer to a bool.
+The call returns the setting for optimized MR usage for the domain.
+The default is determined by the environment setting of
+\f[I]FI_CXI_OPTIMIZED_MRS\f[R].
+.PP
+Command \f[I]FI_OPT_CXI_SET_OPTIMIZED\f[R] where the argument is a
+pointer to a bool initialized to true or false.
+The call enables or disables the use of optimized MRs for the domain.
+If the domain is not configured for FI_MR_PROV_KEY MR mode, the call
+will fail with -FI_EINVAL, it is not supported for client generated
+keys.
+It must be called prior to MR being created.
+.PP
+Command \f[I]FI_OPT_CXI_GET_MR_MATCH_EVENTS\f[R] where the argument is a
+pointer to a bool.
+The call returns the setting for MR Match Event accounting for the
+domain.
+The default is determined by the environment setting of
+\f[I]FI_CXI_MR_MATCH_EVENTS\f[R].
+.PP
+Command \f[I]FI_OPT_CXI_SET_MR_MATCH_EVENTS\f[R] where the argument is a
+pointer to a bool initialized to true or false.
+This call enables or disables the use of MR Match Event counting.
+This ensures that memory backing a MR cannot be accessed after invoking
+fi_close() on the MR, even if that memory remains in the libfabric MR
+cache.
+Manual progress must be made to process events at the RMA destination.
+It can only be changed prior to any EP or MR being created.
+.PP
+Command \f[I]FI_OPT_CXI_GET_PROV_KEY_CACHE\f[R] where the argument is a
+pointer to a bool.
+The call returns the setting for enabling use of the remote MR cache for
+provider keys for the domain.
+The default is determined by the environment setting of
+\f[I]FI_CXI_PROV_KEY_CACHE\f[R] and is only valid if FI_MR_PROV_KEY MR
+mode is used.
+.PP
+Command \f[I]FI_OPT_CXI_SET_PROV_KEY_CACHE\f[R] where the argument is a
+pointer to a bool initialized to true or false.
+This call enables or disables the use of the remote MR cache for
+provider keys for the domain.
+By default the cache is enabled and can be used for provider keys that
+do not require events.
+The command will fail with -FI_EINVAL if FI_MR_PROV_KEY MR mode is not
+in use.
+It can only be changed prior to any MR being created.
+.SS CXI Domain Extensions
+.PP
+CXI domain extensions have been named \f[I]FI_CXI_DOM_OPS_6\f[R].
+The flags parameter is ignored.
+The fi_open_ops function takes a \f[C]struct fi_cxi_dom_ops\f[R].
+See an example of usage below:
+.IP
+.nf
+\f[C]
+struct fi_cxi_dom_ops *dom_ops;
+
+ret = fi_open_ops(&domain->fid, FI_CXI_DOM_OPS_4, 0, (void **)&dom_ops, NULL);
+\f[R]
+.fi
+.PP
+The following domain extensions are defined:
+.IP
+.nf
+\f[C]
+struct fi_cxi_dom_ops {
+    int (*cntr_read)(struct fid *fid, unsigned int cntr, uint64_t *value,
+              struct timespec *ts);
+    int (*topology)(struct fid *fid, unsigned int *group_id,
+            unsigned int *switch_id, unsigned int *port_id);
+    int (*enable_hybrid_mr_desc)(struct fid *fid, bool enable);
+    size_t (*ep_get_unexp_msgs)(struct fid_ep *fid_ep,
+                    struct fi_cq_tagged_entry *entry,
+                    size_t count, fi_addr_t *src_addr,
+                    size_t *ux_count);
+    int (*get_dwq_depth)(struct fid *fid, size_t *depth);
+};
+\f[R]
+.fi
+.PP
+\f[I]cntr_read\f[R] extension is used to read hardware counter values.
+Valid values of the cntr argument are found in the Cassini-specific
+header file cassini_cntr_defs.h.
+Note that Counter accesses by applications may be rate-limited to 1HZ.
+.PP
+\f[I]topology\f[R] extension is used to return CXI NIC address topology
+information for the domain.
+Currently only a dragonfly fabric topology is reported.
+.PP
+The enablement of hybrid MR descriptor mode allows for libfabric users
+to optionally pass in a valid MR desc for local communications
+operations.
+.PP
+The get unexpected message function is used to obtain a list of
+unexpected messages associated with an endpoint.
+The list is returned as an array of CQ tagged entries set in the
+following manner:
+.IP
+.nf
+\f[C]
+struct fi_cq_tagged_entry {
+    .op_context = NULL,
+    .flags = any of [FI_TAGGED | FI_MSG | FI_REMOTE_CQ_DATA],
+    .len = message length,
+    .buf = NULL,
+    .data = CQ data if FI_REMOTE_CQ_DATA set
+    .tag = tag if FI_TAGGED set
+};
+\f[R]
+.fi
+.PP
+If the src_addr or entry array is NULL, only the ux_count of available
+unexpected list entries will be returned.
+The parameter count specifies the size of the array provided, if it is 0
+then only the ux_count will be returned.
+The function returns the number of entries written to the array or a
+negative errno.
+On successful return, ux_count will always be set to the total number of
+unexpected messages available.
+.PP
+\f[I]enable_hybrid_mr_desc\f[R] is used to enable hybrid MR descriptor
+mode.
+Hybrid MR desc allows for libfabric users to optionally pass in a valid
+MR desc for local communication operations.
+This is currently only used for RMA and AMO transfers.
+.PP
+\f[I]get_dwq_depth\f[R] is used to get the depth of the deferred work
+queue.
+The depth is the number of triggered operation commands which can be
+queued to hardware.
+The depth is not per fi_domain but rather per service ID.
+Since a single service ID is intended to be shared between all
+processing using the same NIC in a job step, the triggered operations
+are shared across processes.
+.PP
+\f[I]enable_mr_match_events\f[R] and \f[I]enable_optimized_mrs\f[R] have
+been deprecated in favor of using the fi_control() API.
+While the can be still be called via the domain ops, They will be
+removed from the domain opts prior to software release 2.2.
+.SS CXI Counter Extensions
+.PP
+CXI counter extensions have been named \f[I]FI_CXI_COUNTER_OPS\f[R].
+The flags parameter is ignored.
+The fi_open_ops function takes a \f[C]struct fi_cxi_cntr_ops\f[R].
+See an example of usage below.
+.IP
+.nf
+\f[C]
+struct fi_cxi_cntr_ops *cntr_ops;
+
+ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0, (void **)&cntr_ops, NULL);
+\f[R]
+.fi
+.PP
+The following domain extensions are defined:
+.IP
+.nf
+\f[C]
+struct fi_cxi_cntr_ops {
+    /* Set the counter writeback address to a client provided address. */
+    int (*set_wb_buffer)(struct fid *fid, const void *buf, size_t len);
+
+    /* Get the counter MMIO region. */
+    int (*get_mmio_addr)(struct fid *fid, void **addr, size_t *len);
+};
+\f[R]
+.fi
+.SS CXI Counter Writeback Flag
+.PP
+If a client is using the CXI counter extensions to define a counter
+writeback buffer, the CXI provider will not update the writeback buffer
+success or failure values for each hardware counter success or failure
+update.
+This can especially create issues when clients expect the completion of
+a deferred workqueue operation to generate a counter writeback.
+To support this, the flag \f[I]FI_CXI_CNTR_WB\f[R] can be used in
+conjunction with a deferred workqueue operation to force a writeback at
+the completion of the deferred workqueue operation.
+See an example of usage below.
+.IP
+.nf
+\f[C]
+struct fi_op_rma rma = {
+  /* Signal to the provider the completion of the RMA should trigger a
+   * writeback.
+   */
+  .flags = FI_CXI_CNTR_WB,
+};
+
+struct fi_deferred_work rma_work = {
+  .op_type = FI_OP_READ,
+  .triggering_counter = cntr,
+  .completion_cntr = cntr,
+  .threshold = 1,
+  .op.rma = &rma,
+};
+
+ret = fi_control(&domain->fid, FI_QUEUE_WORK, &rma_work);
+\f[R]
+.fi
+.PP
+\f[B]Note:\f[R] Using \f[I]FI_CXI_CNTR_WB\f[R] will lead to additional
+hardware usage.
+To conserve hardware resources, it is recommended to only use the
+\f[I]FI_CXI_CNTR_WB\f[R] when a counter writeback is absolutely
+required.
+.SS CXI Alias EP Overrides
+.PP
+A transmit alias endpoint can be created and configured to utilize a
+different traffic class than the original endpoint.
+This provides a lightweight mechanism to utilize multiple traffic
+classes within a process.
+Message order between the original endpoint and the alias endpoint is
+not defined/guaranteed.
+See example usage below for setting the traffic class of a transmit
+alias endpoint.
+.IP
+.nf
+\f[C]
+#include <rdma/fabric.h>
+#include <rdma/fi_endpoint.h>
+#include <rdma/fi_cxi_ext.h>     // Ultimately fi_ext.h
+
+struct fid_ep *ep;
+\&. . .
+
+struct fid_ep *alias_ep = NULL;
+uint32_t tclass = FI_TC_LOW_LATENCY;
+uint64_t op_flags = FI_TRANSMIT | desired data operation flags;
+
+ret = fi_ep_alias(ep, &alias_ep, op_flags);
+if (ret)
+    error;
+
+ret = fi_set_val(&alias_ep->fid, FI_OPT_CXI_SET_TCLASS, (void *)&tlcass);
+if (ret)
+    error;
+\f[R]
+.fi
+.PP
+In addition, the alias endpoint message order may be modified to
+override the default endpoint message order.
+Message order between the modified alias endpoint and the original
+endpoint is not guaranteed.
+See example usage below for setting the traffic class of a transmit
+alias endpoint.
+.IP
+.nf
+\f[C]
+uint64_t msg_order = FI_ORDER_RMA_WAW;
+
+ret = fi_set_val(&alias_ep->fid, FI_OPT_CXI_SET_MSG_ORDER,
+                 (void *)&msg_order);
+if (ret)
+    error;
+\f[R]
+.fi
+.PP
+When an endpoint does not support FI_FENCE (e.g.\ optimized MR), a
+provider specific transmit flag, FI_CXI_WEAK_FENCE, may be specified on
+an alias EP to issue a FENCE operation to create a data ordering point
+for the alias.
+This is supported for one-sided operations only.
+.PP
+Alias EP must be closed prior to closing the original EP.
+.SS PCIe Atomics
+.PP
+The CXI provider has the ability to issue a given libfabric atomic
+memory operation as a PCIe operation as compared to a NIC operation.
+The CXI provider extension flag FI_CXI_PCIE_AMO is used to signify this.
+.PP
+Since not all libfabric atomic memory operations can be executed as a
+PCIe atomic memory operation, \f[C]fi_query_atomic()\f[R] could be used
+to query if a given libfabric atomic memory operation could be executed
+as PCIe atomic memory operation.
+.PP
+The following is a query to see if a given libfabric operation can be a
+PCIe atomic operation.
+.IP
+.nf
+\f[C]
+int ret;
+struct fi_atomic_attr out_attrs;
+
+/* Query if non-fetching PCIe atomic is supported. */
+ret = fi_query_atomic(domain, FI_UINT32, FI_SUM, &out_attrs, FI_CXI_PCIE_AMO);
+
+/* Query if fetching PCIe atomic is supported. */
+ret = fi_query_atomic(domain, FI_UINT32, FI_SUM, &out_attrs,
+                      FI_FETCH_ATOMIC | FI_CXI_PCIE_AMO);
+\f[R]
+.fi
+.PP
+The following is how to issue a PCIe atomic operation.
+.IP
+.nf
+\f[C]
+ssize_t ret;
+struct fi_msg_atomic msg;
+struct fi_ioc resultv;
+void *result_desc;
+size_t result_count;
+
+ret = fi_fetch_atomicmsg(ep, &msg, &resultv, &result_desc, result_count,
+                         FI_CXI_PCIE_AMO);
+\f[R]
+.fi
+.PP
+\f[B]Note:\f[R] The CXI provider only supports PCIe fetch add for
+UINT32_T, INT32_t, UINT64_T, and INT64_t.
+This support requires enablement of PCIe fetch add in the CXI driver,
+and it comes at the cost of losing NIC atomic support for another
+libfabric atomic operation.
+.PP
+\f[B]Note:\f[R] Ordering between PCIe atomic operations and NIC
+atomic/RMA operations is undefined.
+.PP
+To enable PCIe fetch add for libfabric, the following CXI driver kernel
+module parameter must be set to non-zero.
+.IP
+.nf
+\f[C]
+/sys/module/cxi_core/parameters/amo_remap_to_pcie_fadd
+\f[R]
+.fi
+.PP
+The following are the possible values for this kernel module and the
+impact of each value: - -1: Disable PCIe fetch add support.
+FI_CXI_PCIE_AMO is not supported.
+- 0: Enable PCIe fetch add support.
+FI_MIN is not supported.
+- 1: Enable PCIe fetch add support.
+FI_MAX is not supported.
+- 2: Enable PCIe fetch add support.
+FI_SUM is not supported.
+- 4: Enable PCIe fetch add support.
+FI_LOR is not supported.
+- 5: Enable PCIe fetch add support.
+FI_LAND is not supported.
+- 6: Enable PCIe fetch add support.
+FI_BOR is not supported.
+- 7: Enable PCIe fetch add support.
+FI_BAND is not supported.
+- 8: Enable PCIe fetch add support.
+FI_LXOR is not supported.
+- 9: Enable PCIe fetch add support.
+FI_BXOR is not supported.
+- 10: Enable PCIe fetch add support.
+No loss of default CXI provider AMO functionality.
+.PP
+Guidance is to default amo_remap_to_pcie_fadd to 10.
+.SH FABTESTS
+.PP
+The CXI provider does not currently support fabtests which depend on IP
+addressing.
+.PP
+fabtest RDM benchmarks are supported, like:
+.IP
+.nf
+\f[C]
+# Start server by specifying source PID and interface
+\&./fabtests/benchmarks/fi_rdm_tagged_pingpong -B 10 -s cxi0
+
+# Read server NIC address
+CXI0_ADDR=$(cat /sys/class/cxi/cxi0/device/properties/nic_addr)
+
+# Start client by specifying server PID and NIC address
+\&./fabtests/benchmarks/fi_rdm_tagged_pingpong -P 10 $CXI0_ADDR
+
+# The client may be bound to a specific interface, like:
+\&./fabtests/benchmarks/fi_rdm_tagged_pingpong -B 10 -s cxi1 -P 10 $CXI0_ADDR
+\f[R]
+.fi
+.PP
+Some functional fabtests are supported (including fi_bw).
+Others use IP sockets and are not yet supported.
+.PP
+multinode fabtests are not yet supported.
+.PP
+ubertest is supported for test configs matching the provider\[cq]s
+current capabilities.
+.PP
+unit tests are supported where the test feature set matches the CXI
+provider\[cq]s current capabilities.
+.SH ERRATA
+.IP \[bu] 2
+Fetch and compare type AMOs with FI_DELIVERY_COMPLETE or
+FI_MATCH_COMPLETE completion semantics are not supported with
+FI_RMA_EVENT.
+.SH Libfabric CXI Provider User Programming and Troubleshooting Guide
+.PP
+The scope of the following subsection is to provide guidance and/or
+troubleshooting tips for users of the libfabric CXI provider.
+The scope of this section is not a full guide for user libfabric.
+.SS Sizing Libfabric Objects Based on Expected Usage
+.PP
+The CXI provider uses various libfabric object attribute size and/or
+libfabric enviroment variables to size hardware related resources
+accordingly.
+Failure to size resources properly can result in the CXI provider
+frequently returning -FI_EAGAIN which may negatively impact performance.
+The following subsection outline important sizing related attributes and
+environment variables.
+.SS Completion Queue Size Attribute
+.PP
+The CXI provider uses completion queue attribute size to size various
+software and hardware event queues used to generate libfabric completion
+events.
+While the size of the software queues may grow, hardware event queue
+sizes are static.
+Failing to size hardware queues properly may result in CXI provider
+returning -FI_EAGAIN frequently for data transfer operations.
+When this error is returned, user should progress the corresponding
+endpoint completion queues by calling fi_cq_read().
+.PP
+Users are encouraged to set the completion queue size attribute based on
+the expected number of inflight RDMA operations to and from a single
+endpoint.
+For users which are relying on the provider default value (e.g.\ MPI),
+the FI_CXI_DEFAULT_CQ_SIZE environment variable can be used to override
+the provider default value.
+.SS Endpoint Recieve Size Attribute
+.PP
+The CXI provider uses the endpoint receive size attribute to size
+internal command and hardware event queues.
+Failing to size the either command queue correctly can result in the CXI
+provider returning -FI_EAGAIN frequently for data transfer operations.
+When this error is returned, user should progress the corresponding
+endpoint completion queues by calling fi_cq_read().
+.PP
+Users are encouraged to set the endpoint receive size attribute based on
+the expected numbfer of inflight untagged and tagged RDMA operations.
+For users which are relying on the provider default value (e.g.\ MPI),
+the FI_CXI_DEFAULT_RX_SIZE environment variable can be used to override
+the provider default value.
+.SS Endpoint Transmit Size Attribute
+.PP
+The CXI provider uses the endpoint transmit size attribute to size
+internal command and hardware event queues.
+Failing to size the either command queue correctly can result in the CXI
+provider returning -FI_EAGAIN frequently for data transfer operations.
+When this error is returned, user should progress the corresponding
+endpoint completion queues by calling fi_cq_read().
+.PP
+At a minimum, users are encouraged to set the endpoint transmit size
+attribute based on the expected numbfer of inflight, initiator RDMA
+operations.
+If users are going to be issuing message opeartions over the CXI
+provider rendezvous limit (FI_CXI_RDZV_THRESHOLD), the transmit size
+attribute must also include the number of outstanding, unexpected
+rendezvous operations (i.e.\ inflight, initiator RDMA operations +
+outstanding, unexpected rendezvous operations).
+.PP
+For users which are relying on the provider default value (e.g.\ MPI),
+the FI_CXI_DEFAULT_TX_SIZE environment variable can be used to override
+the provider default value.
+.SS FI_UNIVERSE_SIZE Environment Variable
+.PP
+The libfabric FI_UNIVERSE_SIZE environment variable defines the number
+of expected ranks/peers an application needs to communicate with.
+The CXI provider may use this environment variable to size resources
+tied to number of peers.
+Users are encourage to set this environment variable accordingly.
+.SS Selecting Proper Receive Match Mode
+.PP
+As mentioned in the \f[I]Runtime Parameters\f[R] section, the CXI
+provider supports 3 different operational modes: hardware, hybrid, and
+software.
+.PP
+Hardware match mode is approriate for users who can ensure the sum of
+unexpected messages and posted receives does not exceed the configured
+hardware receive resource limit for the application.
+When resources are consumed, the endpoint will transition into a flow
+control operational mode which requires side-band messaging to recover
+from.
+Recovery will involve the CXI provider trying to reclaim hardware
+receive resources to help prevent future transition into flow control.
+If the CXI provider is unable to reclaim hardware receive resoures, this
+can lead to a cycle of entering and exiting flow control which may
+present itself as a hang to the libfabric user.
+Running with FI_LOG_LEVEL=warn and FI_LOG_PROV=cxi will report if this
+flow control transition is happening.
+.PP
+Hybrid match mode is approriate for users who are unsure if the sum of
+unexpected messages and posted receives will not exceed the configure
+hardware receive resource limit for the application but want to ensure
+they application still functions if hardware receive resources are
+consumed.
+Hybrid match mode extends hardware match by allowing for an automated
+transition into software match mode if resources are consumed.
+.PP
+Sofftware match mode is approriate for user who know the sum of
+unexpected messages and posted receives will exceed the configured
+hardware receive resource limit for the application.
+In software match mode, the CXI provider maintains the a software
+unexpected and posted receive list rather than offloading to hardware.
+This avoids having to allocated a hardware receive resource for each
+unxpected messsage and posted receive.
+.PP
+\f[I]Note\f[R]: In practice, dependent processes (e.g.\ parallel job)
+will most likely be sharing a recieve hardware resource pool.
+.PP
+\f[I]Note\f[R]: Each match mode may still enter flow control.
+For example, if a user is not draining the libfabric completion queue at
+a reasonable rate, corresponding hardware events may fill up which will
+trigger flow control.
+.SS Using Hybrid Match Mode Preemptive Options
+.PP
+The high-level objective of the hybrid match mode preemptive environment
+variables (i.e.
+FI_CXI_HYBRID_PREEMPTIVE, FI_CXI_HYBRID_RECV_PREEMPTIVE,
+FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE, and
+FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE) is to ensure a process
+requiring more hardware receives resource does not force other process
+requiring less hardware receive resource to be force into software match
+mode due to no available hardware receive resources available.
+.PP
+For example, considered a parallel application which has multiple
+processes (i.e.\ ranks) per NIC all sharing the same hardware receive
+resource pool.
+Suppose that the application communication pattern results in an
+all-to-one communication to only a single rank (e.g.
+rank 0) while other ranks may be doing communication amongst each other.
+If the width of the all-to-one exceeds hardware resource consumptions,
+all ranks on the target NIC will transition to software match mode.
+The preemptive options may help ensure that only rank 0 would transition
+to software match mode instead of all the ranks on the target NIC.
+.PP
+The FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE and
+FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE environment variables will force
+the transition to software match mode if the user requested endpoint
+recieve size attribute is exceeded.
+The benefit of running with these enabled is that software match mode
+transition is 100% in control of the libfabric user through the receive
+size attribute.
+One approach users could take here is set receive size attribute to
+expected usage, and if this expected usage is exceeded, only the
+offending endpoints will transition to software match mode.
+.PP
+FI_CXI_HYBRID_PREEMPTIVE and FI_CXI_HYBRID_RECV_PREEMPTIVE environment
+variables will force the transition to software match mode if hardware
+receive resources in the pool are running low.
+The CXI provider will do a multi-step process to transition the
+libfabric endpoint to software match mode.
+The benefit of running with these enabled is that the number of
+endpoints transitioning to software match mode may be smaller when
+compared to forced software match mode transition due to zero hardware
+resources available.
+.SS Preventing Messaging Flow Control Due to Hardware Event Queue Sizing
+.PP
+As much as possible, CXI provider message flow control should be
+avoided.
+Flow control results in expensive, side-band, CXI provider internal
+messaging to recover from.
+One cause for flow control is due to improper hardware event queue
+sizing.
+If the hardware event queue is undersized resulting it filling quicker
+than expected, the next incoming message operation targeting a full
+event queue will result in the message operation being dropped and flow
+control triggered.
+.PP
+The default CXI provider behavior is to size hardware event queues based
+on endpoint transmit and receive size attributes.
+Thus, it is critical for users to set these attributes accordingly.
+.PP
+The CQ size can be used to override the CXI provider calcuatled hardware
+event queue size based on endpoint transmit and receive size attributes.
+If the CQ size is greater than the CXI proviuder calcuation, the value
+from the CQ size will be used.
+.PP
+The CQ fill percent can be used to define a threshold for when no new
+RDMA operations can be queued until the libfabric CQ a progressed thus
+draining hardware event queues.
+.SS Interrupting CXI Provider CQ Error Event Errno
+.PP
+The following are the libfabric errno value which may be returned in an
+RDMA CQ error event.
+.PP
+FI_ETRUNC: Receive message truncation.
+.PP
+FI_EHOSTUNREACH: Target is unreachable.
+This is due to connectivity issues, such as downed links, between the
+two peers.
+.PP
+FI_ENOTCONN: Cannot communicate due to no libfabric endpoint configure.
+In this case, the target NIC is reachable.
+.PP
+FI_EIO: Catch all errno.
+.SH SEE ALSO
+.PP
+\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7),
+.SH AUTHORS
+OpenFabrics.
diff --git a/prov/cxi/.gitignore b/prov/cxi/.gitignore
new file mode 100644
index 00000000000..689e74dcd1d
--- /dev/null
+++ b/prov/cxi/.gitignore
@@ -0,0 +1,2 @@
+test/cxitest
+test/curltest
diff --git a/prov/cxi/Makefile.include b/prov/cxi/Makefile.include
new file mode 100644
index 00000000000..b2619dd2ec6
--- /dev/null
+++ b/prov/cxi/Makefile.include
@@ -0,0 +1,211 @@
+# SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+#
+# Copyright 2018,2020-2023 Hewlett Packard Enterprise Development LP
+
+if HAVE_CXI
+
+AM_CPPFLAGS += \
+	-I$(top_srcdir)/prov/cxi/include
+
+_cxi_files = \
+	prov/cxi/src/cxip_if.c \
+	prov/cxi/src/cxip_fabric.c \
+	prov/cxi/src/cxip_repsum.c \
+	prov/cxi/src/cxip_coll.c \
+	prov/cxi/src/cxip_zbcoll.c \
+	prov/cxi/src/cxip_curl.c \
+	prov/cxi/src/cxip_dom.c \
+	prov/cxi/src/cxip_ep.c \
+	prov/cxi/src/cxip_txc.c \
+	prov/cxi/src/cxip_rxc.c \
+	prov/cxi/src/cxip_av.c \
+	prov/cxi/src/cxip_avset.c \
+	prov/cxi/src/cxip_eq.c \
+	prov/cxi/src/cxip_cq.c \
+	prov/cxi/src/cxip_cntr.c \
+	prov/cxi/src/cxip_rma.c \
+	prov/cxi/src/cxip_mr.c \
+	prov/cxi/src/cxip_msg.c \
+	prov/cxi/src/cxip_atomic.c \
+	prov/cxi/src/cxip_iomm.c \
+	prov/cxi/src/cxip_faults.c \
+	prov/cxi/src/cxip_info.c \
+	prov/cxi/src/cxip_ctrl.c \
+	prov/cxi/src/cxip_req_buf.c \
+	prov/cxi/src/cxip_rdzv_pte.c \
+	prov/cxi/src/cxip_trace.c \
+	prov/cxi/src/cxip_telemetry.c \
+	prov/cxi/src/cxip_ptelist_buf.c \
+	prov/cxi/src/cxip_evtq.c \
+	prov/cxi/src/cxip_nic.c \
+	prov/cxi/src/cxip_portals_table.c \
+	prov/cxi/src/cxip_pte.c \
+	prov/cxi/src/cxip_cmdq.c
+
+_cxi_headers = \
+	prov/cxi/include/cxip.h \
+	prov/cxi/include/cxip_faults.h \
+	prov/cxi/include/fi_cxi_ext.h
+
+rdmainclude_HEADERS += \
+	prov/cxi/include/fi_cxi_ext.h
+
+# Stand-alone srun tests for hardware testing environment
+noinst_PROGRAMS += prov/cxi/test/multinode/test_frmwk
+prov_cxi_test_multinode_test_frmwk_SOURCES = \
+	prov/cxi/test/multinode/multinode_frmwk.h \
+	prov/cxi/test/multinode/multinode_frmwk.c \
+	prov/cxi/test/multinode/test_frmwk.c
+prov_cxi_test_multinode_test_frmwk_CPPFLAGS = \
+	$(AM_CPPFLAGS) $(cxi_CPPFLAGS) $(PTHREAD_FLAGS)
+prov_cxi_test_multinode_test_frmwk_LDFLAGS = -static
+prov_cxi_test_multinode_test_frmwk_LDADD = $(linkback) $(PTHREAD_LIBS)
+
+noinst_PROGRAMS += prov/cxi/test/multinode/test_zbcoll
+prov_cxi_test_multinode_test_zbcoll_SOURCES = \
+	prov/cxi/test/multinode/multinode_frmwk.h \
+	prov/cxi/test/multinode/multinode_frmwk.c \
+	prov/cxi/test/multinode/test_zbcoll.c
+prov_cxi_test_multinode_test_zbcoll_CPPFLAGS = \
+	$(AM_CPPFLAGS) $(cxi_CPPFLAGS) $(PTHREAD_FLAGS)
+prov_cxi_test_multinode_test_zbcoll_LDFLAGS = -static
+prov_cxi_test_multinode_test_zbcoll_LDADD = $(linkback) $(PTHREAD_LIBS)
+
+noinst_PROGRAMS += prov/cxi/test/multinode/test_coll
+prov_cxi_test_multinode_test_coll_SOURCES = \
+	prov/cxi/test/multinode/multinode_frmwk.h \
+	prov/cxi/test/multinode/multinode_frmwk.c \
+	prov/cxi/test/multinode/test_coll.c
+prov_cxi_test_multinode_test_coll_CPPFLAGS = \
+	$(AM_CPPFLAGS) $(cxi_CPPFLAGS) $(PTHREAD_FLAGS)
+prov_cxi_test_multinode_test_coll_LDFLAGS = -static
+prov_cxi_test_multinode_test_coll_LDADD = $(linkback) $(PTHREAD_LIBS)
+
+noinst_PROGRAMS += prov/cxi/test/multinode/test_barrier
+prov_cxi_test_multinode_test_barrier_SOURCES = \
+	prov/cxi/test/multinode/multinode_frmwk.h \
+	prov/cxi/test/multinode/multinode_frmwk.c \
+	prov/cxi/test/multinode/test_barrier.c
+prov_cxi_test_multinode_test_barrier_CPPFLAGS = \
+	$(AM_CPPFLAGS) $(cxi_CPPFLAGS) $(PTHREAD_FLAGS)
+prov_cxi_test_multinode_test_barrier_LDFLAGS = -static
+prov_cxi_test_multinode_test_barrier_LDADD = $(linkback) $(PTHREAD_LIBS)
+
+if HAVE_CRITERION
+
+# curltest is not expected to exist outside devel env
+noinst_PROGRAMS += prov/cxi/test/curltest
+prov_cxi_test_curltest_SOURCES = \
+	prov/cxi/test/curl.c
+prov_cxi_test_curltest_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \
+	$(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS)
+prov_cxi_test_curltest_LDFLAGS = $(cxitest_LDFLAGS) -static
+prov_cxi_test_curltest_LDADD =  $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS)
+
+bin_PROGRAMS += prov/cxi/test/cxitest
+nodist_prov_cxi_test_cxitest_SOURCES = \
+	prov/cxi/test/cxip_test_common.c \
+	prov/cxi/test/fabric.c \
+	prov/cxi/test/domain.c \
+	prov/cxi/test/ep.c \
+	prov/cxi/test/eq.c \
+	prov/cxi/test/cq.c \
+	prov/cxi/test/av.c \
+	prov/cxi/test/avset.c \
+	prov/cxi/test/rma.c \
+	prov/cxi/test/tagged.c \
+	prov/cxi/test/msg.c \
+	prov/cxi/test/atomic.c \
+	prov/cxi/test/cntr.c \
+	prov/cxi/test/tagged_stress.c \
+	prov/cxi/test/mr.c \
+	prov/cxi/test/deferred_work.c \
+	prov/cxi/test/coll.c \
+	prov/cxi/test/ctrl.c \
+	prov/cxi/test/lat.c \
+	prov/cxi/test/repsum.c \
+	prov/cxi/test/auth_key.c \
+	prov/cxi/test/fork.c \
+	prov/cxi/test/mem_reg.c \
+	prov/cxi/test/nic.c
+
+prov_cxi_test_cxitest_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \
+	$(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS)
+prov_cxi_test_cxitest_LDFLAGS = $(cxitest_LDFLAGS) -static
+prov_cxi_test_cxitest_LDADD =  $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS)
+
+TESTS += prov/cxi/test/cxitest
+
+# ZE test suite is its own binary
+if HAVE_ZE
+
+bin_PROGRAMS += prov/cxi/test/cxitestze
+nodist_prov_cxi_test_cxitestze_SOURCES = \
+	prov/cxi/test/cxip_test_common.c \
+	prov/cxi/test/ze.c
+
+prov_cxi_test_cxitestze_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \
+	$(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS)
+prov_cxi_test_cxitestze_LDFLAGS = $(cxitest_LDFLAGS) -static
+prov_cxi_test_cxitestze_LDADD =  $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS) -lze_loader
+
+TESTS += prov/cxi/test/cxitestze
+
+endif HAVE_ZE
+
+# CUDA test suite is its own binary
+if HAVE_CUDA
+
+bin_PROGRAMS += prov/cxi/test/cxitestcuda
+nodist_prov_cxi_test_cxitestcuda_SOURCES = \
+	prov/cxi/test/cxip_test_common.c \
+	prov/cxi/test/cuda.c
+
+prov_cxi_test_cxitestcuda_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \
+	$(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS)
+prov_cxi_test_cxitestcuda_LDFLAGS = $(cxitest_LDFLAGS) -static
+prov_cxi_test_cxitestcuda_LDADD =  $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS) -lcudart -lcuda
+
+TESTS += prov/cxi/test/cxitestcuda
+
+endif HAVE_CUDA
+
+# ROCR test suite is its own binary
+if HAVE_ROCR
+
+bin_PROGRAMS += prov/cxi/test/cxitestrocr
+nodist_prov_cxi_test_cxitestrocr_SOURCES = \
+	prov/cxi/test/cxip_test_common.c \
+	prov/cxi/test/rocr.c
+
+prov_cxi_test_cxitestrocr_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \
+	$(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS)
+prov_cxi_test_cxitestrocr_LDFLAGS = $(cxitest_LDFLAGS) -static
+prov_cxi_test_cxitestrocr_LDADD =  $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS) -lhsa-runtime64
+
+TESTS += prov/cxi/test/cxitestrocr
+
+endif HAVE_ROCR
+
+endif HAVE_CRITERION
+
+if HAVE_CXI_DL
+pkglib_LTLIBRARIES += libcxi-fi.la
+libcxi_fi_la_SOURCES = $(_cxi_files) $(_cxi_headers) $(common_srcs)
+libcxi_fi_la_CPPFLAGS = $(cxi_CPPFLAGS)
+libcxi_fi_la_LIBADD = $(linkback) $(cxi_LIBS)
+libcxi_fi_la_LDFLAGS = $(cxi_LDFLAGS) \
+	-module -avoid-version -shared -export-dynamic
+libcxi_fi_la_DEPENDENCIES = $(linkback)
+else !HAVE_CXI_DL
+src_libfabric_la_SOURCES += $(_cxi_files) $(_cxi_headers)
+src_libfabric_la_CPPFLAGS += $(cxi_CPPFLAGS)
+src_libfabric_la_LIBADD += $(cxi_LIBS)
+src_libfabric_la_LDFLAGS += $(cxi_LDFLAGS)
+endif !HAVE_CXI_DL
+
+prov_install_man_pages += man/man7/fi_cxi.7
+
+endif HAVE_CXI
+
+prov_dist_man_pages += man/man7/fi_cxi.7
diff --git a/prov/cxi/configure.m4 b/prov/cxi/configure.m4
new file mode 100644
index 00000000000..ec50e18f33c
--- /dev/null
+++ b/prov/cxi/configure.m4
@@ -0,0 +1,153 @@
+dnl SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+dnl
+dnl Copyright 2018 Hewlett Packard Enterprise Development LP
+
+dnl CXI provider specific configuration
+
+dnl Called to configure this provider
+dnl
+dnl Arguments:
+dnl
+dnl $1: action if configured successfully
+dnl $2: action if not configured successfully
+dnl
+
+AM_CONDITIONAL([HAVE_PMI], [test "x$have_pmi" = "xtrue"])
+AM_CONDITIONAL([HAVE_ZE], [test "$have_ze" = "1" && test "$with_ze" != ""])
+AM_CONDITIONAL([HAVE_CUDA], [test "$have_cuda" = "1" && test "$with_cuda" != ""])
+AM_CONDITIONAL([HAVE_ROCR], [test "$have_rocr" = "1" && test "$with_rocr" != ""])
+
+
+AC_DEFUN([FI_CXI_CONFIGURE],[
+
+	cxi_happy=1
+
+	# Support non-standard install path for cassini headers. This is needed
+	# by libcxi.
+	AC_ARG_WITH([cassini-headers],
+		[AS_HELP_STRING([--with-cassin-headers=DIR], [Install directory for Cassini headers])],
+		[CPPFLAGS="-I$with_cassini_headers/include $CPPFLAGS"])
+
+	# Support non-standard install path for cxi kernel UAPI headers. This is
+	# needed by libcxi.
+	AC_ARG_WITH([cxi-uapi-headers],
+		[AS_HELP_STRING([--with-cxi-uapi-headers=DIR], [Install directory for kernel CXI UAPI headers])],
+		[CPPFLAGS="-I$with_cxi_uapi_headers/include $CPPFLAGS"])
+
+	# Support non-standard install path for curl. This is needed by CXI provider.
+	AC_ARG_WITH([curl],
+		[AS_HELP_STRING([--with-curl=DIR], [Install directory for curl])])
+
+	# Support non-standard install path for json-c. This is needed by CXI provider.
+	AC_ARG_WITH([json-c],
+		[AS_HELP_STRING([--with-json-c=DIR], [Install directory for json-c])])
+
+	AS_IF([test x"$enable_cxi" != x"no"],
+		[
+			AC_CHECK_HEADER(cxi_prov_hw.h,
+				[],
+				[cxi_happy=0])
+
+			AC_CHECK_HEADER(uapi/misc/cxi.h,
+				[],
+				[cxi_happy=0])
+
+			FI_CHECK_PACKAGE([libcxi],
+				[libcxi/libcxi.h],
+				[cxi],
+				[cxil_open_device],
+				[],
+				[$cxi_PREFIX],
+				[$cxi_LIBDIR],
+				[],
+				[cxi_happy=0])
+
+			cxi_CPPFLAGS=$libcxi_CPPFLAGS
+			cxi_LDFLAGS=$libcxi_LDFLAGS
+			cxi_LIBS=$libcxi_LIBS
+
+			if test "$with_cassini_headers" != "" && test "$with_cassini_headers" != "no"; then
+				cxi_CPPFLAGS="$cxi_CPPFLAGS -I$with_cassini_headers/include"
+			fi
+
+			if test "$with_cxi_uapi_headers" != "" && test "$with_cxi_uapi_headers" != "no"; then
+				cxi_CPPFLAGS="$cxi_CPPFLAGS -I$with_cxi_uapi_headers/include"
+			fi
+
+			# Add on curl if installed in non-default location.
+			if test "$with_curl" != "" && test "$with_curl" != "no"; then
+				FI_CHECK_PREFIX_DIR([$with_curl], [curl])
+			else
+				curl_PREFIX=""
+				curl_LIBDIR=""
+			fi
+
+			FI_CHECK_PACKAGE([libcurl],
+				[curl/curl.h],
+				[curl],
+				[curl_global_init],
+				[],
+				[$curl_PREFIX],
+				[$curl_LIBDIR],
+				[],
+				[cxi_happy=0])
+
+			cxi_CPPFLAGS="$cxi_CPPFLAGS $libcurl_CPPFLAGS"
+			cxi_LDFLAGS="$cxi_LDFLAGS $libcurl_LDFLAGS"
+			cxi_LIBS="$cxi_LIBS $libcurl_LIBS"
+
+			# Add on json if installed in non-default location.
+			if test "$with_json" != "" && test "$with_json" != "no"; then
+				FI_CHECK_PREFIX_DIR([$with_json], [json])
+			else
+				json_PREFIX=""
+				json_LIBDIR=""
+			fi
+
+			FI_CHECK_PACKAGE([libjson],
+				[json-c/json.h],
+				[json-c],
+				[json_object_get_type],
+				[],
+				[$json_PREFIX],
+				[$json_LIBDIR],
+				[],
+				[cxi_happy=0])
+
+			cxi_CPPFLAGS="$cxi_CPPFLAGS $libjson_CPPFLAGS"
+			cxi_LDFLAGS="$cxi_LDFLAGS $libjson_LDFLAGS"
+			cxi_LIBS="$cxi_LIBS $libjson_LIBS"
+
+			# Need to explicitly link to libmath
+			cxi_LIBS="$cxi_LIBS -lm"
+
+			AC_SUBST(cxi_CPPFLAGS)
+			AC_SUBST(cxi_LDFLAGS)
+			AC_SUBST(cxi_LIBS)
+
+			# Checks to enable cxitest
+			AS_IF([test "$with_criterion" != ""],
+				[cxitest_CPPFLAGS="-I$with_criterion/include"
+				cxitest_LDFLAGS="-L$with_criterion/lib64 -Wl,-rpath=$(realpath $with_criterion/lib64)"
+				cxitest_LIBS="-lcriterion"
+				have_criterion=true])
+			AM_CONDITIONAL([HAVE_CRITERION], [test "x$have_criterion" = "xtrue"])
+
+			AS_IF([test "$have_ze" = "1" && test "$with_ze" != "" && test x"$with_ze" != x"yes"],
+					[cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_ze/include"
+					cxitest_LDFLAGS="$cxitest_LDFLAGS -L$with_ze/lib64"])
+			AS_IF([test "$have_cuda" = "1" && test "$with_cuda" != "" && test x"$with_cuda" != x"yes"],
+					[cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_cuda/include"
+					cxitest_LDFLAGS="$cxitest_LDFLAGS -L$with_cuda/lib64"])
+			AS_IF([test "$have_rocr" = "1" && test "$with_rocr" != "" && test x"$with_rocr" != x"yes"],
+					[cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_rocr/include"
+					cxitest_LDFLAGS="$cxitest_LDFLAGS -L$with_rocr/lib"])
+
+			AC_SUBST(cxitest_CPPFLAGS)
+			AC_SUBST(cxitest_LDFLAGS)
+			AC_SUBST(cxitest_LIBS)
+		],
+		[cxi_happy=0])
+
+	AS_IF([test $cxi_happy -eq 1], [$1], [$2])
+])
diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h
new file mode 100644
index 00000000000..0a441e3bc2c
--- /dev/null
+++ b/prov/cxi/include/cxip.h
@@ -0,0 +1,3348 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved.
+ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved.
+ * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#ifndef _CXIP_PROV_H_
+#define _CXIP_PROV_H_
+
+#include <netinet/ether.h>
+#include "config.h"
+
+#include <pthread.h>
+#include <json-c/json.h>
+
+#include <rdma/fabric.h>
+#include <rdma/fi_atomic.h>
+#include <rdma/fi_cm.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_endpoint.h>
+#include <rdma/fi_eq.h>
+#include <rdma/fi_errno.h>
+#include <rdma/fi_rma.h>
+#include <rdma/fi_tagged.h>
+#include <rdma/fi_trigger.h>
+#include <semaphore.h>
+
+#include <ofi.h>
+#include <ofi_atom.h>
+#include <ofi_atomic.h>
+#include <ofi_mr.h>
+#include <ofi_enosys.h>
+#include <ofi_indexer.h>
+#include <ofi_rbuf.h>
+#include <ofi_lock.h>
+#include <ofi_list.h>
+#include <ofi_file.h>
+#include <ofi_osd.h>
+#include <ofi_util.h>
+#include <ofi_mem.h>
+#include <ofi_hmem.h>
+#include <unistd.h>
+
+#include "libcxi/libcxi.h"
+#include "cxip_faults.h"
+#include "fi_cxi_ext.h"
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#ifndef FLOOR
+#define FLOOR(a, b) ((long long)(a) - (((long long)(a)) % (b)))
+#endif
+
+#ifndef CEILING
+#define CEILING(a, b) ((long long)(a) <= 0LL ? 0 : (FLOOR((a)-1, b) + (b)))
+#endif
+
+#define CXIP_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define CXIP_ALIGN(x, a) CXIP_ALIGN_MASK(x, (typeof(x))(a) - 1)
+#define CXIP_ALIGN_DOWN(x, a) CXIP_ALIGN((x) - ((a) - 1), (a))
+
+#define CXIP_REQ_CLEANUP_TO		3000
+
+#define CXIP_BUFFER_ID_MAX		(1 << 16)
+
+/* Scalable EP not supported */
+#define CXIP_EP_MAX_CTX_BITS		0
+#define CXIP_EP_MAX_TX_CNT		(1 << CXIP_EP_MAX_CTX_BITS)
+#define CXIP_EP_MAX_RX_CNT		(1 << CXIP_EP_MAX_CTX_BITS)
+#define CXIP_EP_MAX_MSG_SZ		((1ULL << 32) - 1)
+#define CXIP_EP_MIN_MULTI_RECV		64
+#define CXIP_EP_MAX_MULTI_RECV		((1 << 24) - 1)
+
+#define CXIP_TX_COMP_MODES		(FI_INJECT_COMPLETE | \
+					 FI_TRANSMIT_COMPLETE | \
+					 FI_DELIVERY_COMPLETE | \
+					 FI_MATCH_COMPLETE)
+#define CXIP_TX_OP_FLAGS		(FI_INJECT | \
+					 FI_COMPLETION | \
+					 CXIP_TX_COMP_MODES | \
+					 FI_REMOTE_CQ_DATA | \
+					 FI_MORE | \
+					 FI_FENCE)
+#define CXIP_RX_OP_FLAGS		(FI_COMPLETION | \
+					 FI_MULTI_RECV | \
+					 FI_MORE)
+/* Invalid OP flags for RX that can be silently ignored */
+#define CXIP_RX_IGNORE_OP_FLAGS		(FI_REMOTE_CQ_DATA | \
+					 FI_INJECT)
+#define CXIP_WRITEMSG_ALLOWED_FLAGS	(FI_INJECT | \
+					 FI_COMPLETION | \
+					 FI_MORE | \
+					 FI_FENCE | \
+					 CXIP_TX_COMP_MODES)
+#define CXIP_READMSG_ALLOWED_FLAGS	(FI_COMPLETION | \
+					 FI_MORE | \
+					 FI_FENCE | \
+					 CXIP_TX_COMP_MODES)
+
+#define CXIP_AMO_MAX_IOV		1
+#define CXIP_EQ_DEF_SZ			(1 << 8)
+#define CXIP_CQ_DEF_SZ			1024U
+#define CXIP_REMOTE_CQ_DATA_SZ		8
+
+#define CXIP_PTE_IGNORE_DROPS		((1 << 24) - 1)
+#define CXIP_RDZV_THRESHOLD		2048
+#define CXIP_OFLOW_BUF_SIZE		(2*1024*1024)
+#define CXIP_OFLOW_BUF_MIN_POSTED	3
+#define CXIP_OFLOW_BUF_MAX_CACHED	(CXIP_OFLOW_BUF_MIN_POSTED * 3)
+#define CXIP_REQ_BUF_SIZE		(2*1024*1024)
+#define CXIP_REQ_BUF_MIN_POSTED		4
+#define CXIP_REQ_BUF_MAX_CACHED		0
+#define CXIP_UX_BUFFER_SIZE		(CXIP_OFLOW_BUF_MIN_POSTED * \
+					 CXIP_OFLOW_BUF_SIZE)
+
+/* When device memory is safe to access via load/store then the
+ * CPU will be used to move data below this threshold.
+ */
+#define CXIP_SAFE_DEVMEM_COPY_THRESH	4096
+
+#define CXIP_EP_PRI_CAPS \
+	(FI_RMA | FI_ATOMICS | FI_TAGGED | FI_RECV | FI_SEND | \
+	 FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE | \
+	 FI_DIRECTED_RECV | FI_MSG | FI_NAMED_RX_CTX | \
+	 FI_COLLECTIVE | FI_HMEM)
+#define CXIP_EP_SEC_CAPS \
+	(FI_SOURCE | FI_SOURCE_ERR | FI_LOCAL_COMM | \
+	 FI_REMOTE_COMM | FI_RMA_EVENT | FI_MULTI_RECV | FI_FENCE | FI_TRIGGER)
+#define CXIP_EP_CAPS (CXIP_EP_PRI_CAPS | CXIP_EP_SEC_CAPS)
+#define CXIP_DOM_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID)
+#define CXIP_CAPS (CXIP_DOM_CAPS | CXIP_EP_CAPS)
+#define CXIP_MSG_ORDER			(FI_ORDER_SAS | \
+					 FI_ORDER_WAW | \
+					 FI_ORDER_RMA_WAW | \
+					 FI_ORDER_ATOMIC_WAW | \
+					 FI_ORDER_ATOMIC_WAR | \
+					 FI_ORDER_ATOMIC_RAW | \
+					 FI_ORDER_ATOMIC_RAR)
+
+#define CXIP_EP_CQ_FLAGS \
+	(FI_SEND | FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION | \
+	 FI_COLLECTIVE)
+#define CXIP_EP_CNTR_FLAGS \
+	(FI_SEND | FI_RECV | FI_READ | FI_WRITE | FI_REMOTE_READ | \
+	 FI_REMOTE_WRITE)
+
+#define CXIP_INJECT_SIZE		C_MAX_IDC_PAYLOAD_UNR
+
+/* Max TX size of 16,384 translate to a 4MiB command queue buffer. */
+#define CXIP_MAX_TX_SIZE		16384U
+#define CXIP_DEFAULT_TX_SIZE		512U
+
+/* Some LEs need to be used for internally by CXI provider. The user facing
+ * RX size should be updated to reflect this.
+ *
+ * Note: This value is an estimation and may be too high.
+ */
+#define CXI_PROV_LE_PER_EP		1024U
+
+/* Maximum number of LEs per endpoint. */
+#define LES_PER_EP_MAX			16384U
+
+#define CXIP_MAX_RX_SIZE		(LES_PER_EP_MAX - CXI_PROV_LE_PER_EP)
+#define CXIP_DEFAULT_RX_SIZE		512U
+
+#define CXIP_MAJOR_VERSION		0
+#define CXIP_MINOR_VERSION		1
+#define CXIP_PROV_VERSION		FI_VERSION(CXIP_MAJOR_VERSION, \
+						   CXIP_MINOR_VERSION)
+#define CXIP_FI_VERSION			FI_VERSION(1, 20)
+#define CXIP_WIRE_PROTO_VERSION		1
+
+#define	CXIP_COLL_MAX_CONCUR		8
+#define	CXIP_COLL_MIN_RX_BUFS		8
+#define	CXIP_COLL_MIN_RX_SIZE		4096
+#define	CXIP_COLL_MIN_MULTI_RECV	64
+#define	CXIP_COLL_MAX_DATA_SIZE		32
+#define	CXIP_COLL_MAX_SEQNO		(1 << 10)
+// TODO adjust based on performance testing
+#define	CXIP_COLL_MIN_RETRY_USEC	1
+#define	CXIP_COLL_MAX_RETRY_USEC	32000
+#define	CXIP_COLL_MIN_TIMEOUT_USEC	1
+#define	CXIP_COLL_MAX_TIMEOUT_USEC	32000
+
+#define CXIP_REQ_BUF_HEADER_MAX_SIZE (sizeof(struct c_port_fab_hdr) + \
+	sizeof(struct c_port_unrestricted_hdr))
+#define CXIP_REQ_BUF_HEADER_MIN_SIZE (sizeof(struct c_port_fab_hdr) + \
+	sizeof(struct c_port_small_msg_hdr))
+
+extern int s_page_size;
+extern char cxip_prov_name[];
+extern struct fi_provider cxip_prov;
+extern struct util_prov cxip_util_prov;
+
+extern int cxip_cq_def_sz;
+extern int cxip_eq_def_sz;
+
+extern struct slist cxip_if_list;
+
+extern struct fi_fabric_attr cxip_fabric_attr;
+extern struct fi_domain_attr cxip_domain_attr;
+extern struct fi_ep_attr cxip_ep_attr;
+extern struct fi_tx_attr cxip_tx_attr;
+extern struct fi_rx_attr cxip_rx_attr;
+
+enum cxip_ats_mlock_mode {
+	CXIP_ATS_MLOCK_OFF,
+	CXIP_ATS_MLOCK_CACHE,
+	CXIP_ATS_MLOCK_ALL,
+};
+
+enum cxip_llring_mode {
+	CXIP_LLRING_NEVER,
+	CXIP_LLRING_IDLE,
+	CXIP_LLRING_ALWAYS,
+};
+
+enum cxip_ep_ptle_mode {
+	CXIP_PTLTE_HARDWARE_MODE,
+	CXIP_PTLTE_DEFAULT_MODE = CXIP_PTLTE_HARDWARE_MODE,
+	CXIP_PTLTE_SOFTWARE_MODE,
+	CXIP_PTLTE_HYBRID_MODE,
+};
+
+enum cxip_rdzv_proto {
+	CXIP_RDZV_PROTO_DEFAULT,	/* unrestricted gets */
+	CXIP_RDZV_PROTO_ALT_READ,	/* restricted gets */
+	CXIP_RDZV_PROTO_ALT_WRITE,	/* restricted puts */
+};
+
+const char *cxip_rdzv_proto_to_str(enum cxip_rdzv_proto proto);
+
+struct cxip_environment {
+	/* Translation */
+	int odp;
+	int force_odp;
+	int ats;
+	int iotlb;
+	enum cxip_ats_mlock_mode ats_mlock_mode;
+
+	/* Messaging */
+	int fork_safe_requested;
+	enum cxip_ep_ptle_mode rx_match_mode;
+	int msg_offload;
+	int hybrid_preemptive;
+	int hybrid_recv_preemptive;
+	size_t rdzv_threshold;
+	size_t rdzv_get_min;
+	size_t rdzv_eager_size;
+	int rdzv_aligned_sw_rget;
+	int disable_non_inject_msg_idc;
+	int disable_host_register;
+	size_t oflow_buf_size;
+	size_t oflow_buf_min_posted;
+	size_t oflow_buf_max_cached;
+	size_t safe_devmem_copy_threshold;
+	size_t req_buf_size;
+	size_t req_buf_min_posted;
+	size_t req_buf_max_cached;
+	int sw_rx_tx_init_max;
+	int msg_lossless;
+	size_t default_cq_size;
+	size_t default_tx_size;
+	size_t default_rx_size;
+	int optimized_mrs;
+	int prov_key_cache;
+	int mr_match_events;
+	int disable_eq_hugetlb;
+	int zbcoll_radix;
+
+	enum cxip_llring_mode llring_mode;
+
+	int cq_policy;
+
+	size_t default_vni;
+
+	size_t eq_ack_batch_size;
+	int fc_retry_usec_delay;
+	size_t ctrl_rx_eq_max_size;
+	char *device_name;
+	size_t cq_fill_percent;
+	int enable_unrestricted_end_ro;
+	int rget_tc;
+	int cacheline_size;
+
+	char *coll_job_id;
+	char *coll_job_step_id;
+	size_t coll_retry_usec;
+	size_t coll_timeout_usec;
+	char *coll_fabric_mgr_url;
+	char *coll_mcast_token;
+	size_t hwcoll_addrs_per_job;
+	size_t hwcoll_min_nodes;
+	int coll_use_dma_put;
+
+	char hostname[255];
+	char *telemetry;
+	int telemetry_rgid;
+	int disable_hmem_dev_register;
+	int ze_hmem_supported;
+	enum cxip_rdzv_proto  rdzv_proto;
+	int enable_trig_op_limit;
+	int hybrid_posted_recv_preemptive;
+	int hybrid_unexpected_msg_preemptive;
+};
+
+extern struct cxip_environment cxip_env;
+
+static inline bool cxip_software_pte_allowed(void)
+{
+	return cxip_env.rx_match_mode != CXIP_PTLTE_HARDWARE_MODE;
+}
+
+/*
+ * The CXI Provider Address format.
+ *
+ * A Cassini NIC Address and PID identify a libfabric Endpoint.  Cassini
+ * borrows the name 'PID' from Portals. In CXI, a process can allocate several
+ * PID values.
+ *
+ * The PID value C_PID_ANY is reserved. When used, the library auto-assigns
+ * a free PID value. A PID value is assigned when network resources are
+ * allocated. Libfabric clients can achieve this by not specifying a 'service'
+ * in a call to fi_getinfo() or by not setting src_addr in the fi_info
+ * structure used to allocate an Endpoint.
+ */
+struct cxip_addr {
+	uint32_t pid		: C_DFA_PID_BITS_MAX;
+	uint32_t nic		: C_DFA_NIC_BITS;
+	uint16_t vni;
+};
+
+#define CXIP_ADDR_EQUAL(a, b) ((a).nic == (b).nic && (a).pid == (b).pid)
+
+/*
+ * A PID contains "pid_granule" logical endpoints. The PID granule is set per
+ * device and can be found in libCXI devinfo. The default pid_granule is 256.
+ * These endpoints are partitioned by the provider for the following use:
+ *
+ * 0       RX Queue PtlTE
+ * 16      Collective PtlTE entry
+ * 17-116  Optimized write MR PtlTEs 0-99
+ *         For Client specified keys:
+ *           17-116 Non-cached optimized write MR PtlTEs 0-99
+ *         For Provider specified keys:
+ *           17-24 Cached optimized write MR PtlTEs 0-7
+ *           25-116 Non-cached optimized write MR PtlTEs 8-99
+ * 117     Standard client/provider cached/non-cached write MR
+ *         PtlTE / Control messaging
+ * 127     Rendezvous destination write PtlTE
+ * 128-227 Optimized read MR PtlTEs 0-99
+ *         For Client specified keys:
+ *           128-227 Non-cached optimized read MR PtlTEs 0-99
+ *         For Provider specified keys:
+ *           128-135 Cached optimized read MR PtlTEs 0-7
+ *           136-227 Non-cached optimized read MR PtlTEs 8-99
+ * 228     Standard client or provider cached/non-cached read MR
+ *         PtlTE
+ * 229-237 Rendezvous restricted read PtlTE (TODO consider merge with MR)
+ * 255     Rendezvous source PtlTE
+ *
+ * Note: Any logical endpoint within a PID granule that issues unrestricted Puts
+ * MUST be within the logical endpoint range 0 - 127 and unrestricted Gets MUST
+ * be within the logical endpoint range 128 - 255.
+ */
+#define CXIP_PTL_IDX_RXQ				0
+#define CXIP_PTL_IDX_WRITE_MR_OPT_BASE			17
+#define CXIP_PTL_IDX_READ_MR_OPT_BASE			128
+#define CXIP_PTL_IDX_MR_OPT_CNT				100
+#define CXIP_PTL_IDX_PROV_NUM_CACHE_IDX			8
+#define CXIP_PTL_IDX_PROV_MR_OPT_CNT				\
+	(CXIP_PTL_IDX_MR_OPT_CNT - CXIP_PTL_IDX_PROV_NUM_CACHE_IDX)
+
+/* Map non-cached optimized MR keys (client or FI_MR_PROV_KEY)
+ * to appropriate PTL index.
+ */
+#define CXIP_MR_PROV_KEY_MASK ((1ULL << 61) - 1)
+#define CXIP_MR_PROV_KEY_ID_MASK ((1ULL << 16) - 1)
+#define CXIP_MR_UNCACHED_KEY_TO_IDX(key) ((key) & CXIP_MR_PROV_KEY_ID_MASK)
+#define CXIP_PTL_IDX_WRITE_MR_OPT(key)		\
+	(CXIP_PTL_IDX_WRITE_MR_OPT_BASE +	\
+	 CXIP_MR_UNCACHED_KEY_TO_IDX(key))
+#define CXIP_PTL_IDX_READ_MR_OPT(key)		\
+	(CXIP_PTL_IDX_READ_MR_OPT_BASE +	\
+	 CXIP_MR_UNCACHED_KEY_TO_IDX(key))
+
+/* Map cached FI_MR_PROV_KEY optimized MR LAC to Index */
+#define CXIP_PTL_IDX_WRITE_PROV_CACHE_MR_OPT(lac)		\
+	(CXIP_PTL_IDX_WRITE_MR_OPT_BASE + (lac))
+#define CXIP_PTL_IDX_READ_PROV_CACHE_MR_OPT(lac)		\
+	(CXIP_PTL_IDX_READ_MR_OPT_BASE + (lac))
+
+#define CXIP_PTL_IDX_WRITE_MR_STD		117
+#define CXIP_PTL_IDX_RDZV_DEST			127
+#define CXIP_PTL_IDX_COLL			6
+#define CXIP_PTL_IDX_CTRL			CXIP_PTL_IDX_WRITE_MR_STD
+#define CXIP_PTL_IDX_READ_MR_STD		228
+#define CXIP_PTL_IDX_RDZV_RESTRICTED_BASE	229
+#define CXIP_PTL_IDX_RDZV_RESTRICTED(lac)			\
+	(CXIP_PTL_IDX_RDZV_RESTRICTED_BASE + (lac))
+
+#define CXIP_PTL_IDX_RDZV_SRC			255
+
+/* The CXI provider supports both provider specified MR keys
+ * (FI_MR_PROV_KEY MR mode) and client specified keys on a per-domain
+ * basis.
+ *
+ * User specified keys:
+ * Hardware resources limit the number of active keys to 16 bits.
+ * Key size is 32-bit so there are only 64K unique keys.
+ *
+ * Provider specified keys:
+ * The key size is 64-bits and is separated from the MR hardware
+ * resources such that the associated MR can be cached if the
+ * following criteria are met:
+ *
+ *     - The associated memory region is non-zero in length
+ *     - The associated memory region mapping is cached
+ *     - The MR is not bound to a counter
+ *
+ * Optimized caching is preferred by default.
+ * TODO: Fallback to standard optimized if PTE can not be allocated.
+ *
+ * FI_MR_PROV_KEY MR are associated with a unique domain wide
+ * 16-bit buffer ID, reducing the overhead of maintaining keys.
+ * Provider keys should always be preferred over client keys
+ * unless well known keys are not exchanged between peers.
+ */
+#define CXIP_MR_KEY_SIZE sizeof(uint32_t)
+#define CXIP_MR_KEY_MASK ((1ULL << (8 * CXIP_MR_KEY_SIZE)) - 1)
+#define CXIP_MR_VALID_OFFSET_MASK ((1ULL << 56) - 1)
+
+/* For provider defined keys we define a 64 bit MR key that maps
+ * to provider required information.
+ */
+struct cxip_mr_key {
+	union {
+		/* Provider generated standard cached */
+		struct {
+			uint64_t lac	: 3;
+			uint64_t lac_off: 58;
+			uint64_t opt	: 1;
+			uint64_t cached	: 1;
+			uint64_t unused1: 1;
+			/* shares CXIP_CTRL_LE_TYPE_MR */
+		};
+		/* Client or Provider non-cached */
+		struct {
+			uint64_t key	: 61;
+			uint64_t unused2: 3;
+			/* Provider shares opt */
+			/* Provider shares cached == 0 */
+			/* Provider shares CXIP_CTRL_LE_TYPE_MR */
+		};
+		/* Provider Key Only */
+		struct {
+			/* Non-cached key consists of unique MR ID and sequence
+			 * number. The same MR ID can be used with sequence
+			 * number to create 2^44 unique keys. That is, a
+			 * single standard MR repeatedly created and destroyed
+			 * every micro-second, would take months before
+			 * it repeated.
+			 */
+			uint64_t id     : 16;  /* Unique - 64K MR */
+			uint64_t seqnum : 44;  /* Sequence with random seed */
+			uint64_t events : 1;   /* Requires event generation */
+			uint64_t unused3: 2;
+			uint64_t is_prov: 1;
+			/* Overloads CXIP_CTRL_LE_TYPE_MR and must be cleared
+			 * before appending MR LE or TX using in match bits.
+			 */
+		};
+		uint64_t raw;
+	};
+};
+
+#define CXIP_MR_PROV_KEY_SIZE sizeof(struct cxip_mr_key)
+#define CXIP_NUM_CACHED_KEY_LE 8
+
+struct cxip_domain;
+struct cxip_mr_domain;
+struct cxip_mr;
+
+/* CXI provider MR operations that are specific for the MR
+ * based on MR key type and caching.
+ */
+struct cxip_mr_util_ops {
+	bool is_cached;
+	int (*init_key)(struct cxip_mr *mr, uint64_t req_key);
+	int (*enable_opt)(struct cxip_mr *mr);
+	int (*disable_opt)(struct cxip_mr *mr);
+	int (*enable_std)(struct cxip_mr *mr);
+	int (*disable_std)(struct cxip_mr *mr);
+};
+
+struct cxip_ep_obj;
+
+/*
+ * cxip_ctrl_mr_cache_flush() - Flush LE associated with remote MR cache.
+ */
+void cxip_ctrl_mr_cache_flush(struct cxip_ep_obj *ep_obj);
+
+/*
+ * cxip_adjust_remote_offset() - Update address with the appropriate offset
+ * for key.
+ */
+static inline
+uint64_t cxip_adjust_remote_offset(uint64_t *addr, uint64_t key)
+{
+	struct cxip_mr_key cxip_key = {
+		.raw = key,
+	};
+
+	if (cxip_key.cached) {
+		*addr += cxip_key.lac_off;
+		if (*addr & ~CXIP_MR_VALID_OFFSET_MASK)
+			return -FI_EINVAL;
+	}
+	return FI_SUCCESS;
+}
+
+int cxip_generic_mr_key_to_ptl_idx(struct cxip_domain *dom,
+				   uint64_t key, bool write);
+bool cxip_generic_is_mr_key_opt(uint64_t key);
+bool cxip_generic_is_mr_key_events(uint64_t caps, uint64_t key);
+bool cxip_generic_is_valid_mr_key(uint64_t key);
+
+/* Messaging Match Bit layout */
+#define CXIP_TX_ID_WIDTH	11
+#define CXIP_TAG_WIDTH		48
+#define CXIP_RDZV_ID_CMD_WIDTH	8
+#define CXIP_RDZV_ID_HIGH_WIDTH 7
+#define CXIP_TOTAL_RDZV_ID_WIDTH (CXIP_RDZV_ID_CMD_WIDTH +	\
+				  CXIP_RDZV_ID_HIGH_WIDTH)
+#define CXIP_TAG_MASK		((1UL << CXIP_TAG_WIDTH) - 1)
+
+/* Define several types of LEs */
+enum cxip_le_type {
+	CXIP_LE_TYPE_RX = 0,	/* RX data LE */
+	CXIP_LE_TYPE_ZBP,	/* Zero-byte Put control message LE. Used to
+				 * exchange data in the EQ header_data and
+				 * match_bits fields. Unexpected headers are
+				 * disabled.
+				 */
+};
+
+enum cxip_ctrl_le_type {
+	CXIP_CTRL_LE_TYPE_MR = 0,	/* Memory Region LE */
+	CXIP_CTRL_LE_TYPE_CTRL_MSG,	/* Control Message LE */
+};
+
+enum cxip_ctrl_msg_type {
+	CXIP_CTRL_MSG_FC_NOTIFY = 0,
+	CXIP_CTRL_MSG_FC_RESUME,
+	CXIP_CTRL_MSG_ZB_DATA,
+};
+
+union cxip_match_bits {
+	struct {
+		uint64_t tag        : CXIP_TAG_WIDTH; /* User tag value */
+		uint64_t tx_id      : CXIP_TX_ID_WIDTH; /* Prov. tracked ID */
+		uint64_t cq_data    : 1;  /* Header data is valid */
+		uint64_t tagged     : 1;  /* Tagged API */
+		uint64_t match_comp : 1;  /* Notify initiator on match */
+		uint64_t rdzv_done  : 1;  /* Notify initiator when rdzv done */
+		uint64_t le_type    : 1;
+	};
+	/* Rendezvous protocol request, overloads match_comp and rdzv_done
+	 * to specify requested protocol.
+	 */
+	struct {
+		uint64_t pad0       : 61;
+		uint64_t rdzv_proto : 2;
+		uint64_t pad1       : 1;
+	};
+	/* Split TX ID for rendezvous operations. */
+	struct {
+		uint64_t pad2       : CXIP_TAG_WIDTH; /* User tag value */
+		uint64_t rdzv_id_hi : CXIP_RDZV_ID_HIGH_WIDTH;
+		uint64_t rdzv_lac   : 4;  /* Rendezvous Get LAC */
+	};
+	struct {
+		uint64_t rdzv_id_lo : CXIP_RDZV_ID_CMD_WIDTH;
+	};
+	/* Control LE match bit format for notify/resume */
+	struct {
+		uint64_t txc_id       : 8;
+		uint64_t rxc_id       : 8;
+		uint64_t drops        : 16;
+		uint64_t pad3         : 29;
+		uint64_t ctrl_msg_type: 2;
+		uint64_t ctrl_le_type : 1;
+	};
+	/* Control LE match bit format for zbcollectives */
+	struct {
+		uint64_t zb_data       :61;
+		uint64_t zb_pad        : 3;
+		/* shares ctrl_le_type == CXIP_CTRL_LE_TYPE_CTRL_MSG
+		 * shares ctrl_msg_type == CXIP_CTRL_MSG_ZB_BCAST
+		 */
+	};
+	/* Control LE match bit format for cached MR */
+	struct {
+		uint64_t mr_lac		: 3;
+		uint64_t mr_lac_off	: 58;
+		uint64_t mr_opt		: 1;
+		uint64_t mr_cached	: 1;
+		uint64_t mr_unused	: 1;
+		/* shares ctrl_le_type == CXIP_CTRL_LE_TYPE_MR */
+	};
+	struct {
+		uint64_t mr_key		: 61;
+		uint64_t mr_pad		: 3;
+		/* shares mr_opt
+		 * shares mr_cached == 0
+		 * shares ctrl_le_type == CXIP_CTRL_LE_TYPE_MR
+		 */
+	};
+	struct {
+		uint64_t unused2	: 63;
+		uint64_t is_prov	: 1;
+		/* Indicates provider generated key and shares ctrl_le_type ==
+		 * CXIP_CTRL_LE_TYPE_MR so it must be cleared before matching.
+		 */
+	};
+	uint64_t raw;
+};
+#define CXIP_IS_PROV_MR_KEY_BIT (1ULL << 63)
+#define CXIP_KEY_MATCH_BITS(key) ((key) & ~CXIP_IS_PROV_MR_KEY_BIT)
+
+/* libcxi Wrapper Structures */
+
+#define CXI_PLATFORM_ASIC 0
+#define CXI_PLATFORM_NETSIM 1
+#define CXI_PLATFORM_Z1 2
+#define CXI_PLATFORM_FPGA 3
+
+/*
+ * CXI Device wrapper
+ *
+ * There will be one of these for every local Cassini device on the node.
+ */
+struct cxip_if {
+	struct slist_entry if_entry;
+
+	/* Device description */
+	struct cxil_devinfo *info;
+	int speed;
+	int link;
+
+	struct cxil_dev *dev;
+
+	/* PtlTEs (searched during state change events) */
+	struct dlist_entry ptes;
+
+	ofi_atomic32_t ref;
+	ofi_spin_t lock;
+};
+
+/*
+ * CXI communication profile wrapper.
+ *
+ * The wrapper is used to remap user requested traffic class to a communication
+ * profile which actually can be allocated.
+ */
+struct cxip_remap_cp {
+	struct dlist_entry remap_entry;
+	struct cxi_cp remap_cp;
+	struct cxi_cp *hw_cp;
+};
+
+/*
+ * CXI Logical Network Interface (LNI) wrapper
+ *
+ * An LNI is a container used allocate resources from a NIC.
+ */
+struct cxip_lni {
+	struct cxip_if *iface;
+	struct cxil_lni *lni;
+
+	/* Hardware communication profiles */
+	struct cxi_cp *hw_cps[16];
+	int n_cps;
+
+	/* Software remapped communication profiles. */
+	struct dlist_entry remap_cps;
+
+	ofi_spin_t lock;
+};
+
+/* A portals table define a network endpoint address. The endpoint address is
+ * a {NIC + PID} and this can be configured against multiple VNIs
+ */
+struct cxip_portals_table {
+	struct cxip_lni *lni;
+	uint32_t pid;
+	struct cxil_domain **doms;
+	size_t doms_count;
+};
+
+int cxip_portals_table_alloc(struct cxip_lni *lni, uint16_t *vni,
+			     size_t vni_count, uint32_t pid,
+			     struct cxip_portals_table **ptable);
+void cxip_portals_table_free(struct cxip_portals_table *ptable);
+
+struct cxip_pte_map_entry {
+        struct dlist_entry entry;
+        struct cxil_pte_map *map;
+};
+
+/*
+ * CXI Portal Table Entry (PtlTE) wrapper
+ *
+ * Represents PtlTE mapped in a CXI domain.
+ */
+struct cxip_pte {
+	struct dlist_entry pte_entry;
+	struct cxip_portals_table *ptable;
+	struct cxil_pte *pte;
+	enum c_ptlte_state state;
+	struct dlist_entry map_list;
+
+	void (*state_change_cb)(struct cxip_pte *pte,
+				const union c_event *event);
+	void *ctx;
+};
+
+/*
+ * CXI Command Queue wrapper
+ */
+struct cxip_cmdq {
+	struct cxi_cq *dev_cmdq;
+	struct c_cstate_cmd c_state;
+	enum cxip_llring_mode llring_mode;
+
+	struct cxi_cp *cur_cp;
+	struct cxip_lni *lni;
+};
+
+int cxip_cmdq_emit_idc_put(struct cxip_cmdq *cmdq,
+			   const struct c_cstate_cmd *c_state,
+			   const struct c_idc_put_cmd *put, const void *buf,
+			   size_t len, uint64_t flags);
+int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma,
+		       uint64_t flags);
+int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq,
+			   const struct c_cstate_cmd *c_state,
+			   const struct c_idc_amo_cmd *amo, uint64_t flags,
+			   bool fetching, bool flush);
+int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo,
+			   uint64_t flags, bool fetching, bool flush);
+
+/* OFI Provider Structures */
+
+/*
+ * CXI Provider Fabric object
+ */
+struct cxip_fabric {
+	struct util_fabric util_fabric;
+	ofi_atomic32_t ref;
+};
+
+/*
+ * CXI Provider Memory Descriptor
+ */
+struct cxip_md {
+	struct cxip_domain *dom;
+	struct cxi_md *md;
+	struct ofi_mr_info info;
+	uint64_t handle;
+	bool handle_valid;
+	bool cached;
+};
+
+#define CXIP_MR_DOMAIN_HT_BUCKETS 16
+
+struct cxip_mr_domain {
+	struct dlist_entry buckets[CXIP_MR_DOMAIN_HT_BUCKETS];
+	ofi_spin_t lock;
+};
+
+void cxip_mr_domain_init(struct cxip_mr_domain *mr_domain);
+void cxip_mr_domain_fini(struct cxip_mr_domain *mr_domain);
+
+struct cxip_telemetry {
+	struct cxip_domain *dom;
+
+	/* List of telemetry entries to being monitored. */
+	struct dlist_entry telemetry_list;
+};
+
+void cxip_telemetry_dump_delta(struct cxip_telemetry *telemetry);
+void cxip_telemetry_free(struct cxip_telemetry *telemetry);
+int cxip_telemetry_alloc(struct cxip_domain *dom,
+			 struct cxip_telemetry **telemetry);
+
+#define TELEMETRY_ENTRY_NAME_SIZE 64U
+
+struct cxip_telemetry_entry {
+	struct cxip_telemetry *telemetry;
+	struct dlist_entry telemetry_entry;
+
+	/* Telemetry name. */
+	char name[TELEMETRY_ENTRY_NAME_SIZE];
+
+	/* Telemetry value. */
+	unsigned long value;
+};
+
+/*
+ * CXI Provider Domain object
+ */
+struct cxip_domain {
+	struct util_domain util_domain;
+	struct cxip_fabric *fab;
+	ofi_spin_t lock;
+	ofi_atomic32_t ref;
+
+	uint32_t tclass;
+
+	struct cxip_eq *eq; //unused
+	struct cxip_eq *mr_eq; //unused
+
+	/* Assigned NIC address */
+	uint32_t nic_addr;
+
+	/* Device info */
+	struct cxip_if *iface;
+
+	/* Device partition */
+	struct cxip_lni *lni;
+
+	/* Trigger and CT support */
+	struct cxip_cmdq *trig_cmdq;
+	struct ofi_genlock trig_cmdq_lock;
+	bool cntr_init;
+
+	/* Provider generated RKEYs, else client */
+	bool is_prov_key;
+
+	/* Can disable caching of provider generated RKEYs */
+	bool prov_key_cache;
+
+	/* Provider generated RKEYs optimized MR disablement/enablement */
+	bool optimized_mrs;
+
+	/* Enable MR match event counting enables a more robust
+	 * MR when using FI_MR_PROV_KEY. It disables hardware cached
+	 * MR keys and ensures memory backing a MR cannot be
+	 * remotely accessed even if that memory remains in the
+	 * libfabric MR cache.
+	 */
+	bool mr_match_events;
+
+	/* Domain wide MR resources.
+	 *   Req IDs are control buffer IDs to map MR or MR cache to an LE.
+	 *   MR IDs are used by non-cached provider key MR to decouple the
+	 *   MR and Req ID, and do not map directly to the MR LE.
+	 */
+	ofi_spin_t ctrl_id_lock;
+	struct indexer req_ids;
+	struct indexer mr_ids;
+
+	/* If FI_MR_PROV_KEY is not cached, keys include a sequence number
+	 * to reduce the likelyhood of a stale key being used to access
+	 * a recycled MR key.
+	 */
+	uint32_t prov_key_seqnum;
+
+	/* Translation cache */
+	struct ofi_mr_cache iomm;
+	bool odp;
+	bool ats;
+	bool hmem;
+
+	/* ATS translation support */
+	struct cxip_md scalable_md;
+	bool scalable_iomm;
+	bool rocr_dev_mem_only;
+
+	/* Domain state */
+	bool enabled;
+
+	/* List of allocated resources used for deferred work queue processing.
+	 */
+	struct dlist_entry txc_list;
+	struct dlist_entry cntr_list;
+	struct dlist_entry cq_list;
+
+	struct fi_hmem_override_ops hmem_ops;
+	bool hybrid_mr_desc;
+
+	/* Container of in-use MRs against this domain. */
+	struct cxip_mr_domain mr_domain;
+
+	/* Counters collected for the duration of the domain existence. */
+	struct cxip_telemetry *telemetry;
+
+	/* NIC AMO operation which is remapped to a PCIe operation. */
+	int amo_remap_to_pcie_fadd;
+
+	/* Maximum number of triggered operations configured for the service
+	 * ID.
+	 */
+	int max_trig_op_in_use;
+	sem_t *trig_op_lock;
+
+	/* Domain has been configured with FI_AV_AUTH_KEY. */
+	bool av_auth_key;
+
+	/* This is only valid if FI_AV_AUTH_KEY is false. */
+	struct cxi_auth_key auth_key;
+
+	/* Maximum number of auth keys requested by user. */
+	size_t auth_key_entry_max;
+
+	/* Domain has been configured with FI_AV_USER_ID. */
+	bool av_user_id;
+};
+
+static inline bool cxip_domain_mr_cache_enabled(struct cxip_domain *dom)
+{
+	return dom->iomm.domain == &dom->util_domain;
+}
+
+static inline bool cxip_domain_mr_cache_iface_enabled(struct cxip_domain *dom,
+						      enum fi_hmem_iface iface)
+{
+	return cxip_domain_mr_cache_enabled(dom) && dom->iomm.monitors[iface];
+}
+
+int cxip_domain_valid_vni(struct cxip_domain *dom, unsigned int vni);
+
+/* This structure implies knowledge about the breakdown of the NIC address,
+ * which is taken from the AMA, that the provider does not know in a flexible
+ * way. However, the domain fi_open_ops() API includes a topology function
+ * that requires knowledge of the address breakdown into topology components.
+ * TODO: Research a less restricted way to get this information.
+ */
+#define CXIP_ADDR_PORT_BITS 6
+#define CXIP_ADDR_SWITCH_BITS 5
+#define CXIP_ADDR_GROUP_BITS 9
+#define CXIP_ADDR_FATTREE_PORT_BITS 6
+#define CXIP_ADDR_FATTREE_SWITCH_BITS 14
+
+struct cxip_topo_addr {
+	union {
+		uint32_t addr;
+		struct {
+			uint32_t port_num:CXIP_ADDR_PORT_BITS;
+			uint32_t switch_num:CXIP_ADDR_SWITCH_BITS;
+			uint32_t group_num:CXIP_ADDR_GROUP_BITS;
+		} dragonfly;
+		struct {
+			uint32_t port_num:CXIP_ADDR_FATTREE_PORT_BITS;
+			uint32_t switch_num:CXIP_ADDR_FATTREE_SWITCH_BITS;
+		} fat_tree;
+	};
+};
+
+static inline ssize_t
+cxip_copy_to_hmem_iov(struct cxip_domain *domain, enum fi_hmem_iface hmem_iface,
+		      uint64_t device, const struct iovec *hmem_iov,
+		      size_t hmem_iov_count, uint64_t hmem_iov_offset,
+		      const void *src, size_t size)
+{
+	return domain->hmem_ops.copy_to_hmem_iov(hmem_iface, device, hmem_iov,
+						 hmem_iov_count,
+						 hmem_iov_offset, src, size);
+}
+
+/*
+ *  Event Queue
+ *
+ *  libfabric fi_eq implementation.
+ *
+ *  Created in cxip_eq_open().
+ */
+struct cxip_eq {
+	struct util_eq util_eq;
+	struct fi_eq_attr attr;
+	struct dlist_entry ep_list;
+	ofi_mutex_t list_lock;
+};
+
+#define CXIP_EQ_MAP_FLAGS \
+	(CXI_MAP_WRITE | CXI_MAP_PIN | CXI_MAP_IOVA_ALLOC)
+
+/*
+ * RMA request
+ *
+ * Support structures, accumulated in a union.
+ */
+struct cxip_req_rma {
+	struct cxip_txc *txc;
+	struct cxip_md *local_md;	// RMA target buffer
+	void *ibuf;
+};
+
+struct cxip_req_amo {
+	struct cxip_txc *txc;
+	struct cxip_md *result_md;
+	struct cxip_md *oper1_md;
+	char result[16];
+	char oper1[16];
+	bool tmp_result;
+	bool tmp_oper1;
+	void *ibuf;
+	bool fetching_amo_flush;
+	uint8_t fetching_amo_flush_event_count;
+	unsigned int fetching_amo_flush_event_rc;
+	struct cxip_cntr *fetching_amo_flush_cntr;
+};
+
+/* Used with receive request to maintain state associated
+ * with MQD support for dumping unexpected messages.
+ */
+struct cxip_ux_dump_state {
+	bool done;
+
+	size_t max_count;	/* Number entries/src_addr provided */
+	size_t ret_count;	/* Number of UX entries returned */
+	size_t ux_count;	/* Total UX entries available */
+
+	struct fi_cq_tagged_entry *entry;
+	fi_addr_t *src_addr;
+};
+
+struct cxip_req_recv {
+	/* Receive parameters */
+	struct dlist_entry rxc_entry;
+	struct cxip_rxc *rxc;		// receive context
+	struct cxip_cntr *cntr;
+	void *recv_buf;			// local receive buffer
+	struct cxip_md *recv_md;	// local receive MD
+	uint32_t ulen;			// User buffer length
+	bool tagged;
+	uint64_t tag;
+	uint64_t ignore;
+	uint32_t match_id;
+	uint64_t flags;
+
+	/* FI_CLAIM work around to hold UX remote offsets for duration of
+	 * H/W UX entry matching and deletion. Array of 8-byte unexpected
+	 * headers remote offsets, and current remote offset used when
+	 * processing search results to match remote offsets.
+	 */
+	uint64_t *ule_offsets;
+	uint64_t ule_offset;
+	unsigned int num_ule_offsets;
+	unsigned int cur_ule_offsets;
+	bool offset_found;
+
+	/* UX list dump state */
+	struct cxip_ux_dump_state *ux_dump;
+
+	/* Control info */
+	int rc;				// DMA return code
+	uint32_t rlen;			// Send length
+	uint64_t oflow_start;		// Overflow buffer address
+	uint16_t vni;			// VNI operation came in on
+	uint32_t initiator;		// DMA initiator address
+	uint32_t rdzv_id;		// DMA initiator rendezvous ID
+	uint8_t rdzv_lac;		// Rendezvous source LAC
+	bool done_notify;		// Must send done notification
+	enum cxip_rdzv_proto rdzv_proto;
+	int rdzv_events;		// Processed rdzv event count
+	enum c_event_type rdzv_event_types[4];
+	uint32_t rdzv_initiator;	// Rendezvous initiator used for mrecvs
+	uint32_t rget_nic;
+	uint32_t rget_pid;
+	bool software_list;		// Appended to HW or SW
+	bool canceled;			// Request canceled?
+	bool unlinked;
+	bool multi_recv;
+	bool tgt_event;
+	uint64_t start_offset;
+	uint64_t mrecv_bytes;
+	uint64_t mrecv_unlink_bytes;
+	bool auto_unlinked;
+	bool hw_offloaded;
+	struct cxip_req *parent;
+	struct dlist_entry children;
+	uint64_t src_offset;
+	uint16_t rdzv_mlen;
+};
+
+struct cxip_req_send {
+	/* Send parameters */
+	struct cxip_txc *txc;
+	struct cxip_cntr *cntr;
+	const void *buf;		// local send buffer
+	size_t len;			// request length
+	struct cxip_md *send_md;	// send buffer memory descriptor
+	struct cxip_addr caddr;
+	fi_addr_t dest_addr;
+	bool tagged;
+	uint32_t tclass;
+	uint64_t tag;
+	uint64_t data;
+	uint64_t flags;
+	void *ibuf;
+
+	/* Control info */
+	struct dlist_entry txc_entry;
+	struct cxip_fc_peer *fc_peer;
+	union {
+		int rdzv_id;		// SW RDZV ID for long messages
+		int tx_id;
+	};
+	int rc;				// DMA return code
+	int rdzv_send_events;		// Processed event count
+};
+
+struct cxip_req_rdzv_src {
+	struct dlist_entry list;
+	struct cxip_txc *txc;
+	uint32_t lac;
+	int rc;
+};
+
+struct cxip_req_search {
+	struct cxip_rxc *rxc;
+	bool complete;
+	int puts_pending;
+};
+
+struct cxip_req_coll {
+	struct cxip_coll_pte *coll_pte;
+	struct cxip_coll_buf *coll_buf;
+	uint32_t mrecv_space;
+	size_t hw_req_len;
+	bool isred;
+	enum c_return_code cxi_rc;
+};
+
+enum cxip_req_type {
+	CXIP_REQ_RMA,
+	CXIP_REQ_AMO,
+	CXIP_REQ_OFLOW,
+	CXIP_REQ_RECV,
+	CXIP_REQ_SEND,
+	CXIP_REQ_RDZV_SRC,
+	CXIP_REQ_SEARCH,
+	CXIP_REQ_COLL,
+	CXIP_REQ_RBUF,
+};
+
+/*
+ * Async Request
+ *
+ * Support structure.
+ *
+ * Created in cxip_cq_req_alloc().
+ *
+ * This implements an async-request/callback mechanism. It uses the libfabric
+ * utility pool, which provides a pool of reusable memory objects that supports
+ * a fast lookup through the req_id index value, and can be bound to a CQ.
+ *
+ * The request is allocated and bound to the CQ, and then the command is
+ * issued. When the completion queue signals completion, this request is found,
+ * and the callback function is called.
+ */
+struct cxip_req {
+	/* Control info */
+	struct dlist_entry evtq_entry;
+	void *req_ctx;
+	struct cxip_cq *cq;		// request CQ
+	struct cxip_evtq *evtq;		// request event queue
+	int req_id;			// fast lookup in index table
+	int (*cb)(struct cxip_req *req, const union c_event *evt);
+					// completion event callback
+	bool discard;
+
+	/* Triggered related fields. */
+	bool triggered;
+	uint64_t trig_thresh;
+	struct cxip_cntr *trig_cntr;
+
+	/* CQ event fields, set according to fi_cq.3
+	 *   - set by provider
+	 *   - returned to user in completion event
+	 */
+	uint64_t context;
+	uint64_t flags;
+	uint64_t data_len;
+	uint64_t buf;
+	uint64_t data;
+	uint64_t tag;
+	fi_addr_t addr;
+
+	/* Request parameters */
+	enum cxip_req_type type;
+	union {
+		struct cxip_req_rma rma;
+		struct cxip_req_amo amo;
+		struct cxip_req_recv recv;
+		struct cxip_req_send send;
+		struct cxip_req_rdzv_src rdzv_src;
+		struct cxip_req_search search;
+		struct cxip_req_coll coll;
+	};
+};
+
+static inline bool cxip_is_trig_req(struct cxip_req *req)
+{
+	return req->trig_cntr != NULL;
+}
+
+struct cxip_ctrl_req_mr {
+	struct cxip_mr *mr;
+};
+
+struct cxip_ctrl_send {
+	uint32_t nic_addr;
+	uint32_t pid;
+	union cxip_match_bits mb;
+};
+
+struct cxip_ctrl_req {
+	struct dlist_entry ep_entry;
+	struct cxip_ep_obj *ep_obj;
+	int req_id;
+	int (*cb)(struct cxip_ctrl_req *req, const union c_event *evt);
+
+	union {
+		struct cxip_ctrl_req_mr mr;
+		struct cxip_ctrl_send send;
+	};
+};
+
+struct cxip_mr_lac_cache {
+	/* MR referencing the associated MR cache LE, can only
+	 * be flushed if reference count is 0.
+	 */
+	ofi_atomic32_t ref;
+	union cxip_match_bits mb;
+	struct cxip_ctrl_req *ctrl_req;
+};
+
+struct cxip_fc_peer {
+	struct dlist_entry txc_entry;
+	struct cxip_txc *txc;
+	struct cxip_ctrl_req req;
+	struct cxip_addr caddr;
+	struct dlist_entry msg_queue;
+	uint16_t pending;
+	uint16_t dropped;
+	uint16_t pending_acks;
+	bool replayed;
+	unsigned int retry_count;
+};
+
+struct cxip_fc_drops {
+	struct dlist_entry rxc_entry;
+	struct cxip_rxc *rxc;
+	struct cxip_ctrl_req req;
+	uint32_t nic_addr;
+	uint32_t pid;
+	uint16_t drops;
+	unsigned int retry_count;
+};
+
+/* Completion queue specific wrapper around CXI event queue. */
+struct cxip_cq_eq {
+	struct cxi_eq *eq;
+	void *buf;
+	size_t len;
+	struct cxi_md *md;
+	bool mmap;
+	unsigned int unacked_events;
+	struct c_eq_status prev_eq_status;
+	bool eq_saturated;
+};
+
+struct cxip_evtq {
+	struct cxi_eq *eq;
+	void *buf;
+	size_t len;
+	struct cxi_md *md;
+	bool mmap;
+	unsigned int unacked_events;
+	unsigned int ack_batch_size;
+	struct c_eq_status prev_eq_status;
+	bool eq_saturated;
+
+	/* Point back to CQ */
+	struct cxip_cq *cq;
+
+	/* Protected with ep_ob->lock */
+	struct ofi_bufpool *req_pool;
+	struct indexer req_table;
+	struct dlist_entry req_list;
+};
+
+/*
+ * CXI Libfbric software completion queue
+ */
+struct cxip_cq {
+	struct util_cq util_cq;
+	struct fi_cq_attr attr;
+
+	/* Implement our own CQ ep_list_lock since common code util_cq
+	 * implementation is a mutex and can not be optimized. This lock
+	 * is always taken walking the CQ EP, but can be optimized to no-op.
+	 */
+	struct ofi_genlock ep_list_lock;
+
+	/* Internal CXI wait object allocated only if required. */
+	struct cxil_wait_obj *priv_wait;
+
+	/* CXI specific fields. */
+	struct cxip_domain *domain;
+	unsigned int ack_batch_size;
+	struct dlist_entry dom_entry;
+};
+
+static inline uint16_t cxip_evtq_eqn(struct cxip_evtq *evtq)
+{
+	return evtq->eq->eqn;
+}
+
+/*
+ * CXI libfabric completion counter
+ */
+struct cxip_cntr {
+	struct fid_cntr cntr_fid;
+	struct cxip_domain *domain;	// parent domain
+	ofi_atomic32_t ref;
+	struct fi_cntr_attr attr;	// copy of user or default attributes
+	struct fid_wait *wait;
+	/* Contexts to which counter is bound */
+	struct dlist_entry ctx_list;
+
+	ofi_mutex_t lock;
+
+	struct cxi_ct *ct;
+	struct c_ct_writeback *wb;
+	uint64_t wb_device;
+	enum fi_hmem_iface wb_iface;
+	uint64_t wb_handle;
+	bool wb_handle_valid;
+	struct c_ct_writeback lwb;
+
+	struct dlist_entry dom_entry;
+};
+
+struct cxip_ux_send {
+	struct dlist_entry rxc_entry;
+	struct cxip_req *req;
+	union c_event put_ev;
+	bool claimed;			/* Reserved with FI_PEEK | FI_CLAIM */
+};
+
+/* Key used to associate PUT and PUT_OVERFLOW events */
+union cxip_def_event_key {
+	struct {
+		uint64_t initiator	: 32;
+		uint64_t rdzv_id	: 15;
+		uint64_t pad0		: 16;
+		uint64_t rdzv		: 1;
+	};
+	struct {
+		uint64_t start_addr	: 57;
+		uint64_t pad1		: 7;
+	};
+	uint64_t raw;
+};
+
+struct cxip_deferred_event {
+	struct dlist_entry rxc_entry;
+	union cxip_def_event_key key;
+	struct cxip_req *req;
+	union c_event ev;
+	uint64_t mrecv_start;
+	uint32_t mrecv_len;
+
+	struct cxip_ux_send *ux_send;
+};
+
+/* A very specific (non-generic) hash table is used to map
+ * deferred CXI events to associate PUT and PUT_OVERFLOW events.
+ * Hash entries are added and removed at a high rate and the
+ * overhead of generic implementations is insufficient.
+ */
+#define CXIP_DEF_EVENT_HT_BUCKETS	256
+
+struct def_event_ht {
+	struct dlist_entry bh[CXIP_DEF_EVENT_HT_BUCKETS];
+};
+
+/*
+ * Zero-buffer collectives.
+ */
+#define	ZB_NOSIM	-1
+#define	ZB_ALLSIM	-2
+
+struct cxip_zbcoll_obj;
+typedef void (*zbcomplete_t)(struct cxip_zbcoll_obj *zb, void *usrptr);
+
+struct cxip_zbcoll_cb_obj {
+	zbcomplete_t usrfunc;		// callback function
+	void *usrptr;			// callback data
+};
+
+/* Used to track state for one or more zbcoll endpoints */
+struct cxip_zbcoll_state {
+	struct cxip_zbcoll_obj *zb;	// backpointer to zbcoll_obj
+	uint64_t *dataptr;		// user-supplied target
+	uint64_t dataval;		// collective data
+	int num_relatives;		// number of nearest relatives
+	int *relatives;			// nearest relative indices
+	int contribs;			// contribution count
+	int grp_rank;			// local rank within group
+};
+
+/* Used to track concurrent zbcoll operations */
+struct cxip_zbcoll_obj {
+	struct dlist_entry ready_link;	// link to zb_coll ready_list
+	struct cxip_ep_obj *ep_obj;	// backpointer to endpoint
+	struct cxip_zbcoll_state *state;// state array
+	struct cxip_addr *caddrs;	// cxip addresses in collective
+	int num_caddrs;			// number of cxip addresses
+	zbcomplete_t userfunc;		// completion callback function
+	void *userptr;			// completion callback data
+	uint64_t *grpmskp;		// pointer to global group mask
+	uint32_t *shuffle;		// TEST shuffle array
+	int simcount;			// TEST count of states
+	int simrank;			// TEST simulated rank
+	int simref;			// TEST zb0 reference count
+	int busy;			// serialize collectives in zb
+	int grpid;			// zb collective grpid
+	int error;			// error code
+	int reduce;			// set to report reduction data
+};
+
+/* zbcoll extension to struct cxip_ep_obj */
+struct cxip_ep_zbcoll_obj {
+	struct dlist_entry ready_list;	// zbcoll ops ready to advance
+	struct cxip_zbcoll_obj **grptbl;// group lookup table
+	uint64_t grpmsk;		// mask of used grptbl entries
+	int refcnt;			// grptbl reference count
+	bool disable;			// low level tests
+	ofi_spin_t lock;		// group ID negotiation lock
+	ofi_atomic32_t dsc_count;	// cumulative RCV discard count
+	ofi_atomic32_t err_count;	// cumulative ACK error count
+	ofi_atomic32_t ack_count;	// cumulative ACK success count
+	ofi_atomic32_t rcv_count;	// cumulative RCV success count
+};
+
+/*
+ * Collectives context.
+ *
+ * Extension to cxip_ep_obj for collectives.
+ *
+ * Initialized in cxip_coll_init() during EP creation.
+ */
+struct cxip_ep_coll_obj {
+	struct index_map mcast_map;	// mc address -> object
+	struct dlist_entry mc_list;	// list of mcast addresses
+	struct cxip_coll_pte *coll_pte;	// PTE extensions
+	struct dlist_ts sched_list;	// scheduled actions
+	struct cxip_cmdq *rx_cmdq;	// shared with STD EP
+	struct cxip_cmdq *tx_cmdq;	// shared with STD EP
+	struct cxip_cntr *rx_cntr;	// shared with STD EP
+	struct cxip_cntr *tx_cntr;	// shared with STD EP
+	struct cxip_evtq *rx_evtq;	// shared with STD EP
+	struct cxip_evtq *tx_evtq;	// shared with STD EP
+	struct cxip_eq *eq;		// shared with STD EP
+	ofi_atomic32_t num_mc;		// count of MC objects
+	ofi_atomic32_t join_cnt;	// advanced on every join
+	size_t min_multi_recv;		// trigger value to rotate bufs
+	size_t buffer_size;		// size of receive buffers
+	size_t buffer_count;		// count of receive buffers
+	bool join_busy;			// serialize joins on a node
+	bool is_hwroot;			// set if ep is hw_root
+	bool enabled;			// enabled
+};
+
+/* Receive context state machine.
+ * TODO: Handle unexpected RMA.
+ */
+enum cxip_rxc_state {
+	/* Initial state of an RXC. All user posted receives are rejected until
+	 * the RXC has been enabled.
+	 *
+	 * Note that an RXC can be transitioned from any state into
+	 * RXC_DISABLED.
+	 *
+	 * Validate state changes:
+	 * RXC_ENABLED: User has successfully enabled the RXC.
+	 * RXC_ENABLED_SOFTWARE: User has successfully initialized the RXC
+	 * in a software only RX matching mode.
+	 */
+	RXC_DISABLED = 0,
+
+	/* User posted receives are matched against the software unexpected
+	 * list before being offloaded to hardware. Hardware matches against
+	 * the corresponding PtlTE priority and overflow list.
+	 *
+	 * Validate state changes:
+	 * RXC_ONLOAD_FLOW_CONTROL: Several scenarios can initiate this state
+	 * change.
+	 *    1. Hardware fails to allocate an LE for an unexpected message
+	 *    or a priority list LE append fails, and hybrid mode is not
+	 *    enabled. Hardware transitions the PtlTE from enabled to disabled.
+	 *    2. Hardware fails to allocate an LE during an overflow list
+	 *    append. The PtlTE remains in the enabled state but appends to
+	 *    the overflow list are disabled. Software manually disables
+	 *    the PtlTE.
+	 *    3. Hardware fails to successfully match on the overflow list.
+	 *    Hardware automatically transitions the PtlTE from enabled to
+	 *    disabled.
+	 * RXC_ONLOAD_FLOW_CONTROL_REENABLE: Several scenarios can initiate
+	 * it this state change:
+	 *    1. The hardware EQ is full, hardware transitions the PtlTE from
+	 *    enabled/software managed to disabled to recover drops, but it
+	 *    can re-enable if an LE resource is not recovered.
+	 *    2. Running "hardware" RX match mode and matching failed because
+	 *    the overflow list buffers were full. Hardware transitions the
+	 *    PtlTE from enabled to disabled. The overflow list must be
+	 *    replenished and processing can continue if an LE resource is not
+	 *    recovered.
+	 *    3. Running "hybrid" or "software" RX match mode and a message
+	 *    is received, but there is not a buffer available on the request
+	 *    list. Hardware transitions the PtlTE from software managed to
+	 *    disabled. The request list must be replenished and processing
+	 *    can continue if an LE resource is not recovered.
+	 * RXC_PENDING_PTLTE_SOFTWARE_MANAGED: When the provider is configured
+	 * to run in "hybrid" RX match mode and hardware fails to allocate an
+	 * LE for an unexpected message match or an priority list append fails.
+	 * Hardware will automatically transition the PtlTE from enabled to
+	 * software managed and onload of UX messages will be initiated.
+	 */
+	RXC_ENABLED,
+
+	/* The NIC has initiated a transition to software managed EP matching.
+	 *
+	 * Software must onload/reonload the hardware unexpected list while
+	 * creating a pending unexpected list from entries received on the PtlTE
+	 * request list. Any in flight appends will fail and be added to
+	 * a receive replay list, further attempts to post receive operations
+	 * will return -FI_EAGAIN. When onloading completes, the pending
+	 * UX list is appended to the onloaded UX list and then failed appends
+	 * are replayed prior to enabling the posting of receive operations.
+	 *
+	 * Validate state changes:
+	 * RXC_ENABLED_SOFTWARE: The HW to SW transition onloading has
+	 * completed and the onloaded and pending request UX list have been
+	 * combined.
+	 */
+	RXC_PENDING_PTLTE_SOFTWARE_MANAGED,
+
+	/* Executing as a software managed PtlTE either due to hybrid
+	 * transition from hardware or initial startup in software
+	 * RX matching mode.
+	 *
+	 * Validate state changes:
+	 * RXC_PENDING_PTLTE_HARDWARE: TODO: When able, software may
+	 * initiate a transition from software managed mode back to
+	 * fully offloaded operation.
+	 * RXC_ONLODAD_FLOW_CONTROL_REENABLE: Hardware was unable to match
+	 * on the request list or the EQ is full. Hardware has disabled the
+	 * PtlTE initiating flow control. Operation can continue if LE
+	 * resources are not recovered as long as request buffers can be
+	 * replenished.
+	 */
+	RXC_ENABLED_SOFTWARE,
+
+	/* TODO: Hybrid RX match mode PtlTE is transitioning from software
+	 * managed operation back to fully offloaded operation.
+	 *
+	 * Validate state changes:
+	 * RXC_ENABLED: Hybrid software managed PtlTE successfully
+	 * transitions back to fully offloaded operation.
+	 * RXC_ENABLED_SOFTWARE: Hybrid software managed PtlTE was
+	 * not able to transition to fully offloaded operation.
+	 */
+	RXC_PENDING_PTLTE_HARDWARE,
+
+	/* Software has encountered a condition which requires manual transition
+	 * of the PtlTE into disable. This state change occurs when a posted
+	 * receive could not be appended due to LE exhaustion and software
+	 * managed EP PtlTE operation has been disabled or is not possible.
+	 *
+	 * Validate state changes:
+	 * RXC_ONLOAD_FLOW_CONTROL: PtlTE disabled event has successfully been
+	 * received and onloading can begin.
+	 */
+	RXC_PENDING_PTLTE_DISABLE,
+
+	/* Flow control has occurred and the PtlTE is disabled. Software is
+	 * in the process of onloading the hardware unexpected headers to free
+	 * up LEs. User posted receives are matched against the software
+	 * unexpected list. If a match is not found on the software unexpected
+	 * list, -FI_EAGAIN is returned to the user. Hardware matching is
+	 * disabled.
+	 *
+	 * Validate state changes:
+	 * RXC_ONLOAD_FLOW_CONTROL_REENABLE: An unexpected list entry matched
+	 * a user posted receive, the search and delete command free a
+	 * unexpected list entry, or a transition to software managed EP is
+	 * occuring.
+	 */
+	RXC_ONLOAD_FLOW_CONTROL,
+
+	/* PtlTE is in the same state as RXC_ONLOAD_FLOW_CONTROL, but the RXC
+	 * should attempt to be re-enabled.
+	 *
+	 * Validate state changes:
+	 * RXC_FLOW_CONTROL: Onloading of the unexpected headers has completed.
+	 */
+	RXC_ONLOAD_FLOW_CONTROL_REENABLE,
+
+	/* Software is performing sideband communication to recover the dropped
+	 * messages. User posted receives are matched against the software
+	 * unexpected list. If a match is not found on the software unexpected
+	 * list, -FI_EAGAIN is returned to the user. Hardware matching is
+	 * disabled.
+	 *
+	 * If an append fails due to RC_NO_SPACE while in the RXC_FLOW_CONTROL
+	 * state, hardware LEs are exhausted and no more LEs can be freed by
+	 * onloading unexpected headers into software. This is a fatal event
+	 * which requires software endpoint mode to workaround.
+	 *
+	 * Validate state changes:
+	 * RXC_ENABLED: Sideband communication is complete and PtlTE is
+	 * successfully re-enabled.
+	 * RXC_SOFTWARE_MANAGED: When executing in "hybrid" or "software"
+	 * RX match mode and processing has requested to re-enable as a
+	 * software managed EP.
+	 */
+	RXC_FLOW_CONTROL,
+};
+
+#define CXIP_COUNTER_BUCKETS 31U
+#define CXIP_BUCKET_MAX (CXIP_COUNTER_BUCKETS - 1)
+#define CXIP_LIST_COUNTS 3U
+
+struct cxip_msg_counters {
+	/* Histogram counting the number of messages based on priority, buffer
+	 * type (HMEM), and message size.
+	 */
+	ofi_atomic32_t msg_count[CXIP_LIST_COUNTS][OFI_HMEM_MAX][CXIP_COUNTER_BUCKETS];
+};
+
+/* Returns the most significant bit set (indexed from 1 - the LSB) */
+static inline int fls64(uint64_t x)
+{
+	if (!x)
+		return 0;
+
+	return (sizeof(x) * 8) - __builtin_clzl(x);
+}
+
+static inline void cxip_msg_counters_init(struct cxip_msg_counters *cntrs)
+{
+	int i;
+	int j;
+	int k;
+
+	for (i = 0; i < CXIP_LIST_COUNTS; i++) {
+		for (j = 0; j < OFI_HMEM_MAX; j++) {
+			for (k = 0; k < CXIP_COUNTER_BUCKETS; k++)
+				ofi_atomic_initialize32(&cntrs->msg_count[i][j][k], 0);
+		}
+	}
+}
+
+static inline void
+cxip_msg_counters_msg_record(struct cxip_msg_counters *cntrs,
+			     enum c_ptl_list list, enum fi_hmem_iface buf_type,
+			     size_t msg_size)
+{
+	unsigned int bucket;
+
+	/* Buckets to bytes
+	 * Bucket 0: 0 bytes
+	 * Bucket 1: 1 byte
+	 * Bucket 2: 2 bytes
+	 * Bucket 3: 4 bytes
+	 * ...
+	 * Bucket CXIP_BUCKET_MAX: (1 << (CXIP_BUCKET_MAX - 1))
+	 */
+
+	/* Round size up to the nearest power of 2. */
+	bucket = fls64(msg_size);
+	if ((1ULL << bucket) < msg_size)
+		bucket++;
+
+	bucket = MIN(CXIP_BUCKET_MAX, bucket);
+
+	ofi_atomic_add32(&cntrs->msg_count[list][buf_type][bucket], 1);
+}
+
+/*
+ * The default for the number of SW initiated TX operation that may
+ * be initiated by RX processing and be outstanding. This has no
+ * impact on hardware initiated rendezvous gets. This value can be
+ * adjusted if necessary with FI_CXI_SW_RX_TX_INIT_MAX=#.
+ */
+#define CXIP_SW_RX_TX_INIT_MAX_DEFAULT	1024
+#define CXIP_SW_RX_TX_INIT_MIN		64
+
+/* If a restricted rendezvous protocol notify done message
+ * cannot be delivered due to EQ full, delay before retrying.
+ */
+#define CXIP_DONE_NOTIFY_RETRY_DELAY_US 100
+/*
+ * Endpoint object receive context
+ */
+struct cxip_rxc {
+	void *context;
+	struct cxip_cq *recv_cq;
+	struct cxip_cntr *recv_cntr;
+
+	struct cxip_ep_obj *ep_obj;	// parent EP object
+	struct cxip_domain *domain;	// parent domain
+	uint8_t pid_bits;
+
+	struct dlist_entry ep_list;	// contains EPs using shared context
+
+	struct fi_rx_attr attr;
+	bool selective_completion;
+	bool sw_ep_only;
+
+	struct cxip_evtq rx_evtq;
+	struct cxip_pte *rx_pte;	// HW RX Queue
+	struct cxip_cmdq *rx_cmdq;	// RX CMDQ for posting receive buffers
+	struct cxip_cmdq *tx_cmdq;	// TX CMDQ for Message Gets
+
+	/* Number of unexpected list entries in HW. */
+	ofi_atomic32_t orx_hw_ule_cnt;
+	ofi_atomic32_t orx_reqs;	// outstanding receive requests
+	ofi_atomic32_t orx_tx_reqs;	// outstanding RX initiated TX requests
+	int32_t max_tx;
+	unsigned int recv_appends;
+
+	/* Window when FI_CLAIM mutual exclusive access is required */
+	bool hw_claim_in_progress;
+
+	size_t min_multi_recv;
+	int max_eager_size;
+
+	/* Flow control/software state change metrics */
+	int num_fc_eq_full;
+	int num_fc_no_match;
+	int num_fc_unexp;
+	int num_fc_append_fail;
+	int num_fc_req_full;
+	int num_sc_nic_hw2sw_append_fail;
+	int num_sc_nic_hw2sw_unexp;
+
+	/* Unexpected message handling */
+	struct cxip_ptelist_bufpool *req_list_bufpool;
+	struct cxip_ptelist_bufpool *oflow_list_bufpool;
+
+	/* Defer events to wait for both put and put overflow */
+	struct def_event_ht deferred_events;
+
+	struct dlist_entry fc_drops;
+	struct dlist_entry replay_queue;
+	struct dlist_entry sw_ux_list;
+	struct dlist_entry sw_pending_ux_list;
+	int sw_ux_list_len;
+	int sw_pending_ux_list_len;
+
+	/* Array of 8-byte of unexpected headers remote offsets. */
+	uint64_t *ule_offsets;
+	unsigned int num_ule_offsets;
+
+	/* Current remote offset to be processed. Incremented after processing
+	 * a search and delete put event.
+	 */
+	unsigned int cur_ule_offsets;
+
+	/* Software receive queue. User posted requests are queued here instead
+	 * of on hardware if the RXC is in software endpoint mode.
+	 */
+	struct dlist_entry sw_recv_queue;
+
+	enum cxip_rxc_state state;
+	enum cxip_rxc_state prev_state;
+	enum cxip_rxc_state new_state;
+	enum c_sc_reason fc_reason;
+
+	bool msg_offload;
+	uint64_t rget_align_mask;
+
+	/* RXC drop count used for FC accounting. */
+	int drop_count;
+	bool hmem;
+
+	struct cxip_msg_counters cntrs;
+};
+
+static inline void cxip_copy_to_md(struct cxip_md *md, void *dest,
+				   const void *src, size_t size)
+{
+	ssize_t ret __attribute__((unused));
+	struct iovec iov;
+
+	/* Favor CPU store access instead of relying on HMEM copy functions. */
+	if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) {
+		ret = ofi_hmem_dev_reg_copy_to_hmem(md->info.iface, md->handle,
+						    dest, src, size);
+		assert(ret == FI_SUCCESS);
+	} else {
+		iov.iov_base = dest;
+		iov.iov_len = size;
+
+		ret = md->dom->hmem_ops.copy_to_hmem_iov(md->info.iface,
+							 md->info.device, &iov,
+							 1, 0, src, size);
+		assert(ret == size);
+	}
+}
+
+static inline void cxip_copy_from_md(struct cxip_md *md, void *dest,
+				     const void *src, size_t size)
+{
+	ssize_t ret __attribute__((unused));
+	struct iovec iov;
+
+	/* Favor CPU store access instead of relying on HMEM copy functions. */
+	if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) {
+		ret = ofi_hmem_dev_reg_copy_from_hmem(md->info.iface,
+						      md->handle,
+						      dest, src, size);
+		assert(ret == FI_SUCCESS);
+	} else {
+		iov.iov_base = (void *)src;
+		iov.iov_len = size;
+
+
+		ret = md->dom->hmem_ops.copy_from_hmem_iov(dest, size,
+							   md->info.iface,
+							   md->info.device,
+							   &iov, 1, 0);
+		assert(ret == size);
+	}
+}
+
+/* PtlTE buffer pool - Common PtlTE request/overflow list buffer
+ * management.
+ *
+ * Only C_PTL_LIST_REQUEST and C_PTL_LIST_OVERFLOW are supported.
+ */
+struct cxip_ptelist_bufpool_attr {
+	enum c_ptl_list list_type;
+
+	/* Callback to handle PtlTE link error/unlink events */
+	int (*ptelist_cb)(struct cxip_req *req, const union c_event *event);
+	size_t buf_size;
+	size_t min_space_avail;
+	size_t min_posted;
+	size_t max_posted;
+	size_t max_cached;
+};
+
+struct cxip_ptelist_bufpool {
+	struct cxip_ptelist_bufpool_attr attr;
+	struct cxip_rxc *rxc;
+	size_t buf_alignment;
+
+	/* Ordered list of buffers emitted to hardware */
+	struct dlist_entry active_bufs;
+
+	/* List of consumed buffers which cannot be reposted yet
+	 * since unexpected entries have not been matched.
+	 */
+	struct dlist_entry consumed_bufs;
+
+	/* List of available buffers that may be appended to the list.
+	 * These could be from a previous append failure or be cached
+	 * from previous message processing to avoid map/unmap of
+	 * list buffer.
+	 */
+	struct dlist_entry free_bufs;
+
+	ofi_atomic32_t bufs_linked;
+	ofi_atomic32_t bufs_allocated;
+	ofi_atomic32_t bufs_free;
+};
+
+struct cxip_ptelist_req {
+	/* Pending list of unexpected header entries which could not be placed
+	 * on the RX context unexpected header list due to put events being
+	 * received out-of-order.
+	 */
+	struct dlist_entry pending_ux_list;
+};
+
+struct cxip_ptelist_buf {
+	struct cxip_ptelist_bufpool *pool;
+
+	/* RX context the request buffer is posted on. */
+	struct cxip_rxc *rxc;
+	enum cxip_le_type le_type;
+	struct dlist_entry buf_entry;
+	struct cxip_req *req;
+
+	/* Memory mapping of req_buf field. */
+	struct cxip_md *md;
+
+	/* The number of bytes consume by hardware when the request buffer was
+	 * unlinked.
+	 */
+	size_t unlink_length;
+
+	/* Current offset into the buffer where packets/data are landing. When
+	 * the cur_offset is equal to unlink_length, software has completed
+	 * event processing for the buffer.
+	 */
+	size_t cur_offset;
+
+	/* Request list specific control information */
+	struct cxip_ptelist_req request;
+
+	/* The number of unexpected headers posted placed on the RX context
+	 * unexpected header list which have not been matched.
+	 */
+	ofi_atomic32_t refcount;
+
+	/* Buffer used to land packets. */
+	char *data;
+};
+
+int cxip_ptelist_bufpool_init(struct cxip_rxc *rxc,
+			      struct cxip_ptelist_bufpool **pool,
+			      struct cxip_ptelist_bufpool_attr *attr);
+void cxip_ptelist_bufpool_fini(struct cxip_ptelist_bufpool *pool);
+int cxip_ptelist_buf_replenish(struct cxip_ptelist_bufpool *pool,
+			       bool seq_restart);
+void cxip_ptelist_buf_link_err(struct cxip_ptelist_buf *buf,
+			       int rc_link_error);
+void cxip_ptelist_buf_unlink(struct cxip_ptelist_buf *buf);
+void cxip_ptelist_buf_put(struct cxip_ptelist_buf *buf, bool repost);
+void cxip_ptelist_buf_get(struct cxip_ptelist_buf *buf);
+void cxip_ptelist_buf_consumed(struct cxip_ptelist_buf *buf);
+
+/*
+ * cxip_req_bufpool_init() - Initialize PtlTE request list buffer management
+ * object.
+ */
+int cxip_req_bufpool_init(struct cxip_rxc *rxc);
+void cxip_req_bufpool_fini(struct cxip_rxc *rxc);
+
+/*
+ * cxip_oflow_bufpool_init() - Initialize PtlTE overflow list buffer management
+ * object.
+ */
+int cxip_oflow_bufpool_init(struct cxip_rxc *rxc);
+void cxip_oflow_bufpool_fini(struct cxip_rxc *rxc);
+
+void _cxip_req_buf_ux_free(struct cxip_ux_send *ux, bool repost);
+void cxip_req_buf_ux_free(struct cxip_ux_send *ux);
+
+#define CXIP_RDZV_IDS	(1 << CXIP_TOTAL_RDZV_ID_WIDTH)
+#define CXIP_RDZV_IDS_MULTI_RECV (1 << CXIP_RDZV_ID_CMD_WIDTH)
+#define CXIP_TX_IDS	(1 << CXIP_TX_ID_WIDTH)
+
+/* One per LAC */
+#define RDZV_SRC_LES 8U
+#define RDZV_NO_MATCH_PTES 8U
+
+/* Base rendezvous PtlTE object */
+struct cxip_rdzv_pte {
+	struct cxip_txc *txc;
+	struct cxip_pte *pte;
+
+	/* Count of the number of buffers successfully linked on this PtlTE. */
+	ofi_atomic32_t le_linked_success_count;
+
+	/* Count of the number of buffers failed to link on this PtlTE. */
+	ofi_atomic32_t le_linked_failure_count;
+};
+
+/* Matching PtlTE for user generated unrestricted get DMA */
+struct cxip_rdzv_match_pte {
+	struct cxip_rdzv_pte base_pte;
+
+	/* Request structure used to handle zero byte puts used for match
+	 * complete.
+	 */
+	struct cxip_req *zbp_req;
+
+	/* Request structures used to handle rendezvous source/data transfers.
+	 * There is one request structure (and LE) for each LAC.
+	 */
+	struct cxip_req *src_reqs[RDZV_SRC_LES];
+};
+
+/* Matching PtlTE for user generated restricted get DMA. One PtlTE
+ * per LAC used.
+ */
+struct cxip_rdzv_nomatch_pte {
+	struct cxip_rdzv_pte base_pte;
+	struct cxip_req *le_req;
+};
+
+#if ENABLE_DEBUG
+/* Defines to force hard to test TXC error path failures;
+ * only valid for debug unit testing. See txc->force_err.
+ */
+#define	CXIP_TXC_FORCE_ERR_ALT_READ_PROTO_ALLOC (1 << 0)
+#endif
+
+/*
+ * Endpoint object transmit context
+ */
+struct cxip_txc {
+	void *context;
+	bool enabled;
+	bool hrp_war_req;		// Non-fetching 32-bit HRP
+
+	bool hmem;
+
+	struct cxip_cq *send_cq;
+	struct cxip_cntr *send_cntr;
+	struct cxip_cntr *read_cntr;
+	struct cxip_cntr *write_cntr;
+
+	struct cxip_ep_obj *ep_obj;	// parent EP object
+	struct cxip_domain *domain;	// parent domain
+	uint8_t pid_bits;
+
+	struct dlist_entry ep_list;	// contains EPs using shared context
+
+	struct fi_tx_attr attr;		// attributes
+	bool selective_completion;
+	uint32_t tclass;
+
+	/* TX H/W Event Queue */
+	struct cxip_evtq tx_evtq;
+
+	/* Inject buffers for EP, protected by ep_obj->lock */
+	struct ofi_bufpool *ibuf_pool;
+
+	struct cxip_cmdq *tx_cmdq;	// added during cxip_txc_enable()
+	ofi_atomic32_t otx_reqs;	// outstanding transmit requests
+
+	struct cxip_req *rma_write_selective_completion_req;
+	struct cxip_req *rma_read_selective_completion_req;
+	struct cxip_req *amo_selective_completion_req;
+	struct cxip_req *amo_fetch_selective_completion_req;
+
+	/* Rendezvous related structures */
+	struct cxip_rdzv_match_pte *rdzv_pte;
+	struct cxip_rdzv_nomatch_pte *rdzv_nomatch_pte[RDZV_NO_MATCH_PTES];
+	struct indexer rdzv_ids;
+	struct indexer msg_rdzv_ids;
+	enum cxip_rdzv_proto rdzv_proto;
+
+	/* Match complete IDs */
+	struct indexer tx_ids;
+
+	int max_eager_size;
+	int rdzv_eager_size;
+	struct cxip_cmdq *rx_cmdq;	// Target cmdq for Rendezvous buffers
+
+#if ENABLE_DEBUG
+	uint64_t force_err;
+#endif
+	/* Flow Control recovery */
+	struct dlist_entry msg_queue;
+	struct dlist_entry fc_peers;
+
+	struct dlist_entry dom_entry;
+};
+
+int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni,
+			  enum cxi_traffic_class tc,
+			  enum cxi_traffic_class_type tc_type,
+			  const struct c_cstate_cmd *c_state,
+			  const struct c_idc_put_cmd *put, const void *buf,
+			  size_t len, uint64_t flags);
+int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni,
+		      enum cxi_traffic_class tc,
+		      enum cxi_traffic_class_type tc_type,
+		      struct cxip_cntr *trig_cntr, size_t trig_thresh,
+		      struct c_full_dma_cmd *dma, uint64_t flags);
+int cxip_txc_emit_idc_amo(struct cxip_txc *txc, uint16_t vni,
+			  enum cxi_traffic_class tc,
+			  enum cxi_traffic_class_type tc_type,
+			  const struct c_cstate_cmd *c_state,
+			  const struct c_idc_amo_cmd *amo, uint64_t flags,
+			  bool fetching, bool flush);
+int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni,
+			  enum cxi_traffic_class tc,
+			  enum cxi_traffic_class_type tc_type,
+			  struct cxip_cntr *trig_cntr, size_t trig_thresh,
+			  struct c_dma_amo_cmd *amo, uint64_t flags,
+			  bool fetching, bool flush);
+
+void cxip_txc_flush_msg_trig_reqs(struct cxip_txc *txc);
+
+/*
+ * Base Endpoint Object
+ *
+ * Support structure, libfabric fi_endpoint implementation.
+ *
+ * This is the meat of the endpoint object. It has been separated from cxip_ep
+ * to support aliasing.
+ */
+struct cxip_ep_obj {
+	/* Allow lock to be optimized out with FI_THREAD_DOMAIN */
+	struct ofi_genlock lock;
+	struct cxip_domain *domain;
+	struct cxip_av *av;
+
+	/* Domain has been configured with FI_AV_AUTH_KEY. */
+	bool av_auth_key;
+
+	/* This is only valid if FI_AV_AUTH_KEY is false. */
+	struct cxi_auth_key auth_key;
+
+	/* Array of VNIs if FI_AV_AUTH_KEY is true. */
+	uint16_t *vnis;
+	size_t vni_count;
+
+	bool enabled;
+
+	struct cxil_wait_obj *ctrl_wait;
+	struct cxi_eq *ctrl_tgt_evtq;
+	struct cxi_eq *ctrl_tx_evtq;
+
+	struct cxip_addr src_addr;
+	fi_addr_t fi_addr;
+
+	/* ASIC version associated with EP/Domain */
+	enum cassini_version asic_ver;
+
+	struct cxip_txc txc;
+	struct cxip_rxc rxc;
+
+	/* Command queues. Each EP has 1 transmit and 1 target
+	 * command queue that can be shared. An optional 2nd transmit
+	 * command queue may be created for RX initiated rgets.
+	 */
+	struct cxip_cmdq *txq;
+	ofi_atomic32_t txq_ref;
+	struct cxip_cmdq *tgq;
+	ofi_atomic32_t tgq_ref;
+	struct cxip_cmdq *rx_txq;
+
+	/* Portals flow-control recovery messaging uses a credit
+	 * scheme to avoid over-running the associated event queue.
+	 */
+	struct cxip_cmdq *ctrl_txq;
+	struct cxip_cmdq *ctrl_tgq;
+	unsigned int ctrl_tx_credits;
+	struct cxip_pte *ctrl_pte;
+	struct cxip_ctrl_req ctrl_msg_req;
+
+	/* Libfabric software EQ resource */
+	struct cxip_eq *eq;
+	struct dlist_entry eq_link;
+
+	/* Values at base EP creation */
+	uint64_t caps;
+	struct fi_ep_attr ep_attr;
+	struct fi_tx_attr tx_attr;
+	struct fi_rx_attr rx_attr;
+
+	/* Collectives support */
+	struct cxip_ep_coll_obj coll;
+	struct cxip_ep_zbcoll_obj zbcoll;
+
+	/* Flow control recovery event queue buffers */
+	void *ctrl_tgt_evtq_buf;
+	struct cxi_md *ctrl_tgt_evtq_buf_md;
+	void *ctrl_tx_evtq_buf;
+	struct cxi_md *ctrl_tx_evtq_buf_md;
+
+	/* FI_MR_PROV_KEY caching, protected with ep_obj->lock */
+	struct cxip_mr_lac_cache std_mr_cache[CXIP_NUM_CACHED_KEY_LE];
+	struct cxip_mr_lac_cache opt_mr_cache[CXIP_NUM_CACHED_KEY_LE];
+	struct dlist_entry mr_list;
+
+	size_t txq_size;
+	size_t tgq_size;
+	ofi_atomic32_t ref;
+	struct cxip_portals_table *ptable;
+};
+
+/*
+ * CXI endpoint implementations to support FI_CLASS_EP.
+ */
+struct cxip_ep {
+	struct fid_ep ep;
+	struct fi_tx_attr tx_attr;
+	struct fi_rx_attr rx_attr;
+	struct cxip_ep_obj *ep_obj;
+	int is_alias;
+};
+
+size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep,
+			      struct fi_cq_tagged_entry *entry, size_t count,
+			      fi_addr_t *src_addr, size_t *ux_count);
+int cxip_build_ux_entry_info(struct cxip_ep *ep,
+			     struct fi_cq_tagged_entry *entry, size_t count,
+			     fi_addr_t *src_addr, size_t *ux_count);
+
+enum cxip_mr_state {
+	CXIP_MR_DISABLED = 1,
+	CXIP_MR_ENABLED,
+	CXIP_MR_LINKED,
+	CXIP_MR_UNLINKED,
+	CXIP_MR_LINK_ERR,
+};
+
+/*
+ * Memory Region
+ *
+ * libfabric fi_mr implementation.
+ *
+ * Created in cxip_regattr().
+ */
+struct cxip_mr {
+	struct fid_mr mr_fid;
+	struct cxip_domain *domain;	// parent domain
+	struct cxip_ep *ep;		// endpoint for remote memory
+	uint64_t key;			// memory key
+	uint64_t flags;			// special flags
+	struct fi_mr_attr attr;		// attributes
+	struct cxip_cntr *cntr;		// if bound to cntr
+
+	/* Indicates if FI_RMA_EVENT was specified at creation and
+	 * will be used to enable fi_writedata() and fi_inject_writedata()
+	 * support for this MR (TODO).
+	 */
+	bool rma_events;
+
+	/* If requested then count MR events to determine if RMA are in
+	 * progress. At close if no RMA are in progress bypass the invalidate
+	 * of the PTLTE LE. This improves non-cached key close performance,
+	 * enabling their use so that after closing the MR the associated
+	 * memory cannot be remotely accessed, even if it remains in the
+	 * libfabric MR cache.
+	 */
+	bool count_events;
+	ofi_atomic32_t  match_events;
+	ofi_atomic32_t  access_events;
+
+	ofi_spin_t lock;
+
+	struct cxip_mr_util_ops *mr_util;
+	bool enabled;
+	struct cxip_pte *pte;
+	enum cxip_mr_state mr_state;
+	int64_t mr_id;			// Non-cached provider key uniqueness
+	struct cxip_ctrl_req req;
+	bool optimized;
+
+	void *buf;			// memory buffer VA
+	uint64_t len;			// memory length
+	struct cxip_md *md;		// buffer IO descriptor
+	struct dlist_entry ep_entry;
+
+	struct dlist_entry mr_domain_entry;
+};
+
+struct cxip_av_auth_key_entry {
+	ofi_atomic32_t use_cnt;
+	ofi_atomic32_t ref_cnt;
+	UT_hash_handle hh;
+	struct dlist_entry entry;
+	struct cxi_auth_key key;
+	fi_addr_t fi_addr;
+};
+
+struct cxip_av_entry {
+	ofi_atomic32_t use_cnt;
+	UT_hash_handle hh;
+	struct cxip_addr addr;
+	fi_addr_t fi_addr;
+	struct cxip_av_auth_key_entry *auth_key;
+};
+
+struct cxip_av {
+	struct fid_av av_fid;
+	struct cxip_domain *domain;
+
+	/* List of endpoints bound to this AV. Each bind takes a reference
+	 * as well.
+	 */
+	struct dlist_entry ep_list;
+	ofi_atomic32_t ref;
+
+	/* Memory used to implement lookups. Two data structures are used.
+	 * 1. ibuf pool for O(1) lookup on the data path
+	 * 2. hash table for O(1) on the receive path
+	 */
+	struct cxip_av_entry *av_entry_hash;
+	struct ofi_bufpool *av_entry_pool;
+	ofi_atomic32_t av_entry_cnt;
+
+	/* Memory used to support AV authorization key. Three data structures
+	 * are needed.
+	 * 1. ibuf pool for memory allocation and lookup O(1) access.
+	 * 2. hash table for O(1) reverse lookup
+	 * 3. List for iterating
+	 */
+	struct cxip_av_auth_key_entry *auth_key_entry_hash;
+	struct ofi_bufpool *auth_key_entry_pool;
+	struct dlist_entry auth_key_entry_list;
+	ofi_atomic32_t auth_key_entry_cnt;
+	size_t auth_key_entry_max;
+
+	/* Single lock is used to protect entire AV. With domain level
+	 * threading, this lock is not used.
+	 */
+	bool lockless;
+	pthread_rwlock_t lock;
+
+	/* AV is configured as symmetric. This is an optimization which enables
+	 * endpoints to use logical address.
+	 */
+	bool symmetric;
+
+	/* Address vector type. */
+	enum fi_av_type type;
+
+	/* Whether or not the AV is operating in FI_AV_AUTH_KEY mode. */
+	bool av_auth_key;
+
+	/* Whether or not the AV was opened with FI_AV_USER_ID. */
+	bool av_user_id;
+};
+
+int cxip_av_auth_key_get_vnis(struct cxip_av *av, uint16_t **vni,
+			      size_t *vni_count);
+void cxip_av_auth_key_put_vnis(struct cxip_av *av, uint16_t *vni,
+			       size_t vni_count);
+extern struct cxip_addr *(*cxip_av_addr_in)(const void *addr);
+extern void (*cxip_av_addr_out)(struct cxip_addr *addr_out,
+				struct cxip_addr *addr);
+int cxip_av_lookup_addr(struct cxip_av *av, fi_addr_t fi_addr,
+			struct cxip_addr *addr);
+fi_addr_t cxip_av_lookup_fi_addr(struct cxip_av *av,
+				 const struct cxip_addr *addr);
+fi_addr_t cxip_av_lookup_auth_key_fi_addr(struct cxip_av *av, unsigned int vni);
+int cxip_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
+		 struct fid_av **av, void *context);
+int cxip_av_bind_ep(struct cxip_av *av, struct cxip_ep *ep);
+void cxip_av_unbind_ep(struct cxip_av *av, struct cxip_ep *ep);
+static inline int cxip_av_entry_count(struct cxip_av *av)
+{
+	return ofi_atomic_get32(&av->av_entry_cnt);
+}
+
+/*
+ * AV Set
+ *
+ * libfabric fi_av_set implementation.
+ *
+ * Created in cxip_av_set().
+ */
+struct cxip_av_set {
+	struct fid_av_set av_set_fid;
+	struct cxip_av *cxi_av;		// associated AV
+	struct cxip_coll_mc *mc_obj;	// reference MC
+	fi_addr_t *fi_addr_ary;		// addresses in set
+	size_t fi_addr_cnt;		// count of addresses
+	struct cxip_comm_key comm_key;	// communication key
+	uint64_t flags;
+};
+
+/* Needed for math functions */
+union cxip_dbl_bits {
+	struct {
+		uint64_t mantissa:52;
+		uint64_t exponent:11;
+		uint64_t sign:1;
+	} __attribute__((__packed__));
+	double dval;
+	uint64_t ival;
+};
+
+static inline uint64_t _dbl2bits(double d)
+{
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+	union cxip_dbl_bits x = {.dval = d};
+	return x.ival;
+#else
+#error "Unsupported processor byte ordering"
+#endif
+}
+
+static inline double _bits2dbl(uint64_t i)
+{
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+	union cxip_dbl_bits x = {.ival = i};
+	return x.dval;
+#else
+#error "Unsupported processor byte ordering"
+#endif
+}
+
+static inline void _decompose_dbl(double d, int *sgn, int *exp,
+				  unsigned long *man)
+{
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+	union cxip_dbl_bits x = {.dval = d};
+	*sgn = (x.sign) ? -1 : 1;
+	*exp = x.exponent;
+	*man = x.mantissa;
+#else
+#error "Unsupported processor byte ordering"
+#endif
+}
+
+/* data structures for reduction support */
+enum cxip_coll_redtype {
+	REDTYPE_BYT,
+	REDTYPE_INT,
+	REDTYPE_FLT,
+	REDTYPE_IMINMAX,
+	REDTYPE_FMINMAX,
+	REDTYPE_REPSUM
+};
+
+/* int AND, OR, XOR, MIN, MAX, SUM */
+struct cxip_intval {
+	int64_t ival[4];
+};
+
+/* flt MIN, MAX, SUM */
+struct cxip_fltval {
+	double fval[4];
+};
+
+/* int MINMAXLOC */
+struct cxip_iminmax {
+	int64_t iminval;
+	uint64_t iminidx;
+	int64_t imaxval;
+	uint64_t imaxidx;
+};
+
+/* flt MINMAXLOC */
+struct cxip_fltminmax {
+	double fminval;
+	uint64_t fminidx;
+	double fmaxval;
+	uint64_t fmaxidx;
+};
+
+/* repsum SUM */
+struct cxip_repsum {
+	int64_t T[4];
+	int32_t M;
+	int8_t overflow_id;
+	bool inexact;
+	bool overflow;
+	bool invalid;
+};
+
+/* Collective operation states */
+enum cxip_coll_state {
+	CXIP_COLL_STATE_NONE,
+	CXIP_COLL_STATE_READY,
+	CXIP_COLL_STATE_FAULT,
+};
+
+/* Similar to C_RC_* provider errors, but pure libfabric */
+/* These should be in priority order, from lowest to highest */
+enum cxip_coll_prov_errno {
+	CXIP_PROV_ERRNO_OK = -1,		// good
+	CXIP_PROV_ERRNO_PTE = -2,		// PTE setup failure
+	CXIP_PROV_ERRNO_MCAST_INUSE = -3,	// multicast in-use
+	CXIP_PROV_ERRNO_HWROOT_INUSE = -4,	// hwroot in-use
+	CXIP_PROV_ERRNO_MCAST_INVALID = -5,	// multicast invalid
+	CXIP_PROV_ERRNO_HWROOT_INVALID = -6,	// hwroot invalid
+	CXIP_PROV_ERRNO_CURL = -7,		// CURL failure
+	CXIP_PROV_ERRNO_LAST = -8,		// last error code (unused)
+};
+
+/* Rosetta reduction engine error codes */
+typedef enum cxip_coll_rc {
+	CXIP_COLL_RC_SUCCESS = 0,		// good
+	CXIP_COLL_RC_FLT_INEXACT = 1,		// result was rounded
+	CXIP_COLL_RC_FLT_OVERFLOW = 3,		// result too large to represent
+	CXIP_COLL_RC_FLT_INVALID = 4,           // operand was signalling NaN,
+						//   or infinities subtracted
+	CXIP_COLL_RC_REP_INEXACT = 5,		// reproducible sum was rounded
+	CXIP_COLL_RC_INT_OVERFLOW = 6,		// reproducible sum overflow
+	CXIP_COLL_RC_CONTR_OVERFLOW = 7,	// too many contributions seen
+	CXIP_COLL_RC_OP_MISMATCH = 8,		// conflicting opcodes
+	CXIP_COLL_RC_TX_FAILURE = 9,		// internal send error
+	CXIP_COLL_RC_MAX = 10
+} cxip_coll_rc_t;
+
+struct cxip_coll_buf {
+	struct dlist_entry buf_entry;		// linked list of buffers
+	struct cxip_req *req;			// associated LINK request
+	struct cxip_md *cxi_md;			// buffer memory descriptor
+	size_t bufsiz;				// buffer size in bytes
+	uint8_t buffer[];			// buffer space itself
+};
+
+struct cxip_coll_pte {
+	struct cxip_pte *pte;			// Collectives PTE
+	struct cxip_ep_obj *ep_obj;		// Associated endpoint
+	struct cxip_coll_mc *mc_obj;		// Associated multicast object
+	struct dlist_entry buf_list;		// PTE receive buffers
+	ofi_atomic32_t buf_cnt;			// count of linked buffers
+	ofi_atomic32_t buf_swap_cnt;		// for diagnostics
+	ofi_atomic32_t recv_cnt;		// for diagnostics
+	int buf_low_water;			// for diagnostics
+	bool enabled;				// enabled
+};
+
+/* REQUIRED:
+ * sizeof(struct cxip_coll_accumulator) >= sizeof(struct cxip_coll_data)
+ * (opaque) struct cxip_coll_accumulator exported in fi_cxi_ext.h
+ */
+struct cxip_coll_data {
+	union {
+		uint8_t databuf[32];		// raw data buffer
+		struct cxip_intval intval;	// 4 integer values + flags
+		struct cxip_fltval fltval;	// 4 double values + flags
+		struct cxip_iminmax intminmax;	// 1 intminmax structure + flags
+		struct cxip_fltminmax fltminmax;// 1 fltminmax structure + flags
+		struct cxip_repsum repsum;	// 1 repsum structure + flags
+	};
+	cxip_coll_op_t red_op;			// reduction opcode
+	cxip_coll_rc_t red_rc;			// reduction return code
+	int red_cnt;				// reduction contrib count
+	bool initialized;
+};
+
+struct cxip_coll_reduction {
+	struct cxip_coll_mc *mc_obj;		// parent mc_obj
+	uint32_t red_id;			// reduction id
+	uint16_t seqno;				// reduction sequence number
+	uint16_t resno;				// reduction result number
+	struct cxip_req *op_inject_req;		// active operation request
+	enum cxip_coll_state coll_state;	// reduction state on node
+	struct cxip_coll_data accum;		// reduction accumulator
+	void *op_rslt_data;			// user recv buffer (or NULL)
+	int op_data_bytcnt;			// bytes in send/recv buffers
+	void *op_context;			// caller's context
+	bool in_use;				// reduction is in-use
+	bool pktsent;				// reduction packet sent
+	bool completed;				// reduction is completed
+	bool drop_send;				// drop the next send operation
+	bool drop_recv;				// drop the next recv operation
+	enum cxip_coll_rc red_rc;		// set by first error
+	struct timespec tv_expires;		// reduction expiration time
+	uint8_t tx_msg[64];			// static packet memory
+};
+
+struct cxip_coll_mc {
+	struct fid_mc mc_fid;
+	struct dlist_entry entry;		// Link to mc object list
+	struct cxip_ep_obj *ep_obj;		// Associated endpoint
+	struct cxip_av_set *av_set_obj;		// associated AV set
+	struct cxip_zbcoll_obj *zb;		// zb object for zbcol
+	struct cxip_coll_pte *coll_pte;		// collective PTE
+	struct timespec timeout;		// state machine timeout
+	fi_addr_t mynode_fiaddr;		// fi_addr of this node
+	int mynode_idx;				// av_set index of this node
+	uint32_t hwroot_idx;			// av_set index of hwroot node
+	uint32_t mcast_addr;			// multicast target address
+	int tail_red_id;			// tail active red_id
+	int next_red_id;			// next available red_id
+	int max_red_id;				// limit total concurrency
+	int seqno;				// rolling seqno for packets
+	bool arm_disable;			// arm-disable for testing
+	bool is_joined;				// true if joined
+	bool rx_discard;			// true to discard RX events
+	enum cxi_traffic_class tc;		// traffic class
+	enum cxi_traffic_class_type tc_type;	// traffic class type
+	ofi_atomic32_t send_cnt;		// for diagnostics
+	ofi_atomic32_t recv_cnt;		// for diagnostics
+	ofi_atomic32_t pkt_cnt;			// for diagnostics
+	ofi_atomic32_t seq_err_cnt;		// for diagnostics
+	ofi_atomic32_t tmout_cnt;		// for diagnostics
+	ofi_spin_t lock;
+
+	struct cxi_md *reduction_md;		// memory descriptor for DMA
+	struct cxip_coll_reduction reduction[CXIP_COLL_MAX_CONCUR];
+};
+
+struct cxip_curl_handle;
+
+typedef void (*curlcomplete_t)(struct cxip_curl_handle *);
+
+struct cxip_curl_handle {
+	long status;		// HTTP status, 0 for no server, -1 busy
+	const char *endpoint;	// HTTP server endpoint address
+	const char *request;	// HTTP request data
+	const char *response;	// HTTP response data, NULL until complete
+	curlcomplete_t usrfunc;	// user completion function
+	void *usrptr;		// user function argument
+	void *recv;		// opaque
+	void *headers;		// opaque
+};
+
+/* Low-level CURL POST/DELETE async wrappers */
+enum curl_ops {
+	CURL_GET,
+	CURL_PUT,
+	CURL_POST,
+	CURL_PATCH,
+	CURL_DELETE,
+	CURL_MAX
+};
+int cxip_curl_init(void);
+void cxip_curl_fini(void);
+const char *cxip_curl_opname(enum curl_ops op);
+int cxip_curl_perform(const char *endpoint, const char *request,
+		      const char *sessionToken, size_t rsp_init_size,
+		      enum curl_ops op, bool verbose,
+		      curlcomplete_t usrfunc, void *usrptr);
+int cxip_curl_progress(struct cxip_curl_handle **handleptr);
+void cxip_curl_free(struct cxip_curl_handle *handle);
+
+static inline void single_to_double_quote(char *str)
+{
+	do {if (*str == '\'') *str = '"';} while (*(++str));
+}
+enum json_type cxip_json_obj(const char *desc, struct json_object *jobj,
+			     struct json_object **jval);
+int cxip_json_bool(const char *desc, struct json_object *jobj, bool *val);
+int cxip_json_int(const char *desc, struct json_object *jobj, int *val);
+int cxip_json_int64(const char *desc, struct json_object *jobj, int64_t *val);
+int cxip_json_double(const char *desc, struct json_object *jobj, double *val);
+int cxip_json_string(const char *desc, struct json_object *jobj,
+		     const char **val);
+
+/* Perform zero-buffer collectives */
+void cxip_tree_rowcol(int radix, int nodeidx, int *row, int *col, int *siz);
+void cxip_tree_nodeidx(int radix, int row, int col, int *nodeidx);
+int cxip_tree_relatives(int radix, int nodeidx, int maxnodes, int *rels);
+
+int cxip_zbcoll_recv_cb(struct cxip_ep_obj *ep_obj, uint32_t init_nic,
+			uint32_t init_pid, uint64_t mbv);
+void cxip_zbcoll_send(struct cxip_zbcoll_obj *zb, int srcidx, int dstidx,
+		      uint64_t payload);
+void cxip_zbcoll_free(struct cxip_zbcoll_obj *zb);
+int cxip_zbcoll_alloc(struct cxip_ep_obj *ep_obj, int num_addrs,
+		      fi_addr_t *fiaddrs, int simrank,
+		      struct cxip_zbcoll_obj **zbp);
+int cxip_zbcoll_simlink(struct cxip_zbcoll_obj *zb0,
+			struct cxip_zbcoll_obj *zb);
+void cxip_zbcoll_set_user_cb(struct cxip_zbcoll_obj *zb,
+			     zbcomplete_t userfunc, void *userptr);
+
+int cxip_zbcoll_max_grps(bool sim);
+int cxip_zbcoll_getgroup(struct cxip_zbcoll_obj *zb);
+void cxip_zbcoll_rlsgroup(struct cxip_zbcoll_obj *zb);
+int cxip_zbcoll_broadcast(struct cxip_zbcoll_obj *zb, uint64_t *dataptr);
+int cxip_zbcoll_reduce(struct cxip_zbcoll_obj *zb, uint64_t *dataptr);
+int cxip_zbcoll_barrier(struct cxip_zbcoll_obj *zb);
+void cxip_ep_zbcoll_progress(struct cxip_ep_obj *ep_obj);
+
+void cxip_zbcoll_reset_counters(struct cxip_ep_obj *ep_obj);
+void cxip_zbcoll_get_counters(struct cxip_ep_obj *ep_obj, uint32_t *dsc,
+			      uint32_t *err, uint32_t *ack, uint32_t *rcv);
+void cxip_zbcoll_fini(struct cxip_ep_obj *ep_obj);
+int cxip_zbcoll_init(struct cxip_ep_obj *ep_obj);
+
+/*
+ * CNTR/CQ wait object file list element
+ *
+ * Support structure.
+ *
+ * Created in cxip_cntr_open(), cxip_cq_open().
+ */
+struct cxip_fid_list {
+	struct dlist_entry entry;
+	struct fid *fid;
+};
+
+int cxip_rdzv_match_pte_alloc(struct cxip_txc *txc,
+			      struct cxip_rdzv_match_pte **rdzv_pte);
+int cxip_rdzv_nomatch_pte_alloc(struct cxip_txc *txc, int lac,
+				struct cxip_rdzv_nomatch_pte **rdzv_pte);
+int cxip_rdzv_pte_src_req_alloc(struct cxip_rdzv_match_pte *pte, int lac);
+void cxip_rdzv_match_pte_free(struct cxip_rdzv_match_pte *pte);
+void cxip_rdzv_nomatch_pte_free(struct cxip_rdzv_nomatch_pte *pte);
+int cxip_rdzv_pte_zbp_cb(struct cxip_req *req, const union c_event *event);
+int cxip_rdzv_pte_src_cb(struct cxip_req *req, const union c_event *event);
+
+struct cxip_if *cxip_if_lookup_addr(uint32_t nic_addr);
+struct cxip_if *cxip_if_lookup_name(const char *name);
+int cxip_get_if(uint32_t nic_addr, struct cxip_if **dev_if);
+void cxip_put_if(struct cxip_if *dev_if);
+int cxip_if_valid_rgroup_vni(struct cxip_if *iface, unsigned int rgroup_id,
+			     unsigned int vni);
+int cxip_alloc_lni(struct cxip_if *iface, uint32_t svc_id,
+		   struct cxip_lni **if_lni);
+void cxip_free_lni(struct cxip_lni *lni);
+const char *cxi_tc_str(enum cxi_traffic_class tc);
+enum cxi_traffic_class cxip_ofi_to_cxi_tc(uint32_t ofi_tclass);
+int cxip_txq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni,
+		    enum cxi_traffic_class tc,
+		    enum cxi_traffic_class_type tc_type);
+void cxip_if_init(void);
+void cxip_if_fini(void);
+
+int cxip_pte_set_state(struct cxip_pte *pte, struct cxip_cmdq *cmdq,
+		       enum c_ptlte_state new_state, uint32_t drop_count);
+int cxip_pte_set_state_wait(struct cxip_pte *pte, struct cxip_cmdq *cmdq,
+			    struct cxip_evtq *evtq,
+			    enum c_ptlte_state new_state, uint32_t drop_count);
+int cxip_pte_append(struct cxip_pte *pte, uint64_t iova, size_t len,
+		    unsigned int lac, enum c_ptl_list list,
+		    uint32_t buffer_id, uint64_t match_bits,
+		    uint64_t ignore_bits, uint32_t match_id,
+		    uint64_t min_free, uint32_t flags,
+		    struct cxip_cntr *cntr, struct cxip_cmdq *cmdq,
+		    bool ring);
+int cxip_pte_unlink(struct cxip_pte *pte, enum c_ptl_list list,
+		    int buffer_id, struct cxip_cmdq *cmdq);
+int cxip_pte_map(struct cxip_pte *pte, uint64_t pid_idx, bool is_multicast);
+int cxip_pte_alloc_nomap(struct cxip_portals_table *ptable, struct cxi_eq *evtq,
+			 struct cxi_pt_alloc_opts *opts,
+			 void (*state_change_cb)(struct cxip_pte *pte,
+						 const union c_event *event),
+			 void *ctx, struct cxip_pte **pte);
+int cxip_pte_alloc(struct cxip_portals_table *ptable, struct cxi_eq *evtq,
+		   uint64_t pid_idx, bool is_multicast,
+		   struct cxi_pt_alloc_opts *opts,
+		   void (*state_change_cb)(struct cxip_pte *pte,
+					   const union c_event *event),
+		   void *ctx, struct cxip_pte **pte);
+void cxip_pte_free(struct cxip_pte *pte);
+int cxip_pte_state_change(struct cxip_if *dev_if, const union c_event *event);
+
+int cxip_cmdq_alloc(struct cxip_lni *lni, struct cxi_eq *evtq,
+		    struct cxi_cq_alloc_opts *cq_opts, uint16_t vni,
+		    enum cxi_traffic_class tc,
+		    enum cxi_traffic_class_type tc_type,
+		    struct cxip_cmdq **cmdq);
+void cxip_cmdq_free(struct cxip_cmdq *cmdq);
+int cxip_cmdq_emit_c_state(struct cxip_cmdq *cmdq,
+			   const struct c_cstate_cmd *cmd);
+
+int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq,
+		   size_t num_events, size_t num_fc_events);
+void cxip_evtq_fini(struct cxip_evtq *eq);
+
+int cxip_domain(struct fid_fabric *fabric, struct fi_info *info,
+		struct fid_domain **dom, void *context);
+
+int cxip_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
+		void *context);
+
+int cxip_endpoint(struct fid_domain *domain, struct fi_info *info,
+		  struct fid_ep **ep, void *context);
+
+int cxip_tx_id_alloc(struct cxip_txc *txc, void *ctx);
+int cxip_tx_id_free(struct cxip_txc *txc, int id);
+void *cxip_tx_id_lookup(struct cxip_txc *txc, int id);
+int cxip_rdzv_id_alloc(struct cxip_txc *txc, struct cxip_req *req);
+int cxip_rdzv_id_free(struct cxip_txc *txc, int id);
+void *cxip_rdzv_id_lookup(struct cxip_txc *txc, int id);
+int cxip_ep_cmdq(struct cxip_ep_obj *ep_obj, bool transmit, uint32_t tclass,
+		 struct cxi_eq *evtq, struct cxip_cmdq **cmdq);
+void cxip_ep_cmdq_put(struct cxip_ep_obj *ep_obj, bool transmit);
+
+int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux);
+int cxip_recv_req_sw_matcher(struct cxip_req *req);
+int cxip_recv_cancel(struct cxip_req *req);
+int cxip_fc_process_drops(struct cxip_ep_obj *ep_obj, uint32_t nic_addr,
+			  uint32_t pid, uint16_t drops);
+void cxip_recv_pte_cb(struct cxip_pte *pte, const union c_event *event);
+void cxip_rxc_req_fini(struct cxip_rxc *rxc);
+int cxip_rxc_oflow_init(struct cxip_rxc *rxc);
+void cxip_rxc_oflow_fini(struct cxip_rxc *rxc);
+int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid);
+
+void cxip_txc_struct_init(struct cxip_txc *txc, const struct fi_tx_attr *attr,
+			  void *context);
+int cxip_txc_enable(struct cxip_txc *txc);
+void cxip_txc_disable(struct cxip_txc *txc);
+struct cxip_txc *cxip_stx_alloc(const struct fi_tx_attr *attr, void *context);
+int cxip_rxc_msg_enable(struct cxip_rxc *rxc, uint32_t drop_count);
+int cxip_rxc_enable(struct cxip_rxc *rxc);
+void cxip_rxc_disable(struct cxip_rxc *rxc);
+void cxip_rxc_struct_init(struct cxip_rxc *rxc, const struct fi_rx_attr *attr,
+			  void *context);
+
+int cxip_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
+		 struct fid_eq **eq, void *context);
+
+bool cxip_evtq_saturated(struct cxip_evtq *evtq);
+struct cxip_md *cxip_txc_ibuf_md(void *ibuf);
+void *cxip_txc_ibuf_alloc(struct cxip_txc *txc);
+void cxip_txc_ibuf_free(struct cxip_txc *txc, void *ibuf);
+int cxip_ibuf_chunk_init(struct ofi_bufpool_region *region);
+void cxip_ibuf_chunk_fini(struct ofi_bufpool_region *region);
+int cxip_evtq_req_cancel(struct cxip_evtq *evtq, void *req_ctx,
+			 void *op_ctx, bool match);
+void cxip_evtq_req_discard(struct cxip_evtq *evtq, void *req_ctx);
+void cxip_evtq_flush_trig_reqs(struct cxip_evtq *evtq);
+int cxip_cq_req_complete(struct cxip_req *req);
+int cxip_cq_req_complete_addr(struct cxip_req *req, fi_addr_t src);
+int cxip_cq_req_error(struct cxip_req *req, size_t olen,
+		      int err, int prov_errno, void *err_data,
+		      size_t err_data_size, fi_addr_t src_addr);
+int proverr2errno(int err);
+struct cxip_req *cxip_evtq_req_alloc(struct cxip_evtq *evtq,
+				     int remap, void *req_ctx);
+void cxip_evtq_req_free(struct cxip_req *req);
+void cxip_evtq_progress(struct cxip_evtq *evtq);
+
+void cxip_ep_progress(struct fid *fid);
+int cxip_ep_peek(struct fid *fid);
+void cxip_ep_flush_trig_reqs(struct cxip_ep_obj *ep_obj);
+
+void cxip_cq_progress(struct cxip_cq *cq);
+void cxip_util_cq_progress(struct util_cq *util_cq);
+int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
+		 struct fid_cq **cq, void *context);
+int cxip_evtq_adjust_reserved_fc_event_slots(struct cxip_evtq *evtq, int value);
+void cxip_cq_flush_trig_reqs(struct cxip_cq *cq);
+
+void cxip_dom_cntr_disable(struct cxip_domain *dom);
+int cxip_cntr_mod(struct cxip_cntr *cxi_cntr, uint64_t value, bool set,
+		  bool err);
+int cxip_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr,
+		   struct fid_cntr **cntr, void *context);
+
+int cxip_iomm_init(struct cxip_domain *dom);
+void cxip_iomm_fini(struct cxip_domain *dom);
+int cxip_map(struct cxip_domain *dom, const void *buf, unsigned long len,
+	     uint64_t flags, struct cxip_md **md);
+void cxip_unmap(struct cxip_md *md);
+
+int cxip_ctrl_msg_send(struct cxip_ctrl_req *req);
+void cxip_ep_ctrl_progress(struct cxip_ep_obj *ep_obj);
+void cxip_ep_ctrl_progress_locked(struct cxip_ep_obj *ep_obj);
+void cxip_ep_tx_ctrl_progress(struct cxip_ep_obj *ep_obj);
+void cxip_ep_tx_ctrl_progress_locked(struct cxip_ep_obj *ep_obj);
+void cxip_ep_tgt_ctrl_progress(struct cxip_ep_obj *ep_obj);
+void cxip_ep_tgt_ctrl_progress_locked(struct cxip_ep_obj *ep_obj);
+int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj);
+void cxip_ep_ctrl_fini(struct cxip_ep_obj *ep_obj);
+void cxip_ep_ctrl_del_wait(struct cxip_ep_obj *ep_obj);
+int cxip_ep_ctrl_trywait(void *arg);
+
+int cxip_av_set(struct fid_av *av, struct fi_av_set_attr *attr,
+	        struct fid_av_set **av_set_fid, void * context);
+
+// TODO: naming convention for testing hooks
+void cxip_coll_init(struct cxip_ep_obj *ep_obj);
+int cxip_coll_enable(struct cxip_ep *ep);
+int cxip_coll_disable(struct cxip_ep_obj *ep_obj);
+void cxip_coll_close(struct cxip_ep_obj *ep_obj);
+void cxip_coll_populate_opcodes(void);
+int cxip_coll_send(struct cxip_coll_reduction *reduction,
+		   int av_set_idx, const void *buffer, size_t buflen,
+		   struct cxi_md *md);
+int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction,
+			   const struct cxip_coll_data *coll_data,
+			   bool arm, bool retry);
+
+void cxip_capture_red_id(int *red_id_buf);
+ssize_t cxip_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context);
+ssize_t cxip_broadcast(struct fid_ep *ep, void *buf, size_t count,
+		       void *desc, fi_addr_t coll_addr, fi_addr_t root_addr,
+		       enum fi_datatype datatype, uint64_t flags,
+		       void *context);
+ssize_t cxip_reduce(struct fid_ep *ep, const void *buf, size_t count,
+		    void *desc, void *result, void *result_desc,
+		    fi_addr_t coll_addr, fi_addr_t root_addr,
+		    enum fi_datatype datatype, enum fi_op op, uint64_t flags,
+		    void *context);
+ssize_t cxip_allreduce(struct fid_ep *ep, const void *buf, size_t count,
+		       void *desc, void *result, void *result_desc,
+		       fi_addr_t coll_addr, enum fi_datatype datatype,
+		       enum fi_op op, uint64_t flags, void *context);
+int cxip_join_collective(struct fid_ep *ep, fi_addr_t coll_addr,
+			 const struct fid_av_set *coll_av_set,
+			 uint64_t flags, struct fid_mc **mc, void *context);
+void cxip_coll_progress_join(struct cxip_ep_obj *ep_obj);
+
+int cxip_coll_arm_disable(struct fid_mc *mc, bool disable);
+void cxip_coll_limit_red_id(struct fid_mc *mc, int max_red_id);
+void cxip_coll_drop_send(struct cxip_coll_reduction *reduction);
+void cxip_coll_drop_recv(struct cxip_coll_reduction *reduction);
+
+void cxip_coll_reset_mc_ctrs(struct fid_mc *mc);
+
+void cxip_dbl_to_rep(struct cxip_repsum *x, double d);
+void cxip_rep_to_dbl(double *d, const struct cxip_repsum *x);
+void cxip_rep_add(struct cxip_repsum *x, const struct cxip_repsum *y);
+double cxip_rep_add_dbl(double d1, double d2);
+double cxip_rep_sum(size_t count, double *values);
+
+int cxip_check_auth_key_info(struct fi_info *info);
+int cxip_gen_auth_key(struct fi_info *info, struct cxi_auth_key *key);
+
+#define CXIP_FC_SOFTWARE_INITIATED -1
+
+/* cxip_fc_reason() - Returns the event reason for portal state
+ * change (FC reason or SC reason).
+ */
+static inline int cxip_fc_reason(const union c_event *event)
+{
+	if (!event->tgt_long.initiator.state_change.sc_nic_auto)
+		return CXIP_FC_SOFTWARE_INITIATED;
+
+	return event->tgt_long.initiator.state_change.sc_reason;
+}
+
+static inline void cxip_txq_ring(struct cxip_cmdq *cmdq, bool more,
+				 int otx_reqs)
+{
+	if (!more) {
+		switch (cmdq->llring_mode) {
+		case CXIP_LLRING_IDLE:
+			if (!otx_reqs)
+				cxi_cq_ll_ring(cmdq->dev_cmdq);
+			else
+				cxi_cq_ring(cmdq->dev_cmdq);
+			break;
+		case CXIP_LLRING_ALWAYS:
+			cxi_cq_ll_ring(cmdq->dev_cmdq);
+			break;
+		case CXIP_LLRING_NEVER:
+		default:
+			cxi_cq_ring(cmdq->dev_cmdq);
+			break;
+		}
+	}
+}
+
+ssize_t cxip_send_common(struct cxip_txc *txc, uint32_t tclass,
+			 const void *buf, size_t len,
+			 void *desc, uint64_t data, fi_addr_t dest_addr,
+			 uint64_t tag, void *context, uint64_t flags,
+			 bool tagged, bool triggered, uint64_t trig_thresh,
+			 struct cxip_cntr *trig_cntr,
+			 struct cxip_cntr *comp_cntr);
+
+ssize_t cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len,
+			 void *desc, fi_addr_t src_addr, uint64_t tag,
+			 uint64_t ignore, void *context, uint64_t flags,
+			 bool tagged, struct cxip_cntr *comp_cntr);
+
+ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc,
+			const void *buf, size_t len, void *desc,
+			fi_addr_t tgt_addr, uint64_t addr,
+			uint64_t key, uint64_t data, uint64_t flags,
+			uint32_t tclass, uint64_t msg_order, void *context,
+			bool triggered, uint64_t trig_thresh,
+			struct cxip_cntr *trig_cntr,
+			struct cxip_cntr *comp_cntr);
+
+/*
+ * Request variants:
+ *   CXIP_RQ_AMO
+ *      Passes one argument (operand1), and applies that to a remote memory
+ *      address content.
+ *
+ *   CXIP_RQ_AMO_FETCH
+ *      Passes two arguments (operand1, resultptr), applies operand1 to a
+ *      remote memory address content, and returns the prior content of the
+ *      remote memory in resultptr.
+ *
+ *   CXIP_RQ_AMO_SWAP
+ *      Passes three arguments (operand1, compare, resultptr). If remote memory
+ *      address content satisfies the comparison operation with compare,
+ *      replaces the remote memory content with operand1, and returns the prior
+ *      content of the remote memory in resultptr.
+ *
+ *  CXIP_RQ_AMO_PCIE_FETCH
+ *      Passes two arguments (operand1, resultptr), applies operand1 to a
+ *      remote memory address content, and returns the prior content of the
+ *      remote memory in resultptr.
+ *
+ *      The resulting operation should be a PCIe AMO instead of NIC AMO.
+ */
+enum cxip_amo_req_type {
+	CXIP_RQ_AMO,
+	CXIP_RQ_AMO_FETCH,
+	CXIP_RQ_AMO_SWAP,
+	CXIP_RQ_AMO_PCIE_FETCH,
+	CXIP_RQ_AMO_LAST,
+};
+
+int cxip_amo_common(enum cxip_amo_req_type req_type, struct cxip_txc *txc,
+		    uint32_t tclass, const struct fi_msg_atomic *msg,
+		    const struct fi_ioc *comparev, void **comparedesc,
+		    size_t compare_count, const struct fi_ioc *resultv,
+		    void **resultdesc, size_t result_count, uint64_t flags,
+		    bool triggered, uint64_t trig_thresh,
+		    struct cxip_cntr *trig_cntr, struct cxip_cntr *comp_cntr);
+int _cxip_atomic_opcode(enum cxip_amo_req_type req_type, enum fi_datatype dt,
+			enum fi_op op, int amo_remap_to_pcie_fadd,
+			enum c_atomic_op *cop, enum c_atomic_type *cdt,
+			enum c_cswap_op *copswp, unsigned int *cdtlen);
+
+static inline void
+cxip_domain_add_txc(struct cxip_domain *dom, struct cxip_txc *txc)
+{
+	ofi_spin_lock(&dom->lock);
+	dlist_insert_tail(&txc->dom_entry, &dom->txc_list);
+	ofi_spin_unlock(&dom->lock);
+}
+
+static inline void
+cxip_domain_remove_txc(struct cxip_domain *dom, struct cxip_txc *txc)
+{
+	ofi_spin_lock(&dom->lock);
+	dlist_remove(&txc->dom_entry);
+	ofi_spin_unlock(&dom->lock);
+}
+
+static inline void
+cxip_domain_add_cntr(struct cxip_domain *dom, struct cxip_cntr *cntr)
+{
+	ofi_spin_lock(&dom->lock);
+	dlist_insert_tail(&cntr->dom_entry, &dom->cntr_list);
+	ofi_atomic_inc32(&dom->ref);
+	ofi_spin_unlock(&dom->lock);
+}
+
+static inline void
+cxip_domain_remove_cntr(struct cxip_domain *dom, struct cxip_cntr *cntr)
+{
+	ofi_spin_lock(&dom->lock);
+	dlist_remove(&cntr->dom_entry);
+	ofi_atomic_dec32(&dom->ref);
+	ofi_spin_unlock(&dom->lock);
+}
+
+static inline void
+cxip_domain_add_cq(struct cxip_domain *dom, struct cxip_cq *cq)
+{
+	ofi_spin_lock(&dom->lock);
+	dlist_insert_tail(&cq->dom_entry, &dom->cq_list);
+	ofi_atomic_inc32(&dom->ref);
+	ofi_spin_unlock(&dom->lock);
+}
+
+static inline void
+cxip_domain_remove_cq(struct cxip_domain *dom, struct cxip_cq *cq)
+{
+	ofi_spin_lock(&dom->lock);
+	dlist_remove(&cq->dom_entry);
+	ofi_atomic_dec32(&dom->ref);
+	ofi_spin_unlock(&dom->lock);
+}
+
+int cxip_domain_ctrl_id_alloc(struct cxip_domain *dom,
+			      struct cxip_ctrl_req *req);
+void cxip_domain_ctrl_id_free(struct cxip_domain *dom,
+			      struct cxip_ctrl_req *req);
+int cxip_domain_prov_mr_id_alloc(struct cxip_domain *dom,
+				 struct cxip_mr *mr);
+void cxip_domain_prov_mr_id_free(struct cxip_domain *dom,
+				 struct cxip_mr *mr);
+
+static inline
+struct cxip_ctrl_req *cxip_domain_ctrl_id_at(struct cxip_domain *dom,
+					     int buffer_id)
+{
+	if (ofi_idx_is_valid(&dom->req_ids, buffer_id))
+		return ofi_idx_at(&dom->req_ids, buffer_id);
+	return NULL;
+}
+
+static inline uint32_t cxip_mac_to_nic(struct ether_addr *mac)
+{
+	return mac->ether_addr_octet[5] |
+			(mac->ether_addr_octet[4] << 8) |
+			((mac->ether_addr_octet[3] & 0xF) << 16);
+}
+
+static inline bool is_netsim(struct cxip_ep_obj *ep_obj)
+{
+	return (ep_obj->domain->iface->info->device_platform ==
+		CXI_PLATFORM_NETSIM);
+}
+
+/* debugging TRACE functions */
+#define	cxip_trace_attr	__attribute__((format(__printf__, 1, 2)))
+typedef int (*cxip_trace_t)(const char *fmt, ...);
+extern cxip_trace_t cxip_trace_attr cxip_trace_fn;
+
+typedef void (*cxip_trace_flush_t)(void);
+extern cxip_trace_flush_t cxip_trace_flush_fn;
+
+typedef void (*cxip_trace_close_t)(void);
+extern cxip_trace_close_t cxip_trace_close_fn;
+
+typedef bool (*cxip_trace_enable_t)(bool enable);
+extern cxip_trace_enable_t cxip_trace_enable_fn;
+
+extern bool cxip_trace_enabled;	// true if tracing is enabled
+extern bool cxip_trace_append;		// append open for trace file
+extern bool cxip_trace_linebuf;	// set line buffering for trace
+extern int cxip_trace_rank;		// tracing rank
+extern int cxip_trace_numranks;	// tracing number of ranks
+extern FILE *cxip_trace_fid;		// trace output file descriptor
+
+int cxip_trace_attr cxip_trace(const char *fmt, ...);
+void cxip_trace_flush(void);
+void cxip_trace_close(void);
+bool cxip_trace_enable(bool enable);
+
+/* debugging TRACE filtering control */
+enum cxip_trace_module {
+	CXIP_TRC_CTRL,
+	CXIP_TRC_ZBCOLL,
+	CXIP_TRC_CURL,
+	CXIP_TRC_COLL_PKT,
+	CXIP_TRC_COLL_JOIN,
+	CXIP_TRC_COLL_DEBUG,
+	CXIP_TRC_TEST_CODE,
+	CXIP_TRC_MAX
+};
+extern uint64_t cxip_trace_mask;
+
+static inline void cxip_trace_set(int mod)
+{
+	cxip_trace_mask |= (1L << mod);
+}
+
+static inline void cxip_trace_clr(int mod)
+{
+	cxip_trace_mask &= ~(1L << mod);
+}
+
+static inline bool cxip_trace_true(int mod)
+{
+	return cxip_trace_enabled && (cxip_trace_mask & (1L << mod));
+}
+
+#if ENABLE_DEBUG
+#define CXIP_TRACE(mod, fmt, ...) \
+	do {if (cxip_trace_true(mod)) cxip_trace_fn(fmt, ##__VA_ARGS__);} while (0)
+#else
+#define	CXIP_TRACE(mod, fmt, ...) do {} while (0)
+#endif
+
+/* fabric logging implementation functions */
+#define _CXIP_DBG(subsys, fmt,  ...) \
+	FI_DBG(&cxip_prov, subsys, "%s: " fmt "", cxip_env.hostname, \
+	       ##__VA_ARGS__)
+#define _CXIP_INFO(subsys, fmt, ...) \
+	FI_INFO(&cxip_prov, subsys, "%s: " fmt "", cxip_env.hostname, \
+		##__VA_ARGS__)
+#define _CXIP_WARN(subsys, fmt, ...) \
+	FI_WARN(&cxip_prov, subsys, "%s: " fmt "", cxip_env.hostname, \
+		##__VA_ARGS__)
+#define _CXIP_WARN_ONCE(subsys, fmt, ...) \
+	FI_WARN_ONCE(&cxip_prov, subsys, "%s: " fmt "", cxip_env.hostname, \
+		     ##__VA_ARGS__)
+#define CXIP_LOG(fmt,  ...) \
+	fi_log(&cxip_prov, FI_LOG_WARN, FI_LOG_CORE, \
+	       __func__, __LINE__, "%s: " fmt "", cxip_env.hostname, \
+	       ##__VA_ARGS__)
+
+#define CXIP_FATAL(fmt, ...)					\
+	do {							\
+		CXIP_LOG(fmt, ##__VA_ARGS__);			\
+		abort();					\
+	} while (0)
+
+#define TXC_DBG(txc, fmt, ...) \
+	_CXIP_DBG(FI_LOG_EP_DATA, "TXC (%#x:%u): " fmt "", \
+		  (txc)->ep_obj->src_addr.nic, (txc)->ep_obj->src_addr.pid, \
+		  ##__VA_ARGS__)
+#define TXC_WARN(txc, fmt, ...) \
+	_CXIP_WARN(FI_LOG_EP_DATA, "TXC (%#x:%u): " fmt "", \
+		   (txc)->ep_obj->src_addr.nic, (txc)->ep_obj->src_addr.pid, \
+		   ##__VA_ARGS__)
+#define TXC_WARN_RET(txc, ret, fmt, ...) \
+	TXC_WARN(txc, "%d:%s: " fmt "", ret, fi_strerror(-ret), ##__VA_ARGS__)
+#define TXC_FATAL(txc, fmt, ...) \
+	CXIP_FATAL("TXC (%#x:%u):: " fmt "", (txc)->ep_obj->src_addr.nic, \
+		   (txc)->ep_obj->src_addr.pid, ##__VA_ARGS__)
+
+#define RXC_DBG(rxc, fmt, ...) \
+	_CXIP_DBG(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \
+		  (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \
+		  (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__)
+#define RXC_INFO(rxc, fmt, ...) \
+	_CXIP_INFO(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \
+		   (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \
+		   (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__)
+#define RXC_WARN(rxc, fmt, ...) \
+	_CXIP_WARN(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \
+		   (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \
+		   (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__)
+#define RXC_WARN_ONCE(rxc, fmt, ...) \
+	_CXIP_WARN_ONCE(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \
+		   (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \
+		   (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__)
+#define RXC_FATAL(rxc, fmt, ...) \
+	CXIP_FATAL("RXC (%#x:%u) PtlTE %u:[Fatal] " fmt "", \
+		   (rxc)->ep_obj->src_addr.nic, \
+		   (rxc)->ep_obj->src_addr.pid, \
+		   (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__)
+
+#define DOM_INFO(dom, fmt, ...) \
+	_CXIP_INFO(FI_LOG_DOMAIN, "DOM (cxi%u:%u:%u:%u:%#x): " fmt "", \
+		   (dom)->iface->info->dev_id, (dom)->lni->lni->id, \
+		   (dom)->auth_key.svc_id, (dom)->auth_key.vni, \
+		   (dom)->nic_addr, ##__VA_ARGS__)
+#define DOM_WARN(dom, fmt, ...) \
+	_CXIP_WARN(FI_LOG_DOMAIN, "DOM (cxi%u:%u:%u:%u:%#x): " fmt "", \
+		   (dom)->iface->info->dev_id, (dom)->lni->lni->id, \
+		   (dom)->auth_key.svc_id, (dom)->auth_key.vni, \
+		   (dom)->nic_addr, ##__VA_ARGS__)
+
+#define CXIP_UNEXPECTED_EVENT_STS "Unexpected event status, %s rc = %s\n"
+#define CXIP_UNEXPECTED_EVENT "Unexpected event %s, rc = %s\n"
+
+#define CXIP_DEFAULT_CACHE_LINE_SIZE 64
+
+#define CXIP_SYSFS_CACHE_LINE_SIZE      \
+	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
+
+/* cxip_cacheline_size() - Return the CPU cache-line size, if unable to
+ * read then return the assumed cache size.
+ */
+static inline int cxip_cacheline_size(void)
+{
+	FILE *f;
+	int cache_line_size;
+	int ret;
+
+	f = fopen(CXIP_SYSFS_CACHE_LINE_SIZE, "r");
+	if (!f) {
+		_CXIP_WARN(FI_LOG_CORE,
+			   "Error %d determining cacheline size\n",
+			   errno);
+		cache_line_size = CXIP_DEFAULT_CACHE_LINE_SIZE;
+	} else {
+		ret = fscanf(f, "%d", &cache_line_size);
+		if (ret != 1) {
+			_CXIP_WARN(FI_LOG_CORE,
+				   "Error reading cacheline size\n");
+			cache_line_size = CXIP_DEFAULT_CACHE_LINE_SIZE;
+		}
+
+		fclose(f);
+	}
+
+	return cache_line_size;
+}
+
+static inline int
+cxip_txc_copy_from_hmem(struct cxip_txc *txc, struct cxip_md *hmem_md,
+			void *dest, const void *hmem_src, size_t size)
+{
+	enum fi_hmem_iface iface;
+	uint64_t device;
+	struct iovec hmem_iov;
+	struct cxip_domain *domain = txc->domain;
+	uint64_t flags;
+	bool unmap_hmem_md = false;
+	int ret;
+
+	/* Default to memcpy unless FI_HMEM is set. */
+	if (!txc->hmem) {
+		memcpy(dest, hmem_src, size);
+		return FI_SUCCESS;
+	}
+
+	/* With HMEM enabled, performing memory registration will also cause
+	 * the device buffer to be registered for CPU load/store access. Being
+	 * able to perform load/store instead of using the generic HMEM copy
+	 * routines and/or HMEM override copy routines can significantly reduce
+	 * latency. Thus, this path is favored.
+	 *
+	 * However, if FORK_SAFE variables are enabled, we avoid this mapping
+	 * to keep from designating the entire page in which the buffer
+	 * resides as don't copy, and take the performance hit.
+	 *
+	 * Memory registration can result in additional latency. Expectation is
+	 * the MR cache can amortize the additional memory registration latency.
+	 */
+	if (!cxip_env.fork_safe_requested) {
+		if (!hmem_md) {
+			ret = cxip_map(domain, hmem_src, size, 0, &hmem_md);
+			if (ret) {
+				TXC_WARN(txc, "cxip_map failed: %d:%s\n", ret,
+					 fi_strerror(-ret));
+				return ret;
+			}
+
+			unmap_hmem_md = true;
+		}
+
+		cxip_copy_from_md(hmem_md, dest, hmem_src, size);
+		if (unmap_hmem_md)
+			cxip_unmap(hmem_md);
+
+		return FI_SUCCESS;
+	}
+
+	/* Slow path HMEM copy path.*/
+	iface = ofi_get_hmem_iface(hmem_src, &device, &flags);
+	hmem_iov.iov_base = (void *)hmem_src;
+	hmem_iov.iov_len = size;
+
+	ret = domain->hmem_ops.copy_from_hmem_iov(dest, size, iface, device,
+						  &hmem_iov, 1, 0);
+	if (ret != size) {
+		if (ret < 0) {
+			TXC_WARN(txc, "copy_from_hmem_iov failed: %d:%s\n", ret,
+				 fi_strerror(-ret));
+			return ret;
+		}
+
+		TXC_WARN(txc,
+			 "copy_from_hmem_iov short copy: expect=%ld got=%d\n",
+			 size, ret);
+		return -FI_EIO;
+	}
+
+	return FI_SUCCESS;
+}
+
+size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep,
+			      struct fi_cq_tagged_entry *entry, size_t count,
+			      fi_addr_t *src_addr, size_t *ux_count);
+
+int cxip_nic_alloc(struct cxip_if *nic_if, struct fid_nic **fid_nic);
+
+int cxip_domain_dwq_emit_dma(struct cxip_domain *dom, uint16_t vni,
+			     enum cxi_traffic_class tc,
+			     enum cxi_traffic_class_type tc_type,
+			     struct cxip_cntr *trig_cntr, size_t trig_thresh,
+			     struct c_full_dma_cmd *dma, uint64_t flags);
+int cxip_domain_dwq_emit_amo(struct cxip_domain *dom, uint16_t vni,
+			     enum cxi_traffic_class tc,
+			     enum cxi_traffic_class_type tc_type,
+			     struct cxip_cntr *trig_cntr, size_t trig_thresh,
+			     struct c_dma_amo_cmd *amo, uint64_t flags,
+			     bool fetching, bool flush);
+
+#endif
diff --git a/prov/cxi/include/cxip_faults.h b/prov/cxi/include/cxip_faults.h
new file mode 100644
index 00000000000..e9b28f17fe9
--- /dev/null
+++ b/prov/cxi/include/cxip_faults.h
@@ -0,0 +1,148 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2019 Hewlett Packard Enterprise Development LP
+ */
+
+/* Fault injection. */
+
+struct cxip_fault {
+	char *env;	/* Configuration env. var. name */
+	int prop;	/* Proportion of rand() values */
+	size_t count;	/* Count of injected faults */
+};
+
+extern struct cxip_fault dma_fault;
+extern struct cxip_fault malloc_fault;
+
+void cxip_fault_inject_fini(void);
+void cxip_fault_inject_init(void);
+
+#if ENABLE_DEBUG
+#define INJECT_FAULT(fault) \
+	((fault).prop && rand() < (fault).prop && (fault).count++)
+#else
+#define INJECT_FAULT(fault) 0
+#endif
+
+#define cxi_cq_emit_dma_f(...)			\
+	(INJECT_FAULT(dma_fault) ? -ENOSPC :	\
+	 cxi_cq_emit_dma(__VA_ARGS__))
+
+#define cxip_pte_unlink_f(...)			\
+	(INJECT_FAULT(dma_fault) ? -FI_EAGAIN :	\
+	 cxip_pte_unlink(__VA_ARGS__))
+
+#define malloc_f(...)				\
+	(INJECT_FAULT(malloc_fault) ? NULL :	\
+	 malloc(__VA_ARGS__))
+
+/**
+ * Collective traps, can be extended for other uses.
+ *
+ * This creates a dlist of "traps" that are keyed to an index, and a trap
+ * identifier. When the search results in a match of both index and trap, this
+ * sets the *err variable to the specified trap error, and returns true.
+ * Otherwise it returns false.
+ *
+ * The close, and set functions are generally called in the test code. The
+ * search function is generally embedded in the provider.
+ *
+ * If the trap logic branches on search returning true, search should be a no-op
+ * with no performance penalty when ENABLE_DEBUG is FALSE.
+ *
+ * This will slow operations if ENABLE_DEBUG is TRUE, and there is a large list
+ * of traps. Normally, the test case will set only one trap, since the objective
+ * is to force a controlled fault and observe the result.
+ */
+enum {
+	CXIP_TRAP_NONE = 0,
+	CXIP_TRAP_GETGRP,
+	CXIP_TRAP_BCAST,
+	CXIP_TRAP_REDUCE,
+	CXIP_TRAP_INITPTE,
+	CXIP_TRAP_CURLSND,
+	CXIP_TRAP_CURLRCV,
+};
+
+#if ENABLE_DEBUG
+/* structure used to simulate failures */
+struct _cxip_trap {
+	struct dlist_entry link;
+	int index;
+	int trap;
+	int err;
+};
+
+struct dlist_entry _trap_list;
+bool _trap_initialized;
+
+static void _cxip_trap_close(void)
+{
+	struct _cxip_trap *trap_obj;
+
+	if (!_trap_initialized)
+		return;
+	while (!dlist_empty(&_trap_list)) {
+		dlist_pop_front(&_trap_list, struct _cxip_trap, trap_obj, link);
+		free(trap_obj);
+	}
+}
+
+static void _cxip_trap_set(int index, int trap, int err)
+{
+	struct _cxip_trap *trap_obj;
+
+	if (!_trap_initialized) {
+		dlist_init(&_trap_list);
+		_trap_initialized = true;
+	}
+	trap_obj = calloc(1, sizeof(*trap_obj));
+	if (!trap_obj)
+		return;
+	dlist_init(&trap_obj->link);
+	trap_obj->index = index;
+	trap_obj->trap = trap;
+	trap_obj->err = err;
+	dlist_insert_tail(&_trap_list, &trap_obj->link);
+}
+
+static bool _cxip_trap_search(int index, int trap, int *err)
+{
+	struct _cxip_trap *trap_obj;
+	struct dlist_entry *item;
+
+	if (!_trap_initialized)
+		return false;
+
+	dlist_foreach(&_trap_list, item) {
+		trap_obj = container_of(item, struct _cxip_trap, link);
+		if (trap_obj->index != index)
+			continue;
+		if (trap_obj->trap != trap)
+			continue;
+		dlist_remove(item);
+		*err = trap_obj->err;
+		free(trap_obj);
+		return true;
+	}
+	return false;
+}
+
+static inline void cxip_trap_close(void)
+{
+	_cxip_trap_close();
+}
+static inline void cxip_trap_set(int index, int trap, int err)
+{
+	_cxip_trap_set(index, trap, err);
+}
+static inline bool cxip_trap_search(int index, int trap, int *err)
+{
+	return _cxip_trap_search(index, trap, err);
+}
+#else
+static inline void cxip_trap_close(void) {}
+static inline void cxip_trap_set(int a, int b, int c) {}
+static inline bool cxip_trap_search(int a, int b, int *c) {return false;}
+#endif
diff --git a/prov/cxi/include/fi_cxi_ext.h b/prov/cxi/include/fi_cxi_ext.h
new file mode 100644
index 00000000000..bee868450e1
--- /dev/null
+++ b/prov/cxi/include/fi_cxi_ext.h
@@ -0,0 +1,455 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2020-2022 Hewlett Packard Enterprise Development LP
+ */
+
+#ifndef _FI_CXI_EXT_H_
+#define _FI_CXI_EXT_H_
+
+/* CXI provider specific NIC attributes. This information is returned in
+ * fi_info::nid::prov_attr.
+
+ * Users can optionally modify some fields. Depending on the field adjusted,
+ * this can impact fi_domain() or other endpoint allocation behavior.
+ */
+
+#define FI_CXI_NIC_ATTR_VER 1U
+
+struct cxip_nic_attr {
+	/* Version of NIC attr. Must remain at the top of this struct. */
+	uint32_t version;
+
+	/* NIC address. Should never be modified. */
+	const unsigned int addr;
+
+	/* On output from fi_getinfo(), rgroup_id will be set in the following
+	 * order:
+	 * 1. Resource group ID returned from SLINGSHOT_SVC_ID environment
+	 * variable
+	 * 2. First resource group ID with matching UID
+	 * 3. First resource group ID with matching GID
+	 * 4. First resource group ID with open permissions
+	 */
+	const unsigned int default_rgroup_id;
+
+	/* Default VNI used with the rgroup ID. */
+	const unsigned int default_vni;
+};
+
+/*
+ * TODO: The following should be integrated into the include/rdma/fi_ext.h
+ * and are use for provider specific fi_control() operations.
+ */
+#define FI_PROV_SPECIFIC_CXI	(0xccc << 16)
+
+enum {
+	FI_OPT_CXI_SET_TCLASS = -FI_PROV_SPECIFIC_CXI,	/* uint32_t */
+	FI_OPT_CXI_SET_MSG_ORDER,			/* uint64_t */
+
+	/* fid_nic control operation to refresh NIC attributes. */
+	FI_OPT_CXI_NIC_REFRESH_ATTR,
+
+	FI_OPT_CXI_SET_MR_MATCH_EVENTS,			/* bool */
+	FI_OPT_CXI_GET_MR_MATCH_EVENTS,			/* bool */
+	FI_OPT_CXI_SET_OPTIMIZED_MRS,			/* bool */
+	FI_OPT_CXI_GET_OPTIMIZED_MRS,			/* bool */
+	FI_OPT_CXI_SET_PROV_KEY_CACHE,			/* bool */
+	FI_OPT_CXI_GET_PROV_KEY_CACHE,			/* bool */
+};
+
+/*
+ * Execute a given libfabric atomic memory operation as a PCIe operation as
+ * compared to a NIC operation.
+ *
+ * Note: Ordering between PCIe atomic operations and NIC atomic/RMA operations
+ * is undefined.
+ *
+ * Note: This flag overloads the bit used for FI_SOURCE. But, since FI_SOURCE
+ * is invalid for AMO operations, overloading this bit is not an issue.
+ */
+#define FI_CXI_PCIE_AMO (1ULL << 57)
+
+/*
+ * Flag an accelerated collective as pre-reduced.
+ *
+ * This can be passed to the accelerated collectives operations to indicate
+ * that the supplied data is a pre-reduced cxip_coll_accumulator structure.
+ *
+ * Note: This flag overloads FI_CXI_PCIE_AMO. Accelerated collectives do not
+ * use FI_CXI_PCIE_AMO or FI_SOURCE.
+ */
+#define	FI_CXI_PRE_REDUCED (1ULL << 57)
+
+/*
+ * Use CXI High Rate Puts (HRP). Increases message rate performance. Applies to
+ * RMA and unreliable, non-fetching AMO operations.
+ */
+#define FI_CXI_HRP (1ULL << 60)
+
+/*
+ * Disable AMO reliability. Increases message rate performance. Applies to
+ * non-fetching AMOs. Required for HRP AMOs.
+ */
+#define FI_CXI_UNRELIABLE (1ULL << 61)
+
+/*
+ * Request a provider specific weak FENCE operation to facilitate an
+ * EP alias ordering point, when the original EP utilizes PCIe RO=1.
+ */
+#define FI_CXI_WEAK_FENCE (1ULL << 63)
+
+/*
+ * Used in conjunction with the deferred work queue API. If a deferred work
+ * queue operation has this flag set, the CXI provider will ensure a counter
+ * writeback occurs once the deferred work queue operation completes.
+ * Note: Addition hardware resources will be used to ensure a counter writeback
+ * occurs at the completion of the deferred work queue operation.
+ */
+#define FI_CXI_CNTR_WB (1ULL << 62)
+#define FI_CXI_COUNTER_OPS "cxi_counter_ops"
+
+struct fi_cxi_cntr_ops {
+	/* Set the counter writeback address to a client provided address. */
+	int (*set_wb_buffer)(struct fid *fid, void *buf, size_t len);
+
+	/* Get the counter MMIO region. */
+	int (*get_mmio_addr)(struct fid *fid, void **addr, size_t *len);
+};
+
+/* Success values cannot exceed FI_CXI_CNTR_SUCCESS_MAX */
+#define FI_CXI_CNTR_SUCCESS_MAX ((1ULL << 48) - 1)
+
+/* Failure values cannot exceed FI_CXI_CNTR_FAILURE_MAX */
+#define FI_CXI_CNTR_FAILURE_MAX ((1ULL << 7) - 1)
+
+/* fi_cntr_read() equivalent but for the writeback buffer. */
+static inline uint64_t fi_cxi_cntr_wb_read(const void *wb_buf)
+{
+	return (*(uint64_t *)wb_buf) & FI_CXI_CNTR_SUCCESS_MAX;
+};
+
+/* fi_cntr_reader() equivalent but for the writeback buffer. */
+static inline uint64_t fi_cxi_cntr_wb_readerr(const void *wb_buf)
+{
+	return ((*(uint64_t *)wb_buf) >> 48) & FI_CXI_CNTR_FAILURE_MAX;
+};
+
+/* Generate a counter success value which can be polled on. */
+static inline int fi_cxi_gen_cntr_success(uint64_t value, uint64_t *cxi_value)
+{
+	if (value > FI_CXI_CNTR_SUCCESS_MAX)
+		return -FI_EINVAL;
+
+	*cxi_value = (1ULL << 63) | value;
+	return FI_SUCCESS;
+};
+
+/* fi_cntr_add() equivalent but for the MMIO region. */
+static inline int fi_cxi_cntr_add(void *cntr_mmio, uint64_t value)
+{
+	/* Success counter is only 48 bits wide. */
+	if (value > FI_CXI_CNTR_SUCCESS_MAX)
+		return -FI_EINVAL;
+
+	*((uint64_t *)cntr_mmio) = value;
+	return FI_SUCCESS;
+}
+
+/* fi_cntr_adderr() equivalent but for the MMIO region. */
+static inline int fi_cxi_cntr_adderr(void *cntr_mmio, uint64_t value)
+{
+	/* Error counter is only 7 bits wide. */
+	if (value > FI_CXI_CNTR_FAILURE_MAX)
+		return -FI_EINVAL;
+
+	*((uint64_t *)cntr_mmio + 8) = value;
+	return FI_SUCCESS;
+}
+
+/* fi_cntr_set() equivalent but for the MMIO region. */
+static inline int fi_cxi_cntr_set(void *cntr_mmio, uint64_t value)
+{
+	/* Only set of zero is supported through MMIO region. */
+	if (value > 0)
+		return -FI_EINVAL;
+
+	*((uint64_t *)cntr_mmio + 16) = 0;
+	return FI_SUCCESS;
+}
+
+/* fi_cntr_seterr() equivalent but for MMIO region. */
+static inline int fi_cxi_cntr_seterr(void *cntr_mmio, uint64_t value)
+{
+	/* Only set of zero is supported through MMIO region. */
+	if (value > 0)
+		return -FI_EINVAL;
+
+	*((uint64_t *)cntr_mmio + 24) = 0;
+	return FI_SUCCESS;
+}
+
+/* fi_cntr_add() equivalent but for the MMIO region. */
+static inline void *fi_cxi_get_cntr_add_addr(void *cntr_mmio)
+{
+	return cntr_mmio;
+}
+
+/* fi_cntr_adderr() equivalent but for the MMIO region. */
+static inline void *fi_cxi_get_cntr_adderr_addr(void *cntr_mmio)
+{
+	return (void *)((uint64_t *)cntr_mmio + 8);
+}
+
+/* fi_cntr_set() equivalent but for the MMIO region reset.
+ * NOTE: CXI does not support set to counter MMIO region. Only reset.
+ */
+static inline void *fi_cxi_get_cntr_reset_addr(void *cntr_mmio)
+{
+	return (void *)((uint64_t *)cntr_mmio + 16);
+}
+
+/* fi_cntr_seterr() equivalent but for MMIO region reset.
+ * NOTE: CXI does not support set to counter MMIO region. Only reset.
+ */
+static inline void *fi_cxi_get_cntr_reseterr_addr(void *cntr_mmio)
+{
+	return (void *)((uint64_t *)cntr_mmio + 24);
+}
+
+#define FI_CXI_DOM_OPS_1 "dom_ops_v1"
+#define FI_CXI_DOM_OPS_2 "dom_ops_v2"
+#define FI_CXI_DOM_OPS_3 "dom_ops_v3"
+#define FI_CXI_DOM_OPS_4 "dom_ops_v4"
+#define FI_CXI_DOM_OPS_5 "dom_ops_v5"
+#define FI_CXI_DOM_OPS_6 "dom_ops_v6"
+
+/* v1 to v6 can use the same struct since they only appended a routine */
+struct fi_cxi_dom_ops {
+	int (*cntr_read)(struct fid *fid, unsigned int cntr, uint64_t *value,
+		      struct timespec *ts);
+	int (*topology)(struct fid *fid, unsigned int *group_id,
+			unsigned int *switch_id, unsigned int *port_id);
+
+	/* Enable hybrid MR desc mode. Hybrid MR desc allows for libfabric users
+	 * to optionally pass in a valid MR desc for local communication
+	 * operations.
+	 *
+	 * When enabled, if the MR desc is NULL, the provider will
+	 * perform internal memory registration. Else, the provider will assume
+	 * the MR desc field is valid and skip internal memory registration.
+	 *
+	 * When disabled, the provider will ignore the MR desc field and always
+	 * perform internal memory registration. This is the default behavior.
+	 *
+	 * All child endpoints will inherit the current domain status of hybrid
+	 * MR desc only during endpoint creation. Dynamically changing the
+	 * domain hybrid MR desc status with endpoint allocate may not propagate
+	 * to child endpoints. Thus, it is recommended to set hybrid MR desc
+	 * status prior to allocating endpoints.
+	 */
+	int (*enable_hybrid_mr_desc)(struct fid *fid, bool enable);
+
+	/* Get unexpected message information.
+	 *
+	 * Obtain a list of unexpected messages associated with the endpoint.
+	 * The list is returned as an array of CQ tagged entries. The following
+	 * is how the fields in fi_cq_tagged_entry are used.
+	 *
+	 * op_context: NULL since this message has not matched a posted receive
+	 *	flags: A combination of FI_MSG, FI_TAGGED, FI_RECV,
+	 *	and/or FI_REMOTE_CQ_DATA
+	 *	len: Unexpected message request length
+	 *	data: Completion queue data (only valid if FI_REMOTE_CQ_DATA
+	 *	is set)
+	 *	tag: Unexpected message tag (only valid if FI_TAGGED is set)
+	 *
+	 * @ep: Endpoint FID to have unexpected messages returned to user.
+	 * @entry: Tagged entry array to be filled in by the provider. If the
+	 * entry is NULL, only ux_count will be set.
+	 * @count: Number of entries in entry and src_addr array. If count is
+	 * zero,then only the ux_count will be set on return.
+	 * @src_addr: Source address array to be filled in by the provider. If
+	 * the entry is NULL, only ux_count will be set.
+	 * @ux_count: Output variable used to return the number of unexpected
+	 * messages queued on the given endpoint.
+	 *
+	 * Return: On success, number of entries copied into the users entry
+	 * and src_addr arrays. On error, -FI_ERRNO.
+	 */
+	size_t (*ep_get_unexp_msgs)(struct fid_ep *fid_ep,
+				    struct fi_cq_tagged_entry *entry,
+				    size_t count, fi_addr_t *src_addr,
+				    size_t *ux_count);
+
+	/* Get the depth of the deferred work queue. The depth is the number of
+	 * of triggered operation commands which can be queued to hardware. The
+	 * depth is not per fi_domain. The depth is across all processes using
+	 * the same CXI service which usually maps to a job-step.
+	 */
+	int (*get_dwq_depth)(struct fid *fid, size_t *depth);
+
+	/* The following two functions have been deprecated in favor of
+	 * using the fi_control() standardized interface. They will be
+	 * removed in a future software release, but are left here initially
+	 * to allow early users to adjust their usage.
+	 */
+	int (*enable_mr_match_events)(struct fid *fid, bool enable);
+	int (*enable_optimized_mrs)(struct fid *fid, bool enable);
+};
+
+/*
+ * CXI Authorization Key
+ */
+struct cxi_auth_key {
+	/* The CXI service assigned to the Domain and Endpoints. A CXI service
+	 * is associated with a set of local resource limits, VNIs, and Traffic
+	 * Classes.
+	 *
+	 * The svc_id used by an OFI Domain must match all Endpoints belonging
+	 * to the Domain.
+	 */
+	uint32_t svc_id;
+
+	/* The Virtual Network ID (VNI) assigned to the Endpoint. Two Endpoints
+	 * must use the same VNI in order to communicate.
+	 *
+	 * Note that while the CXI service may define one or more VNIs which a
+	 * process can access, an Endpoint is assigned to only one.
+	 */
+	uint16_t vni;
+};
+
+/*
+ * CXI Collectives
+ */
+
+/*
+ * AV Set communication key.
+ *
+ * For production:
+ * - Set cxip_comm_key.keytype = COMM_KEY_NONE.
+ * - Initialize cxip_comm_key structure to zeros.
+ * - Create one av_set on each node.
+ * - Initialize each av_set to contain the NIC addresses of all endpoints.
+ * - Call fi_join_collective() once on each endpoint.
+ * - dest_addr is a multicast address created by the join.
+ * - hwroot_nic is assigned by the join.
+ * - The PTE will receive at the multicast ID value, index extension of zero.
+ * - Sending to the multicast ID will cause delivery to nodes according to the
+ *   tree topology.
+ *
+ * For testing with externally established multicast address:
+ * - NOT IMPLEMENTED.
+ *
+ * For testing on a multinode system without multicast:
+ * - Set cxip_comm_key.keytype = COMM_KEY_UNICAST.
+ * - Set cxip_comm_key.ucast.hwroot_idx to the desired hw_root index.
+ * - Create one av_set on each node.
+ * - Initialize each av_set to contain the NIC addresses of all endpoints.
+ * - Call fi_join_collective() once one each endpoint.
+ * - hwroot_nic is the NIC address of the node that serves as the emulated
+ *   hardware root of the tree.
+ * - The PTE will use the EP source NIC address and process PID, with a
+ *   PID_IDX of CXIP_PTL_IDX_COLL.
+ * - Sending to any (valid) node address with CXIP_PTL_IDX_COLL will target the
+ *   collectives PTE on that node.
+ * - The root/leaf send routines will distribute one or more packets to all
+ *   fi_addr_t in the av_set as appropriate.
+ *
+ * For testing under NETSIM on a single node:
+ * - Set cxip_comm_key.keytype = COMM_KEY_RANK.
+ * - Set cxip_comm_key.rank.hwroot_idx to the desired hw_root index.
+ * - Set cxip_comm_key.rank.rank to the simulated rank.
+ * - Create N av_set objects, one for each simulated rank.
+ * - Call fi_join_collective() once for each simulated endpoint.
+ * - dest_addr is the MC object index.
+ * - hwroot_nic is the MC object index for the MC object to serve as the
+ *   simulated hardware root.
+ * - The PTE will use the EP source NIC address and process PID, with a PID_IDX
+ *   of 16 + dest_addr (MC object index).
+ * - Sending to the node's own address with a PID_IDX of 16 + MC index will
+ *   target the appropriate MC object.
+ * - Simulation is limited to 32 simulated endpoints.
+ */
+enum cxip_comm_key_type {
+	COMM_KEY_NONE = 0,
+	COMM_KEY_MULTICAST,
+	COMM_KEY_UNICAST,
+	COMM_KEY_RANK,
+	COMM_KEY_MAX
+};
+
+typedef unsigned int cxip_coll_op_t;	// CXI collective opcode
+
+struct cxip_coll_mcast_key {
+	uint32_t hwroot_idx;		// index of hwroot in av_set list
+	uint32_t mcast_addr;		// 13-bit multicast address id
+};
+
+struct cxip_coll_unicast_key {
+	uint32_t hwroot_idx;		// index of hwroot in av_set list
+	uint32_t mcast_addr;		// 13-bit simulated multcast address
+};
+
+struct cxip_coll_rank_key {
+	uint32_t hwroot_idx;		// index of hwroot in av_set list
+	uint32_t rank;			// rank of this object
+	bool rx_discard;		// clear to report RX events
+};
+
+struct cxip_comm_key {
+	enum cxip_comm_key_type keytype;
+	union {
+		struct cxip_coll_mcast_key mcast;
+		struct cxip_coll_unicast_key ucast;
+		struct cxip_coll_rank_key rank;
+	};
+};
+
+/* Extended reduction opcodes.
+ *
+ * Only the following standard FI_ATOMIC operations are supported:
+ * - FI_MIN	: INT or FLT
+ * - FI_MAX	: INT or FLT
+ * - FI_SUM	: INT or FLT
+ * - FI_BOR	: INT
+ * - FI_BAND	: INT
+ * - FI_BXOR	: INT
+ *
+ * The codes below extend this standard FI_ATOMIC set to explicitly take
+ * advantage of extended hardware operations. These can be used as opcodes for
+ * any of the collective operations, just like FI_MIN or FI_SUM.
+ *
+ * Note that the current FI_ATOMIC set ends at opcode == 19. We start this one
+ * at 32, to accommodate possible expansion of the FI_ATOMIC set, and check for
+ * overlap during initialization.
+ */
+enum cxip_coll_op {
+	FI_CXI_MINMAXLOC = 32,	// FLT or INT
+	FI_CXI_REPSUM,		// FLT only
+	FI_CXI_OP_LAST
+};
+
+/* Extended accelerated reduction structures.
+ */
+struct cxip_coll_intminmax {
+	int64_t minval;
+	uint64_t minidx;
+	int64_t maxval;
+	uint64_t maxidx;
+};
+
+struct cxip_coll_fltminmax {
+	double minval;
+	uint64_t minidx;
+	double maxval;
+	uint64_t maxidx;
+};
+
+/* opaque export of struct cxip_coll_data */
+struct cxip_coll_accumulator {
+	uint8_t accum[64];
+};
+
+#endif /* _FI_CXI_EXT_H_ */
diff --git a/prov/cxi/libfabric-cxi.spec.in b/prov/cxi/libfabric-cxi.spec.in
new file mode 100644
index 00000000000..77a0b6613ff
--- /dev/null
+++ b/prov/cxi/libfabric-cxi.spec.in
@@ -0,0 +1,52 @@
+%{!?configopts: %global configopts LDFLAGS=-Wl,--build-id}
+%{!?provider: %define provider cxi}
+%{!?provider_formal: %define provider_formal cxi}
+
+Name: libfabric-%{provider}
+Version: @VERSION@
+Release: 1%{?dist}
+Summary: Dynamic %{provider_formal} provider for user-space Open Fabric Interfaces
+Group: System Environment/Libraries
+License: GPLv2 or BSD
+Url: http://www.github.com/ofiwg/libfabric
+Source: http://www.github.org/ofiwg/%{name}/releases/download/v{%version}/libfabric-%{version}.tar.bz2
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
+Requires: libfabric
+BuildRequires: libfabric
+
+%description
+libfabric provides a user-space API to access high-performance fabric
+services, such as RDMA.
+
+This RPM provides the %{provider_formal} provider as a "plugin" to an existing
+libfabric installation.  This plugin will override older %{provider_formal}
+provider functionality in the existing libfabric installation.
+
+%prep
+%setup -q -n libfabric-%{version}
+
+%build
+%configure %{configopts} --enable-%{provider}=dl
+make %{?_smp_mflags}
+
+%install
+rm -rf %{buildroot}
+%makeinstall installdirs
+
+%clean
+rm -rf %{buildroot}
+
+%files
+%defattr(-,root,root,-)
+%{_libdir}/libfabric/*.so
+
+%exclude %{_libdir}/libfabric.*
+%exclude %{_libdir}/libfabric/*.la
+%exclude %{_libdir}/pkgconfig
+%exclude %{_bindir}
+%exclude %{_mandir}
+%exclude %{_includedir}
+
+%changelog
+* Wed May 24 2017 Open Fabrics Interfaces Working Group <ofiwg@lists.openfabrics.org>
+- First release of specfile for packaging a single dl provider.
diff --git a/prov/cxi/src/cxip_atomic.c b/prov/cxi/src/cxip_atomic.c
new file mode 100644
index 00000000000..c71a762f02a
--- /dev/null
+++ b/prov/cxi/src/cxip_atomic.c
@@ -0,0 +1,1744 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved.
+ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2018,2021-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <ofi_list.h>
+#include <ofi.h>
+
+#include "cxip.h"
+
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+/* Cassini supports ONLY 1-element vectors, and this code presumes that the
+ * value is 1.
+ */
+_Static_assert(CXIP_AMO_MAX_IOV == 1, "Unexpected max IOV #");
+
+/* Cassini supports ONLY 1-element packed IOVs.
+ */
+#define CXIP_AMO_MAX_PACKED_IOV (1)
+
+/**
+ * Data type codes for all of the supported fi_datatype values.
+ */
+static enum c_atomic_type _cxip_amo_type_code[FI_DATATYPE_LAST] = {
+	[FI_INT8]	  = C_AMO_TYPE_INT8_T,
+	[FI_UINT8]	  = C_AMO_TYPE_UINT8_T,
+	[FI_INT16]	  = C_AMO_TYPE_INT16_T,
+	[FI_UINT16]	  = C_AMO_TYPE_UINT16_T,
+	[FI_INT32]	  = C_AMO_TYPE_INT32_T,
+	[FI_UINT32]	  = C_AMO_TYPE_UINT32_T,
+	[FI_INT64]	  = C_AMO_TYPE_INT64_T,
+	[FI_UINT64]	  = C_AMO_TYPE_UINT64_T,
+	[FI_FLOAT]	  = C_AMO_TYPE_FLOAT_T,
+	[FI_DOUBLE]	  = C_AMO_TYPE_DOUBLE_T,
+	[FI_FLOAT_COMPLEX]	  = C_AMO_TYPE_FLOAT_COMPLEX_T,
+	[FI_DOUBLE_COMPLEX]	  = C_AMO_TYPE_DOUBLE_COMPLEX_T,
+};
+//TODO: C_AMO_TYPE_UINT128_T
+
+/**
+ * AMO operation codes for all of the fi_op values.
+ */
+static enum c_atomic_op _cxip_amo_op_code[FI_ATOMIC_OP_LAST] = {
+	[FI_MIN]	  = C_AMO_OP_MIN,
+	[FI_MAX]	  = C_AMO_OP_MAX,
+	[FI_SUM]	  = C_AMO_OP_SUM,
+	[FI_LOR]	  = C_AMO_OP_LOR,
+	[FI_LAND]	  = C_AMO_OP_LAND,
+	[FI_BOR]	  = C_AMO_OP_BOR,
+	[FI_BAND]	  = C_AMO_OP_BAND,
+	[FI_LXOR]	  = C_AMO_OP_LXOR,
+	[FI_BXOR]	  = C_AMO_OP_BXOR,
+	[FI_ATOMIC_READ]  = C_AMO_OP_SUM,
+
+	/* ATOMIC_WRITE is implemented as a CSWAP NE instead of SWAP. This
+	 * allows for SWAP to be remapped to PCIe fadd.
+	 */
+	[FI_ATOMIC_WRITE] = C_AMO_OP_CSWAP,
+	[FI_CSWAP]	  = C_AMO_OP_CSWAP,
+	[FI_CSWAP_NE]	  = C_AMO_OP_CSWAP,
+	[FI_CSWAP_LE]	  = C_AMO_OP_CSWAP,
+	[FI_CSWAP_LT]	  = C_AMO_OP_CSWAP,
+	[FI_CSWAP_GE]	  = C_AMO_OP_CSWAP,
+	[FI_CSWAP_GT]	  = C_AMO_OP_CSWAP,
+	[FI_MSWAP]	  = C_AMO_OP_AXOR,	/* special handling */
+};
+
+/**
+ * AMO swap operation codes for the CSWAP comparison conditions.
+ */
+static enum c_cswap_op _cxip_amo_swpcode[FI_ATOMIC_OP_LAST] = {
+	[FI_CSWAP]	  = C_AMO_OP_CSWAP_EQ,
+	[FI_CSWAP_NE]	  = C_AMO_OP_CSWAP_NE,
+	[FI_CSWAP_LE]	  = C_AMO_OP_CSWAP_LE,
+	[FI_CSWAP_LT]	  = C_AMO_OP_CSWAP_LT,
+	[FI_CSWAP_GE]	  = C_AMO_OP_CSWAP_GE,
+	[FI_CSWAP_GT]	  = C_AMO_OP_CSWAP_GT,
+};
+
+/**
+ * Multi-dimensional array defining supported/unsupported operations. Bits
+ * correspond to the 14 possible fi_datatype values. The OP_VALID() macro will
+ * return a 1 if the (request,op,dt) triple is supported by Cassini.
+ */
+static uint16_t _cxip_amo_valid[CXIP_RQ_AMO_LAST][FI_ATOMIC_OP_LAST] = {
+
+	[CXIP_RQ_AMO] = {
+		[FI_MIN]	  = 0x03ff,
+		[FI_MAX]	  = 0x03ff,
+		[FI_SUM]	  = 0x0fff,
+		[FI_LOR]	  = 0x00ff,
+		[FI_LAND]	  = 0x00ff,
+		[FI_LXOR]	  = 0x00ff,
+		[FI_BOR]	  = 0x00ff,
+		[FI_BAND]	  = 0x00ff,
+		[FI_BXOR]	  = 0x00ff,
+		[FI_ATOMIC_WRITE] = 0x0fff,
+	},
+
+	[CXIP_RQ_AMO_FETCH] = {
+		[FI_MIN]	  = 0x03ff,
+		[FI_MAX]	  = 0x03ff,
+		[FI_SUM]	  = 0x0fff,
+		[FI_LOR]	  = 0x00ff,
+		[FI_LAND]	  = 0x00ff,
+		[FI_LXOR]	  = 0x00ff,
+		[FI_BOR]	  = 0x00ff,
+		[FI_BAND]	  = 0x00ff,
+		[FI_BXOR]	  = 0x00ff,
+		[FI_ATOMIC_WRITE] = 0x0fff,
+		[FI_ATOMIC_READ]  = 0x0fff,
+	},
+
+	[CXIP_RQ_AMO_SWAP] = {
+		[FI_CSWAP]	  = 0x0fff,
+		[FI_CSWAP_NE]	  = 0x0fff,
+		[FI_CSWAP_LE]	  = 0x03ff,
+		[FI_CSWAP_LT]	  = 0x03ff,
+		[FI_CSWAP_GE]	  = 0x03ff,
+		[FI_CSWAP_GT]	  = 0x03ff,
+		[FI_MSWAP]	  = 0x00ff,
+	},
+
+	[CXIP_RQ_AMO_PCIE_FETCH] = {
+		[FI_MIN]	  = 0x0,
+		[FI_MAX]	  = 0x0,
+		[FI_SUM]	  = 0xf0,
+		[FI_LOR]	  = 0x0,
+		[FI_LAND]	  = 0x0,
+		[FI_LXOR]	  = 0x0,
+		[FI_BOR]	  = 0x0,
+		[FI_BAND]	  = 0x0,
+		[FI_BXOR]	  = 0x0,
+		[FI_ATOMIC_WRITE] = 0x0,
+		[FI_ATOMIC_READ]  = 0x0,
+	},
+
+};
+#define	OP_VALID(rq, op, dt)	(_cxip_amo_valid[rq][op] & (1 << dt))
+
+/**
+ * Supply opcodes for a request, and determine if the operation is supported.
+ *
+ * @param req_type basic, fetch, or swap
+ * @param dt data type for operation
+ * @param op operation
+ * @param amo_remap_to_pcie_fadd NIC AMO operation which is remapped as PCIe
+ * fetch add
+ * @param cop Cassini code for operation
+ * @param cdt Cassini code for data type
+ * @param copswp Cassini code for cswap operation
+ * @param cdtlen Length of datatype in bytes
+ *
+ * @return int 0 on success, -FI_EOPNOTSUPP if operation is not supported
+ */
+int _cxip_atomic_opcode(enum cxip_amo_req_type req_type, enum fi_datatype dt,
+			enum fi_op op, int amo_remap_to_pcie_fadd,
+			enum c_atomic_op *cop, enum c_atomic_type *cdt,
+			enum c_cswap_op *copswp, unsigned int *cdtlen)
+{
+	int opcode;
+	int dtcode;
+
+	if (dt < 0 || dt >= FI_DATATYPE_LAST ||
+	    op < 0 || op >= FI_ATOMIC_OP_LAST)
+		return -FI_EINVAL;
+
+	if (!OP_VALID(req_type, op, dt))
+		return -FI_EOPNOTSUPP;
+
+	/* If the request is a PCIe fetching AMO, then the remap opcode is
+	 * used.
+	 *
+	 * Note: Only fetching FI_SUM is supported as a PCIe AMO.
+	 */
+	if (req_type == CXIP_RQ_AMO_PCIE_FETCH) {
+		if (amo_remap_to_pcie_fadd >= 0)
+			opcode = amo_remap_to_pcie_fadd;
+		else
+			return -FI_EOPNOTSUPP;
+	} else {
+		opcode = _cxip_amo_op_code[op];
+		if (opcode == amo_remap_to_pcie_fadd)
+			return -FI_EOPNOTSUPP;
+	}
+
+	/* For fetching FI_SUMs done as a PCIe AMO, force signed data types to
+	 * unsigned. This is required by the NIC to allow libfabric to support
+	 * signed PCIe fetching FI_SUMs.
+	 */
+	dtcode = _cxip_amo_type_code[dt];
+	if (req_type == CXIP_RQ_AMO_PCIE_FETCH) {
+		if (dtcode == C_AMO_TYPE_INT32_T)
+			dtcode = C_AMO_TYPE_UINT32_T;
+		else if (dtcode == C_AMO_TYPE_INT64_T)
+			dtcode = C_AMO_TYPE_UINT64_T;
+	}
+
+	if (cop)
+		*cop = opcode;
+	if (cdt)
+		*cdt = dtcode;
+	if (cdtlen)
+		*cdtlen = ofi_datatype_size(dt);
+	if (copswp) {
+		if (op == FI_ATOMIC_WRITE)
+			*copswp = C_AMO_OP_CSWAP_NE;
+		else
+			*copswp = _cxip_amo_swpcode[op];
+	}
+
+	return 0;
+}
+
+/**
+ * Implementation of the provider *_atomic_valid() functions.
+ *
+ * The returned count is the maximum number of atomic objects on which a single
+ * atomic call can operate. For Cassini, this is 1.
+ *
+ * @param ep endpoint
+ * @param req_type request type
+ * @param datatype datatype
+ * @param op operation
+ * @param count returns count of operations supported
+ *
+ * @return int 0 on success, -FI_EOPNOTSUPP if operation not supported
+ */
+static inline int _cxip_ep_valid(struct fid_ep *fid_ep,
+				 enum cxip_amo_req_type req_type,
+				 enum fi_datatype datatype,
+				 enum fi_op op,
+				 size_t *count)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	int ret;
+
+	/* Check for a valid opcode */
+	ret = _cxip_atomic_opcode(req_type, datatype, op,
+				  ep->ep_obj->domain->amo_remap_to_pcie_fadd,
+				  NULL, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	/* "Cassini implements single element atomics. There is no hardware
+	 *  support for packed atomics or IOVECs." -- CSDG
+	 */
+	if (count)
+		*count = CXIP_AMO_MAX_IOV;
+
+	return 0;
+}
+
+/*
+ * cxip_amo_inject_cb() - AMO inject event callback.
+ */
+static int cxip_amo_inject_cb(struct cxip_req *req, const union c_event *event)
+{
+	/* When errors happen, send events can occur before the put/get event.
+	 * These events should just be dropped.
+	 */
+	if (event->hdr.event_type == C_EVENT_SEND) {
+		CXIP_WARN(CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+		return FI_SUCCESS;
+	}
+
+	int event_rc;
+
+	event_rc = cxi_init_event_rc(event);
+	int ret_err;
+
+	ret_err = proverr2errno(event_rc);
+	return cxip_cq_req_error(req, 0, ret_err,
+				 cxi_event_rc(event), NULL, 0,
+				 FI_ADDR_UNSPEC);
+}
+
+/*
+ * cxip_amo_selective_completion_req() - Return request state associated with
+ * all AMO inject transactions on the transmit context.
+ *
+ * The request is freed when the TXC send CQ is closed.
+ */
+static struct cxip_req *cxip_amo_selective_completion_req(struct cxip_txc *txc)
+{
+	if (!txc->amo_selective_completion_req) {
+		struct cxip_req *req;
+		bool free_request = false;
+
+		req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc);
+		if (!req)
+			return NULL;
+
+		req->cb = cxip_amo_inject_cb;
+		req->context = (uint64_t)txc->context;
+		req->flags = FI_ATOMIC | FI_WRITE;
+		req->addr = FI_ADDR_UNSPEC;
+
+		if (!txc->amo_selective_completion_req)
+			txc->amo_selective_completion_req = req;
+		else
+			free_request = true;
+
+		if (free_request)
+			cxip_evtq_req_free(req);
+	}
+
+	return txc->amo_selective_completion_req;
+}
+
+/*
+ * cxip_amo_fetching_selective_completion_req() - Return request state
+ * associated with all fetching AMO inject transactions on the transmit context.
+ *
+ * The request is freed when the TXC send CQ is closed.
+ */
+static struct cxip_req *
+cxip_amo_fetching_selective_completion_req(struct cxip_txc *txc)
+{
+	if (!txc->amo_fetch_selective_completion_req) {
+		struct cxip_req *req;
+		bool free_request = false;
+
+		req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc);
+		if (!req)
+			return NULL;
+
+		req->cb = cxip_amo_inject_cb;
+		req->context = (uint64_t)txc->context;
+		req->flags = FI_ATOMIC | FI_READ;
+		req->addr = FI_ADDR_UNSPEC;
+
+		if (!txc->amo_fetch_selective_completion_req)
+			txc->amo_fetch_selective_completion_req = req;
+		else
+			free_request = true;
+
+		if (free_request)
+			cxip_evtq_req_free(req);
+	}
+
+	return txc->amo_fetch_selective_completion_req;
+}
+
+/**
+ * Callback for non-fetching AMO operations.
+ *
+ * @param req AMO request structure
+ * @param event resulting event
+ */
+static int _cxip_amo_cb(struct cxip_req *req, const union c_event *event)
+{
+	int ret;
+	int event_rc;
+	int ret_err;
+	int success_event = (req->flags & FI_COMPLETION);
+	struct cxip_txc *txc = req->amo.txc;
+
+	/* When errors happen, send events can occur before the put/get event.
+	 * These events should just be dropped.
+	 */
+	if (event->hdr.event_type == C_EVENT_SEND) {
+		TXC_WARN(txc, CXIP_UNEXPECTED_EVENT,
+			 cxi_event_to_str(event),
+			 cxi_rc_to_str(cxi_event_rc(event)));
+		return FI_SUCCESS;
+	}
+
+	/* Fetching AMO with flush requires two events. Only once two events are
+	 * processed can the user-generated completion queue event be
+	 * generated. In addition, since multiple initiator events are
+	 * generated and zero assumptions can be made about the event order,
+	 * counters cannot be incremented until both events are processed.
+	 * This means that software must modify the counter (i/e it cannot be
+	 * offloaded to hardware).
+	 */
+	if (req->amo.fetching_amo_flush) {
+		req->amo.fetching_amo_flush_event_count++;
+
+		if (event->hdr.event_type == C_EVENT_REPLY)
+			req->amo.fetching_amo_flush_event_rc =
+				cxi_init_event_rc(event);
+
+		if (req->amo.fetching_amo_flush_event_count != 2)
+			return FI_SUCCESS;
+
+		event_rc = req->amo.fetching_amo_flush_event_rc;
+
+		if (req->amo.fetching_amo_flush_cntr) {
+			if (event_rc == C_RC_OK)
+				ret = cxip_cntr_mod(req->amo.fetching_amo_flush_cntr,
+						    1, false, false);
+			else
+				ret = cxip_cntr_mod(req->amo.fetching_amo_flush_cntr,
+						    1, false, true);
+
+			if (ret != FI_SUCCESS) {
+				req->amo.fetching_amo_flush_event_count--;
+				return ret;
+			}
+		}
+	} else {
+		event_rc = cxi_init_event_rc(event);
+	}
+
+	if (req->amo.result_md)
+		cxip_unmap(req->amo.result_md);
+
+	if (req->amo.oper1_md)
+		cxip_unmap(req->amo.oper1_md);
+
+	if (req->amo.ibuf)
+		cxip_txc_ibuf_free(txc, req->amo.ibuf);
+
+	req->flags &= (FI_ATOMIC | FI_READ | FI_WRITE);
+
+	if (event_rc == C_RC_OK) {
+		if (success_event) {
+			ret = cxip_cq_req_complete(req);
+			if (ret != FI_SUCCESS)
+				TXC_WARN_RET(txc, ret,
+					     "Failed to report completion\n");
+		}
+	} else {
+		ret_err = proverr2errno(event_rc);
+
+		ret = cxip_cq_req_error(req, 0, ret_err,
+					event_rc, NULL, 0,
+					FI_ADDR_UNSPEC);
+
+		if (ret != FI_SUCCESS)
+			TXC_WARN_RET(txc, ret, "Failed to report error\n");
+	}
+
+	ofi_atomic_dec32(&req->amo.txc->otx_reqs);
+	cxip_evtq_req_free(req);
+
+	return FI_SUCCESS;
+}
+
+/**
+ * Return true if vector specification is valid.
+ *
+ * vn must be > 0 and <= 1 (CXIP_AMO_MAX_IOV). Formally, we could do this test,
+ * but formally we would have to loop (once) over the vectors, and test each
+ * count for being > 0 and <= 1 (CXIP_AMO_MAX_PACKED_IOV). Instead, we just test
+ * to ensure that each is 1.
+ *
+ * @param vn vector element count
+ * @param v vector pointer
+ *
+ * @return bool true if vector is valid, false otherwise
+ */
+static inline bool _vector_valid(size_t vn, const struct fi_ioc *v)
+{
+	return (vn == CXIP_AMO_MAX_IOV && v &&
+		v[0].count == CXIP_AMO_MAX_PACKED_IOV &&
+		v[0].addr);
+}
+
+/**
+ * Return true if RMA vector specification is valid. Note that the address is
+ * treated as an offset into an RMA MR window, so a value of zero is valid.
+ *
+ * @param vn vector element count
+ * @param v vector pointer
+ *
+ * @return bool true if RMA vector is valid, false otherwise
+ */
+static inline bool _rma_vector_valid(size_t vn, const struct fi_rma_ioc *v)
+{
+	return (vn == CXIP_AMO_MAX_IOV && v &&
+		v[0].count == CXIP_AMO_MAX_PACKED_IOV);
+}
+
+static bool cxip_amo_emit_idc_req_needed(uint64_t flags, void *result,
+					 struct cxip_mr *result_mr,
+					 bool fetching_amo_flush)
+{
+	/* User completion events always require a tracking structure. */
+	if (flags & FI_COMPLETION)
+		return true;
+
+	/* If a fetching operation (i.e. result buffer is valid) and the user
+	 * did not provide an MR for the result arg, internal memory
+	 * registration needs to occur. This requires tracking.
+	 */
+	if (result && !result_mr)
+		return true;
+
+	/* Fetching AMO with flush always requires a request struct since two
+	 * operations are required to implement it.
+	 */
+	if (fetching_amo_flush)
+		return true;
+
+	return false;
+}
+
+/* TODO: Update HMEM buf type for 128-bit AMOs. */
+static int cxip_amo_emit_idc(struct cxip_txc *txc,
+			     enum cxip_amo_req_type req_type,
+			     const struct fi_msg_atomic *msg, void *buf,
+			     void *compare, void *result,
+			     struct cxip_mr *result_mr, uint64_t key,
+			     uint64_t remote_offset, union c_fab_addr *dfa,
+			     uint8_t *idx_ext, uint16_t vni,
+			     enum c_atomic_op atomic_op,
+			     enum c_cswap_op cswap_op,
+			     enum c_atomic_type atomic_type,
+			     unsigned int atomic_type_len, uint64_t flags,
+			     uint32_t tclass)
+{
+	struct cxip_domain *dom = txc->domain;
+	struct cxip_md *result_md = NULL;
+	struct c_cstate_cmd cstate_cmd = {};
+	struct c_idc_amo_cmd idc_amo_cmd = {};
+	struct cxip_req *req = NULL;
+	bool flush = !!(flags & (FI_DELIVERY_COMPLETE | FI_MATCH_COMPLETE));
+	bool fetching = result != NULL;
+	bool fetching_amo_flush = fetching && flush;
+	bool restricted = !!(flags & FI_CXI_UNRELIABLE);
+	int ret;
+	void *selective_completion_req;
+	enum cxi_traffic_class_type tc_type;
+	uint64_t hmem_buf;
+	uint64_t hmem_compare;
+	bool tgt_events = cxip_generic_is_mr_key_events(txc->ep_obj->caps,
+							key);
+
+	/* MR desc cannot be value unless hybrid MR desc is enabled. */
+	if (!dom->hybrid_mr_desc)
+		result_mr = NULL;
+
+	/* Restricted AMOs must target optimized MRs without target events */
+	if (restricted && tgt_events) {
+		TXC_WARN(txc,
+			 "Restricted AMOs with FI_RMA_EVENT not supported\n");
+		return -FI_EINVAL;
+	}
+
+	/* Usage of the FI_CXI_HRP requires FI_CXI_UNRELIABLE. */
+	if (flags & FI_CXI_HRP && !(flags & FI_CXI_UNRELIABLE)) {
+		TXC_WARN(txc, "FI_CXI_HRP requires FI_CXI_UNRELIABLE\n");
+		return -FI_EINVAL;
+	}
+
+	/* Since fetching AMO with flush results in two commands, if
+	 * FI_RMA_EVENT is enabled, this would results in two remote MR counter
+	 * increments. Thus, this functionality cannot be supported.
+	 */
+	if (fetching_amo_flush && tgt_events) {
+		TXC_WARN(txc,
+			 "Fetching AMO with FI_DELIVERY_COMPLETE not supported with FI_RMA_EVENT\n");
+		return -FI_EINVAL;
+	}
+
+	/* Work around for silent drops at the target for non-fetching
+	 * FI_UNIT32 atomic operations when using FI_CXI_HRP. Force
+	 * switching out of HRP if necessary.
+	 */
+	if (txc->hrp_war_req && (flags & FI_CXI_HRP) &&
+	    req_type == CXIP_RQ_AMO && msg->datatype == FI_UINT32)
+		flags &= ~FI_CXI_HRP;
+
+	ofi_genlock_lock(&txc->ep_obj->lock);
+	if (cxip_amo_emit_idc_req_needed(flags, result, result_mr,
+	    fetching_amo_flush)) {
+		/* if (result && !result_mr) we end up in this branch */
+		req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc);
+		if (!req) {
+			TXC_WARN(txc, "Failed to allocate request\n");
+			ret = -FI_EAGAIN;
+			goto err;
+		}
+
+		/* Values set here are passed back to the user through the CQ */
+		if (flags & FI_COMPLETION)
+			req->context = (uint64_t)msg->context;
+		else
+			req->context = (uint64_t)txc->context;
+		req->flags = FI_ATOMIC;
+		req->flags |= (req_type == CXIP_RQ_AMO ? FI_WRITE : FI_READ);
+		req->flags |= (flags & FI_COMPLETION);
+		req->cb = _cxip_amo_cb;
+		req->amo.txc = txc;
+		req->amo.fetching_amo_flush = fetching_amo_flush;
+		req->type = CXIP_REQ_AMO;
+
+		/* For fetching AMOs, the result buffer (i.e. fetch buffer) must
+		 * always be registered.
+		 */
+		if (result) {
+			if (result_mr) {
+				result_md = result_mr->md;
+			} else {
+				ret = cxip_map(dom, result, atomic_type_len, 0,
+					       &req->amo.result_md);
+				if (ret) {
+					TXC_WARN_RET(txc, ret,
+						     "Failed to map result buffer\n");
+					goto err_free_req;
+				}
+
+				result_md = req->amo.result_md;
+			}
+		}
+	} else if (result_mr) {
+		result_md = result_mr->md;
+	}
+	/* else {result == false} */
+
+	/* Identify the correct traffic class sub-type. */
+	if (flags & FI_CXI_HRP)
+		tc_type = CXI_TC_TYPE_HRP;
+	else if (flags & FI_CXI_UNRELIABLE)
+		tc_type = CXI_TC_TYPE_RESTRICTED;
+	else
+		tc_type = CXI_TC_TYPE_DEFAULT;
+
+	/* Prepare the c-state command for the AMO IDC operation. */
+	if (result)
+		cstate_cmd.write_lac = result_md->md->lac;
+
+	cstate_cmd.event_send_disable = 1;
+	cstate_cmd.index_ext = *idx_ext;
+	cstate_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq);
+	cstate_cmd.restricted = restricted;
+
+	/* If a request structure is not allocated, success events will be
+	 * disabled. But, if for some reason the operation completes with an
+	 * error, an event will occur. For this case, a TXC inject request is
+	 * allocated. This request enables the reporting of failed operation to
+	 * the completion queue. This request is freed when the TXC is closed.
+	 */
+	if (req) {
+		cstate_cmd.user_ptr = (uint64_t)req;
+	} else {
+		if (req_type == CXIP_RQ_AMO)
+			selective_completion_req =
+				cxip_amo_selective_completion_req(txc);
+		else
+			selective_completion_req =
+				cxip_amo_fetching_selective_completion_req(txc);
+
+		if (!selective_completion_req) {
+			ret = -FI_EAGAIN;
+			TXC_WARN(txc,
+				 "Failed to allocate selective completion request\n");
+			goto err_unmap_result_buf;
+		}
+
+		cstate_cmd.user_ptr = (uint64_t)selective_completion_req;
+		cstate_cmd.event_success_disable = 1;
+	}
+
+	/* Fetching AMO with flushes requires a trailing zero-byte put with
+	 * flush. Normal AMOs can use the operation flush functionality.
+	 */
+	if (!fetching_amo_flush) {
+		if (flush)
+			cstate_cmd.flush = 1;
+
+		if (req_type == CXIP_RQ_AMO) {
+			if (txc->write_cntr) {
+				cstate_cmd.event_ct_ack = 1;
+				cstate_cmd.ct = txc->write_cntr->ct->ctn;
+			}
+		} else {
+			if (txc->read_cntr) {
+				cstate_cmd.event_ct_reply = 1;
+				cstate_cmd.ct = txc->read_cntr->ct->ctn;
+			}
+		}
+	}
+
+	/* Prepare the IDC AMO command. */
+	idc_amo_cmd.idc_header.dfa = *dfa;
+	idc_amo_cmd.idc_header.remote_offset = remote_offset;
+	idc_amo_cmd.atomic_op = atomic_op;
+	idc_amo_cmd.atomic_type = atomic_type;
+	idc_amo_cmd.cswap_op = cswap_op;
+
+	/* if (result) {result_md is set} */
+	if (result)
+		idc_amo_cmd.local_addr = CXI_VA_TO_IOVA(result_md->md, result);
+
+	switch (msg->op) {
+	case FI_MSWAP:
+		ret = cxip_txc_copy_from_hmem(txc, NULL, &hmem_buf, buf,
+					      atomic_type_len);
+		if (ret) {
+			TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_unmap_result_buf;
+		}
+
+		ret = cxip_txc_copy_from_hmem(txc, NULL, &hmem_compare, compare,
+					      atomic_type_len);
+		if (ret) {
+			TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_unmap_result_buf;
+		}
+
+		hmem_buf &= hmem_compare;
+
+		/* Note: 16-byte value will overflow into op2_word2 */
+		memcpy(&idc_amo_cmd.op1_word1, &hmem_buf, atomic_type_len);
+		break;
+
+	/* FI_ATOMIC_READ is implemented as a sum of zero. Thus, only copy over
+	 * the buffer contents for non-FI_ATOMIC_READ operations.
+	 */
+	case FI_ATOMIC_READ:
+		break;
+
+	/* FI_ATOMIC_WRITE is implemented as a CSWAP NE operation. For this to
+	 * work, the compare buffer (i.e. operand 2) needs to have the same
+	 * contents as the write payload (i.e. operand 1).
+	 */
+	case FI_ATOMIC_WRITE:
+		assert(compare == NULL);
+
+		ret = cxip_txc_copy_from_hmem(txc, NULL, &idc_amo_cmd.op2_word1,
+					      buf, atomic_type_len);
+		if (ret) {
+			TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_unmap_result_buf;
+		}
+
+		/* Fall through. */
+	default:
+		ret = cxip_txc_copy_from_hmem(txc, NULL, &idc_amo_cmd.op1_word1,
+					      buf, atomic_type_len);
+		if (ret) {
+			TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_unmap_result_buf;
+		}
+	}
+
+	if (compare) {
+		/* Note: 16-byte value will overflow into op2_word2 */
+		ret = cxip_txc_copy_from_hmem(txc, NULL, &idc_amo_cmd.op2_word1,
+					      compare, atomic_type_len);
+		if (ret) {
+			TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_unmap_result_buf;
+		}
+	}
+
+	/* Optionally configure the flushing command used for fetching AMOs. */
+	if (fetching_amo_flush) {
+		assert(req != NULL);
+		if (req_type == CXIP_RQ_AMO)
+			req->amo.fetching_amo_flush_cntr = txc->write_cntr;
+		else
+			req->amo.fetching_amo_flush_cntr = txc->read_cntr;
+	}
+
+	ret = cxip_txc_emit_idc_amo(txc, vni, cxip_ofi_to_cxi_tc(tclass),
+				    tc_type, &cstate_cmd, &idc_amo_cmd, flags,
+				    fetching, flush);
+	if (ret) {
+		TXC_WARN_RET(txc, ret, "Failed to emit IDC amo\n");
+		goto err_unmap_result_buf;
+	}
+
+	ofi_genlock_unlock(&txc->ep_obj->lock);
+
+	return FI_SUCCESS;
+
+err_unmap_result_buf:
+	if (req && req->amo.result_md)
+		cxip_unmap(req->amo.result_md);
+err_free_req:
+	if (req)
+		cxip_evtq_req_free(req);
+err:
+	ofi_genlock_unlock(&txc->ep_obj->lock);
+
+	TXC_WARN_RET(txc, ret,
+		     "%s IDC %s failed: atomic_op=%u cswap_op=%u atomic_type=%u buf=%p compare=%p result=%p len=%u roffset=%#lx nid=%#x ep=%u idx_ext=%u\n",
+		     restricted ? "Restricted" : "Unrestricted",
+		     fetching ? "FAMO" : "AMO", atomic_op, cswap_op,
+		     atomic_type, buf, compare, result, atomic_type_len,
+		     remote_offset, dfa->unicast.nid,
+		     dfa->unicast.endpoint_defined, *idx_ext);
+
+	return ret;
+}
+
+static bool cxip_amo_emit_dma_req_needed(const struct fi_msg_atomic *msg,
+					 uint64_t flags, void *result,
+					 struct cxip_mr *buf_mr,
+					 struct cxip_mr *result_mr,
+					 bool fetching_amo_flush)
+{
+	/* To support FI_INJECt + DMA operations, an internal bounce buffer is
+	 * needed. This buffer is tracked in the request structure.
+	 */
+	if (flags & FI_INJECT)
+		return true;
+
+	/* User completion events always require a tracking structure. */
+	if (flags & FI_COMPLETION)
+		return true;
+
+	/* If the user did not provide an MR for the buffer arg, internal memory
+	 * registration needs to occur. This requires tracking.
+	 */
+	if (!buf_mr)
+		return true;
+
+	/* If a fetching operation (i.e. result buffer is valid) and the user
+	 * did not provide an MR for the result arg, internal memory
+	 * registration needs to occur. This requires tracking.
+	 */
+	if (result && !result_mr)
+		return true;
+
+	/* FI_ATOMIC_READ and FI_MSWAP are require the use of an internal bounce
+	 * buffer. This requires tracking.
+	 */
+	if (msg->op == FI_ATOMIC_READ || msg->op == FI_MSWAP)
+		return true;
+
+	/* Fetching AMO with flush always requires a request struct since two
+	 * operations are required to implement it.
+	 */
+	if (fetching_amo_flush)
+		return true;
+
+	return false;
+}
+
+/* TODO: Update HMEM buf type for 128-bit AMOs. */
+static int cxip_amo_emit_dma(struct cxip_txc *txc,
+			     enum cxip_amo_req_type req_type,
+			     const struct fi_msg_atomic *msg, void *buf,
+			     void *compare, void *result,
+			     struct cxip_mr *buf_mr, struct cxip_mr *result_mr,
+			     uint64_t key, uint64_t remote_offset,
+			     union c_fab_addr *dfa, uint8_t *idx_ext,
+			     uint16_t vni, enum c_atomic_op atomic_op,
+			     enum c_cswap_op cswap_op,
+			     enum c_atomic_type atomic_type,
+			     unsigned int atomic_type_len, uint64_t flags,
+			     uint32_t tclass, bool triggered,
+			     uint64_t trig_thresh, struct cxip_cntr *trig_cntr,
+			     struct cxip_cntr *comp_cntr)
+{
+	struct cxip_domain *dom = txc->domain;
+	struct c_dma_amo_cmd dma_amo_cmd = {};
+	bool flush = !!(flags & (FI_DELIVERY_COMPLETE | FI_MATCH_COMPLETE));
+	bool fetching = result != NULL;
+	bool fetching_amo_flush = fetching && flush;
+	struct cxip_req *req;
+	struct cxip_cntr *cntr;
+	int ret;
+	uint64_t hmem_buf;
+	uint64_t hmem_compare;
+	struct cxip_md *buf_md;
+	struct cxip_md *result_md = NULL;
+	void *selective_completion_req;
+
+	/* MR desc cannot be value unless hybrid MR desc is enabled. */
+	if (!dom->hybrid_mr_desc) {
+		buf_mr = NULL;
+		result_mr = NULL;
+	}
+
+	/* Since fetching AMO with flush results in two commands, if the
+	 * target MR needs events, this would results in two remote MR counter
+	 * increments. Thus, this functionality cannot be supported.
+	 */
+	if (fetching_amo_flush &&
+	    cxip_generic_is_mr_key_events(txc->ep_obj->caps, key)) {
+		TXC_WARN(txc,
+			 "Fetching AMO with FI_DELIVERY_COMPLETE not supported with FI_RMA_EVENT\n");
+		return -FI_EINVAL;
+	}
+
+	ofi_genlock_lock(&txc->ep_obj->lock);
+	if (cxip_amo_emit_dma_req_needed(msg, flags, result, buf_mr, result_mr,
+					 fetching_amo_flush)) {
+		/* if (result && !result_mr) we end up in this branch */
+		req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc);
+		if (!req) {
+			ret = -FI_EAGAIN;
+			TXC_WARN_RET(txc, ret, "Failed to allocate request\n");
+			goto err;
+		}
+
+		/* Values set here are passed back to the user through the CQ */
+		if (flags & FI_COMPLETION)
+			req->context = (uint64_t)msg->context;
+		else
+			req->context = (uint64_t)txc->context;
+		req->flags = FI_ATOMIC;
+		req->flags |= (req_type == CXIP_RQ_AMO ? FI_WRITE : FI_READ);
+		req->flags |= (flags & FI_COMPLETION);
+		req->cb = _cxip_amo_cb;
+		req->amo.txc = txc;
+		req->amo.fetching_amo_flush = fetching_amo_flush;
+		req->type = CXIP_REQ_AMO;
+		req->trig_cntr = trig_cntr;
+
+		/* Optionally register result MR. */
+		if (result) {
+			if (!result_mr) {
+				ret = cxip_map(dom, result, atomic_type_len, 0,
+					       &req->amo.result_md);
+				if (ret) {
+					TXC_WARN(txc,
+						 "Failed to map result buffer: %d:%s\n",
+						 ret, fi_strerror(-ret));
+					goto err_free_req;
+				}
+
+				result_md = req->amo.result_md;
+			} else {
+				result_md = result_mr->md;
+			}
+		}
+
+		if ((flags & FI_INJECT) || msg->op == FI_ATOMIC_READ ||
+		    msg->op == FI_MSWAP) {
+			/* To support FI_INJECT ot FI_ATOMIC_READ with matching
+			 * AMO commands, an internal buffer is needed to store
+			 * the payload.
+			 */
+			req->amo.ibuf = cxip_txc_ibuf_alloc(txc);
+			if (!req->amo.ibuf) {
+				ret = -FI_EAGAIN;
+				TXC_WARN(txc,
+					 "Failed to allocate ibuf: %d:%s\n",
+					 ret, fi_strerror(-ret));
+				goto err_unmap_result_buf;
+			}
+
+			switch (msg->op) {
+			/* FI_ATOMIC_READ is implemented as a sum of zero. Thus,
+			 * zero internal buffer which is used for the sum
+			 * operand.
+			 */
+			case FI_ATOMIC_READ:
+				memset(req->amo.ibuf, 0, atomic_type_len);
+				break;
+
+			case FI_MSWAP:
+				ret = cxip_txc_copy_from_hmem(txc, NULL,
+							      &hmem_buf, buf,
+							      atomic_type_len);
+				if (ret) {
+					TXC_WARN(txc,
+						 "cxip_txc_copy_from_hmem failed: %d:%s\n",
+						 ret, fi_strerror(-ret));
+					goto err_unmap_operand_buf;
+				}
+
+				ret = cxip_txc_copy_from_hmem(txc, NULL,
+							      &hmem_compare,
+							      compare,
+							      atomic_type_len);
+				if (ret) {
+					TXC_WARN(txc,
+						 "cxip_txc_copy_from_hmem failed: %d:%s\n",
+						 ret, fi_strerror(-ret));
+					goto err_unmap_operand_buf;
+				}
+
+				hmem_buf &= hmem_compare;
+
+				memcpy(req->amo.ibuf, &hmem_buf,
+				       atomic_type_len);
+				break;
+
+			/* Copy over user payload for FI_INJECT operation. */
+			default:
+				ret = cxip_txc_copy_from_hmem(txc, NULL,
+							      req->amo.ibuf,
+							      buf,
+							      atomic_type_len);
+				if (ret) {
+					TXC_WARN(txc,
+						 "cxip_txc_copy_from_hmem failed: %d:%s\n",
+						 ret, fi_strerror(-ret));
+					goto err_unmap_operand_buf;
+				}
+			}
+
+			buf = req->amo.ibuf;
+			buf_md = cxip_txc_ibuf_md(req->amo.ibuf);
+		} else if (buf_mr) {
+			buf_md = buf_mr->md;
+		} else {
+			/* Map user operand buffer for DMA command. */
+			ret = cxip_map(dom, buf, atomic_type_len, 0,
+				       &req->amo.oper1_md);
+			if (ret) {
+				TXC_WARN(txc,
+					 "Failed to map operand buffer: %d:%s\n",
+					 ret, fi_strerror(-ret));
+				goto err_unmap_result_buf;
+			}
+
+			buf_md = req->amo.oper1_md;
+		}
+	} else {
+		req = NULL;
+
+		if (result)
+			result_md = result_mr->md;
+
+		buf_md = buf_mr->md;
+	}
+
+	/* Build up the matching AMO command. */
+	dma_amo_cmd.dfa = *dfa;
+	dma_amo_cmd.index_ext = *idx_ext;
+	dma_amo_cmd.event_send_disable = 1;
+	dma_amo_cmd.remote_offset = remote_offset;
+	dma_amo_cmd.request_len = atomic_type_len;
+	dma_amo_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq);
+	dma_amo_cmd.match_bits = CXIP_KEY_MATCH_BITS(key);
+	dma_amo_cmd.atomic_op = atomic_op;
+	dma_amo_cmd.atomic_type = atomic_type;
+	dma_amo_cmd.cswap_op = cswap_op;
+	dma_amo_cmd.local_read_addr = CXI_VA_TO_IOVA(buf_md->md, buf);
+	dma_amo_cmd.lac = buf_md->md->lac;
+
+	if (req) {
+		dma_amo_cmd.user_ptr = (uint64_t)req;
+	} else {
+		if (req_type == CXIP_RQ_AMO)
+			selective_completion_req =
+				cxip_amo_selective_completion_req(txc);
+		else
+			selective_completion_req =
+				cxip_amo_fetching_selective_completion_req(txc);
+
+		if (!selective_completion_req) {
+			ret = -FI_EAGAIN;
+			TXC_WARN(txc,
+				 "Failed to allocate selective completion request\n");
+			goto err_unmap_operand_buf;
+		}
+
+		dma_amo_cmd.user_ptr = (uint64_t)selective_completion_req;
+		dma_amo_cmd.event_success_disable = 1;
+	}
+
+	/* FI_ATOMIC_WRITE is implemented as a CSWAP NE operation. For this to
+	 * work, the compare buffer (i.e. operand 2) needs to have the same
+	 * contents as the write payload (i.e. operand 1).
+	 */
+	if (msg->op == FI_ATOMIC_WRITE) {
+		assert(compare == NULL);
+
+		ret = cxip_txc_copy_from_hmem(txc, NULL, &dma_amo_cmd.op2_word1,
+					      buf, atomic_type_len);
+		if (ret) {
+			TXC_WARN(txc,
+				 "cxip_txc_copy_from_hmem failed: %d:%s\n", ret,
+				 fi_strerror(-ret));
+			goto err_unmap_operand_buf;
+		}
+	} else if (compare) {
+		/* Note: 16-byte value will overflow into op2_word2 */
+		ret = cxip_txc_copy_from_hmem(txc, NULL, &dma_amo_cmd.op2_word1,
+					      compare, atomic_type_len);
+		if (ret) {
+			TXC_WARN(txc,
+				 "cxip_txc_copy_from_hmem failed: %d:%s\n", ret,
+				 fi_strerror(-ret));
+			goto err_unmap_operand_buf;
+		}
+	}
+
+	/* if (result) {result_md is set} */
+	if (result) {
+		dma_amo_cmd.local_write_addr =
+			CXI_VA_TO_IOVA(result_md->md, result);
+		dma_amo_cmd.write_lac = result_md->md->lac;
+	}
+
+	/* Fetching AMO with flushes requires a trailing zero-byte put with
+	 * Normal AMOs can use the operation flush functionality.
+	 */
+	if (!fetching_amo_flush) {
+		dma_amo_cmd.flush = flush;
+
+		if (req_type == CXIP_RQ_AMO) {
+			cntr = triggered ? comp_cntr : txc->write_cntr;
+
+			if (cntr) {
+				dma_amo_cmd.event_ct_ack = 1;
+				dma_amo_cmd.ct = cntr->ct->ctn;
+			}
+		} else {
+			cntr = triggered ? comp_cntr : txc->read_cntr;
+
+			if (cntr) {
+				dma_amo_cmd.event_ct_reply = 1;
+				dma_amo_cmd.ct = cntr->ct->ctn;
+			}
+		}
+	}
+
+	/* Optionally configure the flushing command used for fetching AMOs. */
+	if (fetching_amo_flush) {
+		assert(req != NULL);
+
+		if (req_type == CXIP_RQ_AMO)
+			req->amo.fetching_amo_flush_cntr = txc->write_cntr;
+		else
+			req->amo.fetching_amo_flush_cntr = txc->read_cntr;
+	}
+
+	ret = cxip_txc_emit_dma_amo(txc, vni, cxip_ofi_to_cxi_tc(tclass),
+				    CXI_TC_TYPE_DEFAULT, trig_cntr, trig_thresh,
+				    &dma_amo_cmd, flags, fetching, flush);
+	if (ret) {
+		TXC_WARN_RET(txc, ret, "Failed to emit AMO\n");
+		goto err_unmap_operand_buf;
+	}
+
+	ofi_genlock_unlock(&txc->ep_obj->lock);
+
+	return FI_SUCCESS;
+
+err_unmap_operand_buf:
+	if (req) {
+		if (req->amo.ibuf)
+			cxip_txc_ibuf_free(txc, req->amo.ibuf);
+		else
+			cxip_unmap(req->amo.oper1_md);
+	}
+err_unmap_result_buf:
+	if (req && req->amo.result_md)
+		cxip_unmap(req->amo.result_md);
+err_free_req:
+	if (req)
+		cxip_evtq_req_free(req);
+err:
+	ofi_genlock_unlock(&txc->ep_obj->lock);
+
+	TXC_WARN_RET(txc, ret,
+		     "%s %s failed: atomic_op=%u cswap_op=%u atomic_type=%u buf=%p compare=%p result=%p len=%u rkey=%#lx roffset=%#lx nid=%#x ep=%u idx_ext=%u\n",
+		     triggered ? "Triggered" : "DMA", fetching ? "FAMO" : "AMO",
+		     atomic_op, cswap_op, atomic_type, buf, compare, result,
+		     atomic_type_len, key, remote_offset, dfa->unicast.nid,
+		     dfa->unicast.endpoint_defined, *idx_ext);
+
+	return ret;
+}
+
+static bool cxip_amo_is_idc(struct cxip_txc *txc, uint64_t key, bool triggered)
+{
+	/* Triggered AMOs can never be IDCs. */
+	if (triggered)
+		return false;
+
+	/* Only optimized MR can be used for IDCs. */
+	return cxip_generic_is_mr_key_opt(key);
+}
+
+int cxip_amo_common(enum cxip_amo_req_type req_type, struct cxip_txc *txc,
+		    uint32_t tclass, const struct fi_msg_atomic *msg,
+		    const struct fi_ioc *comparev, void **comparedesc,
+		    size_t compare_count, const struct fi_ioc *resultv,
+		    void **resultdesc, size_t result_count, uint64_t flags,
+		    bool triggered, uint64_t trig_thresh,
+		    struct cxip_cntr *trig_cntr, struct cxip_cntr *comp_cntr)
+{
+	void *buf;
+	void *compare = NULL;
+	void *result = NULL;
+	uint64_t remote_offset;
+	uint64_t key;
+	bool idc;
+	enum c_atomic_op atomic_op;
+	enum c_cswap_op cswap_op;
+	enum c_atomic_type atomic_type;
+	unsigned int atomic_type_len;
+	struct cxip_addr caddr;
+	int ret;
+	union c_fab_addr dfa;
+	uint8_t idx_ext;
+	uint32_t pid_idx;
+	struct cxip_mr *buf_mr = NULL;
+	struct cxip_mr *result_mr = NULL;
+	uint16_t vni;
+
+	if (!msg) {
+		TXC_WARN(txc, "NULL fi_msg_atomic");
+		return -FI_EINVAL;
+	}
+
+	switch (req_type) {
+	case CXIP_RQ_AMO_SWAP:
+		/* Must have a valid compare address */
+		if (!_vector_valid(compare_count, comparev)) {
+			TXC_WARN(txc, "compare IOV invalid\n");
+			return -FI_EINVAL;
+		}
+
+		compare = comparev[0].addr;
+
+		/* FALLTHRU */
+	case CXIP_RQ_AMO_FETCH:
+	case CXIP_RQ_AMO_PCIE_FETCH:
+		/* Must have a valid result address */
+		if (!_vector_valid(result_count, resultv)) {
+			TXC_WARN(txc, "result IOV invalid\n");
+			return -FI_EINVAL;
+		}
+
+		result = resultv[0].addr;
+		if (resultdesc && resultdesc[0])
+			result_mr = resultdesc[0];
+
+		/* FALLTHRU */
+	case CXIP_RQ_AMO:
+		if (msg->op != FI_ATOMIC_READ) {
+			if (!_vector_valid(msg->iov_count, msg->msg_iov)) {
+				TXC_WARN(txc, "msg IOV invalid\n");
+				return -FI_EINVAL;
+			}
+			buf = msg->msg_iov[0].addr;
+			if (msg->desc && msg->desc[0])
+				buf_mr = msg->desc[0];
+		} else {
+			buf = NULL;
+			buf_mr = NULL;
+		}
+
+		/* The supplied RMA address is actually an offset into a
+		 * registered MR. A value of 0 is valid.
+		 */
+		if (!_rma_vector_valid(msg->rma_iov_count, msg->rma_iov)) {
+			TXC_WARN(txc, "RMA IOV invalid\n");
+			return -FI_EINVAL;
+		}
+
+		remote_offset = msg->rma_iov[0].addr;
+		key = msg->rma_iov[0].key;
+
+		ret = cxip_adjust_remote_offset(&remote_offset, key);
+		if (ret) {
+			TXC_WARN(txc, "RMA IOV address overflow\n");
+			return -FI_EINVAL;
+		}
+		break;
+
+	default:
+		TXC_WARN(txc, "Invalid AMO request type: %d\n", req_type);
+		return -FI_EINVAL;
+	}
+
+	if (!cxip_generic_is_valid_mr_key(key)) {
+		TXC_WARN(txc, "Invalid remote key: 0x%lx\n", key);
+		return -FI_EKEYREJECTED;
+	}
+
+	idc = cxip_amo_is_idc(txc, key, triggered);
+
+	/* Convert FI to CXI codes, fail if operation not supported */
+	ret = _cxip_atomic_opcode(req_type, msg->datatype, msg->op,
+				  txc->domain->amo_remap_to_pcie_fadd,
+				  &atomic_op, &atomic_type, &cswap_op,
+				  &atomic_type_len);
+	if (ret < 0) {
+		TXC_WARN_RET(txc, ret, "Failed to generate CXI AMO opcodes\n");
+		return ret;
+	}
+
+	/* Look up target CXI address */
+	ret = cxip_av_lookup_addr(txc->ep_obj->av, msg->addr, &caddr);
+	if (ret != FI_SUCCESS) {
+		TXC_WARN_RET(txc, ret, "Failed to look up dst FI addr\n");
+		return ret;
+	}
+
+	if (txc->ep_obj->av_auth_key)
+		vni = caddr.vni;
+	else
+		vni = txc->ep_obj->auth_key.vni;
+
+	pid_idx = cxip_generic_mr_key_to_ptl_idx(txc->domain, key, !result);
+	cxi_build_dfa(caddr.nic, caddr.pid, txc->pid_bits, pid_idx, &dfa,
+		      &idx_ext);
+	if (idc)
+		ret = cxip_amo_emit_idc(txc, req_type, msg, buf, compare,
+					result, result_mr, key, remote_offset,
+					&dfa, &idx_ext, vni, atomic_op,
+					cswap_op, atomic_type, atomic_type_len,
+					flags, tclass);
+	else
+		ret = cxip_amo_emit_dma(txc, req_type, msg, buf, compare,
+					result, buf_mr, result_mr, key,
+					remote_offset, &dfa, &idx_ext, vni,
+					atomic_op, cswap_op, atomic_type,
+					atomic_type_len, flags, tclass,
+					triggered, trig_thresh, trig_cntr,
+					comp_cntr);
+	if (ret)
+		TXC_WARN_RET(txc, ret,
+			     "%s AMO failed: op=%u buf=%p compare=%p result=%p len=%u rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u triggered=%u",
+			     idc ? "IDC" : "DMA", msg->op, buf, compare, result,
+			     atomic_type_len, key, remote_offset, caddr.nic,
+			     caddr.pid, pid_idx, triggered);
+	else
+		TXC_DBG(txc,
+			"%s AMO emitted: op=%u buf=%p compare=%p result=%p len=%u rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u triggered=%u",
+			idc ? "IDC" : "DMA", msg->op, buf, compare, result,
+			atomic_type_len, key, remote_offset, caddr.nic,
+			caddr.pid, pid_idx, triggered);
+
+	return ret;
+}
+
+/*
+ * Libfabric APIs
+ */
+static ssize_t cxip_ep_atomic_write(struct fid_ep *fid_ep, const void *buf,
+				    size_t count, void *desc,
+				    fi_addr_t dest_addr, uint64_t addr,
+				    uint64_t key, enum fi_datatype datatype,
+				    enum fi_op op, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct fi_ioc oper1 = {
+		.addr = (void *)buf,
+		.count = count
+	};
+	struct fi_rma_ioc rma = {
+		.addr = addr,
+		.count = 1,
+		.key = key
+	};
+	struct fi_msg_atomic msg = {
+		.msg_iov = &oper1,
+		.desc = &desc,
+		.iov_count = 1,
+		.addr = dest_addr,
+		.rma_iov = &rma,
+		.rma_iov_count = 1,
+		.datatype = datatype,
+		.op = op,
+		.context = context
+	};
+
+	return cxip_amo_common(CXIP_RQ_AMO, &ep->ep_obj->txc,
+			       ep->tx_attr.tclass, &msg, NULL, NULL, 0,
+			       NULL, NULL, 0, ep->tx_attr.op_flags, false,
+			       0, NULL, NULL);
+}
+
+static ssize_t cxip_ep_atomic_writev(struct fid_ep *fid_ep,
+				     const struct fi_ioc *iov, void **desc,
+				     size_t count, fi_addr_t dest_addr,
+				     uint64_t addr, uint64_t key,
+				     enum fi_datatype datatype, enum fi_op op,
+				     void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct fi_rma_ioc rma = {
+		.addr = addr,
+		.count = 1,
+		.key = key
+	};
+	struct fi_msg_atomic msg = {
+		.msg_iov = iov,
+		.desc = desc,
+		.iov_count = count,
+		.addr = dest_addr,
+		.rma_iov = &rma,
+		.rma_iov_count = 1,
+		.datatype = datatype,
+		.op = op,
+		.context = context
+	};
+
+	return cxip_amo_common(CXIP_RQ_AMO, &ep->ep_obj->txc,
+			       ep->tx_attr.tclass, &msg, NULL, NULL, 0, NULL,
+			       NULL, 0, ep->tx_attr.op_flags, false, 0,
+			       NULL, NULL);
+}
+
+static ssize_t cxip_ep_atomic_writemsg(struct fid_ep *fid_ep,
+				       const struct fi_msg_atomic *msg,
+				       uint64_t flags)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+
+	if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS |
+		      FI_CXI_UNRELIABLE |
+		      FI_CXI_HRP | FI_CXI_WEAK_FENCE))
+		return -FI_EBADFLAGS;
+
+	if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE))
+		return -FI_EINVAL;
+
+	/* If selective completion is not requested, always generate
+	 * completions.
+	 */
+	if (!txc->selective_completion)
+		flags |= FI_COMPLETION;
+
+	return cxip_amo_common(CXIP_RQ_AMO, txc, ep->tx_attr.tclass, msg,
+			       NULL, NULL, 0, NULL, NULL, 0, flags, false, 0,
+			       NULL, NULL);
+}
+
+static ssize_t cxip_ep_atomic_inject(struct fid_ep *fid_ep, const void *buf,
+				     size_t count, fi_addr_t dest_addr,
+				     uint64_t addr, uint64_t key,
+				     enum fi_datatype datatype, enum fi_op op)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct fi_ioc oper1 = {
+		.addr = (void *)buf,
+		.count = count
+	};
+	struct fi_rma_ioc rma = {
+		.addr = addr,
+		.count = 1,
+		.key = key
+	};
+	struct fi_msg_atomic msg = {
+		.msg_iov = &oper1,
+		.desc = NULL,
+		.iov_count = 1,
+		.addr = dest_addr,
+		.rma_iov = &rma,
+		.rma_iov_count = 1,
+		.datatype = datatype,
+		.op = op,
+		.context = NULL
+	};
+
+	return cxip_amo_common(CXIP_RQ_AMO, &ep->ep_obj->txc,
+			       ep->tx_attr.tclass, &msg, NULL, NULL, 0, NULL,
+			       NULL, 0, FI_INJECT, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_ep_atomic_readwrite(struct fid_ep *fid_ep, const void *buf,
+					size_t count, void *desc, void *result,
+					void *result_desc, fi_addr_t dest_addr,
+					uint64_t addr, uint64_t key,
+					enum fi_datatype datatype,
+					enum fi_op op, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct fi_ioc oper1 = {
+		.addr = (void *)buf,
+		.count = count
+	};
+	struct fi_ioc resultv = {
+		.addr = result,
+		.count = count
+	};
+	struct fi_rma_ioc rma = {
+		.addr = addr,
+		.count = 1,
+		.key = key
+	};
+	struct fi_msg_atomic msg = {
+		.msg_iov = &oper1,
+		.desc = &desc,
+		.iov_count = 1,
+		.addr = dest_addr,
+		.rma_iov = &rma,
+		.rma_iov_count = 1,
+		.datatype = datatype,
+		.op = op,
+		.context = context
+	};
+
+	return cxip_amo_common(CXIP_RQ_AMO_FETCH, &ep->ep_obj->txc,
+			       ep->tx_attr.tclass, &msg, NULL, NULL, 0,
+			       &resultv, &result_desc, 1, ep->tx_attr.op_flags,
+			       false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_ep_atomic_readwritev(struct fid_ep *fid_ep,
+					 const struct fi_ioc *iov,
+					 void **desc, size_t count,
+					 struct fi_ioc *resultv,
+					 void **result_desc,
+					 size_t result_count,
+					 fi_addr_t dest_addr, uint64_t addr,
+					 uint64_t key,
+					 enum fi_datatype datatype,
+					 enum fi_op op, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct fi_rma_ioc rma = {
+		.addr = addr,
+		.count = 1,
+		.key = key
+	};
+	struct fi_msg_atomic msg = {
+		.msg_iov = iov,
+		.desc = desc,
+		.iov_count = count,
+		.addr = dest_addr,
+		.rma_iov = &rma,
+		.rma_iov_count = 1,
+		.datatype = datatype,
+		.op = op,
+		.context = context
+	};
+
+	return cxip_amo_common(CXIP_RQ_AMO_FETCH, &ep->ep_obj->txc,
+			       ep->tx_attr.tclass, &msg, NULL, NULL, 0, resultv,
+			       result_desc, result_count, ep->tx_attr.op_flags,
+			       false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_ep_atomic_readwritemsg(struct fid_ep *fid_ep,
+					   const struct fi_msg_atomic *msg,
+					   struct fi_ioc *resultv,
+					   void **result_desc,
+					   size_t result_count, uint64_t flags)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+	enum cxip_amo_req_type req_type;
+
+	if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS |
+		      FI_CXI_UNRELIABLE | FI_CXI_WEAK_FENCE |
+		      FI_CXI_PCIE_AMO))
+		return -FI_EBADFLAGS;
+
+	if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE))
+		return -FI_EINVAL;
+
+	/* If selective completion is not requested, always generate
+	 * completions.
+	 */
+	if (!txc->selective_completion)
+		flags |= FI_COMPLETION;
+
+	if (flags & FI_CXI_PCIE_AMO)
+		req_type = CXIP_RQ_AMO_PCIE_FETCH;
+	else
+		req_type = CXIP_RQ_AMO_FETCH;
+
+	return cxip_amo_common(req_type, txc, ep->tx_attr.tclass, msg, NULL,
+			       NULL, 0, resultv, result_desc, result_count,
+			       flags, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_ep_atomic_compwrite(struct fid_ep *fid_ep, const void *buf,
+					size_t count, void *desc,
+					const void *compare, void *compare_desc,
+					void *result, void *result_desc,
+					fi_addr_t dest_addr, uint64_t addr,
+					uint64_t key, enum fi_datatype datatype,
+					enum fi_op op, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct fi_ioc oper1 = {
+		.addr = (void *)buf,
+		.count = count
+	};
+	struct fi_ioc comparev = {
+		.addr = (void *)compare,
+		.count = count
+	};
+	struct fi_ioc resultv = {
+		.addr = result,
+		.count = count
+	};
+	struct fi_rma_ioc rma = {
+		.addr = addr,
+		.count = 1,
+		.key = key
+	};
+	struct fi_msg_atomic msg = {
+		.msg_iov = &oper1,
+		.desc = &desc,
+		.iov_count = 1,
+		.addr = dest_addr,
+		.rma_iov = &rma,
+		.rma_iov_count = 1,
+		.datatype = datatype,
+		.op = op,
+		.context = context
+	};
+
+	return cxip_amo_common(CXIP_RQ_AMO_SWAP, &ep->ep_obj->txc,
+			       ep->tx_attr.tclass, &msg, &comparev,
+			       &result_desc, 1, &resultv, &result_desc, 1,
+			       ep->tx_attr.op_flags, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_ep_atomic_compwritev(struct fid_ep *fid_ep,
+					 const struct fi_ioc *iov, void **desc,
+					 size_t count,
+					 const struct fi_ioc *comparev,
+					 void **compare_desc,
+					 size_t compare_count,
+					 struct fi_ioc *resultv,
+					 void **result_desc,
+					 size_t result_count,
+					 fi_addr_t dest_addr, uint64_t addr,
+					 uint64_t key,
+					 enum fi_datatype datatype,
+					 enum fi_op op, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct fi_rma_ioc rma = {
+		.addr = addr,
+		.count = 1,
+		.key = key
+	};
+	struct fi_msg_atomic msg = {
+		.msg_iov = iov,
+		.desc = desc,
+		.iov_count = count,
+		.addr = dest_addr,
+		.rma_iov = &rma,
+		.rma_iov_count = 1,
+		.datatype = datatype,
+		.op = op,
+		.context = context
+	};
+
+	return cxip_amo_common(CXIP_RQ_AMO_SWAP, &ep->ep_obj->txc,
+			       ep->tx_attr.tclass, &msg, comparev, compare_desc,
+			       compare_count, resultv, result_desc,
+			       result_count, ep->tx_attr.op_flags, false, 0,
+			       NULL, NULL);
+}
+
+static ssize_t
+cxip_ep_atomic_compwritemsg(struct fid_ep *fid_ep,
+			    const struct fi_msg_atomic *msg,
+			    const struct fi_ioc *comparev, void **compare_desc,
+			    size_t compare_count, struct fi_ioc *resultv,
+			    void **result_desc, size_t result_count,
+			    uint64_t flags)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+
+	if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS |
+		      FI_CXI_UNRELIABLE | FI_CXI_WEAK_FENCE))
+		return -FI_EBADFLAGS;
+
+	if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE))
+		return -FI_EINVAL;
+
+	/* If selective completion is not requested, always generate
+	 * completions.
+	 */
+	if (!txc->selective_completion)
+		flags |= FI_COMPLETION;
+
+	return cxip_amo_common(CXIP_RQ_AMO_SWAP, txc, ep->tx_attr.tclass, msg,
+			       comparev, compare_desc, compare_count, resultv,
+			       result_desc, result_count, flags, false, 0,
+			       NULL, NULL);
+}
+
+static int cxip_ep_atomic_valid(struct fid_ep *ep,
+				enum fi_datatype datatype,
+				enum fi_op op,
+				size_t *count)
+{
+	return _cxip_ep_valid(ep, CXIP_RQ_AMO, datatype, op, count);
+}
+
+static int cxip_ep_fetch_atomic_valid(struct fid_ep *ep,
+				      enum fi_datatype datatype, enum fi_op op,
+				      size_t *count)
+{
+	return _cxip_ep_valid(ep, CXIP_RQ_AMO_FETCH, datatype, op, count);
+}
+
+static int cxip_ep_comp_atomic_valid(struct fid_ep *ep,
+				     enum fi_datatype datatype,
+				     enum fi_op op, size_t *count)
+{
+	return _cxip_ep_valid(ep, CXIP_RQ_AMO_SWAP, datatype, op, count);
+}
+
+struct fi_ops_atomic cxip_ep_atomic_ops = {
+	.size = sizeof(struct fi_ops_atomic),
+	.write = cxip_ep_atomic_write,
+	.writev = cxip_ep_atomic_writev,
+	.writemsg = cxip_ep_atomic_writemsg,
+	.inject = cxip_ep_atomic_inject,
+	.readwrite = cxip_ep_atomic_readwrite,
+	.readwritev = cxip_ep_atomic_readwritev,
+	.readwritemsg = cxip_ep_atomic_readwritemsg,
+	.compwrite = cxip_ep_atomic_compwrite,
+	.compwritev = cxip_ep_atomic_compwritev,
+	.compwritemsg = cxip_ep_atomic_compwritemsg,
+	.writevalid = cxip_ep_atomic_valid,
+	.readwritevalid = cxip_ep_fetch_atomic_valid,
+	.compwritevalid = cxip_ep_comp_atomic_valid,
+};
+
+struct fi_ops_atomic cxip_ep_atomic_no_ops = {
+	.size = sizeof(struct fi_ops_atomic),
+	.write = fi_no_atomic_write,
+	.writev = fi_no_atomic_writev,
+	.writemsg = fi_no_atomic_writemsg,
+	.inject = fi_no_atomic_inject,
+	.readwrite = fi_no_atomic_readwrite,
+	.readwritev = fi_no_atomic_readwritev,
+	.readwritemsg = fi_no_atomic_readwritemsg,
+	.compwrite = fi_no_atomic_compwrite,
+	.compwritev = fi_no_atomic_compwritev,
+	.compwritemsg = fi_no_atomic_compwritemsg,
+	.writevalid = fi_no_atomic_writevalid,
+	.readwritevalid = fi_no_atomic_readwritevalid,
+	.compwritevalid = fi_no_atomic_compwritevalid,
+};
diff --git a/prov/cxi/src/cxip_av.c b/prov/cxi/src/cxip_av.c
new file mode 100644
index 00000000000..6dd4aa4e415
--- /dev/null
+++ b/prov/cxi/src/cxip_av.c
@@ -0,0 +1,947 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved.
+ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2017 Los Alamos National Security, LLC.
+ *                    All rights reserved.
+ * Copyright (c) 2018,2020 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <sys/ipc.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <netinet/ether.h>
+
+#include "cxip.h"
+
+#include "ofi_osd.h"
+#include "ofi_util.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_AV, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_AV, __VA_ARGS__)
+
+/*
+ * cxip_parse_cxi_addr() - Parse node and service arguments representing a CXI
+ * address.
+ */
+static int cxip_parse_cxi_addr(const char *node, const char *service,
+			       struct cxip_addr *addr)
+{
+	struct ether_addr *mac;
+	uint32_t scan_nic;
+	uint32_t scan_pid;
+
+	if (!node)
+		return -FI_ENODATA;
+
+	mac = ether_aton(node);
+	if (mac) {
+		addr->nic = cxip_mac_to_nic(mac);
+	} else if (sscanf(node, "%i", &scan_nic) == 1) {
+		addr->nic = scan_nic;
+	} else {
+		return -FI_ENODATA;
+	}
+
+	if (!service)
+		addr->pid = C_PID_ANY;
+	else if (sscanf(service, "%i", &scan_pid) == 1)
+		addr->pid = scan_pid;
+	else
+		return -FI_ENODATA;
+
+	return FI_SUCCESS;
+}
+
+static inline void cxip_av_read_lock(struct cxip_av *av)
+{
+	if (!av->lockless)
+		pthread_rwlock_rdlock(&av->lock);
+}
+
+static inline void cxip_av_write_lock(struct cxip_av *av)
+{
+	if (!av->lockless)
+		pthread_rwlock_wrlock(&av->lock);
+}
+
+static inline void cxip_av_unlock(struct cxip_av *av)
+{
+	if (!av->lockless)
+		pthread_rwlock_unlock(&av->lock);
+}
+
+static int cxip_av_insert_addr(struct cxip_av *av, struct cxip_addr *addr,
+			       fi_addr_t *fi_addr, uint64_t flags)
+{
+	struct cxip_av_entry *entry;
+	struct cxip_av_auth_key_entry *auth_key_entry = NULL;
+	struct cxip_addr auth_key_addr = {
+		.nic = addr->nic,
+		.pid = addr->pid
+	};
+
+	if (flags & FI_AUTH_KEY) {
+		auth_key_entry =
+			ofi_bufpool_get_ibuf(av->auth_key_entry_pool, *fi_addr);
+		if (!auth_key_entry) {
+			CXIP_WARN("Failed to find auth_key entry\n");
+			return -FI_EINVAL;
+		}
+
+		auth_key_addr.vni = auth_key_entry->key.vni;
+	}
+
+	CXIP_DBG("Inserting nid=%#x pid=%d vni=%d\n", auth_key_addr.nic,
+		 auth_key_addr.pid, auth_key_addr.vni);
+
+	HASH_FIND(hh, av->av_entry_hash, &auth_key_addr, sizeof(auth_key_addr),
+		  entry);
+	if (entry) {
+		if (fi_addr)
+			*fi_addr = ofi_buf_index(entry);
+		if (ofi_atomic_inc32(&entry->use_cnt) > 1)
+			CXIP_WARN("nid=%#x pid=%d inserted multiple times\n",
+				  addr->nic, addr->pid);
+
+		return FI_SUCCESS;
+	}
+
+	entry = ofi_ibuf_alloc(av->av_entry_pool);
+	if (!entry) {
+		CXIP_WARN("Failed to allocated AV entry memory\n");
+		if (fi_addr)
+			*fi_addr = FI_ADDR_NOTAVAIL;
+		return -FI_ENOMEM;
+	}
+
+	memcpy(&entry->addr, &auth_key_addr, sizeof(auth_key_addr));
+	ofi_atomic_initialize32(&entry->use_cnt, 1);
+	HASH_ADD(hh, av->av_entry_hash, addr, sizeof(entry->addr), entry);
+
+	if (flags & FI_AV_USER_ID)
+		entry->fi_addr = *fi_addr;
+	else if (av->av_user_id)
+		entry->fi_addr = FI_ADDR_UNSPEC;
+	else
+		entry->fi_addr = ofi_buf_index(entry);
+
+	if (fi_addr)
+		*fi_addr = ofi_buf_index(entry);
+
+	if (auth_key_entry) {
+		entry->auth_key = auth_key_entry;
+		ofi_atomic_inc32(&auth_key_entry->ref_cnt);
+	}
+
+	ofi_atomic_inc32(&av->av_entry_cnt);
+
+	return FI_SUCCESS;
+}
+
+#define AV_INSERT_VALID_FLAGS (FI_MORE | FI_AV_USER_ID | FI_AUTH_KEY)
+
+static int cxip_av_insert_validate_args(struct fid_av *fid, const void *addr_in,
+					size_t count, fi_addr_t *fi_addr,
+					uint64_t flags, void *context)
+{
+	uint64_t unsupported_flags = flags & ~AV_INSERT_VALID_FLAGS;
+	struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid);
+
+	if (!addr_in && count) {
+		CXIP_WARN("NULL addr buffer\n");
+		return -FI_EINVAL;
+	}
+
+	if (unsupported_flags) {
+		CXIP_WARN("Unsupported AV insert flags: %#lx\n",
+			  unsupported_flags);
+		return -FI_EINVAL;
+	}
+
+	/* FI_SYMMETRIC is an optimization using logical matching. This avoids
+	 * doing a reverse lookup for support FI_SOURCE. Since no lookup
+	 * occurs, FI_AV_USER_ID cannot be support.
+	 */
+	if (av->symmetric && (flags & FI_AV_USER_ID)) {
+		CXIP_WARN("FI_SYMMETRIC not supported with FI_AV_USER_ID\n");
+		return -FI_EINVAL;
+	}
+
+	if (av->av_user_id && (flags & FI_AV_USER_ID)) {
+		CXIP_WARN("FI_AV_USER_ID insert flags not supported with AV opened with FI_AV_USER_ID\n");
+		return -FI_EINVAL;
+	}
+
+	if (!fi_addr && (flags & FI_AV_USER_ID)) {
+		CXIP_WARN("NULL fi_addr with FI_AV_USER_ID\n");
+		return -FI_EINVAL;
+	}
+
+	if (!av->av_auth_key && (flags & FI_AUTH_KEY)) {
+		CXIP_WARN("FI_AUTH_KEY requested without FI_AV_AUTH_KEY domain config\n");
+		return -FI_EINVAL;
+	}
+
+	if (av->av_auth_key && !(flags & FI_AUTH_KEY)) {
+		CXIP_WARN("FI_AUTH_KEY must be used for AVs configured with FI_AV_AUTH_KEY\n");
+		return -FI_EINVAL;
+	}
+
+	if ((flags & FI_AUTH_KEY) && (flags & FI_AV_USER_ID)) {
+		CXIP_WARN("FI_AUTH_KEY and FI_AV_USER_ID are not supported together\n");
+		return -FI_EINVAL;
+	}
+
+	if ((flags & FI_AUTH_KEY) && !fi_addr) {
+		CXIP_WARN("NULL fi_addr array used with FI_AUTH_KEY\n");
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+/* NETSIM collectives simulation reqires many-to-one fi_addr to cxip_addr,
+ * i.e., multiple fi_addr values that resolve to the same target address. The
+ * new reverse-lookup model requires unique one-to-one, i.e. every cxip_addr
+ * must be unique. These filter functions allow insert/lookup modifications
+ * of the values by replacing these functions in the test code.
+ */
+static struct cxip_addr *insert_in(const void *addr_in)
+{
+	return (struct cxip_addr *)addr_in;
+}
+static void insert_out(struct cxip_addr *addr_out,
+		       struct cxip_addr *addr_in)
+{
+	*addr_out = *addr_in;
+}
+struct cxip_addr *(*cxip_av_addr_in)(const void *addr) = insert_in;
+void (*cxip_av_addr_out)(struct cxip_addr *addr_out,
+			 struct cxip_addr *addr) = insert_out;
+
+static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count,
+			  fi_addr_t *fi_addr, uint64_t flags, void *context)
+{
+	struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid);
+	size_t i;
+	size_t success_cnt = 0;
+	int ret;
+
+	ret = cxip_av_insert_validate_args(fid, addr_in, count, fi_addr, flags,
+					   context);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	cxip_av_write_lock(av);
+
+	for (i = 0; i < count; i++) {
+		ret = cxip_av_insert_addr(av, cxip_av_addr_in(addr_in) + i,
+					  fi_addr ? &fi_addr[i] : NULL, flags);
+		if (ret == FI_SUCCESS)
+			success_cnt++;
+	}
+
+	cxip_av_unlock(av);
+
+	return success_cnt;
+}
+
+static int cxip_av_insertsvc_validate_args(struct fid_av *fid, const char *node,
+					   const char *service,
+					   fi_addr_t *fi_addr, uint64_t flags,
+					   void *context)
+{
+	if (!node) {
+		CXIP_WARN("NULL node\n");
+		return -FI_EINVAL;
+	}
+
+	if (!service) {
+		CXIP_WARN("NULL service\n");
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+static int cxip_av_insertsvc(struct fid_av *fid, const char *node,
+			     const char *service, fi_addr_t *fi_addr,
+			     uint64_t flags, void *context)
+{
+	struct cxip_addr addr = {};
+	int ret;
+
+	ret = cxip_av_insertsvc_validate_args(fid, node, service, fi_addr,
+					      flags, context);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	ret = cxip_parse_cxi_addr(node, service, &addr);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to parse node %s and service %s\n", node,
+			  service);
+		return ret;
+	}
+
+	return cxip_av_insert(fid, &addr, 1, fi_addr, flags, context);
+}
+
+int cxip_av_lookup_addr(struct cxip_av *av, fi_addr_t fi_addr,
+			struct cxip_addr *addr)
+{
+	struct cxip_av_entry *entry;
+
+	cxip_av_read_lock(av);
+
+	entry = ofi_bufpool_get_ibuf(av->av_entry_pool, fi_addr);
+	if (entry && addr)
+		cxip_av_addr_out(addr, &entry->addr);
+
+	cxip_av_unlock(av);
+
+	if (entry)
+		return FI_SUCCESS;
+
+	CXIP_WARN("Invalid fi_addr %#lx\n", fi_addr);
+
+	return -FI_EINVAL;
+}
+
+fi_addr_t cxip_av_lookup_fi_addr(struct cxip_av *av,
+				 const struct cxip_addr *addr)
+{
+	struct cxip_av_entry *entry;
+	struct cxip_addr lookup_addr = *addr;
+	fi_addr_t fi_addr;
+
+	/* Non-zero VNIs being inserted into the auth_key is ONLY supported with
+	 * FI_AV_AUTH_KEY.
+	 */
+	if (!av->av_auth_key)
+		lookup_addr.vni = 0;
+
+	cxip_av_read_lock(av);
+
+	HASH_FIND(hh, av->av_entry_hash, &lookup_addr, sizeof(lookup_addr),
+		  entry);
+	fi_addr = entry ? entry->fi_addr : FI_ADDR_NOTAVAIL;
+
+	cxip_av_unlock(av);
+
+	return fi_addr;
+}
+
+int cxip_av_bind_ep(struct cxip_av *av, struct cxip_ep *ep)
+{
+	int ret;
+
+	if (av->domain != ep->ep_obj->domain) {
+		CXIP_WARN("EP belongs to different domain\n");
+		return -FI_EINVAL;
+	}
+
+	cxip_av_write_lock(av);
+	ret = fid_list_insert(&av->ep_list, NULL, &ep->ep.fid);
+	cxip_av_unlock(av);
+
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("EP bind failed: %d\n", ret);
+		return ret;
+	}
+
+	ofi_atomic_inc32(&av->ref);
+	return FI_SUCCESS;
+}
+
+void cxip_av_unbind_ep(struct cxip_av *av, struct cxip_ep *ep)
+{
+	cxip_av_write_lock(av);
+	fid_list_remove(&av->ep_list, NULL, &ep->ep.fid);
+	cxip_av_unlock(av);
+
+	ofi_atomic_dec32(&av->ref);
+}
+
+static int cxip_av_lookup(struct fid_av *fid, fi_addr_t fi_addr, void *addr_out,
+			  size_t *addrlen)
+{
+	struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid);
+	struct cxip_addr addr;
+	int ret;
+
+	ret = cxip_av_lookup_addr(av, fi_addr, &addr);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	memcpy(addr_out, &addr, MIN(*addrlen, sizeof(addr)));
+	*addrlen = sizeof(addr);
+
+	return FI_SUCCESS;
+}
+
+static void cxip_av_remove_addr(struct cxip_av *av, fi_addr_t fi_addr)
+{
+	struct cxip_av_entry *entry;
+	int use_cnt;
+
+	entry = ofi_bufpool_get_ibuf(av->av_entry_pool, fi_addr);
+	if (!entry) {
+		CXIP_WARN("Invalid fi_addr: %#lx\n", fi_addr);
+		return;
+	}
+
+	use_cnt = ofi_atomic_dec32(&entry->use_cnt);
+	if (use_cnt)
+		return;
+
+	CXIP_DBG("Removing nid=%#x pid=%d\n", entry->addr.nic,
+		 entry->addr.pid);
+
+	if (entry->auth_key)
+		ofi_atomic_dec32(&entry->auth_key->ref_cnt);
+
+	ofi_atomic_dec32(&av->av_entry_cnt);
+	HASH_DELETE(hh, av->av_entry_hash, entry);
+	ofi_ibuf_free(entry);
+}
+
+static int cxip_av_remove_auth_key(struct cxip_av *av, fi_addr_t fi_addr)
+{
+	struct cxip_av_auth_key_entry *entry;
+	int use_cnt;
+
+	entry = ofi_bufpool_get_ibuf(av->auth_key_entry_pool, fi_addr);
+	if (!entry) {
+		CXIP_WARN("Invalid fi_addr: %#lx\n", fi_addr);
+		return -FI_EINVAL;
+	}
+
+	if (ofi_atomic_get32(&entry->ref_cnt)) {
+		CXIP_WARN("AV auth key still in use\n");
+		return -FI_EBUSY;
+	}
+
+	use_cnt = ofi_atomic_dec32(&entry->use_cnt);
+	if (use_cnt)
+		return FI_SUCCESS;
+
+	CXIP_DBG("vni=%d\n", entry->key.vni);
+
+	ofi_atomic_dec32(&av->auth_key_entry_cnt);
+	dlist_remove(&entry->entry);
+	HASH_DELETE(hh, av->auth_key_entry_hash, entry);
+	ofi_ibuf_free(entry);
+
+	return FI_SUCCESS;
+}
+
+#define AV_REMOVE_VALID_FLAGS FI_AUTH_KEY
+
+static int cxip_av_remove(struct fid_av *fid, fi_addr_t *fi_addr,
+			  size_t count, uint64_t flags)
+{
+	struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid);
+	uint64_t unsupported_flags = flags & ~AV_REMOVE_VALID_FLAGS;
+	size_t i;
+	int ret;
+
+	if (unsupported_flags) {
+		CXIP_WARN("Unsupported flags: %#lx\n", unsupported_flags);
+		return -FI_EINVAL;
+	}
+
+	cxip_av_write_lock(av);
+
+	for (i = 0; i < count; i++) {
+		if (flags & FI_AUTH_KEY) {
+			ret = cxip_av_remove_auth_key(av, fi_addr[i]);
+			if (ret != FI_SUCCESS)
+				return ret;
+		} else {
+			cxip_av_remove_addr(av, fi_addr[i]);
+		}
+	}
+
+	cxip_av_unlock(av);
+
+	return FI_SUCCESS;
+}
+
+static const char *cxip_av_straddr(struct fid_av *fid, const void *addr,
+				   char *buf, size_t *len)
+{
+	return ofi_straddr(buf, len, FI_ADDR_CXI, addr);
+}
+
+static int cxip_av_close(struct fid *fid)
+{
+	struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid);
+	struct cxip_domain *dom = av->domain;
+
+	if (ofi_atomic_get32(&av->ref))
+		return -FI_EBUSY;
+
+	HASH_CLEAR(hh, av->auth_key_entry_hash);
+	ofi_bufpool_destroy(av->auth_key_entry_pool);
+	HASH_CLEAR(hh, av->av_entry_hash);
+	ofi_bufpool_destroy(av->av_entry_pool);
+	free(av);
+
+	ofi_atomic_dec32(&dom->ref);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_av_insert_auth_key_validate_args(struct cxip_av *cxi_av,
+						 const void *auth_key,
+						 size_t auth_key_size,
+						 fi_addr_t *fi_addr,
+						 uint64_t flags)
+{
+	struct cxi_auth_key *key = (struct cxi_auth_key *)auth_key;
+
+	if (!cxi_av->av_auth_key) {
+		CXIP_WARN("Domain not configured with FI_AV_AUTH_KEY\n");
+		return -FI_EINVAL;
+	}
+
+	if (!auth_key) {
+		CXIP_WARN("NULL auth_key\n");
+		return -FI_EINVAL;
+	}
+
+	if (auth_key_size != sizeof(struct cxi_auth_key)) {
+		CXIP_WARN("Bad auth_key_size\n");
+		return -FI_EINVAL;
+	}
+
+	if (!fi_addr) {
+		CXIP_WARN("NULL fi_addr\n");
+		return -FI_EINVAL;
+	}
+
+	if (flags) {
+		CXIP_WARN("Invalid flags\n");
+		return -FI_EINVAL;
+	}
+
+	if (ofi_atomic_get32(&cxi_av->auth_key_entry_cnt) >=
+	    cxi_av->auth_key_entry_max) {
+		CXIP_WARN("AV EP max auth key count limit reached\n");
+		return -FI_ENOSPC;
+	}
+
+	return cxip_domain_valid_vni(cxi_av->domain, key->vni);
+}
+
+static int cxip_av_insert_auth_key(struct fid_av *av, const void *auth_key,
+				   size_t auth_key_size, fi_addr_t *fi_addr,
+				   uint64_t flags)
+{
+	struct cxip_av *cxi_av = container_of(av, struct cxip_av, av_fid);
+	struct cxip_av_auth_key_entry *entry;
+	struct cxi_auth_key key;
+	int ret;
+
+	ret = cxip_av_insert_auth_key_validate_args(cxi_av, auth_key,
+						    auth_key_size, fi_addr,
+						    flags);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	/* Use a bounce buffer for authorization key to clear the service field.
+	 * The service field is not needed for this AV auth key.
+	 */
+	memcpy(&key, auth_key, sizeof(key));
+	key.svc_id = 0;
+
+	CXIP_DBG("Inserting VNI=%d\n", key.vni);
+
+	cxip_av_write_lock(cxi_av);
+
+	HASH_FIND(hh, cxi_av->auth_key_entry_hash, &key, sizeof(key), entry);
+	if (entry) {
+		*fi_addr = ofi_buf_index(entry);
+		if (ofi_atomic_inc32(&entry->use_cnt) > 1)
+			CXIP_WARN("vni=%d inserted multiple times\n", key.vni);
+
+		return FI_SUCCESS;
+	}
+
+	entry = ofi_ibuf_alloc(cxi_av->auth_key_entry_pool);
+	if (!entry) {
+		CXIP_WARN("Failed to allocated AV auth key entry memory\n");
+		return -FI_ENOMEM;
+	}
+
+	memcpy(&entry->key, &key, sizeof(key));
+	ofi_atomic_initialize32(&entry->use_cnt, 1);
+	ofi_atomic_initialize32(&entry->ref_cnt, 0);
+	HASH_ADD(hh, cxi_av->auth_key_entry_hash, key, sizeof(key), entry);
+	dlist_insert_tail(&entry->entry, &cxi_av->auth_key_entry_list);
+	ofi_atomic_inc32(&cxi_av->auth_key_entry_cnt);
+
+	if (cxi_av->av_user_id)
+		entry->fi_addr = FI_ADDR_UNSPEC;
+	else
+		entry->fi_addr = ofi_buf_index(entry);
+
+	*fi_addr = ofi_buf_index(entry);
+
+	cxip_av_unlock(cxi_av);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_av_lookup_auth_key_validate_args(struct cxip_av *cxi_av,
+						 fi_addr_t addr, void *auth_key,
+						 size_t *auth_key_size)
+{
+	if (!cxi_av->av_auth_key) {
+		CXIP_WARN("Domain not configured with FI_AV_AUTH_KEY\n");
+		return -FI_EINVAL;
+	}
+
+	if (!auth_key) {
+		CXIP_WARN("NULL auth_key\n");
+		return -FI_EINVAL;
+	}
+
+	if (!auth_key_size) {
+		CXIP_WARN("NULL auth_key_size\n");
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+static int cxip_av_lookup_auth_key(struct fid_av *av, fi_addr_t addr,
+				   void *auth_key, size_t *auth_key_size)
+{
+	struct cxip_av *cxi_av = container_of(av, struct cxip_av, av_fid);
+	struct cxip_av_auth_key_entry *entry;
+	int ret;
+
+	ret = cxip_av_lookup_auth_key_validate_args(cxi_av, addr, auth_key,
+						    auth_key_size);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	cxip_av_read_lock(cxi_av);
+
+	entry = ofi_bufpool_get_ibuf(cxi_av->auth_key_entry_pool, addr);
+	if (entry) {
+		*auth_key_size = MIN(sizeof(entry->key), *auth_key_size);
+		memcpy(auth_key, &entry->key, *auth_key_size);
+		*auth_key_size = sizeof(entry->key);
+	}
+
+	cxip_av_unlock(cxi_av);
+
+	if (entry)
+		return FI_SUCCESS;
+
+	CXIP_WARN("Invalid fi_addr %#lx\n", addr);
+
+	return -FI_EINVAL;
+}
+
+fi_addr_t cxip_av_lookup_auth_key_fi_addr(struct cxip_av *av, unsigned int vni)
+{
+	struct cxip_av_auth_key_entry *entry;
+	struct cxi_auth_key lookup = {
+		.vni = vni,
+	};
+	fi_addr_t addr;
+
+	if (!av->av_auth_key)
+		return FI_ADDR_NOTAVAIL;
+
+	cxip_av_read_lock(av);
+
+	HASH_FIND(hh, av->auth_key_entry_hash, &lookup, sizeof(lookup), entry);
+	addr = entry ? entry->fi_addr : FI_ADDR_NOTAVAIL;
+
+	cxip_av_unlock(av);
+
+	return addr;
+}
+
+int cxip_av_auth_key_get_vnis(struct cxip_av *av, uint16_t **vni,
+			      size_t *vni_count)
+{
+	uint16_t *vnis;
+	size_t count;
+	struct cxip_av_auth_key_entry *entry;
+	int i;
+	int ret = FI_SUCCESS;
+
+	cxip_av_read_lock(av);
+
+	count = ofi_atomic_get32(&av->auth_key_entry_cnt);
+	if (count == 0) {
+		CXIP_WARN("AV auth key empty\n");
+		ret = -FI_EINVAL;
+		goto unlock_out;
+	}
+
+	vnis = calloc(count, sizeof(*vnis));
+	if (!vnis) {
+		CXIP_WARN("Failed to allocate auth key VNI memory\n");
+		ret = -FI_ENOMEM;;
+		goto unlock_out;
+	}
+
+	i = 0;
+	dlist_foreach_container(&av->auth_key_entry_list,
+				struct cxip_av_auth_key_entry, entry, entry) {
+		ofi_atomic_inc32(&entry->ref_cnt);
+		vnis[i] = entry->key.vni;
+		i++;
+	}
+
+	assert(count == i);
+
+	*vni_count = count;
+	*vni = vnis;
+
+unlock_out:
+	cxip_av_unlock(av);
+
+	return ret;
+}
+
+void cxip_av_auth_key_put_vnis(struct cxip_av *av, uint16_t *vni,
+			       size_t vni_count)
+{
+	size_t i;
+	struct cxip_av_auth_key_entry *entry;
+
+	cxip_av_read_lock(av);
+
+	for (i = 0; i < vni_count; i++) {
+		dlist_foreach_container(&av->auth_key_entry_list,
+					struct cxip_av_auth_key_entry, entry, entry) {
+			if (entry->key.vni == vni[i]) {
+				ofi_atomic_dec32(&entry->ref_cnt);
+				break;
+			}
+		}
+	}
+
+	cxip_av_unlock(av);
+
+	free(vni);
+}
+
+#define AV_SET_USER_ID_VALID_FLAGS FI_AUTH_KEY
+
+static int cxip_av_set_user_id(struct fid_av *av, fi_addr_t fi_addr,
+			       fi_addr_t user_id, uint64_t flags)
+{
+	struct cxip_av *cxi_av = container_of(av, struct cxip_av, av_fid);
+	struct cxip_av_entry *av_entry = NULL;
+	struct cxip_av_auth_key_entry *auth_key_entry = NULL;
+	uint64_t unsupported_flags = flags & ~AV_INSERT_VALID_FLAGS;
+
+	if (!cxi_av->av_user_id) {
+		CXIP_WARN("AV not opened with FI_AV_AUTH_KEY\n");
+		return -FI_EINVAL;
+	}
+
+	if (unsupported_flags) {
+		CXIP_WARN("Unsupported AV set user id flags: %#lx\n",
+			  unsupported_flags);
+		return -FI_EINVAL;
+	}
+
+	cxip_av_write_lock(cxi_av);
+
+	if (flags & FI_AUTH_KEY) {
+		auth_key_entry =
+			ofi_bufpool_get_ibuf(cxi_av->auth_key_entry_pool,
+					     fi_addr);
+		if (auth_key_entry)
+			auth_key_entry->fi_addr = user_id;
+	} else {
+		av_entry = ofi_bufpool_get_ibuf(cxi_av->av_entry_pool, fi_addr);
+		if (av_entry)
+			av_entry->fi_addr = user_id;
+	}
+
+	cxip_av_unlock(cxi_av);
+
+	if (av_entry || auth_key_entry)
+		return FI_SUCCESS;
+
+	CXIP_WARN("Invalid fi_addr %#lx\n", fi_addr);
+
+	return -FI_EINVAL;
+}
+
+static struct fi_ops_av cxip_av_fid_ops = {
+	.size = sizeof(struct fi_ops_av),
+	.insert = cxip_av_insert,
+	.insertsvc = cxip_av_insertsvc,
+	.insertsym = fi_no_av_insertsym,
+	.remove = cxip_av_remove,
+	.lookup = cxip_av_lookup,
+	.straddr = cxip_av_straddr,
+	.av_set = cxip_av_set,
+	.insert_auth_key = cxip_av_insert_auth_key,
+	.lookup_auth_key = cxip_av_lookup_auth_key,
+	.set_user_id = cxip_av_set_user_id,
+};
+
+static struct fi_ops cxip_av_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_av_close,
+	.bind = fi_no_bind,
+	.control = fi_no_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static int cxip_av_open_validate_args(struct cxip_domain *dom,
+				      struct fi_av_attr *attr,
+				      struct fid_av **avp, void *context)
+{
+	if (!attr) {
+		CXIP_WARN("NULL AV attributes\n");
+		return -FI_EINVAL;
+	}
+
+	if (!avp) {
+		CXIP_WARN("NULL AV\n");
+		return -FI_EINVAL;
+	}
+
+	if (!dom->av_user_id && (attr->flags & FI_AV_USER_ID)) {
+		CXIP_WARN("Domain not configured with FI_AV_USER_ID\n");
+		return -FI_EINVAL;
+	}
+
+	if (attr->rx_ctx_bits) {
+		CXIP_WARN("rx_ctx_bits non-zero. SEPs not supported.\n");
+		return -FI_EINVAL;
+	}
+
+	if (attr->name) {
+		CXIP_WARN("Shared AVs not supported\n");
+		return -FI_EINVAL;
+	}
+
+	if (attr->flags & FI_READ) {
+		CXIP_WARN("FI_READ and shared AVs not supported\n");
+		return -FI_EINVAL;
+	}
+
+	if (attr->flags & FI_EVENT) {
+		CXIP_WARN("FI_EVENT not supported\n");
+		return -FI_EINVAL;
+	}
+
+	switch (attr->type) {
+	case FI_AV_UNSPEC:
+	case FI_AV_MAP:
+	case FI_AV_TABLE:
+		break;
+	default:
+		CXIP_WARN("Invalid AV type: %d\n", attr->type);
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+int cxip_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
+		 struct fid_av **avp, void *context)
+{
+	int ret;
+	struct cxip_av *av;
+	struct cxip_domain *dom;
+	struct ofi_bufpool_attr pool_attr = {
+		.size = sizeof(struct cxip_av_entry),
+		.flags = OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_INDEXED,
+	};
+	size_t orig_size;
+
+	dom = container_of(domain, struct cxip_domain, util_domain.domain_fid);
+
+	ret = cxip_av_open_validate_args(dom, attr, avp, context);
+	if (ret != FI_SUCCESS)
+		goto err;
+
+	av = calloc(1, sizeof(*av));
+	if (!av) {
+		ret = -FI_ENOMEM;
+		goto err;
+	}
+
+	/* Initialize embedded AV fields. */
+	av->av_fid.fid.context = context;
+	av->av_fid.fid.fclass = FI_CLASS_AV;
+	av->av_fid.fid.ops = &cxip_av_fi_ops;
+	av->av_fid.ops = &cxip_av_fid_ops;
+	av->domain = dom;
+	dlist_init(&av->ep_list);
+	ofi_atomic_initialize32(&av->ref, 0);
+	av->lockless = dom->util_domain.threading == FI_THREAD_DOMAIN;
+	pthread_rwlock_init(&av->lock, NULL);
+	av->av_entry_hash = NULL;
+	av->symmetric = !!(attr->flags & FI_SYMMETRIC);
+	ofi_atomic_initialize32(&av->av_entry_cnt, 0);
+	av->av_auth_key = dom->av_auth_key;
+	av->auth_key_entry_hash = NULL;
+	dlist_init(&av->auth_key_entry_list);
+	ofi_atomic_initialize32(&av->auth_key_entry_cnt, 0);
+	av->auth_key_entry_max = dom->auth_key_entry_max;
+	av->av_user_id = !!(attr->flags & FI_AV_USER_ID);
+
+	/* Cannot support symmetric with AV auth key. */
+	if (av->av_auth_key)
+		av->symmetric = 0;
+
+	/* Only FI_AV_TABLE is implemented. */
+	av->type = attr->type == FI_AV_UNSPEC ? FI_AV_TABLE : attr->type;
+
+	/* Allocate buffer pool and size it based on user input. */
+	orig_size = attr->count ? attr->count : ofi_universe_size;
+	orig_size = roundup_power_of_two(orig_size);
+	pool_attr.chunk_cnt = orig_size;
+	ret = ofi_bufpool_create_attr(&pool_attr, &av->av_entry_pool);
+	if (ret) {
+		CXIP_WARN("Failed to allocate buffer pool: %d\n", ret);
+		goto err_free_av;
+	}
+
+	pool_attr.size = sizeof(struct cxip_av_auth_key_entry);
+	pool_attr.chunk_cnt = av->auth_key_entry_max;
+	ret = ofi_bufpool_create_attr(&pool_attr, &av->auth_key_entry_pool);
+	if (ret) {
+		CXIP_WARN("Failed to allocate buffer pool: %d\n", ret);
+		goto err_free_av_buf_pool;
+	}
+
+	ofi_atomic_inc32(&dom->ref);
+
+	*avp = &av->av_fid;
+
+	return FI_SUCCESS;
+
+err_free_av_buf_pool:
+	ofi_bufpool_destroy(av->av_entry_pool);
+err_free_av:
+	free(av);
+err:
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_avset.c b/prov/cxi/src/cxip_avset.c
new file mode 100644
index 00000000000..4bf7a445f25
--- /dev/null
+++ b/prov/cxi/src/cxip_avset.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+/*
+ * Notes:
+ *
+ * To implement this as an extension of util_av_set requires that AV be an
+ * extension of util_av, which it currently is not.
+ *
+ * The bulk of the util code is involved with a point-to-point implementaion of
+ * collectives, and the util_av_set code is relatively trivial, and also has a
+ * bad bug in util_av_set_diff().
+ *
+ * Our current plan is to implement only accelerated multicast operations in
+ * libfabric, and leave all point-to-point implementations to the regular MPI
+ * algorithms, which will (in general) be better optimized and tunable.
+ *
+ * At some future point, we can rework cxip_av to be an extension of util_av,
+ * eliminate this code in favor of the util_coll code, with custom
+ * implementations of the accelerated multicast operations.
+ *
+ */
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <ofi_list.h>
+#include <ofi.h>
+
+#include "cxip.h"
+
+int cxip_av_set_union(struct fid_av_set *dst, const struct fid_av_set *src)
+{
+	/* Must append to end */
+	struct cxip_av_set *src_av_set;
+	struct cxip_av_set *dst_av_set;
+	size_t temp;
+	int i,j;
+
+	src_av_set = container_of(src, struct cxip_av_set, av_set_fid);
+	dst_av_set = container_of(dst, struct cxip_av_set, av_set_fid);
+
+	if (src_av_set->cxi_av != dst_av_set->cxi_av)
+		return -FI_EINVAL;
+
+	if (dst_av_set->mc_obj)
+		return -FI_EPERM;
+
+	/* New elements placed at end of dst */
+	temp = dst_av_set->fi_addr_cnt;
+	for (i = 0; i < src_av_set->fi_addr_cnt; i++) {
+		for (j = 0; j < dst_av_set->fi_addr_cnt; j++) {
+			if (dst_av_set->fi_addr_ary[j] ==
+ 		 	    src_av_set->fi_addr_ary[i]) {
+				/* src[i] already in dst */
+				break;
+			}
+		}
+		if (j == dst_av_set->fi_addr_cnt) {
+			/* src[i] gets added to end of dst */
+			dst_av_set->fi_addr_ary[temp++] =
+				src_av_set->fi_addr_ary[i];
+		}
+	}
+	/* temp >= dst_av_set->fi_addr_cnt */
+	dst_av_set->fi_addr_cnt = temp;
+	return FI_SUCCESS;
+}
+
+int cxip_av_set_intersect(struct fid_av_set *dst, const struct fid_av_set *src)
+{
+	/* Must preserve order */
+	struct cxip_av_set *src_av_set;
+	struct cxip_av_set *dst_av_set;
+	int i,j, temp;
+
+	src_av_set = container_of(src, struct cxip_av_set, av_set_fid);
+	dst_av_set = container_of(dst, struct cxip_av_set, av_set_fid);
+
+	if (src_av_set->cxi_av != dst_av_set->cxi_av)
+		return -FI_EINVAL;
+
+	if (dst_av_set->mc_obj)
+		return -FI_EPERM;
+
+	/* Old elements removed from dst */
+	temp = 0;
+	for (i = 0; i < dst_av_set->fi_addr_cnt; i++) {
+		for (j = 0; j < src_av_set->fi_addr_cnt; j++) {
+			if (dst_av_set->fi_addr_ary[i] ==
+ 		 	    src_av_set->fi_addr_ary[j]) {
+				/* dst[i] is in src, temp <= i */
+				if (temp < i) {
+					dst_av_set->fi_addr_ary[temp] =
+						dst_av_set->fi_addr_ary[i];
+				}
+				temp++;
+				break;
+			}
+		}
+	}
+	/* temp <= dst_av_set->fi_addr_cnt */
+	dst_av_set->fi_addr_cnt = temp;
+	return FI_SUCCESS;
+}
+
+int cxip_av_set_diff(struct fid_av_set *dst, const struct fid_av_set *src)
+{
+	/* Must preserve order */
+	struct cxip_av_set *src_av_set;
+	struct cxip_av_set *dst_av_set;
+	int i,j, temp;
+
+	src_av_set = container_of(src, struct cxip_av_set, av_set_fid);
+	dst_av_set = container_of(dst, struct cxip_av_set, av_set_fid);
+
+	if (src_av_set->cxi_av != dst_av_set->cxi_av)
+		return -FI_EINVAL;
+
+	if (dst_av_set->mc_obj)
+		return -FI_EPERM;
+
+	/* Old elements removed from dst */
+	temp = 0;
+	for (i = 0; i < dst_av_set->fi_addr_cnt; i++) {
+		for (j = 0; j < src_av_set->fi_addr_cnt; j++) {
+			if (dst_av_set->fi_addr_ary[i] ==
+			    src_av_set->fi_addr_ary[j])
+				break;
+		}
+		if (j == src_av_set->fi_addr_cnt) {
+			/* temp <= i */
+			if (temp < dst_av_set->fi_addr_cnt) {
+				dst_av_set->fi_addr_ary[temp] =
+					dst_av_set->fi_addr_ary[i];
+			}
+			temp++;
+		}
+	}
+	/* temp <= dst_av_set->fi_addr_cnt */
+	dst_av_set->fi_addr_cnt = temp;
+	return FI_SUCCESS;
+}
+
+int cxip_av_set_insert(struct fid_av_set *set, fi_addr_t addr)
+{
+	/* Must append to end */
+	struct cxip_av_set *av_set_obj;
+	int i;
+
+	av_set_obj = container_of(set, struct cxip_av_set, av_set_fid);
+
+	if (av_set_obj->mc_obj)
+		return -FI_EPERM;
+
+	/* Do not insert duplicates */
+	for (i = 0; i < av_set_obj->fi_addr_cnt; i++) {
+		if (av_set_obj->fi_addr_ary[i] == addr)
+			return -FI_EINVAL;
+	}
+	/* Append new value */
+	av_set_obj->fi_addr_ary[av_set_obj->fi_addr_cnt++] = addr;
+	return FI_SUCCESS;
+}
+
+int cxip_av_set_remove(struct fid_av_set *set, fi_addr_t addr)
+{
+	/* Must preserve ordering */
+	struct cxip_av_set *av_set_obj;
+	int i;
+
+	av_set_obj = container_of(set, struct cxip_av_set, av_set_fid);
+
+	if (av_set_obj->mc_obj)
+		return -FI_EPERM;
+
+	for (i = 0; i < av_set_obj->fi_addr_cnt; i++) {
+		if (av_set_obj->fi_addr_ary[i] == addr)
+			break;
+	}
+	if (i == av_set_obj->fi_addr_cnt)
+		return -FI_EINVAL;
+
+	for (i++; i < av_set_obj->fi_addr_cnt; i++)
+		av_set_obj->fi_addr_ary[i-1] = av_set_obj->fi_addr_ary[i];
+	av_set_obj->fi_addr_cnt--;
+	return FI_SUCCESS;
+}
+
+int cxip_av_set_addr(struct fid_av_set *set, fi_addr_t *coll_addr)
+{
+	*coll_addr = FI_ADDR_NOTAVAIL;
+	return FI_SUCCESS;
+}
+
+int cxip_close_av_set(struct fid *fid)
+{
+	struct cxip_av_set *cxi_av_set;
+
+	cxi_av_set = container_of(fid, struct cxip_av_set, av_set_fid.fid);
+	if (cxi_av_set->mc_obj)
+		return -FI_EBUSY;
+
+	ofi_atomic_dec32(&cxi_av_set->cxi_av->ref);
+
+	free(cxi_av_set->fi_addr_ary);
+	free(cxi_av_set);
+	return FI_SUCCESS;
+}
+
+static struct fi_ops_av_set cxip_av_set_ops= {
+	.set_union = cxip_av_set_union,
+	.intersect = cxip_av_set_intersect,
+	.diff = cxip_av_set_diff,
+	.insert = cxip_av_set_insert,
+	.remove = cxip_av_set_remove,
+	.addr = cxip_av_set_addr
+};
+
+static struct fi_ops cxip_av_set_fid_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_close_av_set,
+	.bind = fi_no_bind,
+	.control = fi_no_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static inline int fi_addr_is_valid(struct cxip_av *av, fi_addr_t fi_addr)
+{
+	return cxip_av_lookup_addr(av, fi_addr, NULL) == FI_SUCCESS;
+}
+
+int cxip_av_set(struct fid_av *av, struct fi_av_set_attr *attr,
+	        struct fid_av_set **av_set_fid, void *context)
+{
+	struct cxip_av *cxi_av;
+	struct cxip_av_set *cxi_set;
+	bool abeg, aend;
+	fi_addr_t start, end;
+	size_t count, stride;
+	fi_addr_t i, j;
+	int ret;
+	size_t max_size;
+	size_t av_entry_count;
+
+	cxi_av = container_of(av, struct cxip_av, av_fid);
+	av_entry_count = cxip_av_entry_count(cxi_av);
+
+	if (!attr)
+		return -FI_EINVAL;
+
+	/* We need the AV to stick around now */
+	ofi_atomic_inc32(&cxi_av->ref);
+
+	/* May change values below, don't alter struct */
+	start = attr->start_addr;
+	end = attr->end_addr;
+	count = attr->count;
+	stride = attr->stride;
+	abeg = (start != FI_ADDR_NOTAVAIL);
+	aend = (end != FI_ADDR_NOTAVAIL);
+
+	/* Override everything for UNIVERSE flag */
+	if (attr->flags & FI_UNIVERSE) {
+		start = FI_ADDR_NOTAVAIL;
+		end = FI_ADDR_NOTAVAIL;
+		count = FI_ADDR_NOTAVAIL;
+		stride = 1;
+		abeg = false;
+		aend = false;
+	}
+
+	/* Common error for these syntax tests */
+	ret = -FI_EINVAL;
+
+	/* Must specify both, or neither */
+	if (abeg != aend)
+		goto err0;
+
+	/* Cannot specify a range for FI_AV_MAP */
+	if (abeg && cxi_av->type == FI_AV_MAP)
+		goto err0;
+
+	/* Cannot specify a range for empty AV set */
+	if (abeg && count == 0)
+		goto err0;
+
+	/* Comm_key data must match in our structure */
+	if (attr->comm_key && attr->comm_key_size &&
+	    attr->comm_key_size != sizeof(struct cxip_comm_key))
+		goto err0;
+
+	/* Must specify a range if non-sequential stride */
+	if (!abeg && stride > 1)
+		goto err0;
+
+	/* Stride unspecified means sequential */
+	if (stride == 0)
+		stride = 1;
+
+	/* Resolve undefined range and count */
+	if (start == FI_ADDR_NOTAVAIL)
+		start = 0;
+	if (end == FI_ADDR_NOTAVAIL)
+		end = 0;
+	if (count > end - start + 1)
+		count = end - start + 1;
+
+	cxi_set = calloc(1,sizeof(*cxi_set));
+	if (!cxi_set) {
+		ret = -FI_ENOMEM;
+		goto err0;
+	}
+
+	/* Allocate enough space to add all addresses */
+	max_size = attr->count ?
+		attr->count : MAX(ofi_universe_size, av_entry_count);
+	cxi_set->fi_addr_ary = calloc(max_size,
+				      sizeof(*cxi_set->fi_addr_ary));
+	if (!cxi_set->fi_addr_ary) {
+		ret = -FI_ENOMEM;
+		goto err1;
+	}
+
+	/* Add address indices */
+	for (i=0, j=start;
+	     i < count && j <= end && j < av_entry_count;
+	     i++, j+=stride) {
+		/* Skip over invalid addresses as if not there */
+		while (!fi_addr_is_valid(cxi_av, i)) {
+		       if (++j >= av_entry_count)
+			       break;
+		}
+		if (j >= av_entry_count)
+			break;
+		cxi_set->fi_addr_ary[i] = (fi_addr_t)j;
+		cxi_set->fi_addr_cnt++;
+	}
+
+	/* copy comm_key from attributes, if present */
+	if (attr->comm_key && attr->comm_key_size) {
+		memcpy(&cxi_set->comm_key, attr->comm_key,
+		       attr->comm_key_size);
+	}
+
+	cxi_set->av_set_fid.fid.fclass = FI_CLASS_AV_SET;
+	cxi_set->av_set_fid.fid.context = context;
+	cxi_set->av_set_fid.fid.ops = &cxip_av_set_fid_ops;
+	cxi_set->av_set_fid.ops = &cxip_av_set_ops;
+	cxi_set->cxi_av = cxi_av;
+
+	*av_set_fid = &cxi_set->av_set_fid;
+
+	return FI_SUCCESS;
+err1:
+	free(cxi_set);
+err0:
+	ofi_atomic_dec32(&cxi_av->ref);
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_cmdq.c b/prov/cxi/src/cxip_cmdq.c
new file mode 100644
index 00000000000..6d4b28efddf
--- /dev/null
+++ b/prov/cxi/src/cxip_cmdq.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_DOMAIN, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__)
+
+enum cxi_traffic_class cxip_ofi_to_cxi_tc(uint32_t ofi_tclass)
+{
+	switch (ofi_tclass) {
+	case FI_TC_BULK_DATA:
+		return CXI_TC_BULK_DATA;
+	case FI_TC_DEDICATED_ACCESS:
+		return CXI_TC_DEDICATED_ACCESS;
+	case FI_TC_LOW_LATENCY:
+		return CXI_TC_LOW_LATENCY;
+	case FI_TC_BEST_EFFORT:
+	case FI_TC_NETWORK_CTRL:
+	case FI_TC_SCAVENGER:
+	default:
+		return CXI_TC_BEST_EFFORT;
+	}
+}
+
+static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni,
+		       enum cxi_traffic_class tc,
+		       enum cxi_traffic_class_type tc_type,
+		       struct cxi_cp **cp)
+{
+	int ret;
+	int i;
+	struct cxip_remap_cp *sw_cp;
+	static const enum cxi_traffic_class remap_tc = CXI_TC_BEST_EFFORT;
+
+	ofi_spin_lock(&lni->lock);
+
+	/* Always prefer SW remapped CPs over allocating HW CP. */
+	dlist_foreach_container(&lni->remap_cps, struct cxip_remap_cp, sw_cp,
+				remap_entry) {
+		if (sw_cp->remap_cp.vni == vni && sw_cp->remap_cp.tc == tc &&
+		    sw_cp->remap_cp.tc_type == tc_type) {
+			CXIP_DBG("Reusing SW CP: %u VNI: %u TC: %s TYPE: %s\n",
+				 sw_cp->remap_cp.lcid, sw_cp->remap_cp.vni,
+				 cxi_tc_to_str(sw_cp->remap_cp.tc),
+				 cxi_tc_type_to_str(sw_cp->remap_cp.tc_type));
+			*cp = &sw_cp->remap_cp;
+			goto success_unlock;
+		}
+	}
+
+	/* Allocate a new SW remapped CP entry and attempt to allocate the
+	 * user requested HW CP.
+	 */
+	sw_cp = calloc(1, sizeof(*sw_cp));
+	if (!sw_cp) {
+		ret = -FI_ENOMEM;
+		goto err_unlock;
+	}
+
+	ret = cxil_alloc_cp(lni->lni, vni, tc, tc_type,
+			    &lni->hw_cps[lni->n_cps]);
+	if (ret) {
+		/* Attempt to fall back to remap traffic class with the same
+		 * traffic class type and allocate HW CP if necessary.
+		 */
+		CXIP_WARN("Failed to allocate CP, ret: %d VNI: %u TC: %s TYPE: %s\n",
+			  ret, vni, cxi_tc_to_str(tc),
+			  cxi_tc_type_to_str(tc_type));
+		CXIP_WARN("Remapping original TC from %s to %s\n",
+			  cxi_tc_to_str(tc), cxi_tc_to_str(remap_tc));
+
+		/* Check to see if a matching HW CP has already been allocated.
+		 * If so, reuse the entry.
+		 */
+		for (i = 0; i < lni->n_cps; i++) {
+			if (lni->hw_cps[i]->vni == vni &&
+			    lni->hw_cps[i]->tc == remap_tc &&
+			    lni->hw_cps[i]->tc_type == tc_type) {
+				sw_cp->hw_cp = lni->hw_cps[i];
+				goto found_hw_cp;
+			}
+		}
+
+		/* Attempt to allocated a remapped HW CP. */
+		ret = cxil_alloc_cp(lni->lni, vni, remap_tc, tc_type,
+				    &lni->hw_cps[lni->n_cps]);
+		if (ret) {
+			CXIP_WARN("Failed to allocate CP, ret: %d VNI: %u TC: %s TYPE: %s\n",
+				  ret, vni, cxi_tc_to_str(remap_tc),
+				  cxi_tc_type_to_str(tc_type));
+			ret = -FI_EINVAL;
+			goto err_free_sw_cp;
+		}
+	}
+
+	CXIP_DBG("Allocated CP: %u VNI: %u TC: %s TYPE: %s\n",
+		 lni->hw_cps[lni->n_cps]->lcid, vni,
+		 cxi_tc_to_str(lni->hw_cps[lni->n_cps]->tc),
+		 cxi_tc_type_to_str(lni->hw_cps[lni->n_cps]->tc_type));
+
+	sw_cp->hw_cp = lni->hw_cps[lni->n_cps++];
+
+found_hw_cp:
+	sw_cp->remap_cp.vni = vni;
+	sw_cp->remap_cp.tc = tc;
+	sw_cp->remap_cp.tc_type = tc_type;
+	sw_cp->remap_cp.lcid = sw_cp->hw_cp->lcid;
+	dlist_insert_tail(&sw_cp->remap_entry, &lni->remap_cps);
+
+	*cp = &sw_cp->remap_cp;
+
+success_unlock:
+	ofi_spin_unlock(&lni->lock);
+
+	return FI_SUCCESS;
+
+err_free_sw_cp:
+	free(sw_cp);
+err_unlock:
+	ofi_spin_unlock(&lni->lock);
+
+	return ret;
+}
+
+int cxip_txq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni,
+		    enum cxi_traffic_class tc,
+		    enum cxi_traffic_class_type tc_type)
+{
+	struct cxi_cp *cp;
+	int ret;
+
+	if (cmdq->cur_cp->vni == vni && cmdq->cur_cp->tc == tc &&
+	    cmdq->cur_cp->tc_type == tc_type)
+		return FI_SUCCESS;
+
+	ret = cxip_cp_get(cmdq->lni, vni, tc, tc_type, &cp);
+	if (ret != FI_SUCCESS) {
+		CXIP_DBG("Failed to get CP: %d\n", ret);
+		return -FI_EOTHER;
+	}
+
+	ret = cxi_cq_emit_cq_lcid(cmdq->dev_cmdq, cp->lcid);
+	if (ret) {
+		CXIP_DBG("Failed to update CMDQ(%p) CP: %d\n", cmdq, ret);
+		ret = -FI_EAGAIN;
+	} else {
+		ret = FI_SUCCESS;
+		cmdq->cur_cp = cp;
+
+		CXIP_DBG("Updated CMDQ(%p) CP: %d VNI: %u TC: %s TYPE: %s\n",
+			 cmdq, cp->lcid, cp->vni, cxi_tc_to_str(cp->tc),
+			 cxi_tc_type_to_str(cp->tc_type));
+	}
+
+	return ret;
+}
+
+/*
+ * cxip_cmdq_alloc() - Allocate a command queue.
+ */
+int cxip_cmdq_alloc(struct cxip_lni *lni, struct cxi_eq *evtq,
+		    struct cxi_cq_alloc_opts *cq_opts, uint16_t vni,
+		    enum cxi_traffic_class tc,
+		    enum cxi_traffic_class_type tc_type,
+		    struct cxip_cmdq **cmdq)
+{
+	int ret;
+	struct cxi_cq *dev_cmdq;
+	struct cxip_cmdq *new_cmdq;
+	struct cxi_cp *cp = NULL;
+
+	new_cmdq = calloc(1, sizeof(*new_cmdq));
+	if (!new_cmdq) {
+		CXIP_WARN("Unable to allocate CMDQ structure\n");
+		return -FI_ENOMEM;
+	}
+
+	if (cq_opts->flags & CXI_CQ_IS_TX) {
+		ret = cxip_cp_get(lni, vni, tc, tc_type, &cp);
+		if (ret != FI_SUCCESS) {
+			CXIP_WARN("Failed to allocate CP: %d\n", ret);
+			return ret;
+		}
+		cq_opts->lcid = cp->lcid;
+
+		new_cmdq->cur_cp = cp;
+
+		/* Trig command queue can never use LL ring. */
+		if (cq_opts->flags & CXI_CQ_TX_WITH_TRIG_CMDS ||
+		    lni->iface->info->device_platform == CXI_PLATFORM_NETSIM)
+			new_cmdq->llring_mode = CXIP_LLRING_NEVER;
+		else
+			new_cmdq->llring_mode = cxip_env.llring_mode;
+	} else {
+		new_cmdq->llring_mode = CXIP_LLRING_NEVER;
+	}
+
+	ret = cxil_alloc_cmdq(lni->lni, evtq, cq_opts, &dev_cmdq);
+	if (ret) {
+		CXIP_WARN("Failed to allocate %s, ret: %d\n",
+			  cq_opts->flags & CXI_CQ_IS_TX ? "TXQ" : "TGQ", ret);
+		ret = -FI_ENOSPC;
+		goto free_cmdq;
+	}
+
+	new_cmdq->dev_cmdq = dev_cmdq;
+	new_cmdq->lni = lni;
+	*cmdq = new_cmdq;
+
+	return FI_SUCCESS;
+
+free_cmdq:
+	free(new_cmdq);
+
+	return ret;
+}
+
+/*
+ * cxip_cmdq_free() - Free a command queue.
+ */
+void cxip_cmdq_free(struct cxip_cmdq *cmdq)
+{
+	int ret;
+
+	ret = cxil_destroy_cmdq(cmdq->dev_cmdq);
+	if (ret)
+		CXIP_WARN("cxil_destroy_cmdq failed, ret: %d\n", ret);
+
+	free(cmdq);
+}
+
+/* Must hold cmdq->lock. */
+int cxip_cmdq_emit_c_state(struct cxip_cmdq *cmdq,
+			   const struct c_cstate_cmd *c_state)
+{
+	int ret;
+
+	if (memcmp(&cmdq->c_state, c_state, sizeof(*c_state))) {
+		ret = cxi_cq_emit_c_state(cmdq->dev_cmdq, c_state);
+		if (ret) {
+			CXIP_DBG("Failed to issue C_STATE command: %d\n", ret);
+			return -FI_EAGAIN;
+		}
+
+		cmdq->c_state = *c_state;
+	}
+
+	return FI_SUCCESS;
+}
+
+int cxip_cmdq_emit_idc_put(struct cxip_cmdq *cmdq,
+			   const struct c_cstate_cmd *c_state,
+			   const struct c_idc_put_cmd *put, const void *buf,
+			   size_t len, uint64_t flags)
+{
+	int ret;
+
+	if (flags & (FI_FENCE | FI_CXI_WEAK_FENCE)) {
+		ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE);
+		if (ret) {
+			CXIP_WARN("Failed to issue fence command: %d:%s\n", ret,
+				  fi_strerror(-ret));
+			return -FI_EAGAIN;
+		}
+	}
+
+	ret = cxip_cmdq_emit_c_state(cmdq, c_state);
+	if (ret) {
+		CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret,
+			  fi_strerror(-ret));
+		return ret;
+	}
+
+	ret = cxi_cq_emit_idc_put(cmdq->dev_cmdq, put, buf, len);
+	if (ret) {
+		CXIP_WARN("Failed to emit idc_put command: %d:%s\n", ret,
+			  fi_strerror(-ret));
+		return -FI_EAGAIN;
+	}
+
+	return FI_SUCCESS;
+}
+
+int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma,
+		       uint64_t flags)
+{
+	int ret;
+
+	if (flags & (FI_FENCE | FI_CXI_WEAK_FENCE)) {
+		ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE);
+		if (ret) {
+			CXIP_WARN("Failed to issue fence command: %d:%s\n", ret,
+				  fi_strerror(-ret));
+			return -FI_EAGAIN;
+		}
+	}
+
+	ret = cxi_cq_emit_dma(cmdq->dev_cmdq, dma);
+	if (ret) {
+		CXIP_WARN("Failed to emit dma command: %d:%s\n", ret,
+			  fi_strerror(-ret));
+		return -FI_EAGAIN;
+	}
+
+	return FI_SUCCESS;
+}
+
+int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq,
+			   const struct c_cstate_cmd *c_state,
+			   const struct c_idc_amo_cmd *amo, uint64_t flags,
+			   bool fetching, bool flush)
+{
+	struct c_full_dma_cmd flush_cmd;
+	bool fetching_flush = fetching && flush;
+	int ret;
+
+	if (fetching_flush) {
+		memset(&flush_cmd, 0, sizeof(flush_cmd));
+		flush_cmd.command.opcode = C_CMD_PUT;
+		flush_cmd.index_ext = c_state->index_ext;
+		flush_cmd.event_send_disable = 1;
+		flush_cmd.dfa = amo->idc_header.dfa;
+		flush_cmd.remote_offset = amo->idc_header.remote_offset;
+		flush_cmd.eq = c_state->eq;
+		flush_cmd.user_ptr = c_state->user_ptr;
+		flush_cmd.flush = 1;
+	}
+
+	if (flags & (FI_FENCE | FI_CXI_WEAK_FENCE)) {
+		ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE);
+		if (ret) {
+			CXIP_WARN("Failed to issue fence command: %d:%s\n", ret,
+				  fi_strerror(-ret));
+			return -FI_EAGAIN;
+		}
+	}
+
+	ret = cxip_cmdq_emit_c_state(cmdq, c_state);
+	if (ret) {
+		CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret,
+			  fi_strerror(-ret));
+		return ret;
+	}
+
+	/* Fetching AMO with flush requires two commands. Ensure there is enough
+	 * space. At worse at least 16x 32-byte slots are needed.
+	 */
+	if (fetching_flush && __cxi_cq_free_slots(cmdq->dev_cmdq) < 16) {
+		CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n");
+		return -FI_EAGAIN;
+	}
+
+	ret = cxi_cq_emit_idc_amo(cmdq->dev_cmdq, amo, fetching);
+	if (ret) {
+		CXIP_WARN("Failed to emit IDC amo\n");
+		return -FI_EAGAIN;
+	}
+
+	if (fetching_flush) {
+		/* CQ space check already occurred. Thus, return code can be
+		 * ignored.
+		 */
+		ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &flush_cmd);
+		assert(ret == 0);
+	}
+
+	return FI_SUCCESS;
+}
+
+int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo,
+			   uint64_t flags, bool fetching, bool flush)
+{
+	struct c_full_dma_cmd flush_cmd;
+	bool fetching_flush = fetching && flush;
+	int ret;
+
+	if (fetching_flush) {
+		memset(&flush_cmd, 0, sizeof(flush_cmd));
+		flush_cmd.command.opcode = C_CMD_PUT;
+		flush_cmd.index_ext = amo->index_ext;
+		flush_cmd.event_send_disable = 1;
+		flush_cmd.dfa = amo->dfa;
+		flush_cmd.remote_offset = amo->remote_offset;
+		flush_cmd.eq = amo->eq;
+		flush_cmd.user_ptr = amo->user_ptr;
+		flush_cmd.flush = 1;
+		flush_cmd.match_bits = amo->match_bits;
+	}
+
+	if (flags & (FI_FENCE | FI_CXI_WEAK_FENCE)) {
+		ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE);
+		if (ret) {
+			CXIP_WARN("Failed to issue fence command: %d:%s\n", ret,
+				  fi_strerror(-ret));
+			return -FI_EAGAIN;
+		}
+	}
+
+	/* Fetching AMO with flush requires two commands. Ensure there is enough
+	 * space. At worse at least 16x 32-byte slots are needed.
+	 */
+	if (fetching_flush && __cxi_cq_free_slots(cmdq->dev_cmdq) < 16) {
+		CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n");
+		return -FI_EAGAIN;
+	}
+
+	ret = cxi_cq_emit_dma_amo(cmdq->dev_cmdq, amo, fetching);
+	if (ret) {
+		CXIP_WARN("Failed to emit DMA amo\n");
+		return -FI_EAGAIN;
+	}
+
+	if (fetching_flush) {
+		/* CQ space check already occurred. Thus, return code can be
+		 * ignored.
+		 */
+		ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &flush_cmd);
+		assert(ret == 0);
+	}
+
+	return FI_SUCCESS;
+}
diff --git a/prov/cxi/src/cxip_cntr.c b/prov/cxi/src/cxip_cntr.c
new file mode 100644
index 00000000000..2f7354330ac
--- /dev/null
+++ b/prov/cxi/src/cxip_cntr.c
@@ -0,0 +1,865 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved.
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "cxip.h"
+
+#include <ofi_util.h>
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_DATA, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_WARN(FI_LOG_EP_DATA, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_DATA, __VA_ARGS__)
+
+static int cxip_cntr_copy_ct_writeback(struct cxip_cntr *cntr,
+				       struct c_ct_writeback *wb_copy)
+{
+	struct cxip_domain *dom = cntr->domain;
+	ssize_t ret __attribute__((unused));
+	struct iovec hmem_iov;
+
+	if (cntr->wb_iface == FI_HMEM_SYSTEM) {
+		memcpy(wb_copy, cntr->wb, sizeof(*cntr->wb));
+		return FI_SUCCESS;
+	}
+
+	if (cntr->wb_handle_valid) {
+		ret = ofi_hmem_dev_reg_copy_from_hmem(cntr->wb_iface,
+						      cntr->wb_handle, wb_copy,
+						      cntr->wb,
+						      sizeof(*cntr->wb));
+		assert(ret == FI_SUCCESS);
+		return FI_SUCCESS;
+	}
+
+	hmem_iov.iov_base = cntr->wb;
+	hmem_iov.iov_len = sizeof(*cntr->wb);
+
+	ret = dom->hmem_ops.copy_from_hmem_iov(wb_copy, sizeof(*cntr->wb),
+					       cntr->wb_iface, cntr->wb_device,
+					       &hmem_iov, 1, 0);
+	assert(ret == sizeof(*wb_copy));
+	return FI_SUCCESS;
+}
+
+static int cxip_cntr_get_ct_error(struct cxip_cntr *cntr, uint64_t *error)
+{
+	struct c_ct_writeback wb_copy;
+	int ret;
+
+	/* Only can reference the ct_failure field directly if dealing with
+	 * system memory. Device memory requires a memcpy of the contents into
+	 * system memory.
+	 */
+	if (cntr->wb_iface == FI_HMEM_SYSTEM) {
+		*error = cntr->wb->ct_failure;
+		return FI_SUCCESS;
+	}
+
+	ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy);
+	if (ret)
+		return ret;
+
+	*error = wb_copy.ct_failure;
+	return FI_SUCCESS;
+}
+
+static int cxip_cntr_get_ct_success(struct cxip_cntr *cntr, uint64_t *success)
+{
+	struct c_ct_writeback wb_copy;
+	int ret;
+
+	/* Only can reference the ct_success field directly if dealing with
+	 * system memory. Device memory requires a memcpy of the contents into
+	 * system memory.
+	 */
+	if (cntr->wb_iface == FI_HMEM_SYSTEM) {
+		*success = cntr->wb->ct_success;
+		return FI_SUCCESS;
+	}
+
+	ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy);
+	if (ret)
+		return ret;
+
+	*success = wb_copy.ct_success;
+	return FI_SUCCESS;
+}
+
+#define CT_WRITEBACK_OFFSET 7U
+
+static int cxip_cntr_clear_ct_writeback(struct cxip_cntr *cntr)
+{
+	struct iovec hmem_iov;
+	ssize_t ret __attribute__((unused));
+	uint8_t ct_writeback;
+
+	/* Only can reference the ct_success field directly if dealing with
+	 * system memory. Device memory requires a memcpy of the contents into
+	 * device memory.
+	 */
+	if (cntr->wb_iface == FI_HMEM_SYSTEM) {
+		cntr->wb->ct_writeback = 0;
+		return FI_SUCCESS;
+	}
+
+	/* Only write to ct_writeback byte. */
+	ct_writeback = 0;
+	hmem_iov.iov_base = (char *)cntr->wb + CT_WRITEBACK_OFFSET;
+	hmem_iov.iov_len = 1;
+
+	ret = cntr->domain->hmem_ops.copy_to_hmem_iov(cntr->wb_iface, 0,
+						      &hmem_iov, 1, 0,
+						      &ct_writeback, 1);
+	assert(ret == 1);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_cntr_get_ct_writeback(struct cxip_cntr *cntr)
+{
+	struct c_ct_writeback wb_copy;
+	int ret;
+
+	/* Only can reference the ct_writeback field directly if dealing with
+	 * system memory. Device memory requires a memcpy of the contents into
+	 * system memory.
+	 */
+	if (cntr->wb_iface == FI_HMEM_SYSTEM)
+		return cntr->wb->ct_writeback;
+
+	ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy);
+	if (ret)
+		return ret;
+
+	return wb_copy.ct_writeback;
+}
+
+#define TRIG_OP_LOCK_NAME_FMT "/.uuid%d_cxi%d_vni%d_svcid%d"
+#define TRIG_OP_LOCK_NAME_SIZE 256U
+
+static int cxip_dom_cntr_enable(struct cxip_domain *dom)
+{
+	char trig_op_lock_name[TRIG_OP_LOCK_NAME_SIZE];
+	struct cxi_cq_alloc_opts cq_opts = {
+		.policy = CXI_CQ_UPDATE_ALWAYS,
+	};
+	int ret;
+
+	ofi_spin_lock(&dom->lock);
+
+	if (dom->cntr_init) {
+		ofi_spin_unlock(&dom->lock);
+		return FI_SUCCESS;
+	}
+
+	assert(dom->enabled);
+
+	ret = snprintf(trig_op_lock_name, TRIG_OP_LOCK_NAME_SIZE,
+		       TRIG_OP_LOCK_NAME_FMT, getuid(),
+		       dom->iface->dev->info.dev_id, dom->auth_key.vni,
+		       dom->auth_key.svc_id);
+	if (ret >= TRIG_OP_LOCK_NAME_SIZE) {
+		CXIP_WARN("snprintf buffer too small\n");
+		ret = -FI_ENOSPC;
+		goto err_unlock;
+	} else if (ret < 0) {
+		CXIP_WARN("snprintf failed: %d\n", ret);
+		goto err_unlock;
+	}
+
+	dom->trig_op_lock = sem_open(trig_op_lock_name, O_CREAT,
+				     S_IRUSR | S_IWUSR, 1);
+	if (dom->trig_op_lock == SEM_FAILED) {
+		ret = -errno;
+		CXIP_WARN("sem_open failed: %d\n", ret);
+		goto err_unlock;
+	}
+
+	cq_opts.count = MAX(dom->max_trig_op_in_use, 64);
+	cq_opts.flags = CXI_CQ_IS_TX | CXI_CQ_TX_WITH_TRIG_CMDS;
+	cq_opts.policy = CXI_CQ_UPDATE_ALWAYS;
+
+	ret = cxip_cmdq_alloc(dom->lni, NULL, &cq_opts,
+			      dom->auth_key.vni,
+			      cxip_ofi_to_cxi_tc(dom->tclass),
+			      CXI_TC_TYPE_DEFAULT,
+			      &dom->trig_cmdq);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to allocate trig_cmdq: %d\n", ret);
+		goto err_close_sem;
+	}
+
+	if (dom->util_domain.threading == FI_THREAD_DOMAIN)
+		ofi_genlock_init(&dom->trig_cmdq_lock, OFI_LOCK_NONE);
+	else
+		ofi_genlock_init(&dom->trig_cmdq_lock, OFI_LOCK_SPINLOCK);
+
+	dom->cntr_init = true;
+
+	CXIP_DBG("Domain counters enabled: %p\n", dom);
+
+	ofi_spin_unlock(&dom->lock);
+
+	return FI_SUCCESS;
+
+err_close_sem:
+	sem_close(dom->trig_op_lock);
+err_unlock:
+	ofi_spin_unlock(&dom->lock);
+
+	return ret;
+}
+
+void cxip_dom_cntr_disable(struct cxip_domain *dom)
+{
+	char trig_op_lock_name[TRIG_OP_LOCK_NAME_SIZE];
+	int ret;
+
+	if (dom->cntr_init) {
+		ofi_genlock_destroy(&dom->trig_cmdq_lock);
+
+		sem_close(dom->trig_op_lock);
+
+		ret = snprintf(trig_op_lock_name, TRIG_OP_LOCK_NAME_SIZE,
+			TRIG_OP_LOCK_NAME_FMT, getuid(),
+			dom->iface->dev->info.dev_id, dom->auth_key.vni,
+			dom->auth_key.svc_id);
+		if (ret >= TRIG_OP_LOCK_NAME_SIZE)
+			CXIP_WARN("snprintf buffer too small\n");
+		else if (ret < 0)
+			CXIP_WARN("snprintf failed: %d\n", ret);
+		else
+			sem_unlink(trig_op_lock_name);
+
+		cxip_cmdq_free(dom->trig_cmdq);
+	}
+}
+
+const struct fi_cntr_attr cxip_cntr_attr = {
+	.events = FI_CNTR_EVENTS_COMP,
+	.wait_obj = FI_WAIT_YIELD,
+	.wait_set = NULL,
+	.flags = 0,
+};
+
+/*
+ * cxip_cntr_mod() - Modify counter value.
+ *
+ * Set or increment the success or failure value of a counter by 'value'.
+ */
+int cxip_cntr_mod(struct cxip_cntr *cxi_cntr, uint64_t value, bool set,
+		  bool err)
+{
+	struct c_ct_cmd cmd;
+	struct cxip_cmdq *cmdq;
+	int ret;
+
+	if (!set) {
+		/* Doorbell supports counter increment */
+		if (err)
+			cxi_ct_inc_failure(cxi_cntr->ct, value);
+		else
+			cxi_ct_inc_success(cxi_cntr->ct, value);
+	} else {
+		/* Doorbell supports counter reset */
+		if (!value) {
+			if (err)
+				cxi_ct_reset_failure(cxi_cntr->ct);
+			else
+				cxi_ct_reset_success(cxi_cntr->ct);
+		} else {
+			memset(&cmd, 0, sizeof(cmd));
+			cmdq = cxi_cntr->domain->trig_cmdq;
+
+			/* Use CQ to set a specific counter value */
+			cmd.ct = cxi_cntr->ct->ctn;
+			if (err) {
+				cmd.set_ct_failure = 1;
+				cmd.ct_failure = value;
+			} else {
+				cmd.set_ct_success = 1;
+				cmd.ct_success = value;
+			}
+			ofi_genlock_lock(&cxi_cntr->domain->trig_cmdq_lock);
+
+			ret = cxi_cq_emit_ct(cmdq->dev_cmdq, C_CMD_CT_SET,
+					     &cmd);
+			if (ret) {
+				ofi_genlock_unlock(&cxi_cntr->domain->trig_cmdq_lock);
+				return -FI_EAGAIN;
+			}
+			cxi_cq_ring(cmdq->dev_cmdq);
+			ofi_genlock_unlock(&cxi_cntr->domain->trig_cmdq_lock);
+		}
+	}
+
+	return FI_SUCCESS;
+}
+
+static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get)
+{
+	int ret;
+
+	/* The calling thread which changes CT writeback bit from 1 to 0 must
+	 * issue a CT get command.
+	 */
+	ofi_mutex_lock(&cntr->lock);
+
+	ret = cxip_cntr_get_ct_writeback(cntr);
+	if (ret < 0) {
+		CXIP_WARN("Failed to read counter writeback: rc=%d\n", ret);
+		goto err_unlock;
+	}
+
+	if (ret) {
+		ret = cxip_cntr_clear_ct_writeback(cntr);
+		if (ret) {
+			CXIP_WARN("Failed to clear counter writeback bit: rc=%d\n",
+				  ret);
+			goto err_unlock;
+		}
+
+		*issue_ct_get = true;
+	} else {
+		*issue_ct_get = false;
+	}
+
+	ofi_mutex_unlock(&cntr->lock);
+
+	return FI_SUCCESS;
+
+err_unlock:
+	ofi_mutex_unlock(&cntr->lock);
+
+	*issue_ct_get = false;
+	return ret;
+}
+
+/*
+ * cxip_cntr_get() - Schedule a counter write-back.
+ *
+ * Schedule hardware to write the value of a counter to memory. Avoid
+ * scheduling multiple write-backs at once. The counter value will appear in
+ * memory a small amount of time later.
+ */
+static int cxip_cntr_get(struct cxip_cntr *cxi_cntr, bool force)
+{
+	struct c_ct_cmd cmd;
+	struct cxip_cmdq *cmdq;
+	int ret;
+	bool issue_ct_get;
+
+	if (!force) {
+		ret = cxip_cntr_issue_ct_get(cxi_cntr, &issue_ct_get);
+		if (ret) {
+			CXIP_WARN("cxip_cntr_issue_ct_get() error: rc=%d\n",
+				  ret);
+			return ret;
+		}
+
+		if (!issue_ct_get)
+			return FI_SUCCESS;
+	}
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmdq = cxi_cntr->domain->trig_cmdq;
+
+	/* Request a write-back */
+	cmd.ct = cxi_cntr->ct->ctn;
+
+	ofi_genlock_lock(&cxi_cntr->domain->trig_cmdq_lock);
+	ret = cxi_cq_emit_ct(cmdq->dev_cmdq, C_CMD_CT_GET, &cmd);
+	if (ret) {
+		ofi_genlock_unlock(&cxi_cntr->domain->trig_cmdq_lock);
+		return -FI_EAGAIN;
+	}
+	cxi_cq_ring(cmdq->dev_cmdq);
+	ofi_genlock_unlock(&cxi_cntr->domain->trig_cmdq_lock);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_cntr_progress() - Make CQ progress on bound endpoint.
+ */
+static void cxip_cntr_progress(struct cxip_cntr *cntr)
+{
+	struct fid_list_entry *fid_entry;
+	struct dlist_entry *item;
+
+	/* Lock is used to protect bound context list. Note that
+	 * CQ processing updates counters via doorbells, use of
+	 * cntr->lock is not required by CQ processing.
+	 */
+	ofi_mutex_lock(&cntr->lock);
+
+	dlist_foreach(&cntr->ctx_list, item) {
+		fid_entry = container_of(item, struct fid_list_entry, entry);
+		cxip_ep_progress(fid_entry->fid);
+	}
+	ofi_mutex_unlock(&cntr->lock);
+}
+
+/*
+ * cxip_cntr_read() - fi_cntr_read() implementation.
+ */
+static uint64_t cxip_cntr_read(struct fid_cntr *fid_cntr)
+{
+	struct cxip_cntr *cxi_cntr;
+	uint64_t success = 0;
+	int ret;
+
+	cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid);
+
+	cxip_cntr_progress(cxi_cntr);
+	cxip_cntr_get(cxi_cntr, false);
+
+	/* TODO: Fall back to reading register on error? */
+	ret = cxip_cntr_get_ct_success(cxi_cntr, &success);
+	if (ret != FI_SUCCESS)
+		CXIP_WARN("Failed to read counter success: rc=%d\n", ret);
+
+	return success;
+}
+
+/*
+ * cxip_cntr_readerr() - fi_cntr_readerr() implementation.
+ */
+static uint64_t cxip_cntr_readerr(struct fid_cntr *fid_cntr)
+{
+	struct cxip_cntr *cxi_cntr;
+	uint64_t error = 0;
+	int ret;
+
+	cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid);
+
+	cxip_cntr_progress(cxi_cntr);
+	cxip_cntr_get(cxi_cntr, false);
+
+	/* TODO: Fall back to reading register on error? */
+	ret = cxip_cntr_get_ct_error(cxi_cntr, &error);
+	if (ret != FI_SUCCESS)
+		CXIP_WARN("Failed to read counter error: rc=%d\n", ret);
+
+	return error;
+}
+
+/*
+ * cxip_cntr_add() - fi_cntr_add() implementation.
+ */
+static int cxip_cntr_add(struct fid_cntr *fid_cntr, uint64_t value)
+{
+	struct cxip_cntr *cxi_cntr;
+
+	if (value > FI_CXI_CNTR_SUCCESS_MAX)
+		return -FI_EINVAL;
+
+	cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid);
+
+	return cxip_cntr_mod(cxi_cntr, value, false, false);
+}
+
+/*
+ * cxip_cntr_set() - fi_cntr_set() implementation.
+ */
+static int cxip_cntr_set(struct fid_cntr *fid_cntr, uint64_t value)
+{
+	struct cxip_cntr *cxi_cntr;
+
+	if (value > FI_CXI_CNTR_SUCCESS_MAX)
+		return -FI_EINVAL;
+
+	cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid);
+
+	return cxip_cntr_mod(cxi_cntr, value, true, false);
+}
+
+/*
+ * cxip_cntr_adderr() - fi_cntr_adderr() implementation.
+ */
+static int cxip_cntr_adderr(struct fid_cntr *fid_cntr, uint64_t value)
+{
+	struct cxip_cntr *cxi_cntr;
+
+	if (value > FI_CXI_CNTR_FAILURE_MAX)
+		return -FI_EINVAL;
+
+	cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid);
+
+	return cxip_cntr_mod(cxi_cntr, value, false, true);
+}
+
+/*
+ * cxip_cntr_seterr() - fi_cntr_seterr() implementation.
+ */
+static int cxip_cntr_seterr(struct fid_cntr *fid_cntr, uint64_t value)
+{
+	struct cxip_cntr *cxi_cntr;
+
+	if (value > FI_CXI_CNTR_FAILURE_MAX)
+		return -FI_EINVAL;
+
+	cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid);
+
+	return cxip_cntr_mod(cxi_cntr, value, true, true);
+}
+
+static int cxip_cntr_emit_trig_event_cmd(struct cxip_cntr *cntr,
+					 uint64_t threshold)
+{
+	struct c_ct_cmd cmd = {
+		.trig_ct = cntr->ct->ctn,
+		.threshold = threshold,
+		.eq = C_EQ_NONE,
+	};
+	struct cxip_cmdq *cmdq = cntr->domain->trig_cmdq;
+	int ret;
+
+	/* TODO: Need to handle TLE exhaustion. */
+	ofi_genlock_lock(&cntr->domain->trig_cmdq_lock);
+	ret = cxi_cq_emit_ct(cmdq->dev_cmdq, C_CMD_CT_TRIG_EVENT, &cmd);
+	if (!ret)
+		cxi_cq_ring(cmdq->dev_cmdq);
+	ofi_genlock_unlock(&cntr->domain->trig_cmdq_lock);
+
+	if (ret)
+		return -FI_EAGAIN;
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_cntr_wait() - fi_cntr_wait() implementation.
+ */
+static int cxip_cntr_wait(struct fid_cntr *fid_cntr, uint64_t threshold,
+			  int timeout)
+{
+	struct cxip_cntr *cntr =
+		container_of(fid_cntr, struct cxip_cntr, cntr_fid);
+	uint64_t success = 0;
+	int ret;
+	uint64_t endtime;
+
+
+	if (cntr->attr.wait_obj == FI_WAIT_NONE ||
+	    threshold > FI_CXI_CNTR_SUCCESS_MAX)
+		return -FI_EINVAL;
+
+	endtime = ofi_timeout_time(timeout);
+
+	/* Use a triggered list entry setup to fire at the user's threshold.
+	 * This will cause a success/error writeback to occur at the desired
+	 * threshold.
+	 */
+	ret = cxip_cntr_emit_trig_event_cmd(cntr, threshold);
+	if (ret) {
+		CXIP_INFO("Failed to emit trig cmd: %d\n", ret);
+		return ret;
+	}
+
+	/* Spin until the trigger list entry fires which updates the CT success
+	 * field.
+	 */
+	do {
+		ret = cxip_cntr_get_ct_success(cntr, &success);
+		if (ret) {
+			CXIP_WARN("Failed to read counter success: %d\n", ret);
+			return ret;
+		}
+
+		if (success >= threshold)
+			return FI_SUCCESS;
+
+		if (ofi_adjust_timeout(endtime, &timeout))
+			return -FI_ETIMEDOUT;
+
+		/* Only FI_WAIT_YIELD is supported. */
+		sched_yield();
+
+		cxip_cntr_progress(cntr);
+
+	} while (1);
+
+	/* TODO: Triggered operation may get leaked on timeout and threshold
+	 * never met.
+	 */
+}
+
+/*
+ * cxip_cntr_control() - fi_control() implementation for counter objects.
+ */
+static int cxip_cntr_control(struct fid *fid, int command, void *arg)
+{
+	int ret = FI_SUCCESS;
+	struct cxip_cntr *cntr;
+
+	cntr = container_of(fid, struct cxip_cntr, cntr_fid);
+
+	switch (command) {
+	case FI_GETWAIT:
+		if (cntr->wait)
+			ret = fi_control(&cntr->wait->fid,
+					 FI_GETWAIT, arg);
+		else
+			ret = -FI_EINVAL;
+		break;
+
+	case FI_GETOPSFLAG:
+		memcpy(arg, &cntr->attr.flags, sizeof(uint64_t));
+		break;
+
+	case FI_SETOPSFLAG:
+		memcpy(&cntr->attr.flags, arg, sizeof(uint64_t));
+		break;
+
+	default:
+		ret = -FI_EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * cxip_cntr_enable() - Assign hardware resources to the Counter.
+ */
+static int cxip_cntr_enable(struct cxip_cntr *cxi_cntr)
+{
+	int ret;
+
+	ret = cxip_dom_cntr_enable(cxi_cntr->domain);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	cxi_cntr->wb = &cxi_cntr->lwb;
+	cxi_cntr->wb_iface = FI_HMEM_SYSTEM;
+	cxi_cntr->wb_handle_valid = false;
+
+	ret = cxil_alloc_ct(cxi_cntr->domain->lni->lni,
+			    cxi_cntr->wb, &cxi_cntr->ct);
+	if (ret) {
+		CXIP_WARN("Failed to allocate CT, ret: %d\n", ret);
+		return -FI_EDOMAIN;
+	}
+
+	/* Zero the success and failure values. In addition, this will force a
+	 * writeback into the writeback buffer.
+	 */
+	cxi_ct_reset_failure(cxi_cntr->ct);
+	cxi_ct_reset_success(cxi_cntr->ct);
+
+	CXIP_DBG("Counter enabled: %p (CT: %d)\n", cxi_cntr, cxi_cntr->ct->ctn);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_cntr_close() - fi_close() implementation for counter objects.
+ */
+static int cxip_cntr_close(struct fid *fid)
+{
+	struct cxip_cntr *cntr;
+	int ret;
+
+	cntr = container_of(fid, struct cxip_cntr, cntr_fid.fid);
+	if (ofi_atomic_get32(&cntr->ref))
+		return -FI_EBUSY;
+
+	assert(dlist_empty(&cntr->ctx_list));
+
+	if (cntr->wb_iface != FI_HMEM_SYSTEM &&
+	    cntr->wb_handle_valid)
+		ofi_hmem_dev_unregister(cntr->wb_iface, cntr->wb_handle);
+
+	ret = cxil_destroy_ct(cntr->ct);
+	if (ret)
+		CXIP_WARN("Failed to free CT, ret: %d\n", ret);
+	else
+		CXIP_DBG("Counter disabled: %p\n", cntr);
+
+	ofi_mutex_destroy(&cntr->lock);
+
+	cxip_domain_remove_cntr(cntr->domain, cntr);
+
+	free(cntr);
+	return 0;
+}
+
+/* Set the counter writeback address to a client provided address. */
+int cxip_set_wb_buffer(struct fid *fid, void *buf, size_t len)
+{
+	int ret;
+	struct cxip_cntr *cntr;
+	uint64_t flags;
+
+	if (!buf)
+		return -FI_EINVAL;
+
+	if (len < sizeof(struct c_ct_writeback))
+		return -FI_EINVAL;
+
+	cntr = container_of(fid, struct cxip_cntr, cntr_fid.fid);
+
+	ret = cxil_ct_wb_update(cntr->ct, buf);
+	if (ret)
+		return ret;
+
+	if (cntr->wb_iface != FI_HMEM_SYSTEM &&
+	    cntr->wb_handle_valid)
+		ofi_hmem_dev_unregister(cntr->wb_iface, cntr->wb_handle);
+
+	cntr->wb = buf;
+	cntr->wb_iface = ofi_get_hmem_iface(buf, &cntr->wb_device, &flags);
+
+	if (cntr->wb_iface != FI_HMEM_SYSTEM) {
+		ret = ofi_hmem_dev_register(cntr->wb_iface, cntr->wb,
+					    sizeof(*cntr->wb),
+					    &cntr->wb_handle);
+		cntr->wb_handle_valid = (ret == FI_SUCCESS);
+	}
+
+	/* Force a counter writeback into the user's provider buffer. */
+	do {
+		ret = cxip_cntr_get(cntr, true);
+	} while (ret == -FI_EAGAIN);
+
+	return ret;
+}
+
+/* Get the counter MMIO region. */
+int cxip_get_mmio_addr(struct fid *fid, void **addr, size_t *len)
+{
+	struct cxip_cntr *cntr;
+
+	cntr = container_of(fid, struct cxip_cntr, cntr_fid.fid);
+
+	if (!cntr || !cntr->ct)
+		return -FI_EINVAL;
+
+	*addr = cntr->ct->doorbell;
+	*len = sizeof(cntr->ct->doorbell);
+
+	return FI_SUCCESS;
+}
+
+static struct fi_cxi_cntr_ops cxip_cntr_ext_ops = {
+	.set_wb_buffer = cxip_set_wb_buffer,
+	.get_mmio_addr = cxip_get_mmio_addr,
+};
+
+static int cxip_cntr_ops_open(struct fid *fid, const char *ops_name,
+			      uint64_t flags, void **ops, void *context)
+{
+	if (!strcmp(ops_name, FI_CXI_COUNTER_OPS)) {
+		*ops = &cxip_cntr_ext_ops;
+		return FI_SUCCESS;
+	}
+
+	return -FI_EINVAL;
+}
+
+static struct fi_ops_cntr cxip_cntr_ops = {
+	.size = sizeof(struct fi_ops_cntr),
+	.readerr = cxip_cntr_readerr,
+	.read = cxip_cntr_read,
+	.add = cxip_cntr_add,
+	.set = cxip_cntr_set,
+	.wait = cxip_cntr_wait,
+	.adderr = cxip_cntr_adderr,
+	.seterr = cxip_cntr_seterr,
+};
+
+static struct fi_ops cxip_cntr_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_cntr_close,
+	.bind = fi_no_bind,
+	.control = cxip_cntr_control,
+	.ops_open = cxip_cntr_ops_open,
+};
+
+/*
+ * cxip_cntr_verify_attr() - Verify counter creation attributes.
+ */
+static int cxip_cntr_verify_attr(struct fi_cntr_attr *attr)
+{
+	if (!attr)
+		return FI_SUCCESS;
+
+	if (attr->events != FI_CNTR_EVENTS_COMP)
+		return -FI_ENOSYS;
+
+	switch (attr->wait_obj) {
+	case FI_WAIT_NONE:
+	case FI_WAIT_UNSPEC:
+	case FI_WAIT_YIELD:
+		break;
+	default:
+		return -FI_ENOSYS;
+	}
+
+	if (attr->flags)
+		return -FI_ENOSYS;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_cntr_open() - fi_cntr_open() implementation.
+ */
+int cxip_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr,
+		   struct fid_cntr **cntr, void *context)
+{
+	int ret;
+	struct cxip_domain *dom;
+	struct cxip_cntr *_cntr;
+
+	dom = container_of(domain, struct cxip_domain, util_domain.domain_fid);
+
+	ret = cxip_cntr_verify_attr(attr);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	_cntr = calloc(1, sizeof(*_cntr));
+	if (!_cntr)
+		return -FI_ENOMEM;
+
+	if (!attr)
+		memcpy(&_cntr->attr, &cxip_cntr_attr, sizeof(cxip_cntr_attr));
+	else
+		memcpy(&_cntr->attr, attr, sizeof(cxip_cntr_attr));
+
+	ofi_atomic_initialize32(&_cntr->ref, 0);
+	dlist_init(&_cntr->ctx_list);
+
+	ofi_mutex_init(&_cntr->lock);
+
+	_cntr->cntr_fid.fid.fclass = FI_CLASS_CNTR;
+	_cntr->cntr_fid.fid.context = context;
+	_cntr->cntr_fid.fid.ops = &cxip_cntr_fi_ops;
+	_cntr->cntr_fid.ops = &cxip_cntr_ops;
+	_cntr->domain = dom;
+
+	ret = cxip_cntr_enable(_cntr);
+	if (ret)
+		goto err_free_cntr;
+
+	cxip_domain_add_cntr(dom, _cntr);
+
+	*cntr = &_cntr->cntr_fid;
+
+	return FI_SUCCESS;
+
+err_free_cntr:
+	free(_cntr);
+
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c
new file mode 100644
index 00000000000..b58983bf62c
--- /dev/null
+++ b/prov/cxi/src/cxip_coll.c
@@ -0,0 +1,3814 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved.
+ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP
+ * Support for accelerated collective reductions.
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <endian.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <math.h>
+
+#include <ofi_list.h>
+#include <ofi.h>
+#include <fenv.h>
+
+#include "cxip.h"
+
+#ifndef _MM_GET_FLUSH_ZERO_MODE
+#define _MM_GET_FLUSH_ZERO_MODE() ({0;})
+#endif
+
+#define	TRACE_PKT(fmt, ...)	CXIP_TRACE(CXIP_TRC_COLL_PKT, fmt, \
+					   ##__VA_ARGS__)
+#define	TRACE_JOIN(fmt, ...)	CXIP_TRACE(CXIP_TRC_COLL_JOIN, fmt, \
+					   ##__VA_ARGS__)
+#define	TRACE_DEBUG(fmt, ...)	CXIP_TRACE(CXIP_TRC_COLL_DEBUG, fmt, \
+					   ##__VA_ARGS__)
+
+// TODO regularize usage of these
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+/* must all be 0 in production code */
+#define __chk_pkts	1
+#define __trc_pkts	1
+#define __trc_data	1
+
+#define	MAGIC		0x677d
+
+/****************************************************************************
+ * Reduction packet for hardware accelerated collectives:
+ *
+ *  +----------------------------------------------------------+
+ *  | BYTES | Mnemonic    | Definition                         |
+ *  +----------------------------------------------------------+
+ *  | 48:17 | RED_PAYLOAD | Reduction payload, always 32 bytes |
+ *  | 16:5  | RED_HDR     | Reduction Header (below)           |
+ *  | 4:0   | RED_PADDING | Padding                            |
+ *  +----------------------------------------------------------+
+ *
+ *  Reduction header format:
+ *  --------------------------------------------------------
+ *  | Field          | Description              | Bit | Size (bits)
+ *  --------------------------------------------------------
+ *  | rt_seqno       | Sequence number          |  0  | 10 |
+ *  | rt_arm         | Multicast arm command    | 10  |  1 |
+ *  | rt_op          | Reduction operation      | 11  |  6 |
+ *  | rt_count       | Number of contributions  | 17  | 20 |
+ *  | rt_resno       | Result number            | 37  | 10 |
+ *  | rt_rc          | result code              | 47  |  4 |
+ *  | rt_repsum_m    | Reproducible sum M value | 51  |  8 |
+ *  | rt_repsum_ovfl | Reproducible sum M ovfl  | 59  |  2 |
+ *  | rt_pad         | Pad to 64 bits           | 61  |  3 |
+ *  | rt_cookie      | Cookie value             | 64  | 32 |
+ *  --------------------------------------------------------
+ *
+ * Note that this header is a 12-byte object, and "network-defined order" means
+ * big-endian for the entire 12-byte object. Thus, bytes must be swapped so
+ * that the MSByte of rt_cookie appears at byte 0, and the LS 8 bits of
+ * rt_seqno appear in byte 11.
+ *
+ * The cookie is ignored by reduction hardware, and is used as follows:
+ *
+ * mcast_id is the 13-bit multicast address used to disambiguate multiple
+ * multicast trees, since all incoming collective traffic is received by a
+ * single PTE bound to the endpoint.
+ *
+ * red_id is used to disambiguate packets delivered for different concurrent
+ * reductions.
+ *
+ * magic is a magic number used to positively identify this packet as a
+ * reduction packet. The basic send/receive code could be used for other kinds
+ * of restricted IDC packets. At present, all such packets are discarded.
+ *
+ * retry is a control bit that can be invoked by the hw root node to initiate a
+ * retransmission of the data from the leaves, if packets are lost.
+ */
+struct cxip_coll_cookie {
+	uint32_t mcast_id:13;
+	uint32_t red_id:3;
+	uint32_t magic: 15;
+	uint32_t retry: 1;
+} __attribute__((__packed__));           /* size  4b */
+
+/* Packed header bits and cookie from above */
+struct cxip_coll_hdr {
+        uint64_t seqno:10;
+        uint64_t arm:1;
+        uint64_t op:6;
+        uint64_t redcnt:20;
+        uint64_t resno:10;
+        uint64_t red_rc:4;
+        uint64_t repsum_m:8;
+        uint64_t repsum_ovflid:2;
+        uint64_t pad:3;
+        struct cxip_coll_cookie cookie;
+} __attribute__((__packed__));		/* size 12b */
+
+/* The following structure is 49 bytes in size, and all of the fields align
+ * properly for network transmission.
+ */
+struct red_pkt {
+	uint8_t pad[5];			/* size  5b offset  0b */
+	struct cxip_coll_hdr hdr;	/* size 12b offset  5b */
+	uint8_t data[32];		/* size 32b offset 17b */
+} __attribute__((__packed__));		/* size 49b */
+
+/* Swap byte order in an object of any size. Works for even or odd counts */
+static inline
+void _swapbyteorder(void *ptr, int count)
+{
+	uint8_t *p1 = (uint8_t *)ptr;
+	uint8_t *p2 = p1 + count - 1;
+	uint8_t swp;
+	while (p1 < p2) {
+		swp = *p1;
+		*p1 = *p2;
+		*p2 = swp;
+		p1++;
+		p2--;
+	}
+}
+
+/**
+ * Reformat the packet to accommodate network-ordering (big-endian) Rosetta
+ * expectations, versus little-endian Intel processing.
+ *
+ * Note in particular that the header bytes are treated as a single 12-byte
+ * object, rather than an 8-byte followed by a 4-byte, i.e. the last byte of the
+ * cookie is the first byte of the data processed by Rosetta. Note also that
+ * there is a 5-byte pad at the beginning of the packet, not included in the
+ * byte-swapping.
+ *
+ * This is done in-place for convenience. For reductions, it is copied to a
+ * properly-aligned data structure for mathematical operations.
+ */
+static inline
+void _swappkt(struct red_pkt *pkt)
+{
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+	uint64_t *data = (uint64_t *)pkt->data;
+	int i;
+
+	_swapbyteorder(&pkt->hdr, sizeof(pkt->hdr));
+	for (i = 0; i < 4; i++)
+		_swapbyteorder(&data[i], 8);
+#else
+#error "Unsupported processor byte ordering"
+#endif
+}
+
+/**
+ * Verificaton of the packet structure, normally disabled. Sizes and offsets
+ * cannot be checked at compile time. If the structure is wrong, this will
+ * call abort().
+ */
+#define FLDOFFSET(base, fld)	((uint8_t *)&base.fld - (uint8_t *)&base)
+__attribute__((unused)) static inline
+void check_red_pkt(void)
+{
+#if __chk_pkts
+	static int checked = 0;
+	struct red_pkt pkt;
+	uint64_t len, exp;
+	uint8_t *ptr, offset;
+	int i, err = 0;
+
+	if (checked)
+		return;
+	checked = 1;
+
+	len = sizeof(pkt);
+	exp = 49;
+	if (len != exp) {
+		TRACE_PKT("sizeof(pkt) = %ld, exp %ld\n", len, exp);
+		err++;
+	}
+	len = sizeof(pkt.pad);
+	exp = 5;
+	if (len != exp) {
+		TRACE_PKT("sizeof(pkt.pad) = %ld, exp %ld\n", len, exp);
+		err++;
+	}
+	len = sizeof(pkt.hdr);
+	exp = 12;
+	if (len != exp) {
+		TRACE_PKT("sizeof(pkt.hdr) = %ld, exp %ld\n", len, exp);
+		err++;
+	}
+	len = sizeof(pkt.data);
+	exp = 32;
+	if (len != exp) {
+		TRACE_PKT("sizeof(pkt.data) = %ld, exp %ld\n", len, exp);
+		err++;
+	}
+	len = FLDOFFSET(pkt, hdr);
+	exp = 5;
+	if (len != exp) {
+		TRACE_PKT("offset(pkt.hdr) = %ld, exp %ld\n", len, exp);
+		err++;
+	}
+	len = FLDOFFSET(pkt, data);
+	exp = 17;
+	if (len != exp) {
+		TRACE_PKT("offset(pkt.data) = %ld, exp %ld\n", len, exp);
+		err++;
+	}
+
+	/* Arbitrary value between 1,15 inclusive, ensure non-zero fill */
+	offset = 13;
+
+	/* Fill, swap, and confirm integrity of all 49 bytes */
+	ptr = (uint8_t *)&pkt;
+	for (i = 0; i < sizeof(pkt); i++)
+		ptr[i] = i + offset;
+	_swappkt(&pkt);
+	_swappkt(&pkt);
+	for (i = 0; i < sizeof(pkt); i++)
+		if (ptr[i] != i + offset) {
+			TRACE_PKT("pkt[%d] = %d, exp %d\n", i, ptr[i], i + offset);
+			err++;
+		}
+
+	if (err) {
+		TRACE_PKT("*** INVALID STRUCTURE see above ***\n");
+		abort();
+	}
+#endif
+}
+
+__attribute__((unused)) static inline
+void _dump_red_pkt(struct red_pkt *pkt, char *dir)
+{
+#if __trc_pkts
+	__attribute__((__unused__)) const uint64_t *data
+		= (const uint64_t *)pkt->data;
+	__attribute__((__unused__)) int i;
+
+	TRACE_PKT("---------------\n");
+	TRACE_PKT("Reduction packet (%s):\n", dir);
+	TRACE_PKT("  seqno        = %d\n", pkt->hdr.seqno);
+	TRACE_PKT("  arm          = %d\n", pkt->hdr.arm);
+	TRACE_PKT("  op           = %d\n", pkt->hdr.op);
+	TRACE_PKT("  redcnt       = %d\n", pkt->hdr.redcnt);
+	TRACE_PKT("  resno        = %d\n", pkt->hdr.resno);
+	TRACE_PKT("  red_rc       = %d\n", pkt->hdr.red_rc);
+	TRACE_PKT("  repsum_m     = %d\n", pkt->hdr.repsum_m);
+	TRACE_PKT("  repsum_ovflid= %d\n", pkt->hdr.repsum_ovflid);
+	TRACE_PKT("  cookie --\n");
+	TRACE_PKT("   .mcast_id   = %08x\n", pkt->hdr.cookie.mcast_id);
+	TRACE_PKT("   .red_id     = %08x\n", pkt->hdr.cookie.red_id);
+	TRACE_PKT("   .magic      = %08x\n", pkt->hdr.cookie.magic);
+	TRACE_PKT("   .retry      = %08x\n", pkt->hdr.cookie.retry);
+	for (i = 0; i < 4; i++)
+		TRACE_PKT("  ival[%d]     = %016lx\n", i, data[i]);
+	TRACE_PKT("---------------\n");
+#endif
+}
+
+/****************************************************************************
+ * Reduction operators for accelerated collectives.
+ *
+ * The array lookup is faster than a switch. Non-static initialization makes
+ * this adaptive to changes in header files (e.g. new opcodes in FI).
+ */
+#define COLL_OPCODE_BARRIER		0x00
+#define COLL_OPCODE_BIT_AND		0x01
+#define COLL_OPCODE_BIT_OR		0x02
+#define COLL_OPCODE_BIT_XOR		0x03
+#define COLL_OPCODE_INT_MIN		0x10
+#define COLL_OPCODE_INT_MAX		0x11
+#define COLL_OPCODE_INT_MINMAXLOC	0x12
+#define COLL_OPCODE_INT_SUM		0x14
+#define COLL_OPCODE_FLT_MINNUM		0x24
+#define COLL_OPCODE_FLT_MAXNUM		0x25
+#define COLL_OPCODE_FLT_MINMAXNUMLOC	0x26
+#define COLL_OPCODE_FLT_SUM_NOFTZ_RND0	0x28
+#define COLL_OPCODE_FLT_SUM_NOFTZ_RND1	0x29
+#define COLL_OPCODE_FLT_SUM_NOFTZ_RND2	0x2a
+#define COLL_OPCODE_FLT_SUM_NOFTZ_RND3	0x2b
+#define COLL_OPCODE_FLT_SUM_FTZ_RND0	0x2c
+#define COLL_OPCODE_FLT_SUM_FTZ_RND1	0x2d
+#define COLL_OPCODE_FLT_SUM_FTZ_RND2	0x2e
+#define COLL_OPCODE_FLT_SUM_FTZ_RND3	0x2f
+#define COLL_OPCODE_FLT_REPSUM		0x30
+#define COLL_OPCODE_MAX			0x31
+
+/* Convert exported op values to Rosetta opcodes */
+static cxip_coll_op_t _int8_16_32_op_to_opcode[FI_CXI_OP_LAST];
+static cxip_coll_op_t _uint8_16_32_op_to_opcode[FI_CXI_OP_LAST];
+static cxip_coll_op_t _int64_op_to_opcode[FI_CXI_OP_LAST];
+static cxip_coll_op_t _uint64_op_to_opcode[FI_CXI_OP_LAST];
+static cxip_coll_op_t _flt_op_to_opcode[FI_CXI_OP_LAST];
+static enum c_return_code _cxip_rc_to_cxi_rc[16];
+static enum cxip_coll_redtype _cxi_op_to_redtype[COLL_OPCODE_MAX];
+
+/* One-time dynamic initialization of FI to CXI opcode.
+ */
+void cxip_coll_populate_opcodes(void)
+{
+	int i;
+
+	if ((int)FI_CXI_MINMAXLOC < (int)FI_ATOMIC_OP_LAST) {
+		CXIP_FATAL("Invalid CXI_FMINMAXLOC value\n");
+	}
+	for (i = 0; i < FI_CXI_OP_LAST; i++) {
+		_int8_16_32_op_to_opcode[i] = -FI_EOPNOTSUPP;
+		_uint8_16_32_op_to_opcode[i] = -FI_EOPNOTSUPP;
+		_int64_op_to_opcode[i] = -FI_EOPNOTSUPP;
+		_uint64_op_to_opcode[i] = -FI_EOPNOTSUPP;
+		_flt_op_to_opcode[i] = -FI_EOPNOTSUPP;
+		_cxi_op_to_redtype[i] = REDTYPE_BYT;
+	}
+	/* operations supported by 32, 16, and 8 bit signed int operands */
+	/* NOTE: executed as packed 64-bit quantities */
+	_int8_16_32_op_to_opcode[FI_BOR] = COLL_OPCODE_BIT_OR;
+	_int8_16_32_op_to_opcode[FI_BAND] = COLL_OPCODE_BIT_AND;
+	_int8_16_32_op_to_opcode[FI_BXOR] = COLL_OPCODE_BIT_XOR;
+
+	/* operations supported by 32, 16, and 8 bit unsigned int operands */
+	_uint8_16_32_op_to_opcode[FI_BOR] = COLL_OPCODE_BIT_OR;
+	_uint8_16_32_op_to_opcode[FI_BAND] = COLL_OPCODE_BIT_AND;
+	_uint8_16_32_op_to_opcode[FI_BXOR] = COLL_OPCODE_BIT_XOR;
+
+	/* operations supported by 64 bit signed int operands */
+	_int64_op_to_opcode[FI_MIN] = COLL_OPCODE_INT_MIN;
+	_int64_op_to_opcode[FI_MAX] = COLL_OPCODE_INT_MAX;
+	_int64_op_to_opcode[FI_SUM] = COLL_OPCODE_INT_SUM;
+	_int64_op_to_opcode[FI_CXI_MINMAXLOC] = COLL_OPCODE_INT_MINMAXLOC;
+
+	/* operations supported by 64 bit unsigned int operands */
+	_uint64_op_to_opcode[FI_BOR] = COLL_OPCODE_BIT_OR;
+	_uint64_op_to_opcode[FI_BAND] = COLL_OPCODE_BIT_AND;
+	_uint64_op_to_opcode[FI_BXOR] = COLL_OPCODE_BIT_XOR;
+
+	/* operations supported by 64 bit double operands */
+	_flt_op_to_opcode[FI_MIN] = COLL_OPCODE_FLT_MINNUM;
+	_flt_op_to_opcode[FI_MAX] = COLL_OPCODE_FLT_MAXNUM;
+	_flt_op_to_opcode[FI_CXI_MINMAXLOC] = COLL_OPCODE_FLT_MINMAXNUMLOC;
+	_flt_op_to_opcode[FI_CXI_REPSUM] = COLL_OPCODE_FLT_REPSUM;
+	/* NOTE: FI_SUM handled in flt_op_to_opcode() function */
+
+	/* cxi_opcode to redtype translation */
+	_cxi_op_to_redtype[COLL_OPCODE_BIT_OR] = REDTYPE_INT;
+	_cxi_op_to_redtype[COLL_OPCODE_BIT_AND] = REDTYPE_INT;
+	_cxi_op_to_redtype[COLL_OPCODE_BIT_XOR] = REDTYPE_INT;
+	_cxi_op_to_redtype[COLL_OPCODE_INT_MIN] = REDTYPE_INT;
+	_cxi_op_to_redtype[COLL_OPCODE_INT_MAX] = REDTYPE_INT;
+	_cxi_op_to_redtype[COLL_OPCODE_INT_SUM] = REDTYPE_INT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_MINNUM] = REDTYPE_FLT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_MAXNUM] = REDTYPE_FLT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_NOFTZ_RND0] = REDTYPE_FLT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_NOFTZ_RND1] = REDTYPE_FLT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_NOFTZ_RND2] = REDTYPE_FLT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_NOFTZ_RND3] = REDTYPE_FLT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_FTZ_RND0] = REDTYPE_FLT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_FTZ_RND1] = REDTYPE_FLT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_FTZ_RND2] = REDTYPE_FLT;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_FTZ_RND3] = REDTYPE_FLT;
+
+	_cxi_op_to_redtype[COLL_OPCODE_INT_MINMAXLOC] = REDTYPE_IMINMAX;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_MINMAXNUMLOC] = REDTYPE_FMINMAX;
+	_cxi_op_to_redtype[COLL_OPCODE_FLT_REPSUM] = REDTYPE_REPSUM;
+
+	for (i = 0; i < 16; i++)
+		_cxip_rc_to_cxi_rc[i] = C_RC_AMO_ALIGN_ERROR;
+	_cxip_rc_to_cxi_rc[CXIP_COLL_RC_SUCCESS] = C_RC_OK;
+	_cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_INEXACT] = C_RC_AMO_FP_INEXACT;
+	_cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_OVERFLOW] = C_RC_AMO_FP_OVERFLOW;
+	_cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_INVALID] = C_RC_AMO_FP_INVALID;
+	_cxip_rc_to_cxi_rc[CXIP_COLL_RC_REP_INEXACT] = C_RC_AMO_FP_INEXACT;
+	_cxip_rc_to_cxi_rc[CXIP_COLL_RC_INT_OVERFLOW] = C_RC_AMO_FP_OVERFLOW;
+	_cxip_rc_to_cxi_rc[CXIP_COLL_RC_CONTR_OVERFLOW] = C_RC_AMO_LENGTH_ERROR;
+	_cxip_rc_to_cxi_rc[CXIP_COLL_RC_OP_MISMATCH] = C_RC_AMO_INVAL_OP_ERROR;
+}
+
+static inline int int8_16_32_op_to_opcode(int op)
+{
+	return _int8_16_32_op_to_opcode[op];
+}
+
+static inline int uint8_16_32_op_to_opcode(int op)
+{
+	return _uint8_16_32_op_to_opcode[op];
+}
+
+static inline int int64_op_to_opcode(int op)
+{
+	return _int64_op_to_opcode[op];
+}
+
+static inline int uint64_op_to_opcode(int op)
+{
+	return _uint64_op_to_opcode[op];
+}
+
+static inline int flt_op_to_opcode(int op)
+{
+	if (op != FI_SUM)
+		return _flt_op_to_opcode[op];
+
+	switch (fegetround()) {
+	case FE_TONEAREST:
+		return (_MM_GET_FLUSH_ZERO_MODE()) ?
+			COLL_OPCODE_FLT_SUM_FTZ_RND0 :
+			COLL_OPCODE_FLT_SUM_NOFTZ_RND0;
+	case FE_UPWARD:
+		return (_MM_GET_FLUSH_ZERO_MODE()) ?
+			COLL_OPCODE_FLT_SUM_FTZ_RND1 :
+			COLL_OPCODE_FLT_SUM_NOFTZ_RND1;
+	case FE_DOWNWARD:
+		return (_MM_GET_FLUSH_ZERO_MODE()) ?
+			COLL_OPCODE_FLT_SUM_FTZ_RND2 :
+			COLL_OPCODE_FLT_SUM_NOFTZ_RND2;
+	case FE_TOWARDZERO:
+		return (_MM_GET_FLUSH_ZERO_MODE()) ?
+			COLL_OPCODE_FLT_SUM_FTZ_RND3 :
+			COLL_OPCODE_FLT_SUM_NOFTZ_RND3;
+	}
+	return -FI_EOPNOTSUPP;
+}
+
+/* Convert CXI opcode to reduction data type */
+static inline
+enum cxip_coll_redtype _opcode_to_redtype(cxip_coll_op_t cxi_opcode)
+{
+	return _cxi_op_to_redtype[cxi_opcode];
+}
+
+/* Convert FI opcode to CXI opcode, depending on FI data type */
+static inline
+int cxip_fi2cxi_opcode(enum fi_op op, enum fi_datatype datatype)
+{
+	switch ((int)datatype) {
+	case FI_INT8:
+	case FI_INT16:
+	case FI_INT32:
+		return int8_16_32_op_to_opcode(op);
+	case FI_UINT8:
+	case FI_UINT16:
+	case FI_UINT32:
+		return uint8_16_32_op_to_opcode(op);
+	case FI_INT64:
+		return int64_op_to_opcode(op);
+	case FI_UINT64:
+		return uint64_op_to_opcode(op);
+	case FI_DOUBLE:
+		return flt_op_to_opcode(op);
+	}
+	return -FI_EOPNOTSUPP;
+}
+
+/* Determine FI datatype size */
+static inline
+int _get_cxi_data_bytcnt(cxip_coll_op_t cxi_opcode,
+			 enum fi_datatype datatype, size_t count)
+{
+	int size;
+
+	switch (datatype) {
+	case FI_INT8:
+	case FI_UINT8:
+		size = sizeof(uint8_t);
+		break;
+	case FI_INT16:
+	case FI_UINT16:
+		size = sizeof(uint16_t);
+		break;
+	case FI_INT32:
+	case FI_UINT32:
+		size = sizeof(uint32_t);
+		break;
+	case FI_INT64:
+	case FI_UINT64:
+		size = sizeof(uint64_t);
+		break;
+	case FI_FLOAT:
+		size = sizeof(float);
+		break;
+	case FI_DOUBLE:
+		size = sizeof(double);
+		break;
+	default:
+		return -FI_EOPNOTSUPP;
+	}
+	switch (cxi_opcode) {
+	case COLL_OPCODE_INT_MINMAXLOC:
+	case COLL_OPCODE_FLT_MINMAXNUMLOC:
+	case COLL_OPCODE_FLT_REPSUM:
+		size *= 4;
+		break;
+	default:
+		// do nothing, size is correct
+		break;
+	}
+	size *= count;
+	if (size > CXIP_COLL_MAX_DATA_SIZE)
+		return -FI_EINVAL;
+	return size;
+}
+
+/****************************************************************************
+ * SEND operation (restricted Put to a remote PTE)
+ */
+
+/* Forward references */
+static void _progress_coll(struct cxip_coll_reduction *reduction,
+			   struct red_pkt *pkt);
+static ssize_t _coll_append_buffer(struct cxip_coll_pte *coll_pte,
+				   struct cxip_coll_buf *buf);
+
+/* Generate a dfa and index extension for a reduction */
+static int _gen_tx_dfa(struct cxip_coll_reduction *reduction,
+		       int av_set_idx, union c_fab_addr *dfa,
+		       uint8_t *index_ext, bool *is_mcast)
+{
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_av_set *av_set_obj;
+	struct cxip_addr dest_caddr;
+	fi_addr_t dest_addr;
+	int pid_bits;
+	int idx_ext;
+	int ret;
+
+	ep_obj = reduction->mc_obj->ep_obj;
+	av_set_obj = reduction->mc_obj->av_set_obj;
+
+	/* Send address */
+	switch (av_set_obj->comm_key.keytype) {
+	case COMM_KEY_MULTICAST:
+		/* - destination == multicast ID
+		 * - idx_ext == 0
+		 * - dfa == multicast destination
+		 * - index_ext == 0
+		 */
+		if (is_netsim(ep_obj)) {
+			CXIP_WARN("NETSIM does not support mcast\n");
+			return -FI_EINVAL;
+		}
+		idx_ext = 0;
+		cxi_build_mcast_dfa(av_set_obj->comm_key.mcast.mcast_addr,
+				    reduction->red_id, idx_ext,
+				    dfa, index_ext);
+		*is_mcast = true;
+		break;
+	case COMM_KEY_UNICAST:
+		/* - destination == remote node in av_set_obj
+		 * - idx_ext == CXIP_PTL_IDX_COLL
+		 * - dfa = remote nic
+		 * - index_ext == CXIP_PTL_IDX_COLL
+		 */
+		if (av_set_idx >= av_set_obj->fi_addr_cnt) {
+			CXIP_WARN("av_set_idx out-of-range\n");
+			return -FI_EINVAL;
+		}
+		dest_addr = av_set_obj->fi_addr_ary[av_set_idx];
+		ret = cxip_av_lookup_addr(ep_obj->av, dest_addr, &dest_caddr);
+		if (ret != FI_SUCCESS)
+			return ret;
+		pid_bits = ep_obj->domain->iface->dev->info.pid_bits;
+		cxi_build_dfa(dest_caddr.nic, dest_caddr.pid, pid_bits,
+			      CXIP_PTL_IDX_COLL, dfa, index_ext);
+		*is_mcast = false;
+		break;
+	case COMM_KEY_RANK:
+		/* - destination == source NIC
+		 * - idx_ext == extended PID
+		 * - dfa == source NIC
+		 * - index_ext == idx_ext offset beyond RXCs (5-bit range)
+		 */
+		if (av_set_idx >= av_set_obj->fi_addr_cnt) {
+			CXIP_WARN("av_set_idx out-of-range\n");
+			return -FI_EINVAL;
+		}
+		dest_caddr = ep_obj->src_addr;
+		pid_bits = ep_obj->domain->iface->dev->info.pid_bits;
+		idx_ext = CXIP_PTL_IDX_COLL + av_set_idx;
+		cxi_build_dfa(dest_caddr.nic, dest_caddr.pid, pid_bits,
+			      idx_ext, dfa, index_ext);
+		*is_mcast = false;
+		break;
+	default:
+		CXIP_WARN("unexpected comm_key type: %d\n",
+			  av_set_obj->comm_key.keytype);
+		return -FI_EINVAL;
+	}
+	return FI_SUCCESS;
+}
+
+/**
+ * Issue a restricted Put to the destination address.
+ * If md is NULL, this performs an IDC Put, otherwise it issues a DMA Put.
+ *
+ * Exported for unit testing.
+ *
+ * This will return -FI_EAGAIN on transient errors.
+ */
+int cxip_coll_send(struct cxip_coll_reduction *reduction,
+		   int av_set_idx, const void *buffer, size_t buflen,
+		   struct cxi_md *md)
+{
+	union c_cmdu cmd = {};
+	struct cxip_coll_mc *mc_obj;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_cmdq *cmdq;
+	union c_fab_addr dfa;
+	uint8_t index_ext;
+	bool is_mcast;
+	int ret;
+
+	if (!buffer) {
+		CXIP_INFO("no buffer\n");
+		return -FI_EINVAL;
+	}
+
+	mc_obj = reduction->mc_obj;
+	ep_obj = mc_obj->ep_obj;
+	cmdq = ep_obj->coll.tx_cmdq;
+
+	ret = _gen_tx_dfa(reduction, av_set_idx, &dfa, &index_ext, &is_mcast);
+	if (ret)
+		return ret;
+
+	if (cxip_evtq_saturated(ep_obj->coll.tx_evtq)) {
+		CXIP_DBG("TX HW EQ saturated\n");
+		return -FI_EAGAIN;
+	}
+
+#if ENABLE_DEBUG
+	if (reduction->drop_send) {
+		reduction->drop_send = false;
+		goto drop_pkt;
+	}
+#endif
+
+	if (md) {
+		cmd.full_dma.command.opcode = C_CMD_PUT;
+		cmd.full_dma.event_send_disable = 1;
+		cmd.full_dma.event_success_disable = 1;
+		cmd.full_dma.restricted = 1;
+		cmd.full_dma.reduction = is_mcast;
+		cmd.full_dma.index_ext = index_ext;
+		cmd.full_dma.eq = cxip_evtq_eqn(ep_obj->coll.tx_evtq);
+		cmd.full_dma.dfa = dfa;
+		cmd.full_dma.lac = md->lac;
+		cmd.full_dma.local_addr = CXI_VA_TO_IOVA(md, buffer);
+		cmd.full_dma.request_len = buflen;
+
+		/* this uses cached values, returns -FI_EAGAIN if queue full */
+		ret = cxip_txq_cp_set(cmdq, ep_obj->auth_key.vni,
+				      mc_obj->tc, mc_obj->tc_type);
+		if (ret)
+			goto err;
+
+		ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &cmd.full_dma);
+	} else {
+		cmd.c_state.event_send_disable = 1;
+		cmd.c_state.event_success_disable = 1;
+		cmd.c_state.restricted = 1;
+		cmd.c_state.reduction = is_mcast;
+		cmd.c_state.index_ext = index_ext;
+		cmd.c_state.eq = cxip_evtq_eqn(ep_obj->coll.tx_evtq);
+		cmd.c_state.initiator = CXI_MATCH_ID(
+			ep_obj->domain->iface->dev->info.pid_bits,
+			ep_obj->src_addr.pid, ep_obj->src_addr.nic);
+
+		/* this uses cached values, returns -FI_EAGAIN if queue full */
+		ret = cxip_txq_cp_set(cmdq, ep_obj->auth_key.vni,
+				      mc_obj->tc, mc_obj->tc_type);
+		if (ret)
+			goto err;
+
+		/* returns -FI_EAGAIN on failure */
+		ret = cxip_cmdq_emit_c_state(cmdq, &cmd.c_state);
+		if (ret) {
+			ret = -FI_EAGAIN;
+			goto err;
+		}
+
+		memset(&cmd.idc_put, 0, sizeof(cmd.idc_put));
+		cmd.idc_put.idc_header.dfa = dfa;
+		ret = cxi_cq_emit_idc_put(cmdq->dev_cmdq, &cmd.idc_put,
+					  buffer, buflen);
+		if (ret) {
+			ret = -FI_EAGAIN;
+			goto err;
+		}
+	}
+
+	if (ret) {
+		/* Return error according to Domain Resource Management */
+		ret = -FI_EAGAIN;
+		goto err;
+	}
+
+	cxi_cq_ring(cmdq->dev_cmdq);
+
+#if ENABLE_DEBUG
+drop_pkt:
+#endif
+	ret = FI_SUCCESS;
+	ofi_atomic_inc32(&reduction->mc_obj->send_cnt);
+
+err:
+	return ret;
+}
+
+/****************************************************************************
+ * RECV operation (of restricted Put to a local PTE)
+ *
+ * Collectives use a dedicated EP and PTE for each MC object.
+ *
+ * Packet space is allocated and linked to the PTE with a request. When a
+ * packet is received, CXI hardware puts the request pointer and incoming
+ * packet offset into a hardware-managed CXI event queue. When the CXI evtq
+ * is progressed, completed hardware events are harvested, and the request
+ * pointer (along with completion data) is inserted into an OFI CQ for the
+ * endpont. Reading any OFI CQ bound to that endpoint will harvest all CXI
+ * (hardware) evtqs bound to that endpoint, but will return only events
+ * associated with the specified CQ, IF there are multiple CQs.
+ *
+ * Collectives services two CXI (hardware) evtqs for each MC object.
+ *
+ * The tx_evtq is only used to detect hardware buffer overflow, which
+ * reflects -FI_EAGAIN back to the client.
+ *
+ * The rx_evtq manages PTE events for the collective endpoint. Buffer link
+ * and unlink events are consumed silently: buffer exhaustion is checked on
+ * every packet receipt, and will automatically recycle exhausted buffers.
+ * PUT events are filtered for correct format and passed into the collective
+ * state machine for processing. All other received packets are discarded.
+ *
+ * cxip_cq_req_complete() is used internally for PTE events, and externally
+ * to report collective operation completions. The internal events are useful
+ * for certain bench test models, where we need to count the packets received
+ * as well as the collective completion. In production, we want to disable
+ * the internal events. This is done independently for each MC object with
+ * the mc->rx_discard flag.
+ */
+
+/* Report success/error results of an RX event through CQ/counters, and roll
+ * over the buffers if appropriate.
+ *
+ * NOTE: req may be invalid after this call.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static void _coll_rx_req_report(struct cxip_req *req)
+{
+	size_t overflow;
+	int err, ret;
+
+	req->flags &= (FI_RECV | FI_COMPLETION);
+
+	/* Interpret results */
+	overflow = req->coll.hw_req_len - req->data_len;
+	if (req->coll.cxi_rc == C_RC_OK && req->coll.isred && !overflow) {
+		/* receive success */
+		if (req->flags & FI_COMPLETION) {
+			/* failure means progression is hung */
+			ret = cxip_cq_req_complete(req);
+			if (ret)
+				CXIP_FATAL(
+				    "cxip_cq_req_complete failed: %d\n", ret);
+		}
+
+		if (req->coll.coll_pte->ep_obj->coll.rx_cntr) {
+			/* failure means counts cannot be trusted */
+			ret = cxip_cntr_mod(
+				req->coll.coll_pte->ep_obj->coll.rx_cntr, 1,
+				false, false);
+			if (ret)
+				CXIP_WARN(
+					"Failed success cxip_cntr_mod: %d\n",
+					ret);
+		}
+	} else {
+		/* failure */
+		if (req->coll.cxi_rc != C_RC_OK) {
+			/* real network error of some sort */
+			err = proverr2errno(req->coll.cxi_rc);
+			CXIP_WARN("Request error: %p (err: %d, %s)\n",
+				  req, err, cxi_rc_to_str(err));
+		} else if (overflow) {
+			/* can only happen on very large packet (> 64 bytes) */
+			err = FI_EMSGSIZE;
+			CXIP_WARN("Request truncated: %p (err: %d, %s)\n",
+				  req, err, cxi_rc_to_str(err));
+		} else {
+			/* non-reduction packet */
+			err = FI_ENOMSG;
+			CXIP_INFO("Not reduction pkt: %p (err: %d, %s)\n",
+				  req, err, cxi_rc_to_str(err));
+		}
+
+		/* failure means progression is hung */
+		ret = cxip_cq_req_error(req, overflow, err,
+					req->coll.cxi_rc,
+					NULL, 0, FI_ADDR_UNSPEC);
+		if (ret)
+			CXIP_FATAL("cxip_cq_req_error: %d\n", ret);
+
+		if (req->coll.coll_pte->ep_obj->coll.rx_cntr) {
+			/* failure means counts cannot be trusted */
+			ret = cxip_cntr_mod(
+				req->coll.coll_pte->ep_obj->coll.rx_cntr, 1,
+				false, true);
+			if (ret)
+				CXIP_WARN("cxip_cntr_mod: %d\n", ret);
+		}
+	}
+
+	/* manage buffer rollover */
+	if (req->coll.mrecv_space <
+	    req->coll.coll_pte->ep_obj->coll.min_multi_recv) {
+		struct cxip_coll_pte *coll_pte = req->coll.coll_pte;
+		struct cxip_coll_buf *buf = req->coll.coll_buf;
+		int cnt;
+
+		/* Will be re-incremented when LINK is received */
+		cnt = ofi_atomic_dec32(&coll_pte->buf_cnt);
+		if (req->coll.coll_pte->buf_low_water > cnt)
+			req->coll.coll_pte->buf_low_water = cnt;
+		if (cnt <= 0) {
+			CXIP_WARN("COLL buffers exhausted\n");
+			// TODO set flag to shut this down
+		}
+		ofi_atomic_inc32(&coll_pte->buf_swap_cnt);
+
+		/* Re-use this buffer in the hardware */
+		ret = _coll_append_buffer(coll_pte, buf);
+		if (ret != FI_SUCCESS)
+			CXIP_WARN("Re-link buffer failed: %d\n", ret);
+
+		/* Hardware has silently unlinked this */
+		cxip_evtq_req_free(req);
+	}
+}
+
+/* Evaluate PUT receive request to see if this is a reduction packet */
+static void _coll_rx_progress(struct cxip_req *req,
+			      const union c_event *event)
+{
+	struct cxip_coll_mc *mc_obj;
+	struct cxip_coll_reduction *reduction;
+	struct red_pkt *pkt;
+
+	/* Raw packet of some sort received */
+	ofi_atomic_inc32(&req->coll.coll_pte->recv_cnt);
+
+	/* If not the right size, don't swap bytes */
+	if (req->data_len != sizeof(struct red_pkt)) {
+		CXIP_INFO("Bad coll packet size: %ld\n", req->data_len);
+		return;
+	}
+
+	/* If swap doesn't look like reduction packet, swap back and discard */
+	pkt = (struct red_pkt *)req->buf;
+	_swappkt(pkt);
+	if (pkt->hdr.cookie.magic != MAGIC)
+	{
+		CXIP_INFO("Bad coll MAGIC: %x\n", pkt->hdr.cookie.magic);
+		_swappkt(pkt);
+		return;
+	}
+	/* This is a reduction packet */
+
+	/* The coll.coll_pte->mc_obj is defined only for COMM_KEY_RANK */
+	mc_obj = req->coll.coll_pte->mc_obj;
+	if (!mc_obj)
+		mc_obj = ofi_idm_lookup(
+				&req->coll.coll_pte->ep_obj->coll.mcast_map,
+				pkt->hdr.cookie.mcast_id);
+	if (!mc_obj) {
+		TRACE_PKT("Bad coll lookup: %x\n", pkt->hdr.cookie.mcast_id);
+		return;
+	}
+	/* This is a valid reduction packet */
+	ofi_atomic_inc32(&mc_obj->recv_cnt);
+	req->coll.isred = true;
+	req->discard = mc_obj->rx_discard;
+	reduction = &mc_obj->reduction[pkt->hdr.cookie.red_id];
+	TRACE_PKT("Valid reduction packet\n");
+
+#if ENABLE_DEBUG
+	/* Test case, simulate packet dropped in-flight */
+	if (reduction->drop_recv) {
+		reduction->drop_recv = false;
+		return;
+	}
+#endif
+
+	/* Progress the reduction */
+	_dump_red_pkt(pkt, "recv");
+	ofi_atomic_inc32(&mc_obj->pkt_cnt);
+	_progress_coll(reduction, pkt);
+}
+
+/* Event-handling callback for posted receive buffers */
+static int _coll_recv_cb(struct cxip_req *req, const union c_event *event)
+{
+	req->coll.cxi_rc = cxi_tgt_event_rc(event);
+	switch (event->hdr.event_type) {
+	case C_EVENT_LINK:
+		/* Enabled */
+		if (req->coll.cxi_rc != C_RC_OK) {
+			CXIP_WARN("LINK error rc: %d\n", req->coll.cxi_rc);
+			break;
+		}
+		CXIP_DBG("LINK event seen\n");
+		ofi_atomic_inc32(&req->coll.coll_pte->buf_cnt);
+		break;
+	case C_EVENT_UNLINK:
+		/* Normally disabled, errors only */
+		req->coll.cxi_rc = cxi_tgt_event_rc(event);
+		if (req->coll.cxi_rc != C_RC_OK) {
+			CXIP_WARN("UNLINK error rc: %d\n", req->coll.cxi_rc);
+			break;
+		}
+		CXIP_DBG("UNLINK event seen\n");
+		break;
+	case C_EVENT_PUT:
+		req->coll.isred = false;
+		req->coll.cxi_rc = cxi_tgt_event_rc(event);
+		if (req->coll.cxi_rc != C_RC_OK) {
+			CXIP_WARN("PUT error rc: %d\n", req->coll.cxi_rc);
+			break;
+		}
+		CXIP_DBG("PUT event seen\n");
+		req->buf = (uint64_t)(CXI_IOVA_TO_VA(
+					req->coll.coll_buf->cxi_md->md,
+					event->tgt_long.start));
+		req->coll.mrecv_space -= event->tgt_long.mlength;
+		req->coll.hw_req_len = event->tgt_long.rlength;
+		req->data_len = event->tgt_long.mlength;
+		_coll_rx_progress(req, event);
+		_coll_rx_req_report(req);
+		break;
+	default:
+		req->coll.cxi_rc = cxi_tgt_event_rc(event);
+		CXIP_WARN(CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(req->coll.cxi_rc));
+		break;
+	}
+
+	return FI_SUCCESS;
+}
+
+/* Inject a hardware LE append. Does not generate HW LINK event unless error. */
+static int _hw_coll_recv(struct cxip_coll_pte *coll_pte, struct cxip_req *req)
+{
+	uint32_t le_flags;
+	uint64_t recv_iova;
+	int ret;
+
+	/* C_LE_MANAGE_LOCAL makes Cassini ignore initiator remote_offset in all
+	 * Puts, and causes automatic UNLINK when buffer capacity drops below
+	 * CXIP_COLL_MIN_MULTI_RECV.
+	 *
+	 * C_LE_EVENT_UNLINK_DISABLE prevents generation of UNLINK events. We
+	 * detect UNLINK by counting packets, and presume automatic UNLINK drops
+	 * below CXIP_COLL_MIN_MULTI_RECV.
+	 *
+	 * C_LE_EVENT_UNLINK_DISABLE prevents UNLINK events from being
+	 * generated. Hardware performs UNLINK automatically when buffer
+	 * capacity is below CXIP_COLL_MIN_MULTI_RECV.
+	 *
+	 * C_LE_OP_PUT indicates this is an input buffer that responses to PUT.
+	 *
+	 * C_LE_NO_TRUNCATE is not used, because all packets are a fixed size,
+	 * and CXIP_COLL_MIN_MULTI_RECV is sufficient to guarantee space for one new
+	 * reduction packet.
+	 */
+	le_flags = C_LE_EVENT_UNLINK_DISABLE | C_LE_OP_PUT | C_LE_MANAGE_LOCAL;
+
+	recv_iova = CXI_VA_TO_IOVA(req->coll.coll_buf->cxi_md->md,
+				   (uint64_t)req->coll.coll_buf->buffer);
+
+	ret = cxip_pte_append(coll_pte->pte,
+			      recv_iova,
+			      req->coll.coll_buf->bufsiz,
+			      req->coll.coll_buf->cxi_md->md->lac,
+			      C_PTL_LIST_PRIORITY,
+			      req->req_id,
+			      0, 0, 0,
+			      req->coll.coll_pte->ep_obj->coll.min_multi_recv,
+			      le_flags, coll_pte->ep_obj->coll.rx_cntr,
+			      coll_pte->ep_obj->coll.rx_cmdq,
+			      true);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("PTE append inject failed: %d\n", ret);
+		return ret;
+	}
+
+	return FI_SUCCESS;
+}
+
+/* Append a receive buffer to the PTE, with callback to handle receives.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static ssize_t _coll_append_buffer(struct cxip_coll_pte *coll_pte,
+				   struct cxip_coll_buf *buf)
+{
+	struct cxip_req *req;
+	int ret;
+
+	if (buf->bufsiz && !buf->buffer) {
+		CXIP_INFO("no buffer\n");
+		return -FI_EINVAL;
+	}
+
+	/* Allocate and populate a new request
+	 * Sets:
+	 * - req->cq
+	 * - req->req_id to request index
+	 * - req->req_ctx to passed context (buf)
+	 * - req->discard to false
+	 * - Inserts into the cq->req_list
+	 */
+	req = cxip_evtq_req_alloc(coll_pte->ep_obj->coll.rx_evtq, 1, buf);
+	if (!req) {
+		ret = -FI_ENOMEM;
+		goto recv_unmap;
+	}
+
+	/* CQ event fields, set according to fi_cq.3
+	 *   - set by provider
+	 *   - returned to user in completion event
+	 * uint64_t context;	// operation context
+	 * uint64_t flags;	// operation flags
+	 * uint64_t data_len;	// received data length
+	 * uint64_t buf;	// receive buf offset
+	 * uint64_t data;	// receive REMOTE_CQ_DATA
+	 * uint64_t tag;	// receive tag value on matching interface
+	 * fi_addr_t addr;	// sender address (if known) ???
+	 */
+
+	/* Request parameters */
+	req->type = CXIP_REQ_COLL;
+	req->flags = (FI_RECV | FI_COMPLETION);
+	req->cb = _coll_recv_cb;
+	req->triggered = false;
+	req->trig_thresh = 0;
+	req->trig_cntr = NULL;
+	req->context = (uint64_t)buf;
+	req->data_len = 0;
+	req->buf = (uint64_t)buf->buffer;
+	req->data = 0;
+	req->tag = 0;
+	req->coll.coll_pte = coll_pte;
+	req->coll.coll_buf = buf;
+	req->coll.mrecv_space = req->coll.coll_buf->bufsiz;
+
+	/* Returns FI_SUCCESS or FI_EAGAIN */
+	ret = _hw_coll_recv(coll_pte, req);
+	if (ret != FI_SUCCESS)
+		goto recv_dequeue;
+
+	return FI_SUCCESS;
+
+recv_dequeue:
+	cxip_evtq_req_free(req);
+
+recv_unmap:
+	cxip_unmap(buf->cxi_md);
+	return ret;
+}
+
+/****************************************************************************
+ * PTE management functions.
+ */
+
+/* PTE state-change callback */
+ __attribute__((__unused__))
+static void _coll_pte_cb(struct cxip_pte *pte, const union c_event *event)
+{
+	switch (pte->state) {
+	case C_PTLTE_ENABLED:
+	case C_PTLTE_DISABLED:
+		break;
+	default:
+		CXIP_FATAL("Unexpected state received: %u\n", pte->state);
+	}
+}
+
+/* Enable a collective PTE. Wait for completion. */
+static inline
+int _coll_pte_enable(struct cxip_coll_pte *coll_pte, uint32_t drop_count)
+{
+	return cxip_pte_set_state_wait(coll_pte->pte,
+				       coll_pte->ep_obj->coll.rx_cmdq,
+				       coll_pte->ep_obj->coll.rx_evtq,
+				       C_PTLTE_ENABLED, drop_count);
+}
+
+/* Disable a collective PTE. Wait for completion */
+static inline
+int _coll_pte_disable(struct cxip_coll_pte *coll_pte)
+{
+	return cxip_pte_set_state_wait(coll_pte->pte,
+				       coll_pte->ep_obj->coll.rx_cmdq,
+				       coll_pte->ep_obj->coll.rx_evtq,
+				       C_PTLTE_DISABLED, 0);
+}
+
+/* Destroy and unmap all buffers used by the collectives PTE.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static void _coll_destroy_buffers(struct cxip_coll_pte *coll_pte)
+{
+	struct dlist_entry *list = &coll_pte->buf_list;
+	struct cxip_coll_buf *buf;
+
+	while (!dlist_empty(list)) {
+		dlist_pop_front(list, struct cxip_coll_buf, buf, buf_entry);
+		cxip_unmap(buf->cxi_md);
+		free(buf);
+	}
+}
+
+/* Adds 'count' buffers of 'size' bytes to the collecives PTE. This succeeds
+ * fully, or it fails and removes all added buffers.
+ */
+static int _coll_add_buffers(struct cxip_coll_pte *coll_pte, size_t size,
+			     size_t count)
+{
+	struct cxip_coll_buf *buf;
+	int ret, i;
+
+	if (count < CXIP_COLL_MIN_RX_BUFS) {
+		CXIP_INFO("Buffer count %ld < minimum (%d)\n",
+			  count, CXIP_COLL_MIN_RX_BUFS);
+		return -FI_EINVAL;
+	}
+
+	if (size < CXIP_COLL_MIN_RX_SIZE) {
+		CXIP_INFO("Buffer size %ld < minimum (%d)\n",
+			  size, CXIP_COLL_MIN_RX_SIZE);
+		return -FI_EINVAL;
+	}
+
+	CXIP_DBG("Adding %ld buffers of size %ld\n", count, size);
+	for (i = 0; i < count; i++) {
+		buf = calloc(1, sizeof(*buf) + size);
+		if (!buf) {
+			ret = -FI_ENOMEM;
+			goto out;
+		}
+		ret = cxip_map(coll_pte->ep_obj->domain, (void *)buf->buffer,
+			       size, 0, &buf->cxi_md);
+		if (ret)
+			goto del_msg;
+		buf->bufsiz = size;
+		dlist_insert_tail(&buf->buf_entry, &coll_pte->buf_list);
+
+		ret = _coll_append_buffer(coll_pte, buf);
+		if (ret) {
+			CXIP_WARN("Add buffer %d of %ld: %d\n",
+				  i, count, ret);
+			goto out;
+		}
+	}
+	/* Block until PTE completes buffer appends */
+	do {
+		sched_yield();
+		cxip_evtq_progress(coll_pte->ep_obj->coll.rx_evtq);
+	} while (ofi_atomic_get32(&coll_pte->buf_cnt) < count);
+	coll_pte->buf_low_water = (int)count;
+
+	return FI_SUCCESS;
+del_msg:
+	free(buf);
+out:
+	_coll_destroy_buffers(coll_pte);
+	return ret;
+}
+
+/****************************************************************************
+ * Mathematical routines used for collective reductions.
+ */
+
+/* Set RC only if new is higher priority than old */
+// TODO avoid branch:
+// http://geeksforgeeks.org/
+//     compute-the-minimum-or-maximum-of-two-integers-without-branching
+#define SET_RED_RC(redrc, rc) do {if ((redrc)<(rc)) (redrc)=(rc);} while(0)
+
+static inline
+bool cxip_is_snan64(double d)
+{
+	/* This detection is universal IEEE */
+	return isnan(d) && !(_dbl2bits(d) & 0x0008000000000000);
+}
+
+/* convert signalling NaN to quiet NaN */
+static inline
+bool _quiesce_nan(double *d)
+{
+	if (!cxip_is_snan64(*d))
+		return false;
+	*d = NAN;
+	return true;
+}
+
+/**
+ * Implement NaN comparison in RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM
+ *
+ * Only associative mode is supported. The old IEEE mode is incorrect, and has
+ * been deprecated.
+ *
+ * Compares two doubles, replaces *d1 as appropriate, and indicates swap.
+ *
+ * If the values are normal doubles, less=true indicates we are looking for the
+ * lesser of the two values, while less=false indicates we are looking for the
+ * greater of the two values. The appropriate value will be swapped into *d1 if
+ * necessary.
+ *
+ * In general, this will give preference to real values over NaN, which is the
+ * opposite of swpnan1() above. This will only return NaN if BOTH values in
+ * the comparison are NaN.
+ *
+ * If either NaN is sNaN, this will set the CXIP_COLL_RC_FLT_INVALID error.
+ *
+ * The return value can be used when associating an index with the value.
+ *
+ * Note that since this quiets any signalling NaNs, we need set the
+ * CXIP_COLL_RC_FLT_INVALID error.
+ *
+ * - return  0 indicates the values are equivalent, so use the smallest index.
+ * - return +1 indicates the values were swapped, so use the second index.
+ * - return -1 indicates no swap, so use the first index.
+ */
+static int swpnan2(double *d1, double d2, bool less, cxip_coll_rc_t *rc)
+{
+	bool nan1, nan2, snan1, snan2;
+
+	// isnan() does not distinguish sNaN from qNaN
+	nan1 = isnan(*d1);
+	nan2 = isnan(d2);
+	// Neither is NaN, so simple comparison
+	if (!nan1 && !nan2) {
+		if (*d1 == d2)
+			return 0;
+		if (less && (*d1 > d2)) {
+			*d1 = d2;
+			return 1;
+		}
+		if (!less && (*d1 < d2)) {
+			*d1 = d2;
+			return 1;
+		}
+		return -1;
+	}
+
+	// ----- FLT_MINNUM and FLT_MAXNUM rules
+	// At least one is NaN, check for sNaN
+	snan1 = _quiesce_nan(d1);
+	snan2 = _quiesce_nan(&d2);
+	if (snan1 || snan2)
+		SET_RED_RC(*rc, CXIP_COLL_RC_FLT_INVALID);
+
+	// return qNaN only if both are NaN
+	if (nan1 && nan2)
+		return 0;
+
+	// Prefers number
+	if (nan1) {
+		*d1 = d2;
+		return 1;
+	}
+	// Prefers number
+	return -1;
+}
+
+/* Companion to swpnan1() and swpnan2() to swap associated indices */
+static inline
+void swpidx(uint64_t *i1, uint64_t i2, int swp)
+{
+	if (swp >= 0 && (swp > 0 || *i1 > i2))
+		*i1 = i2;
+}
+
+/* Determine if double precision sum is exact. This shifts the value with the
+ * lower exponent toward the MSBit by the amount of the bitwise overlap between
+ * the final sum and the value that resulted in that sum. If any non-zero bits
+ * remain in that smaller value, they were discarded during the summation, and
+ * the result is inexact.
+ */
+static inline
+bool exact(double rslt, double d)
+{
+	// TODO verify sign and shift
+	unsigned long m1, m2;
+	int s1, e1, s2, e2;
+	int shft, dlte;
+	bool ret;
+
+	_decompose_dbl(rslt, &s1, &e1, &m1);
+	_decompose_dbl(d, &s2, &e2, &m2);
+	dlte = e1 - e2;
+
+	if (dlte < 0) {
+		shft = MIN(52 + dlte, 0);
+		ret = !(m1 << shft);
+	} else {
+		shft= MIN(52 - dlte, 0);
+		ret = !(m2 << shft);
+	}
+	return ret;
+}
+
+static inline
+void _dump_coll_data(const char *tag, const struct cxip_coll_data *coll_data)
+{
+#if __trc_data
+	int i;
+
+	TRACE_PKT("=== Coll data: %s\n", tag);
+	TRACE_PKT("  init    = %d\n", coll_data->initialized);
+	TRACE_PKT("  red_op  = %d\n", coll_data->red_op);
+	TRACE_PKT("  rec_rc  = %d\n", coll_data->red_rc);
+	TRACE_PKT("  red_cnt = %d\n", coll_data->red_cnt);
+	TRACE_PKT("  data:\n");
+	for (i = 0; i < 4; i++)
+		TRACE_PKT(" %016lx\n", coll_data->intval.ival[i]);
+	TRACE_PKT("\n");
+	TRACE_PKT("===================\n");
+#endif
+}
+
+/* initialize coll_data structure from raw user data */
+static void _init_coll_data(struct cxip_coll_data *coll_data, int opcode,
+			    const void *user_data, int bytcnt)
+{
+	double d;
+	int i;
+
+	/* NOTE: snan can be directly injected here */
+	memset(coll_data, 0, sizeof(*coll_data));
+	if (user_data)
+		memcpy(coll_data->databuf, user_data, bytcnt);
+	coll_data->red_rc = 0;
+	coll_data->red_cnt = 1;
+	coll_data->red_op = opcode;
+	switch (coll_data->red_op) {
+	case COLL_OPCODE_FLT_MINNUM:
+	case COLL_OPCODE_FLT_MAXNUM:
+	case COLL_OPCODE_FLT_SUM_NOFTZ_RND0:
+	case COLL_OPCODE_FLT_SUM_NOFTZ_RND1:
+	case COLL_OPCODE_FLT_SUM_NOFTZ_RND2:
+	case COLL_OPCODE_FLT_SUM_NOFTZ_RND3:
+	case COLL_OPCODE_FLT_SUM_FTZ_RND0:
+	case COLL_OPCODE_FLT_SUM_FTZ_RND1:
+	case COLL_OPCODE_FLT_SUM_FTZ_RND2:
+	case COLL_OPCODE_FLT_SUM_FTZ_RND3:
+		/* evaluate all four doubles */
+		for (i = 0; i < 4; i++) {
+			if (cxip_is_snan64(coll_data->fltval.fval[i]))
+				SET_RED_RC(coll_data->red_rc,
+					   CXIP_COLL_RC_FLT_INVALID);
+			if (isnan(coll_data->fltval.fval[i]) ||
+			    isinf(coll_data->fltval.fval[i]))
+				SET_RED_RC(coll_data->red_rc,
+					   CXIP_COLL_RC_FLT_OVERFLOW);
+		}
+		break;
+	case COLL_OPCODE_FLT_MINMAXNUMLOC:
+		/* evaluate the two doubles */
+		for (i = 0; i < 4; i += 2) {
+			if (cxip_is_snan64(coll_data->fltval.fval[i]))
+				SET_RED_RC(coll_data->red_rc,
+				CXIP_COLL_RC_FLT_INVALID);
+			if (isinf(coll_data->fltval.fval[i]))
+				SET_RED_RC(coll_data->red_rc,
+					   CXIP_COLL_RC_FLT_OVERFLOW);
+		}
+		break;
+	case COLL_OPCODE_FLT_REPSUM:
+		/* perform the conversion */
+		d = coll_data->fltval.fval[0];
+		cxip_dbl_to_rep(&coll_data->repsum, d);
+		break;
+	}
+	coll_data->initialized = true;
+}
+
+/* reduce data into accumulator - can be used on uninitialized accumulator */
+static void _reduce(struct cxip_coll_data *accum,
+		    const struct cxip_coll_data *coll_data,
+		    bool pre_reduce)
+{
+	int i, swp;
+
+	TRACE_DEBUG("%s entry\n", __func__);
+	/* Initialize with new data */
+	if (!accum->initialized) {
+		memcpy(accum, coll_data, sizeof(*accum));
+		return;
+	}
+
+	/* copy new error (if any) to accumulator */
+	SET_RED_RC(accum->red_rc, coll_data->red_rc);
+
+	/* Real reduction (send or receive) must count contributions.
+	 */
+	if (!pre_reduce)
+		accum->red_cnt += coll_data->red_cnt;
+
+	/* ops must always match, else don't apply data */
+	if (accum->red_op != coll_data->red_op) {
+		SET_RED_RC(accum->red_rc, CXIP_COLL_RC_OP_MISMATCH);
+		return;
+	}
+
+	/* Perform the reduction in software */
+	switch (accum->red_op) {
+	case COLL_OPCODE_BARRIER:
+		break;
+	case COLL_OPCODE_BIT_AND:
+		for (i = 0; i < 4; i++)
+			accum->intval.ival[i] &= coll_data->intval.ival[i];
+		/* overflow not possible */
+		break;
+	case COLL_OPCODE_BIT_OR:
+		for (i = 0; i < 4; i++)
+			accum->intval.ival[i] |= coll_data->intval.ival[i];
+		/* overflow not possible */
+		break;
+	case COLL_OPCODE_BIT_XOR:
+		for (i = 0; i < 4; i++)
+			accum->intval.ival[i] ^= coll_data->intval.ival[i];
+		/* overflow not possible */
+		break;
+	case COLL_OPCODE_INT_MIN:
+		for (i = 0; i < 4; i++)
+			if (accum->intval.ival[i] > coll_data->intval.ival[i])
+				accum->intval.ival[i] = coll_data->intval.ival[i];
+		/* overflow not possible */
+		break;
+	case COLL_OPCODE_INT_MAX:
+		for (i = 0; i < 4; i++)
+			if (accum->intval.ival[i] < coll_data->intval.ival[i])
+				accum->intval.ival[i] = coll_data->intval.ival[i];
+		/* overflow not possible */
+		break;
+	case COLL_OPCODE_INT_MINMAXLOC:
+		/* RSDG 4.5.9.2.2 MINMAXLOC */
+		/* return smallest value and its index */
+		if (accum->intminmax.iminval > coll_data->intminmax.iminval) {
+			accum->intminmax.iminval = coll_data->intminmax.iminval;
+			accum->intminmax.iminidx = coll_data->intminmax.iminidx;
+		} else
+		/* return smallest index if values equal */
+		if (accum->intminmax.iminval == coll_data->intminmax.iminval &&
+		    accum->intminmax.iminidx > coll_data->intminmax.iminidx) {
+			accum->intminmax.iminidx = coll_data->intminmax.iminidx;
+		}
+
+		/* return largest value and its index */
+		if (accum->intminmax.imaxval < coll_data->intminmax.imaxval) {
+			accum->intminmax.imaxval = coll_data->intminmax.imaxval;
+			accum->intminmax.imaxidx = coll_data->intminmax.imaxidx;
+		} else
+		/* return smallest (yes) index if values equal */
+		if (accum->intminmax.imaxval == coll_data->intminmax.imaxval &&
+		    accum->intminmax.imaxidx > coll_data->intminmax.imaxidx) {
+			accum->intminmax.imaxidx = coll_data->intminmax.imaxidx;
+		}
+		/* overflow not possible */
+		break;
+	case COLL_OPCODE_INT_SUM:
+		for (i = 0; i < 4; i++) {
+			bool newneg = (coll_data->intval.ival[i] < 0);
+			bool oldneg = (accum->intval.ival[i] < 0);
+			bool sumneg;
+			accum->intval.ival[i] += coll_data->intval.ival[i];
+			sumneg = (accum->intval.ival[i] < 0);
+			/* if sum changed sign, and doesn't match new sign */
+			if (sumneg != oldneg && sumneg != newneg)
+				SET_RED_RC(accum->red_rc,
+					   CXIP_COLL_RC_INT_OVERFLOW);
+		}
+		break;
+	case COLL_OPCODE_FLT_MINNUM:
+		/* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */
+		for (i = 0; i < 4; i++) {
+			swpnan2(&accum->fltval.fval[i], coll_data->fltval.fval[i], 1,
+				&accum->red_rc);
+		}
+		break;
+	case COLL_OPCODE_FLT_MAXNUM:
+		/* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */
+		for (i = 0; i < 4; i++) {
+			swpnan2(&accum->fltval.fval[i], coll_data->fltval.fval[i], 0,
+				&accum->red_rc);
+		}
+		break;
+	case COLL_OPCODE_FLT_MINMAXNUMLOC:
+		/* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */
+		swp = swpnan2(&accum->fltminmax.fminval,
+			      coll_data->fltminmax.fminval, 1, &accum->red_rc);
+		swpidx(&accum->fltminmax.fminidx, coll_data->fltminmax.fminidx, swp);
+		swp = swpnan2(&accum->fltminmax.fmaxval,
+			      coll_data->fltminmax.fmaxval, 0, &accum->red_rc);
+		swpidx(&accum->fltminmax.fmaxidx, coll_data->fltminmax.fmaxidx, swp);
+		break;
+	case COLL_OPCODE_FLT_SUM_NOFTZ_RND0:
+	case COLL_OPCODE_FLT_SUM_NOFTZ_RND1:
+	case COLL_OPCODE_FLT_SUM_NOFTZ_RND2:
+	case COLL_OPCODE_FLT_SUM_NOFTZ_RND3:
+		/* Rosetta opcode has been chosen according to the current
+		 * rounding mode for this application, so all we need to do is
+		 * add the numbers.
+		 */
+		for (i = 0; i < 4; i++) {
+			/* NOTE: arithmetic operations will quiesce snan */
+			accum->fltval.fval[i] +=  coll_data->fltval.fval[i];
+
+			if (!exact(accum->fltval.fval[i],
+				   coll_data->fltval.fval[i]))
+				SET_RED_RC(accum->red_rc,
+					   CXIP_COLL_RC_FLT_INEXACT);
+			if (isinf(accum->fltval.fval[i]))
+				SET_RED_RC(accum->red_rc,
+					   CXIP_COLL_RC_FLT_OVERFLOW);
+		}
+		break;
+	case COLL_OPCODE_FLT_SUM_FTZ_RND0:
+	case COLL_OPCODE_FLT_SUM_FTZ_RND1:
+	case COLL_OPCODE_FLT_SUM_FTZ_RND2:
+	case COLL_OPCODE_FLT_SUM_FTZ_RND3:
+		/* Rosetta opcode has been chosen according to the current
+		 * rounding mode for this application, so all we need to do is
+		 * add the numbers.
+		 */
+		for (i = 0; i < 4; i++) {
+			/* NOTE: arithmetic operations will quiesce snan */
+			accum->fltval.fval[i] +=  coll_data->fltval.fval[i];
+
+			if (!exact(accum->fltval.fval[i],
+				   coll_data->fltval.fval[i]))
+				SET_RED_RC(accum->red_rc,
+					   CXIP_COLL_RC_FLT_INEXACT);
+			if (isinf(accum->fltval.fval[i]))
+				SET_RED_RC(accum->red_rc,
+					   CXIP_COLL_RC_FLT_OVERFLOW);
+		}
+		break;
+	case COLL_OPCODE_FLT_REPSUM:
+		cxip_rep_add(&accum->repsum, &coll_data->repsum);
+		break;
+	}
+}
+
+/****************************************************************************
+ * Reduction packet management.
+ */
+
+/**
+ * Prevent setting the ARM bit on a root packet.
+ *
+ * This is used in testing to suppress Rosetta collective operations, forcing
+ * all leaf packets to arrive at the root, creating an incast.
+ */
+int cxip_coll_arm_disable(struct fid_mc *mc, bool disable)
+{
+	struct cxip_coll_mc *mc_obj = (struct cxip_coll_mc *)mc;
+	int old = mc_obj->arm_disable;
+
+	mc_obj->arm_disable = disable;
+
+	return old;
+}
+
+/**
+ * Limit the reduction ID values.
+ *
+ * Reduction ID values do round-robin over an adjustable range of values. This
+ * is useful in testing to force all reductions to use reduction id zero (set
+ * max_red_id to 1), but could be used in production to use only a subset of
+ * reduction IDs to limit fabric resource exhaustion when concurrent reductions
+ * are used.
+ */
+void cxip_coll_limit_red_id(struct fid_mc *mc, int max_red_id)
+{
+	struct cxip_coll_mc *mc_obj = (struct cxip_coll_mc *)mc;
+
+	if (max_red_id < 1)
+		max_red_id = 1;
+	if (max_red_id > CXIP_COLL_MAX_CONCUR)
+		max_red_id = CXIP_COLL_MAX_CONCUR;
+	mc_obj->max_red_id = max_red_id;
+}
+
+/* drop the next packet sent */
+void cxip_coll_drop_send(struct cxip_coll_reduction *reduction)
+{
+	reduction->drop_send = true;
+}
+
+/* drop the next packet received */
+void cxip_coll_drop_recv(struct cxip_coll_reduction *reduction)
+{
+	reduction->drop_recv = true;
+}
+
+/* Return true if this node is the hwroot node */
+static inline
+bool is_hw_root(struct cxip_coll_mc *mc_obj)
+{
+	return (mc_obj->hwroot_idx == mc_obj->mynode_idx);
+}
+
+/* Simulated unicast send of multiple packets as root node to leaf nodes */
+static inline
+ssize_t _send_pkt_as_root(struct cxip_coll_reduction *reduction, bool retry)
+{
+	int i, ret, err;
+
+	err = 0;
+	for (i = 0; i < reduction->mc_obj->av_set_obj->fi_addr_cnt; i++) {
+		if (i == reduction->mc_obj->mynode_idx &&
+		    reduction->mc_obj->av_set_obj->fi_addr_cnt > 1) {
+			TRACE_DEBUG("root: skip=%d\n", i);
+			continue;
+		}
+		ret = cxip_coll_send(reduction, i,
+				     reduction->tx_msg,
+				     sizeof(struct red_pkt),
+				     reduction->mc_obj->reduction_md);
+		TRACE_DEBUG("root: send=%d ret=%d\n", i, ret);
+		if (!err)
+			err = ret;
+	}
+	return err;
+}
+
+/* Simulated unicast send of single packet as leaf node to root node */
+static inline
+ssize_t _send_pkt_as_leaf(struct cxip_coll_reduction *reduction, bool retry)
+{
+	int ret;
+
+	ret = cxip_coll_send(reduction, reduction->mc_obj->hwroot_idx,
+			      reduction->tx_msg, sizeof(struct red_pkt),
+			      reduction->mc_obj->reduction_md);
+	TRACE_DEBUG("leaf: send=%d ret=%d\n", 1, ret);
+	return ret;
+}
+
+/* Multicast send of single packet from root or leaf node */
+static inline
+ssize_t _send_pkt_mc(struct cxip_coll_reduction *reduction, bool retry)
+{
+	return cxip_coll_send(reduction, 0,
+			      reduction->tx_msg,
+			      sizeof(struct red_pkt),
+			      reduction->mc_obj->reduction_md);
+}
+
+/* Send packet from root or leaf node as appropriate */
+static inline
+ssize_t _send_pkt(struct cxip_coll_reduction *reduction, bool retry)
+{
+	int ret;
+
+	if (reduction->mc_obj->av_set_obj->comm_key.keytype ==
+	    COMM_KEY_MULTICAST) {
+		ret = _send_pkt_mc(reduction, retry);
+	} else if (is_hw_root(reduction->mc_obj)) {
+		ret = _send_pkt_as_root(reduction, retry);
+	} else {
+		ret = _send_pkt_as_leaf(reduction, retry);
+	}
+	return ret;
+}
+
+/* prepare and issue the reduction packet */
+int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction,
+			   const struct cxip_coll_data *coll_data,
+			   bool arm, bool retry)
+{
+	struct red_pkt *pkt;
+	int ret;
+
+	pkt = (struct red_pkt *)reduction->tx_msg;
+
+	memset(&pkt->hdr, 0, sizeof(pkt->hdr));
+	pkt->hdr.arm = arm;
+	pkt->hdr.seqno = reduction->seqno;
+	pkt->hdr.resno = reduction->resno;
+	pkt->hdr.cookie.mcast_id = reduction->mc_obj->mcast_addr;
+	pkt->hdr.cookie.red_id = reduction->red_id;
+	pkt->hdr.cookie.retry = retry;
+	pkt->hdr.cookie.magic = MAGIC;
+
+	if (coll_data) {
+		pkt->hdr.redcnt = coll_data->red_cnt;
+		pkt->hdr.op = coll_data->red_op;
+		pkt->hdr.red_rc = coll_data->red_rc;
+		/* repsum has some additional information that must be set */
+		if (_opcode_to_redtype(coll_data->red_op) == REDTYPE_REPSUM) {
+			pkt->hdr.repsum_m = coll_data->repsum.M;
+			pkt->hdr.repsum_ovflid = coll_data->repsum.overflow_id;
+		}
+		memcpy(pkt->data, &coll_data->databuf, CXIP_COLL_MAX_DATA_SIZE);
+	} else {
+		pkt->hdr.redcnt = 0;
+		pkt->hdr.op = 0;
+		pkt->hdr.red_rc = 0;
+		pkt->hdr.repsum_m = 0;
+		pkt->hdr.repsum_ovflid = 0;
+		memset(pkt->data, 0, CXIP_COLL_MAX_DATA_SIZE);
+	}
+	_dump_red_pkt(pkt, "send");
+	_swappkt(pkt);
+
+	/* -FI_EAGAIN means HW queue is full, should self-clear */
+	do {
+		ret = _send_pkt(reduction, retry);
+	} while (ret == -FI_EAGAIN);
+	/* any other error is a serious config/hardware issue */
+	if (ret)
+		CXIP_WARN("Fatal send error = %d\n", ret);
+
+	return ret;
+}
+
+/* Post a reduction completion request to the collective completion queue */
+static void _post_coll_complete(struct cxip_coll_reduction *reduction)
+{
+	struct cxip_req *req;
+	int ret;
+
+	/* Indicates collective completion by writing to the endpoint TX CQ */
+	req = reduction->op_inject_req;
+	if (!req)
+		return;
+
+	if (reduction->accum.red_rc == CXIP_COLL_RC_SUCCESS) {
+		ret = cxip_cq_req_complete(req);
+	} else {
+		ret = cxip_cq_req_error(req, 0,
+			_cxip_rc_to_cxi_rc[reduction->accum.red_rc],
+			reduction->accum.red_rc, NULL, 0, FI_ADDR_UNSPEC);
+	}
+	if (ret) {
+		/* Is this possible? The only error is -FI_ENOMEM. It looks like
+		 * send is blocked with -FI_EAGAIN until we are guaranteed EQ
+		 * space in the queue. Display and ignore.
+		 */
+		CXIP_WARN("Attempt to post completion failed %s\n",
+			   fi_strerror(-ret));
+	}
+
+	/* req structure no longer needed */
+	cxip_evtq_req_free(req);
+
+	/* restore reduction object to usable state */
+	reduction->accum.initialized = false;
+	reduction->in_use = false;
+	reduction->completed = false;
+	reduction->pktsent = false;
+	reduction->accum.initialized = false;
+	reduction->accum.red_rc = CXIP_COLL_RC_SUCCESS;
+	reduction->op_inject_req = NULL;
+}
+
+/* unpack reduction data from a reduction packet */
+static void _unpack_red_data(struct cxip_coll_data *coll_data,
+			     const struct red_pkt *pkt)
+{
+	memcpy(coll_data->databuf, pkt->data, 32);
+	coll_data->repsum.M = pkt->hdr.repsum_m;
+	coll_data->repsum.overflow_id = pkt->hdr.repsum_ovflid;
+	coll_data->red_op = pkt->hdr.op;
+	coll_data->red_cnt = pkt->hdr.redcnt;
+	coll_data->red_rc = pkt->hdr.red_rc;
+	coll_data->initialized = true;
+}
+
+/****************************************************************************
+ * Collective State Machine
+ *
+ * The basic flow is:
+ *   - all nodes reach a common reduction call (at different times)
+ *   - leaf nodes send their data, to be reduced, and block, polling CQ
+ *   - root node prepares for the reduction, and blocks, polling CQ
+ *   - root node receives leaf packets and reduces them, until all received
+ *   - root node sends Arm Packet with final result, and unblocks
+ *   - leaf nodes receive Arm Packet with final result, and unblock
+ *
+ * The Rosetta acceleration comes from the Arm Packet, which speculatively arms
+ * the Rosetta tree for the NEXT operation. This persists until a timeout
+ * expires. The timeout is specified when the multicast tree is created by the
+ * Rosetta configuration service, and cannot be modified after join is complete.
+ *
+ * If the next collective operation occurs within the timeout, the leaf results
+ * will be reduced in reduction engines by Rosetta as they move up the tree,
+ * reducing the number of packets received by the root.
+ *
+ * If the reduction engine times out with partial results, it forwards the
+ * partial results, and all subsequent results are passed directly to the next
+ * Rosetta.
+ *
+ * The first leaf contribution to reach a reduction engine establishes the
+ * reduction operation. All subsequent contributions must use the same
+ * operation, or Rosetta returns an error.
+ *
+ * There are eight reduction_id values, which can be used to acquire and use up
+ * to eight independent reduction engines (REs) at each upstream port of each
+ * Rosetta switch in the collective tree.
+ *
+ * We use a round-robin selection of reduction id values. There is a small race
+ * condition among the leaf nodes as the result is distributed from the root. If
+ * another reduction were to be initiated during this race, the leaf nodes would
+ * be in disagreement as to which reduction IDs were free for the new reduction.
+ * To avoid this, we use a deterministic algorithm (round-robin) so that the
+ * "next" reduction id is always predetermined for each reduction.
+ *
+ * Ordering of requests and responses will the same on all nodes.
+ *
+ * Ordering of requests is required of the application. If requests are ordered
+ * differently on different nodes, results are undefined, and it is considered
+ * an application error.
+ *
+ * Ordering of responses is guaranteed by the mc_obj->tail_red_id value, which
+ * is advanced after the reduction completes. This ordering is required to
+ * ensure that the round-robin is observed.
+ */
+
+/* modular increment/decrement */
+#define INCMOD(val, mod)	do {(val)=((val)+1)%(mod);} while (0)
+#define DECMOD(val, mod)	do {(val)=((val)+(mod)-1)%(mod);} while (0)
+
+/* MONOTONIC timestamp operations for timeouts/retries */
+static inline
+void _tsget(struct timespec *ts)
+{
+	clock_gettime(CLOCK_MONOTONIC, ts);
+}
+
+static inline
+void _tsadd(struct timespec *ts, const struct timespec *dt)
+{
+	ts->tv_sec += dt->tv_sec;
+	ts->tv_nsec += dt->tv_nsec;
+	if (ts->tv_nsec >= 1000000000L) {
+		ts->tv_sec += 1;
+		ts->tv_nsec -= 1000000000L;
+	}
+}
+
+/* Set a timespec at expiration time (future) */
+static inline
+void _tsset(struct cxip_coll_reduction *reduction)
+{
+	_tsget(&reduction->tv_expires);
+	_tsadd(&reduction->tv_expires, &reduction->mc_obj->timeout);
+}
+
+/* Used to prevent first-use incast */
+static inline
+bool _is_red_first_time(struct cxip_coll_reduction *reduction)
+{
+	return (reduction->tv_expires.tv_sec == 0L &&
+	    	reduction->tv_expires.tv_nsec == 0L);
+}
+
+/* Used to reduce incast congestion during run */
+static inline
+bool _is_red_timed_out(struct cxip_coll_reduction *reduction)
+{
+	struct timespec tsnow;
+
+	if (_is_red_first_time(reduction)) {
+		TRACE_DEBUG("=== root first time, retry\n");
+		return true;
+	}
+	_tsget(&tsnow);
+	if (tsnow.tv_sec < reduction->tv_expires.tv_sec)
+		return false;
+	if (tsnow.tv_sec == reduction->tv_expires.tv_sec &&
+	    tsnow.tv_nsec < reduction->tv_expires.tv_nsec)
+		return false;
+	TRACE_DEBUG("=== root timeout, retry\n");
+	return true;
+}
+
+/* Root node state machine progress.
+ * !pkt means this is progressing from injection call (e.g. fi_reduce())
+ *  pkt means this is progressing from event callback (leaf packet)
+ */
+static void _progress_root(struct cxip_coll_reduction *reduction,
+			   struct red_pkt *pkt)
+{
+	struct cxip_coll_mc *mc_obj = reduction->mc_obj;
+	struct cxip_coll_data coll_data;
+	ssize_t ret;
+
+	/* State machine disabled for testing */
+	if (reduction->coll_state != CXIP_COLL_STATE_READY)
+		return;
+
+	/* Injection or packet arrival after root timeout initiates a retry */
+	if (_is_red_timed_out(reduction)) {
+		/* reset reduction for retry send */
+		reduction->seqno = mc_obj->seqno;
+		INCMOD(mc_obj->seqno, CXIP_COLL_MAX_SEQNO);
+		ofi_atomic_inc32(&mc_obj->tmout_cnt);
+
+		ret = cxip_coll_send_red_pkt(reduction, NULL,
+					     !mc_obj->arm_disable, true);
+		_tsset(reduction);
+		if (ret) {
+			SET_RED_RC(reduction->accum.red_rc,
+				   CXIP_COLL_RC_TX_FAILURE);
+			reduction->completed = true;
+			goto post_complete;
+		}
+		return;
+	}
+
+	/* Process received packet */
+	if (pkt) {
+		/* Root has received a leaf packet */
+		_dump_red_pkt(pkt, "Rrcv");
+
+		/* Drop out-of-date packets */
+		if (pkt->hdr.resno != reduction->seqno) {
+			TRACE_DEBUG("bad seqno, exp=%d saw=%d\n",
+				reduction->seqno, pkt->hdr.resno);
+			ofi_atomic_inc32(&mc_obj->seq_err_cnt);
+			return;
+		}
+
+		/* capture and reduce packet information */
+		_unpack_red_data(&coll_data, pkt);
+		_reduce(&reduction->accum, &coll_data, false);
+		_dump_coll_data("after leaf contrib to root", &reduction->accum);
+	}
+
+	/* check for reduction complete */
+	if (reduction->accum.red_cnt == mc_obj->av_set_obj->fi_addr_cnt) {
+		/* copy reduction result to user result buffer */
+		if (reduction->op_rslt_data && reduction->op_data_bytcnt) {
+			memcpy(reduction->op_rslt_data,
+			       reduction->accum.databuf,
+			       reduction->op_data_bytcnt);
+		}
+
+		/* send reduction result to leaves, arm new seqno */
+		reduction->seqno = mc_obj->seqno;
+		INCMOD(mc_obj->seqno, CXIP_COLL_MAX_SEQNO);
+		reduction->completed = true;
+
+		ret = cxip_coll_send_red_pkt(reduction, &reduction->accum,
+					     !mc_obj->arm_disable, false);
+		_tsset(reduction);
+		if (ret)
+			SET_RED_RC(reduction->accum.red_rc,
+				   CXIP_COLL_RC_TX_FAILURE);
+	}
+
+post_complete:
+	/* Post completions in injection order */
+	reduction = &mc_obj->reduction[mc_obj->tail_red_id];
+	while (reduction->in_use && reduction->completed) {
+		/* Reduction completed on root */
+		_post_coll_complete(reduction);
+
+		/* Advance to the next reduction */
+		INCMOD(mc_obj->tail_red_id, mc_obj->max_red_id);
+		reduction = &mc_obj->reduction[mc_obj->tail_red_id];
+	}
+}
+
+/* Leaf node state machine progress.
+ * !pkt means this is progressing from injection call (e.g. fi_reduce())
+ *  pkt means this is progressing from event callback (receipt of packet)
+ */
+static void _progress_leaf(struct cxip_coll_reduction *reduction,
+			   struct red_pkt *pkt)
+{
+	struct cxip_coll_mc *mc_obj = reduction->mc_obj;
+	struct cxip_coll_data coll_data;
+	int ret;
+
+	/* state machine disabled for testing */
+	if (reduction->coll_state != CXIP_COLL_STATE_READY)
+		return;
+
+	/* if reduction packet, reset timer, seqno, honor retry */
+	if (pkt) {
+		_dump_red_pkt(pkt, "Lrcv");
+		_tsset(reduction);
+		reduction->seqno = pkt->hdr.seqno;
+		reduction->resno = pkt->hdr.seqno;
+		if (pkt->hdr.cookie.retry)
+			reduction->pktsent = false;
+	}
+
+	/* leaves lead with sending a packet */
+	if (!reduction->pktsent) {
+		/* Avoid first-use incast, retry guaranteed */
+		if (_is_red_first_time(reduction)) {
+			TRACE_DEBUG("=== leaf first time, wait\n");
+			return;
+		}
+
+		/* Don't send if nothing to send yet */
+		if (!reduction->accum.initialized)
+			return;
+
+		/* Send leaf data */
+		ret = cxip_coll_send_red_pkt(reduction, &reduction->accum,
+					     false, false);
+		if (ret) {
+			SET_RED_RC(reduction->accum.red_rc,
+				   CXIP_COLL_RC_TX_FAILURE);
+			reduction->completed = true;
+			goto post_complete;
+		}
+		reduction->pktsent = true;
+	}
+
+	/* If no incoming reduction packet, we are done */
+	if (!pkt)
+		return;
+
+	/* If packet has no reduction count (retry), done */
+	if (!pkt->hdr.redcnt)
+		return;
+
+	/* Capture final reduction data in user-pointer */
+	SET_RED_RC(reduction->accum.red_rc, pkt->hdr.red_rc);
+	if (reduction->op_rslt_data) {
+		_unpack_red_data(&coll_data, pkt);
+		memcpy(reduction->op_rslt_data,
+			&coll_data.databuf,
+			reduction->op_data_bytcnt);
+	}
+	/* Reduction completed on leaf */
+	reduction->completed = true;
+
+post_complete:
+	/* Post completions in injection order */
+	reduction = &mc_obj->reduction[mc_obj->tail_red_id];
+	while (reduction->in_use && reduction->completed) {
+		_post_coll_complete(reduction);
+		INCMOD(mc_obj->tail_red_id, mc_obj->max_red_id);
+		reduction = &mc_obj->reduction[mc_obj->tail_red_id];
+	}
+}
+
+/* Root or leaf progress state machine.
+ */
+static void _progress_coll(struct cxip_coll_reduction *reduction,
+			   struct red_pkt *pkt)
+{
+	if (is_hw_root(reduction->mc_obj))
+		_progress_root(reduction, pkt);
+	else
+		_progress_leaf(reduction, pkt);
+}
+
+/* Debugging only */
+static int *_injected_red_id_buf;
+void cxip_capture_red_id(int *red_id_buf)
+{
+	_injected_red_id_buf = red_id_buf;
+}
+
+/* Generic collective pre-reduction into cxip_coll_data structure */
+static void
+_cxip_coll_prereduce(int cxi_opcode, const void *op_send_data,
+		     void *accum, size_t sendcnt, uint64_t flags)
+{
+	const struct cxip_coll_data *coll_data_ptr;
+	struct cxip_coll_data coll_data;
+
+	/* Convert user data to local coll_data structure */
+	if (flags & FI_CXI_PRE_REDUCED) {
+		coll_data_ptr = op_send_data;
+	} else {
+		_init_coll_data(&coll_data, cxi_opcode, op_send_data,
+				sendcnt);
+		coll_data_ptr = &coll_data;
+	}
+	_dump_coll_data("coll_data initialized pre", coll_data_ptr);
+
+	/* pre-reduce data into accumulator */
+	_reduce(accum, coll_data_ptr, true);
+}
+
+/* Generic collective injection into fabric.
+ *
+ * Reduction ID is normally hidden. Can be exposed by calling _capture_red_id()
+ * just before calling a reduction operation.
+ *
+ * - Acquires next available reduction structure in MC, or returns -FI_EAGAIN.
+ * - Acquires evtq request, or return -FI_EAGAIN.
+ * - Marks reduction structure in-use.
+ * - Advances next available reduction pointer.
+ * - Initializes:
+ *   - result data pointer
+ *   - source data (pre-reduced or raw)
+ *   - data byte count
+ * - Reduces user data into reduction accumulator (may already contain data)
+ * - Progresses reduction (no packet supplied)
+ */
+static ssize_t
+_cxip_coll_inject(struct cxip_coll_mc *mc_obj, int cxi_opcode,
+		  const void *op_send_data, void *op_rslt_data,
+		  size_t bytcnt, uint64_t flags, void *context)
+{
+	struct cxip_coll_reduction *reduction;
+	struct cxip_coll_data coll_data;
+	struct cxip_req *req;
+	int ret;
+
+	TRACE_DEBUG("%s entry\n", __func__);
+	TRACE_DEBUG("%s bytecnt=%ld\n", __func__, bytcnt);
+	ofi_genlock_lock(&mc_obj->ep_obj->lock);
+
+	/* must observe strict round-robin across all nodes */
+	reduction = &mc_obj->reduction[mc_obj->next_red_id];
+	if (reduction->in_use) {
+		ret = -FI_EAGAIN;
+		goto quit;
+	}
+
+	/* acquire a request structure */
+	req = cxip_evtq_req_alloc(mc_obj->ep_obj->coll.tx_evtq, 1, NULL);
+	if (!req) {
+		ret = -FI_EAGAIN;
+		goto quit;
+	}
+
+	/* Used for debugging */
+	if (_injected_red_id_buf) {
+		*_injected_red_id_buf = reduction->red_id;
+		_injected_red_id_buf = NULL;
+	}
+
+	/* advance next_red_id, reserving this one for us */
+	INCMOD(mc_obj->next_red_id, mc_obj->max_red_id);
+	reduction->in_use = true;
+
+	/* Set up the reduction structure */
+	reduction->op_rslt_data = op_rslt_data;
+	reduction->op_data_bytcnt = bytcnt;
+	reduction->op_context = context;
+	reduction->op_inject_req = req;
+	reduction->op_inject_req->context = (uint64_t)context;
+
+	/* Convert user data to local coll_data structure */
+	if (flags & FI_CXI_PRE_REDUCED)
+		memcpy(&coll_data, op_send_data, sizeof(coll_data));
+	else
+		_init_coll_data(&coll_data, cxi_opcode, op_send_data, bytcnt);
+
+	/* reduce data into accumulator */
+	_reduce(&reduction->accum, &coll_data, false);
+	_dump_coll_data("coll_data initialized inj", &coll_data);
+
+	/* Progress the collective */
+	_progress_coll(reduction, NULL);
+	ret = FI_SUCCESS;
+
+quit:
+	ofi_genlock_unlock(&mc_obj->ep_obj->lock);
+	TRACE_DEBUG("%s return %d\n", __func__, ret);
+	return ret;
+}
+
+/* Get the mc_obj from ep/coll_addr and check for consistency */
+static inline
+ssize_t _get_mc_obj(struct fid_ep *ep, fi_addr_t coll_addr,
+		    struct cxip_coll_mc **mc_obj)
+{
+	struct cxip_ep *cxi_ep;
+
+	if (!ep) {
+		CXIP_WARN("Collective requires ep\n");
+		return -FI_EINVAL;
+	}
+
+	if (!coll_addr) {
+		CXIP_WARN("Collective requires coll_addr\n");
+		return -FI_EINVAL;
+	}
+
+	cxi_ep = container_of(ep, struct cxip_ep, ep.fid);
+	*mc_obj = (struct cxip_coll_mc *)((uintptr_t)coll_addr);
+
+	if ((*mc_obj)->ep_obj != cxi_ep->ep_obj) {
+		CXIP_WARN("Multicast does not belong to ep\n");
+		return -FI_EINVAL;
+	}
+
+	if (!(*mc_obj)->is_joined) {
+		CXIP_WARN("Multicast collective not joined\n");
+		return -FI_EOPBADSTATE;
+	}
+
+	return FI_SUCCESS;
+}
+
+/* get payload byte count and check for consistency */
+static inline
+ssize_t _get_bytcnt(int cxi_opcode, enum fi_datatype datatype,
+		    const void *buf, size_t count)
+{
+	ssize_t bytcnt;
+
+	if (cxi_opcode < 0) {
+		CXIP_WARN("opcode not supported\n");
+		return -FI_EINVAL;
+	}
+
+	if (!buf || count <= 0L) {
+		CXIP_WARN("buffer required\n");
+		return -FI_EINVAL;
+	}
+
+	bytcnt = _get_cxi_data_bytcnt(cxi_opcode, datatype, count);
+	if (bytcnt < 0)
+		CXIP_WARN("opcode does not support datatype\n");
+
+	return bytcnt;
+}
+
+ssize_t cxip_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context)
+{
+	struct cxip_coll_mc *mc_obj;
+	int cxi_opcode;
+	ssize_t ret;
+
+	/* barrier requires mc_obj */
+	ret = _get_mc_obj(ep, coll_addr, &mc_obj);
+	if (ret)
+		return ret;
+
+	cxi_opcode = COLL_OPCODE_BARRIER;
+
+	return _cxip_coll_inject(mc_obj, cxi_opcode, NULL, NULL, 0, 0, context);
+}
+
+ssize_t cxip_broadcast(struct fid_ep *ep, void *buf, size_t count,
+		       void *desc, fi_addr_t coll_addr, fi_addr_t root_addr,
+		       enum fi_datatype datatype, uint64_t flags,
+		       void *context)
+{
+	struct cxip_coll_mc *mc_obj;
+	int cxi_opcode, bytcnt;
+	ssize_t ret;
+
+	if (flags & (FI_MORE|FI_CXI_PRE_REDUCED)) {
+		CXIP_WARN("Illegal flags for broadcast\n");
+		return -FI_EINVAL;
+	}
+
+	cxi_opcode = COLL_OPCODE_BIT_OR;
+	bytcnt = _get_bytcnt(cxi_opcode, datatype, buf, count);
+	if (bytcnt < 0)
+		return -FI_EINVAL;
+
+	/* broadcast requires mc_obj */
+	ret = _get_mc_obj(ep, coll_addr, &mc_obj);
+	if (ret)
+		return ret;
+
+	/* only root node contributes data, others contribute 0 */
+	if (root_addr != mc_obj->mynode_fiaddr)
+		memset(buf, 0, bytcnt);
+
+	/* buf serves as source and result */
+	return _cxip_coll_inject(mc_obj, cxi_opcode, buf, buf, bytcnt,
+				 flags, context);
+}
+
+ssize_t cxip_reduce(struct fid_ep *ep, const void *buf, size_t count,
+		    void *desc, void *result, void *result_desc,
+		    fi_addr_t coll_addr, fi_addr_t root_addr,
+		    enum fi_datatype datatype, enum fi_op op, uint64_t flags,
+		    void *context)
+{
+	struct cxip_coll_mc *mc_obj;
+	int cxi_opcode;
+	ssize_t bytcnt, ret;
+
+	TRACE_DEBUG("%s entry\n", __func__);
+	cxi_opcode = cxip_fi2cxi_opcode(op, datatype);
+	bytcnt = _get_bytcnt(cxi_opcode, datatype, buf, count);
+	if (bytcnt < 0)
+		return (ssize_t)bytcnt;
+
+	/* FI_MORE requires result buffer, succeeds immediately */
+	if (flags & FI_MORE) {
+		if (!result) {
+			CXIP_WARN("result required with FI_MORE\n");
+			return -FI_EINVAL;
+		}
+		_cxip_coll_prereduce(cxi_opcode, buf, result, bytcnt, flags);
+		return FI_SUCCESS;
+	}
+
+	/* otherwise reduce requires mc_obj */
+	ret = _get_mc_obj(ep, coll_addr, &mc_obj);
+	if (ret)
+		return ret;
+
+	/* root requires a result buffer */
+	if (!result && (mc_obj->mynode_fiaddr == root_addr)) {
+		CXIP_WARN("reduce root result required\n");
+		return -FI_EINVAL;
+	}
+
+	return _cxip_coll_inject(mc_obj, cxi_opcode, buf, result, bytcnt,
+				 flags, context);
+}
+
+ssize_t cxip_allreduce(struct fid_ep *ep, const void *buf, size_t count,
+		       void *desc, void *result, void *result_desc,
+		       fi_addr_t coll_addr, enum fi_datatype datatype,
+		       enum fi_op op, uint64_t flags, void *context)
+{
+	struct cxip_coll_mc *mc_obj;
+	int cxi_opcode, bytcnt;
+	ssize_t ret;
+
+	TRACE_DEBUG("%s entry\n", __func__);
+	cxi_opcode = cxip_fi2cxi_opcode(op, datatype);
+	TRACE_DEBUG("%s cxi_opcode = %d\n", __func__, cxi_opcode);
+	bytcnt = _get_bytcnt(cxi_opcode, datatype, buf, count);
+	TRACE_DEBUG("%s bytcnt = %d\n", __func__, bytcnt);
+	if (bytcnt < 0)
+		return bytcnt;
+
+	/* result required in all cases */
+	if (!result) {
+		CXIP_WARN("result required with FI_MORE\n");
+		return -FI_EINVAL;
+	}
+
+	/* FI_MORE succeeds immediately */
+	if (flags & FI_MORE) {
+		_cxip_coll_prereduce(cxi_opcode, buf, result, bytcnt, flags);
+		return FI_SUCCESS;
+	}
+
+	/* otherwise reduce requires mc_obj */
+	ret = _get_mc_obj(ep, coll_addr, &mc_obj);
+	if (ret)
+		return ret;
+
+	return _cxip_coll_inject(mc_obj, cxi_opcode, buf, result, bytcnt,
+				 flags, context);
+}
+
+/****************************************************************************
+ * JOIN COLLECTIVE STATE MACHINE
+ */
+
+/* Packed structure to fit information into zbcoll broadcast payload */
+union pack_mcast {
+	uint64_t uint64;
+	struct {
+		uint64_t mcast_addr: 16;// maximum anticipated multicast
+		uint64_t hwroot_idx: 27;// 128M endpoints in tree
+		uint64_t valid: 1;	// success flag
+		uint64_t pad: 20;	// needed by zbcoll
+	} __attribute__((__packed__));
+	struct {
+		uint64_t error_bits: 43;// up to 43 independent errors
+		uint64_t valid1: 1;	// unused/reserved
+		uint64_t pad1: 20;	// unused/reserved
+
+	} __attribute__((__packed__));
+};
+
+/* State structure for carrying data through the join sequence */
+struct cxip_join_state {
+	struct cxip_ep_obj *ep_obj;	// ep object
+	struct cxip_av_set *av_set_obj;	// av set for this collective
+	struct cxip_coll_mc *mc_obj;	// mc object for this collective
+	struct cxip_zbcoll_obj *zb;	// zb object associated with state
+	struct fid_mc **mc;		// user pointer to return mc_obj
+	void *context;			// user context for concurrent joins
+	uint64_t join_flags;		// user-supplied libfabric join flags
+	union pack_mcast bcast_data;	// packed multicast data
+	bool rx_discard;		// set if RX events should be discarded
+	bool is_rank;			// set if using COLL_RANK simulation model
+	bool is_mcast;			// set if using Rosetta multicast tree
+	bool create_mcast;		// set to create Rosetta multicast tree
+	bool creating_mcast;		// set once CURL has been initiated
+	bool finished_mcast;		// set once CURL has been completed
+	bool created_ptlte;		// set once PtlTE is initialized
+	int mynode_idx;			// index within the fi_addr[] list
+	int mynode_fiaddr;		// fi_addr of this node
+	int simrank;			// simulated rank of NIC
+	int pid_idx;			// pid_idx used by ptl_te
+	int prov_errno;			// collective provider error
+	int sched_state;		// scheduled operation
+	int join_idx;			// unique join index for diagnostics
+	struct dlist_entry sched_link;	// link to scheduled actions
+};
+
+/* State structure for recovering data from CURL response */
+struct cxip_curl_mcast_usrptr {
+	struct cxip_join_state *jstate;	// join state
+	int mcast_id;			// multicast address
+	int hwroot_rank;		// hardware root index
+};
+
+/* pack provider errors into AND bitmask - address data */
+void _proverr_to_bits(struct cxip_join_state *jstate)
+{
+	int bitno;
+
+	/* record error as a bit for this endpoint */
+	jstate->bcast_data.error_bits = 0L;
+	if (!jstate->bcast_data.valid) {
+		bitno = -jstate->prov_errno;
+		jstate->bcast_data.error_bits |= (1L << bitno);
+	}
+	/* invert bits, zbcoll reduce does AND */
+	jstate->bcast_data.error_bits ^= -1L;
+}
+
+/* unpack AND bitmask into dominant provider error */
+void _bits_to_proverr(struct cxip_join_state *jstate)
+{
+	int bitno;
+
+	/* zbcoll reduce does AND, invert bits */
+	jstate->bcast_data.error_bits ^= -1L;
+
+	/* if data is valid, bits do not represent errors */
+	if (jstate->bcast_data.valid) {
+		jstate->prov_errno = CXIP_PROV_ERRNO_OK;
+		return;
+	}
+
+	/* bits set represent multiple errors from endpoints */
+	for (bitno = -CXIP_PROV_ERRNO_OK; bitno < -CXIP_PROV_ERRNO_LAST; bitno++) {
+		if (jstate->bcast_data.error_bits & (1 << bitno)) {
+			jstate->prov_errno = -bitno;
+			CXIP_WARN("join error %d seen\n", jstate->prov_errno);
+		}
+	}
+	/* returns most significant of multiple errors as jstate->prov_errno */
+}
+
+/* Close collective pte object - ep_obj->lock must be held */
+static void _close_pte(struct cxip_coll_pte *coll_pte)
+{
+	int ret;
+
+	if (!coll_pte)
+		return;
+	do {
+		ret = _coll_pte_disable(coll_pte);
+	} while (ret == -FI_EAGAIN);
+	_coll_destroy_buffers(coll_pte);
+	cxip_pte_free(coll_pte->pte);
+	free(coll_pte);
+}
+
+/* pid_idx == CXIP_PTL_IDX_COLL+rank for NETSIM
+ * pid_idx == CXIP_PTL_IDX_COLL for all other cases
+ */
+static int _acquire_pte(struct cxip_ep_obj *ep_obj, int pid_idx,
+			 bool is_mcast, struct cxip_coll_pte **coll_pte_ret)
+{
+	struct cxi_pt_alloc_opts pt_opts = {
+		.use_long_event = 1,
+		.do_space_check = 1,
+		.en_restricted_unicast_lm = 1,
+	};
+	struct cxip_coll_pte *coll_pte;
+	int ret;
+
+	*coll_pte_ret = NULL;
+	coll_pte = calloc(1, sizeof(*coll_pte));
+	if (!coll_pte)
+		return -FI_ENOMEM;
+
+	/* initialize coll_pte */
+	coll_pte->ep_obj = ep_obj;
+	dlist_init(&coll_pte->buf_list);
+	ofi_atomic_initialize32(&coll_pte->buf_cnt, 0);
+	ofi_atomic_initialize32(&coll_pte->buf_swap_cnt, 0);
+	ofi_atomic_initialize32(&coll_pte->recv_cnt, 0);
+
+	/* bind PTE to domain */
+	ret = cxip_pte_alloc(ep_obj->ptable, ep_obj->coll.rx_evtq->eq,
+			     pid_idx, is_mcast, &pt_opts, _coll_pte_cb,
+			     coll_pte, &coll_pte->pte);
+	if (ret)
+		goto fail;
+
+	/* enable the PTE */
+	ret = _coll_pte_enable(coll_pte, CXIP_PTE_IGNORE_DROPS);
+	if (ret)
+		goto fail;
+
+	/* add buffers to the PTE */
+	ret = _coll_add_buffers(coll_pte,
+				ep_obj->coll.buffer_size,
+				ep_obj->coll.buffer_count);
+	if (ret)
+		goto fail;
+
+	*coll_pte_ret = coll_pte;
+	return FI_SUCCESS;
+
+fail:
+	_close_pte(coll_pte);
+	return ret;
+}
+
+/* Close multicast collective object */
+static void _close_mc(struct cxip_coll_mc *mc_obj)
+{
+	int count;
+
+	if (!mc_obj)
+		return;
+	/* clear the mcast_addr -> mc_obj reference*/
+	ofi_idm_clear(&mc_obj->ep_obj->coll.mcast_map, mc_obj->mcast_addr);
+	mc_obj->ep_obj->coll.is_hwroot = false;
+
+	/* clear the avset alteration lockout */
+	mc_obj->av_set_obj->mc_obj = NULL;
+
+	/* unmap the reduction mem descriptor for DMA */
+	if (mc_obj->reduction_md)
+		cxil_unmap(mc_obj->reduction_md);
+
+	/* close any PTE associated with mc_obj (NETSIM) */
+	if (mc_obj->coll_pte != mc_obj->ep_obj->coll.coll_pte)
+		_close_pte(mc_obj->coll_pte);
+
+	/* decrement multicast count (real), close PTE if unused */
+	count = ofi_atomic_dec32(&mc_obj->ep_obj->coll.num_mc);
+	count = ofi_atomic_get32(&mc_obj->ep_obj->coll.num_mc);
+	if (!count && mc_obj->ep_obj->coll.coll_pte) {
+		_close_pte(mc_obj->ep_obj->coll.coll_pte);
+		mc_obj->ep_obj->coll.coll_pte = NULL;
+	}
+	free(mc_obj);
+}
+
+static int _fi_close_mc(struct fid *fid)
+{
+	struct cxip_coll_mc *mc_obj;
+
+	mc_obj = container_of(fid, struct cxip_coll_mc, mc_fid.fid);
+	_close_mc(mc_obj);
+	return FI_SUCCESS;
+}
+
+/* multicast object operational functions */
+static struct fi_ops mc_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = _fi_close_mc,
+};
+
+/**
+ * Utility routine to set up the collective framework in response to calls to
+ * fi_join_collective().
+ *
+ * If jstate->is_rank is true, this is a NETSIM model, which opens a PTE for
+ * each call to fi_join_collective() that is bound to the multicast object
+ * created by that call. This allows simulated multicast traffic through the
+ * NETSIM loopback port by using different pte_idx values for each PTE to
+ * disambiguate traffic intended for different simulated hardware endpoints.
+ * This model does not support multiple MC objects at an endpoint: there is
+ * exactly one MC address. Progressing the single endpoint will progress all
+ * of the simulated MC objects. Extending this model to support multiple MC
+ * objects is not a priority at this time.
+ *
+ * If jstate->is_rank is false, this is a multinode model. The first call to
+ * fi_join_collective() creates a single PTE which is bound to the EP, and
+ * creates the first multicast object for that endpoint. Every subsequent
+ * join will create an additional multicast object that shares the PTE for
+ * that endpoint. Multiple NICs on the node are represented by separate EP
+ * objects, which are functionally distinct: all endpoints must be progressed
+ * independently, and if any endpoint is not progressed, it will stall the
+ * collective.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int _initialize_mc(void *ptr)
+{
+	struct cxip_join_state *jstate = ptr;
+	struct cxip_ep_obj *ep_obj = jstate->ep_obj;
+	struct cxip_av_set *av_set_obj = jstate->av_set_obj;
+	struct cxip_coll_mc *mc_obj;
+	struct cxip_coll_pte *coll_pte;
+	struct cxip_cmdq *cmdq;
+	int red_id;
+	int ret;
+
+	TRACE_JOIN("%s entry\n", __func__);
+
+	mc_obj = calloc(1, sizeof(*mc_obj));
+	if (!mc_obj)
+		return -FI_ENOMEM;
+
+	/* COMM_KEY_RANK model needs a distinct PTE for every MC object.
+	 * All other models share a single PTE for all MCs using an EP.
+	 */
+	coll_pte = ep_obj->coll.coll_pte;
+	if (!coll_pte) {
+		TRACE_DEBUG("acqiring PTE\n");
+		ret = _acquire_pte(ep_obj, jstate->pid_idx, jstate->is_mcast,
+				   &coll_pte);
+		if (ret) {
+			TRACE_DEBUG("acquiring PTE failed %d\n", ret);
+			free(mc_obj);
+			return ret;
+		}
+		if (!jstate->is_rank) {
+			TRACE_DEBUG("assigned PTE to ep_obj\n");
+			ep_obj->coll.coll_pte = coll_pte;
+		}
+		/* else leave ep_obj->coll.coll_pte == NULL */
+	}
+	/* copy coll_pte to mc_obj */
+	mc_obj->coll_pte = coll_pte;
+
+	/* if COMM_KEY_RANK model, PTE must know the mc_obj */
+	coll_pte->mc_obj = (jstate->is_rank) ? mc_obj : NULL;
+
+	/* link ep_obj to mc_obj (1 to many) */
+	mc_obj->ep_obj = ep_obj;
+	ofi_atomic_inc32(&ep_obj->coll.num_mc);
+
+	/* link av_set_obj to mc_obj (one to one) */
+	av_set_obj->mc_obj = mc_obj;
+	mc_obj->av_set_obj = av_set_obj;
+
+	/* initialize remainder of mc_obj */
+	mc_obj->mc_fid.fid.fclass = FI_CLASS_MC;
+	mc_obj->mc_fid.fid.context = mc_obj;
+	mc_obj->mc_fid.fid.ops = &mc_ops;
+	mc_obj->mc_fid.fi_addr = (fi_addr_t)(uintptr_t)mc_obj;
+	mc_obj->hwroot_idx = jstate->bcast_data.hwroot_idx;
+	mc_obj->mcast_addr = jstate->bcast_data.mcast_addr;
+	mc_obj->mynode_idx = jstate->mynode_idx;
+	mc_obj->mynode_fiaddr = jstate->mynode_fiaddr;
+	mc_obj->max_red_id = CXIP_COLL_MAX_CONCUR;
+	mc_obj->arm_disable = false;
+	mc_obj->rx_discard = jstate->rx_discard;
+	mc_obj->timeout.tv_sec =
+		cxip_env.coll_retry_usec/1000000L;
+	mc_obj->timeout.tv_nsec =
+		(cxip_env.coll_retry_usec%1000000L)*1000L;
+	for (red_id = 0; red_id < CXIP_COLL_MAX_CONCUR; red_id++) {
+		struct cxip_coll_reduction *reduction;
+
+		reduction = &mc_obj->reduction[red_id];
+		reduction->coll_state = CXIP_COLL_STATE_READY;
+		reduction->mc_obj = mc_obj;
+		reduction->red_id = red_id;
+		reduction->in_use = false;
+		reduction->completed = false;
+	}
+	TRACE_DEBUG("Initializing mc_obj=%p counters\n", mc_obj);
+	ofi_spin_init(&mc_obj->lock);
+	ofi_atomic_initialize32(&mc_obj->send_cnt, 0);
+	ofi_atomic_initialize32(&mc_obj->recv_cnt, 0);
+	ofi_atomic_initialize32(&mc_obj->pkt_cnt, 0);
+	ofi_atomic_initialize32(&mc_obj->seq_err_cnt, 0);
+	ofi_atomic_initialize32(&mc_obj->tmout_cnt, 0);
+
+	/* map entire reduction block if using DMA */
+	if (cxip_env.coll_use_dma_put) {
+		/* EXPERIMENTAL */
+		ret = cxil_map(ep_obj->domain->lni->lni,
+			       mc_obj->reduction,
+			       sizeof(mc_obj->reduction),
+			       CXI_MAP_PIN  | CXI_MAP_READ | CXI_MAP_WRITE,
+			       NULL, &mc_obj->reduction_md);
+		if (ret)
+			goto fail;
+	}
+
+	/* define the traffic class */
+	// TODO revisit for LOW_LATENCY
+	if (is_netsim(ep_obj)) {
+		/* NETSIM RANK model */
+		mc_obj->tc = CXI_TC_BEST_EFFORT;
+		mc_obj->tc_type = CXI_TC_TYPE_DEFAULT;
+	} else if (!jstate->is_mcast) {
+		/* UNICAST model */
+		mc_obj->tc = CXI_TC_BEST_EFFORT;
+		mc_obj->tc_type = CXI_TC_TYPE_DEFAULT;
+	} else if (is_hw_root(mc_obj)) {
+		/* MULTICAST model, hw_root */
+		mc_obj->tc = CXI_TC_BEST_EFFORT;
+		mc_obj->tc_type = CXI_TC_TYPE_DEFAULT;
+	} else {
+		/* MULTICAST model, leaves */
+		mc_obj->tc = CXI_TC_LOW_LATENCY;
+		mc_obj->tc_type = CXI_TC_TYPE_COLL_LEAF;
+	}
+	/* Set this now to instantiate cmdq CP */
+	cmdq = ep_obj->coll.tx_cmdq;
+	ret = cxip_txq_cp_set(cmdq, ep_obj->auth_key.vni,
+			      mc_obj->tc, mc_obj->tc_type);
+	if (ret) {
+		TRACE_JOIN("%s: cxip_txq_cp_set() = %d\n", __func__, ret);
+		goto fail;
+	}
+
+	/* index mc_obj by mcast_addr for fast lookup */
+	TRACE_JOIN("%s: mc addr=%d obj=%p\n", __func__, mc_obj->mcast_addr, mc_obj);
+	ret =  ofi_idm_set(&ep_obj->coll.mcast_map,
+			   mc_obj->mcast_addr, mc_obj);
+	if (ret < 0) {
+		TRACE_JOIN("%s: idm set failed %d\n", __func__, ret);
+		goto fail;
+	}
+	/* lock out reuse of this endpoint as hw_root for any multicast addr */
+	if (mc_obj->hwroot_idx == mc_obj->mynode_idx) {
+		TRACE_JOIN("%s: set is_hwroot\n", __func__);
+		ep_obj->coll.is_hwroot = true;
+	}
+#if ENABLE_DEBUG
+	struct cxip_coll_mc *mc_obj_chk;
+
+	mc_obj_chk = ofi_idm_lookup(&ep_obj->coll.mcast_map,
+				    mc_obj->mcast_addr);
+	if (mc_obj_chk != mc_obj) {
+		TRACE_JOIN("%s: mcast set=%p get=%p\n",
+			   __func__, mc_obj, mc_obj_chk);
+	}
+#endif
+	/* Last field to set */
+	mc_obj->is_joined = true;
+
+	/* Return information to the caller */
+	jstate->mc_obj = mc_obj;
+	*jstate->mc = &mc_obj->mc_fid;
+	TRACE_JOIN("%s: initialized mc[%d] to %p\n",
+		   __func__, jstate->mynode_idx, *jstate->mc);
+
+	return FI_SUCCESS;
+
+fail:
+	_close_mc(mc_obj);
+	return ret;
+}
+
+/**
+ * CURL callback function upon completion of a request.
+ *
+ * This sets jstate->finished_mcast, even if the operation fails.
+ * This sets jstate->bcast_data.valid if the address is valid.
+ */
+static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle)
+{
+	struct cxip_curl_mcast_usrptr *curl_usrptr = handle->usrptr;
+	struct cxip_join_state *jstate = curl_usrptr->jstate;
+	struct json_object *json_obj;
+	struct cxip_addr caddr;
+	const char *hwrootstr;
+	int mcaddr, hwroot;
+	uint32_t b2, b1, b0, n;
+	int i, ret;
+
+	/* Creation process is done */
+	TRACE_JOIN("CURL COMPLETED!\n");
+	jstate->finished_mcast = true;
+
+	switch (handle->status) {
+	case 200:
+	case 201:
+		/* CURL succeeded, parse response */
+		TRACE_JOIN("CURL PARSE RESPONSE:\n%s\n", handle->response);
+		if (!(json_obj = json_tokener_parse(handle->response)))
+			break;
+		if (cxip_json_int("mcastID", json_obj, &mcaddr))
+			break;
+		if (cxip_json_string("hwRoot", json_obj, &hwrootstr))
+			break;
+
+		n = sscanf(hwrootstr, "%x:%x:%x", &b2, &b1, &b0);
+		if (n < 3 || b2 > 0xf || b1 > 0xff || b2 > 0xff)
+			break;
+		hwroot = (b2 << 16) + (b1 << 8) + b0;
+
+		TRACE_JOIN("mcastID=%d hwRoot='%s'=%x\n", mcaddr, hwrootstr,
+			   hwroot);
+		for (i = 0; i < jstate->av_set_obj->fi_addr_cnt; i++) {
+			ret = cxip_av_lookup_addr(
+					jstate->av_set_obj->cxi_av,
+					jstate->av_set_obj->fi_addr_ary[i],
+					&caddr);
+			if (ret < 0)
+				continue;
+			TRACE_JOIN("test %d == %d\n", hwroot, caddr.nic);
+			if (hwroot == caddr.nic)
+				break;
+		}
+		TRACE_JOIN("final index=%d\n", i);
+		if (i >= jstate->av_set_obj->fi_addr_cnt) {
+			TRACE_JOIN("multicast HWroot not found in av_set\n");
+			jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INVALID;
+			break;
+		}
+		/* Production MCAST address */
+		jstate->bcast_data.valid = true;
+		jstate->bcast_data.hwroot_idx = i;
+		jstate->bcast_data.mcast_addr = (uint32_t)mcaddr;
+		jstate->is_mcast = true;
+		/* This succeeded */
+		TRACE_JOIN("curl: mcaddr   =%08x\n",
+			   jstate->bcast_data.mcast_addr);
+		TRACE_JOIN("curl: hwrootidx=%d\n",
+			   jstate->bcast_data.hwroot_idx);
+		break;
+	default:
+		TRACE_JOIN("ERRMSK SET CURL error %ld!\n", handle->status);
+		if (handle->response)
+			TRACE_JOIN("ERROR RESPONSE:\n%s\n", handle->response);
+		// TODO finer error differentiation from CURL errors
+		jstate->prov_errno = CXIP_PROV_ERRNO_CURL;
+		break;
+	}
+	free(curl_usrptr);
+	TRACE_JOIN("CURL COMPLETED!\n");
+	jstate->finished_mcast = true;
+}
+
+/**
+ * Start a CURL request for a multicast address.
+ */
+static void _start_curl(void *ptr)
+{
+	struct cxip_curl_mcast_usrptr *curl_usrptr;
+	struct cxip_join_state *jstate = ptr;
+	static const char *json_fmt =
+		"{'macs':[%s],'jobID':'%s','jobStepID':'%s','timeout':%ld}";
+	struct cxip_addr caddr;
+	char *jsonreq, *mac, *url, *p;
+	int i, ret;
+
+	/* early exit will attempt to free these */
+	curl_usrptr = NULL;
+	jsonreq = NULL;
+	mac = NULL;
+	url = NULL;
+
+	/* acquire the environment variables needed */
+	TRACE_JOIN("jobid   = %s\n", cxip_env.coll_job_id);
+	TRACE_JOIN("stepid  = %s\n", cxip_env.coll_job_step_id);
+	TRACE_JOIN("fmurl   = %s\n", cxip_env.coll_fabric_mgr_url);
+	TRACE_JOIN("token   = %s\n", cxip_env.coll_mcast_token);
+	TRACE_JOIN("maxadrs = %ld\n", cxip_env.hwcoll_addrs_per_job);
+	TRACE_JOIN("minnodes= %ld\n", cxip_env.hwcoll_min_nodes);
+	TRACE_JOIN("retry   = %ld\n", cxip_env.coll_retry_usec);
+	TRACE_JOIN("tmout   = %ld\n", cxip_env.coll_timeout_usec);
+
+	/* Generic error for any preliminary failures */
+	jstate->prov_errno = CXIP_PROV_ERRNO_CURL;
+	if (!cxip_env.coll_job_id ||
+	    !cxip_env.coll_fabric_mgr_url ||
+	    !cxip_env.coll_mcast_token) {
+		TRACE_JOIN("Check environment variables\n");
+		ret = -FI_EINVAL;
+		goto quit;
+	}
+
+	ret = asprintf(&url, "%s/fabric/collectives/multicast",
+		       cxip_env.coll_fabric_mgr_url);
+	if (ret < 0) {
+		TRACE_JOIN("Failed to construct CURL address\n");
+		ret = -FI_ENOMEM;
+		goto quit;
+	}
+
+	/* five hex digits per mac, two colons, two quotes, comma */
+	p = mac = malloc(10*jstate->av_set_obj->fi_addr_cnt + 1);
+	if (!mac) {
+		TRACE_JOIN("Failed to allocate mac list\n");
+		ret = -FI_ENOMEM;
+		goto quit;
+	}
+	for (i = 0; i < jstate->av_set_obj->fi_addr_cnt; i++) {
+		ret = cxip_av_lookup_addr(
+				jstate->av_set_obj->cxi_av,
+				jstate->av_set_obj->fi_addr_ary[i], &caddr);
+		if (ret < 0) {
+			TRACE_JOIN("failed to find address[%d]=%ld\n",
+				   i, jstate->av_set_obj->fi_addr_ary[i]);
+			goto quit;
+		}
+		p += sprintf(p, "'%01X:%02X:%02X',",
+		 		(caddr.nic >> 16) & 0xf,
+				(caddr.nic >> 8) & 0xff,
+				(caddr.nic) & 0xff);
+
+	}
+	*(--p) = 0;
+
+	/* generate the CURL JSON request */
+	ret = asprintf(&jsonreq, json_fmt, mac,
+			cxip_env.coll_job_id,
+			cxip_env.coll_job_step_id,
+			cxip_env.coll_timeout_usec);
+	if (ret < 0) {
+		TRACE_JOIN("Creating JSON request = %d\n", ret);
+		ret = -FI_ENOMEM;
+		goto quit;
+	}
+	single_to_double_quote(jsonreq);
+	TRACE_JOIN("JSON = %s\n", jsonreq);
+
+	/* create the mcast address */
+	curl_usrptr = calloc(1, sizeof(*curl_usrptr));
+	if (!curl_usrptr) {
+		TRACE_JOIN("curl_usrptr calloc() error\n");
+		ret = -FI_ENOMEM;
+		goto quit;
+	}
+	/* dispatch CURL request */
+	curl_usrptr->jstate = jstate;
+	if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_CURLSND, &ret))
+		goto quit;
+	ret = cxip_curl_perform(url, jsonreq, cxip_env.coll_mcast_token, 0,
+				CURL_POST, false, _cxip_create_mcast_cb,
+				curl_usrptr);
+quit:
+	free(url);
+	free(mac);
+	free(jsonreq);
+	if (ret < 0) {
+		TRACE_JOIN("CURL execution failed\n");
+		free(curl_usrptr);
+		jstate->finished_mcast = true;
+	}
+}
+
+/****************************************************************************
+ * State machine for performing fi_join_collective()
+ *
+ * The zbcoll operations use unrestricted packets, will re-route dynamically,
+ * and manage NAK retries automatically, so they are resistant to dropped
+ * packets and other transient errors. They will not (and should not) time out:
+ * a persistently unresponsive endpoint in the collective tree will cause the
+ * collective join to block indefinitely.
+ *
+ * Each state operation returns without doing any retries. The state machine
+ * progress table will decide whether to retry the operation.
+ *
+ * Each state operation must set zb->error as follows:
+ * - FI_SUCCESS - continues the state machine
+ * - FI_EAGAIN  - retries the same state
+ * - other      - fails the join operation
+ *
+ * The bcast_data value is used to carry 64 bits of data.
+ * The prov_errno value records a local (speculative) error
+ * prov_errno is ignored if bcast_data.valid == true
+ *
+ * getgroup:
+ *	acquires a group ID for zbcoll collectives
+ * broadcast (zbcoll rank 0):
+ *   if appropriate, starts CURL request, evaluates return
+ *   otherwise, assumes static initialization, sets return
+ *   on broadcast completion
+ *   - all endpoints share bcast_data from zbcoll rank 0
+ *   - prov_errno indicates an error if bcast_data.valid is false
+ *   - if bcast_data.valid, initializes a new MC object, new PTE if needed
+ *   - creation errors set bcast_data.valid false, set prov_errno
+ * reduce:
+ *   converts this endpoint prov_errno to bitmask
+ *   overwrites mcast_addr and hwcoll_idx in bcast_data with bitmask
+ *   bcast_data.valid remains unchanged
+ *   on reduce completion
+ *   - bitmask is bitwise OR of all error bits and address valid bit
+ *   - prov_errno is set to prioritized error code (0 if bcast_data.valid)
+ *   - all endpoints report the same completion status and error
+ */
+
+/**
+ * Join state machine.
+ *
+ * The state machine walks through the following functions top-to-bottom.
+ * If the return code is success, it advances to the next state.
+ * If the return code is -FI_EAGAIN, it repeats the current state.
+ * If the return code is anything else, the join operation fails.
+ */
+
+/* append a jstate to the zbcoll scheduler */
+static void _append_sched(struct cxip_zbcoll_obj *zb, void *usrptr)
+{
+	struct cxip_ep_coll_obj *coll_obj = &zb->ep_obj->coll;
+	struct cxip_join_state *jstate = usrptr;
+
+	dlist_ts_insert_tail(&coll_obj->sched_list, &jstate->sched_link);
+}
+
+static void _noop(void *ptr)
+{
+	TRACE_JOIN("%s: entry\n", __func__);
+}
+
+/* get a zbcoll group identifier */
+static void _start_getgroup(void *ptr)
+{
+	struct cxip_join_state *jstate = ptr;
+	struct cxip_zbcoll_obj *zb = jstate->zb;
+
+	TRACE_JOIN("%s on %d: entry\n", __func__, jstate->mynode_idx);
+
+	if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_GETGRP, &zb->error))
+		goto quit;
+	/* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */
+	zb->error = cxip_zbcoll_getgroup(zb);
+quit:
+	TRACE_JOIN("getgroup error = %d\n", zb->error);
+	if (zb->error)
+		_append_sched(zb, jstate);
+}
+
+static void _finish_getgroup(void *ptr)
+{
+	struct cxip_join_state *jstate = ptr;
+	struct cxip_zbcoll_obj *zb = jstate->zb;
+
+	TRACE_JOIN("%s on %d: entry\n", __func__, jstate->mynode_idx);
+	_append_sched(zb, jstate);	// _start_bcast
+}
+
+/* Create a multicast address and broadcast it to all endpoints.
+ * If jstate->create_mcast is set, this will use CURL to get an address.
+ * Otherwise, this presumes static initialization, and sets bcast_data.valid.
+ */
+static void _start_bcast(void *ptr)
+{
+	struct cxip_join_state *jstate = ptr;
+	struct cxip_zbcoll_obj *zb = jstate->zb;
+
+	TRACE_JOIN("%s: entry\n", __func__);
+
+	/* error will indicate that the multicast request fails */
+	jstate->prov_errno = C_RC_INVALID_DFA_FORMAT;
+	/* rank 0 always does the work here */
+	if (jstate->mynode_idx == 0) {
+		if (jstate->create_mcast) {
+			/* first call (only) initiates CURL request */
+			if (!jstate->creating_mcast) {
+				jstate->creating_mcast = true;
+				_start_curl(jstate);
+			}
+			/* every retry call checks to see if CURL is complete */
+			if (!jstate->finished_mcast) {
+				zb->error = -FI_EAGAIN;
+				goto quit;
+			}
+			/* bcast_data.valid is set by curl callback */
+		} else {
+			/* static bcast data is presumed correct */
+			jstate->bcast_data.valid = true;
+		}
+	}
+	/* speculative prov_errno for trap */
+	jstate->prov_errno = CXIP_PROV_ERRNO_CURL;
+	if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_BCAST, &zb->error))
+		goto quit;
+	/* rank > 0 endpoints overwritten by rank = 0 data */
+	/* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */
+	zb->error = cxip_zbcoll_broadcast(zb, &jstate->bcast_data.uint64);
+quit:
+	if (zb->error)
+		_append_sched(zb, jstate);
+}
+
+/* Check broadcast validity, and if valid, set up the MC object */
+static void _finish_bcast(void *ptr)
+{
+	struct cxip_join_state *jstate = ptr;
+	struct cxip_zbcoll_obj *zb = jstate->zb;
+	bool is_hwroot;
+	int ret;
+
+	TRACE_JOIN("%s: mc addr=%d hw_root=%d valid=%d\n", __func__,
+		   jstate->bcast_data.mcast_addr,
+		   jstate->bcast_data.hwroot_idx,
+		   jstate->bcast_data.valid);
+	/* all NICs now have same mc_addr data, if invalid, fail */
+	/* jstate->prov_errno is presumed set if not valid */
+	if (!jstate->bcast_data.valid)
+		goto quit;
+	/* error indicates that attempt to configure fails */
+
+	/* check for invalid hwroot index */
+	TRACE_JOIN("check hwroot\n");
+	if (jstate->bcast_data.hwroot_idx >=
+	    jstate->av_set_obj->fi_addr_cnt) {
+		TRACE_JOIN("%s: reject invalid hwroot_idx\n", __func__);
+		jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INVALID;
+		ret = -FI_EINVAL;
+		goto quit;
+	}
+
+	/* check for hwroot overlap on this node */
+	is_hwroot = (jstate->bcast_data.hwroot_idx == jstate->mynode_idx);
+	if (is_hwroot && jstate->ep_obj->coll.is_hwroot) {
+		TRACE_JOIN("%s: reject join, hwroot in use\n", __func__);
+		jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INUSE;
+		ret = -FI_EINVAL;
+		goto quit;
+
+	}
+	/* check for mcast_addr overlap */
+	TRACE_JOIN("check mcast addr\n");
+	if (!jstate->is_rank &&
+	    ofi_idm_lookup(&jstate->ep_obj->coll.mcast_map,
+			   jstate->bcast_data.mcast_addr)) {
+		TRACE_JOIN("%s: reject join, mcast %d in use\n", __func__,
+			   jstate->bcast_data.mcast_addr);
+		jstate->prov_errno = CXIP_PROV_ERRNO_MCAST_INUSE;
+		ret = -FI_EINVAL;
+		goto quit;
+	}
+	/* speculative prov_errno for trap */
+	jstate->prov_errno = CXIP_PROV_ERRNO_PTE;
+	if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_INITPTE, &ret))
+		goto quit;
+	TRACE_JOIN("%s: continuing to configure\n", __func__);
+	ret = _initialize_mc(jstate);
+quit:
+	/* if initialization fails, invalidate bcast_data */
+	if (ret != FI_SUCCESS)
+		jstate->bcast_data.valid = false;
+	/* represent prov_errno values as inverted bitmask */
+	_proverr_to_bits(jstate);
+	_append_sched(zb, jstate);	// _start_reduce
+}
+
+/* Accumulate composite errors from different endpoints */
+static void _start_reduce(void *ptr)
+{
+	struct cxip_join_state *jstate = ptr;
+	struct cxip_zbcoll_obj *zb = jstate->zb;
+
+	/* reduce ANDs inverted bcast_data, if any invalid, all become invalid */
+	if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_REDUCE, &zb->error))
+		goto quit;
+	/* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */
+	zb->error = cxip_zbcoll_reduce(zb, &jstate->bcast_data.uint64);
+quit:
+	if (zb->error)
+		_append_sched(zb, jstate);
+}
+
+/* process error bits (if any) to produce an error condition */
+static void _finish_reduce(void *ptr)
+{
+	struct cxip_join_state *jstate = ptr;
+	struct cxip_zbcoll_obj *zb = jstate->zb;
+
+	TRACE_JOIN("%s: entry\n", __func__);
+
+	/* re-invert bitmap, select common reported error */
+	_bits_to_proverr(jstate);
+
+	TRACE_JOIN("%s: prov_errno=0x%x\n", __func__, jstate->prov_errno);
+	_append_sched(zb, jstate);	// _start_cleanup
+}
+
+/* state machine cleanup */
+static void _start_cleanup(void *ptr)
+{
+	struct cxip_join_state *jstate = ptr;
+	struct fi_eq_err_entry entry = {};
+	size_t size = sizeof(entry);
+	uint64_t flags = 0L;
+	int ret;
+
+	TRACE_JOIN("%s: entry\n", __func__);
+	if (jstate) {
+		entry.fid = (jstate->mc_obj) ?
+				&jstate->mc_obj->mc_fid.fid : NULL;
+		entry.context = jstate->context;
+
+		if (jstate->prov_errno != CXIP_PROV_ERRNO_OK) {
+			size = sizeof(struct fi_eq_err_entry);
+			entry.data = FI_JOIN_COMPLETE;
+			entry.err = -FI_EAVAIL;
+			entry.prov_errno = jstate->prov_errno;
+			flags |= UTIL_FLAG_ERROR;
+		}
+		ret = ofi_eq_write(&jstate->ep_obj->eq->util_eq.eq_fid,
+				FI_JOIN_COMPLETE, &entry,
+				size, flags);
+		if (ret < 0)
+			CXIP_INFO("FATAL ERROR: cannot post to EQ\n");
+		cxip_zbcoll_free(jstate->zb);
+		jstate->ep_obj->coll.join_busy = false;
+	}
+	free(jstate);
+}
+
+typedef	void (*sched_func)(void *ptr);
+
+enum state_code {
+	state_init,
+	start_getgroup,
+	finish_getgroup,
+	start_bcast,
+	finish_bcast,
+	start_reduce,
+	finish_reduce,
+	start_cleanup,
+	state_done
+};
+
+const char *state_name[] = {
+	"state_init",
+	"start_getgroup",
+	"finish_getgroup",
+	"start_bcast",
+	"finish_bcast",
+	"start_reduce",
+	"finish_reduce",
+	"start_cleanup",
+	"state_done"
+};
+sched_func state_func[] = {
+	_noop,
+	_start_getgroup,
+	_finish_getgroup,
+	_start_bcast,
+	_finish_bcast,
+	_start_reduce,
+	_finish_reduce,
+	_start_cleanup,
+	_noop,
+};
+
+/**
+ * State progression table
+ *
+ * Row is the current state.
+ * Col contains states reachable from this state on success/again/fail.
+ */
+static enum state_code progress_state[][3] = {
+	/* STATE              SUCCESS         EAGAIN         FAIL */
+	/* state_init     */ {start_getgroup, start_cleanup, start_cleanup},
+	/* start_getgroup */ {finish_getgroup,start_getgroup,start_cleanup},
+	/* finish_getgroup*/ {start_bcast,    start_cleanup, start_cleanup},
+	/* start_bcast    */ {finish_bcast,   start_bcast,   start_cleanup},
+	/* finish_bcast   */ {start_reduce,   start_cleanup, start_cleanup},
+	/* start_reduce   */ {finish_reduce,  start_reduce,  start_cleanup},
+	/* finish_reduce  */ {start_cleanup,  start_cleanup, start_cleanup},
+	/* start_cleanup  */ {state_done,     state_done,    state_done},
+	/* state_done     */ {state_done,     state_done,    state_done},
+};
+
+/* Advance the state and run scheduled operations */
+static void _progress_sched(struct cxip_join_state *jstate)
+{
+	struct cxip_zbcoll_obj *zb = jstate->zb;
+	enum state_code *codes;
+
+	TRACE_JOIN("entry jstate[%d,%d]=%s, error=%d\n",
+		   jstate->join_idx, jstate->mynode_idx,
+		   state_name[jstate->sched_state], zb->error);
+
+	/* acquire the success/again/fail state codes for current state */
+	codes = progress_state[jstate->sched_state];
+	switch (zb->error) {
+	case FI_SUCCESS:
+		/* last operation succeeded */
+		TRACE_JOIN("%s: success\n", __func__);
+		jstate->sched_state = codes[0];
+		break;
+	case -FI_EBUSY:
+	case -FI_EAGAIN:
+		/* last operation needs a retry */
+		TRACE_JOIN("%s: busy retry\n", __func__);
+		jstate->sched_state = codes[1];
+		break;
+	default:
+		/* last operation failed */
+		TRACE_JOIN("%s: fail zberr=%d\n", __func__, zb->error);
+		jstate->sched_state = codes[2];
+		break;
+	}
+	TRACE_JOIN("----> jstate[%d,%d]=%s\n",
+		   jstate->join_idx, jstate->mynode_idx,
+		   state_name[jstate->sched_state]);
+
+	/* execute the new state function */
+	state_func[jstate->sched_state](jstate);
+}
+
+/* Process the schedule list and dispatch next scheduled operation */
+static void _progress_join(struct cxip_ep_obj *ep_obj)
+{
+	struct cxip_ep_coll_obj *coll_obj = &ep_obj->coll;
+	struct cxip_join_state *jstate = NULL;
+
+	dlist_ts_pop_front(&coll_obj->sched_list,
+			   struct cxip_join_state,
+			   jstate, sched_link);
+
+	if (jstate)
+		_progress_sched(jstate);
+}
+
+/* During join, determine my index position in the av_set_obj */
+static unsigned int _caddr_to_idx(struct cxip_av_set *av_set_obj,
+				  struct cxip_addr caddr)
+{
+	struct cxip_addr addr;
+	size_t size = sizeof(addr);
+	int i, ret;
+
+	for (i = 0; i < av_set_obj->fi_addr_cnt; i++) {
+		ret = fi_av_lookup(&av_set_obj->cxi_av->av_fid,
+				   av_set_obj->fi_addr_ary[i],
+				   &addr, &size);
+		if (ret)
+			return ret;
+		if (CXIP_ADDR_EQUAL(addr, caddr))
+			return i;
+	}
+	return -FI_EADDRNOTAVAIL;
+}
+
+/**
+ * fi_join_collective() implementation.
+ *
+ * Calling syntax is defined by libfabric.
+ *
+ * This is a multi-stage collective operation, progressed by calling TX/RX CQs
+ * and the EQ for the endpoint. Upon completion of the state machine, the EQ
+ * will return an EQ event structure.
+ *
+ * We go through the following steps:
+ *
+ * 1) allocate a join state for this operation
+ * 2) allocate zbcoll object
+ * 3) get a collective group identifier
+ * 4) generate a multicast tree from NIC 0
+ * 5) broadcast multicast address from NIC 0
+ * 6) reduce error mask across all NICs
+ * 7) cleanup
+ *
+ * Joins are non-concurrent, and return FI_EAGAIN until any active join
+ * completes. The final return code of a join is not known to all nodes until
+ * the final state completes.
+ *
+ * Joins are progressed by polling TX/RX CQs, and completion status is
+ * returned by polling the endpoint EQ.
+ *
+ * CPU errors like -FI_ENOMEM will likely occur on individual endpoints,
+ * and the correct response is to exit the application. There is no
+ * reasonable way to re-enter the state machine once any participant has
+ * unexpectedly failed.
+ *
+ * Internal errors, such as inability to acquire a multicast address, are
+ * are represented by a CXIP_PROV_ERRNO value, which is returned through the
+ * EQ polling with an error of -FI_EAVAIL, and the CXIP_PROV_ERRNO value.
+ * These values are ranked, and if multiple nodes show different errors, the
+ * returned error will be the most-significant (most-negative) value.
+ *
+ * There are four operational models, one for production, and three for testing.
+ *
+ * In all cases, there must be one join for every NIC address in the av_set_obj
+ * fi_addr_ary, and the collective proceeds among these joined endpoints.
+ *
+ * COMM_KEY_RANK tests using a single process on a single Cassini, which
+ * supplies the src/tgt, but different pid_idx values, representing different
+ * PTLTE objects, each with its own buffers. The zbcoll operations are performed
+ * using linked zb objects, which represent a single zbcoll collective, so each
+ * zb callback function is called only once for the entire set, yet must provide
+ * a unique mc return value and FI_COLL_COMPLETE event for each joined object.
+ * We manage this with the simstates array, which associates the simulated rank
+ * with the state pointer, so that upon completion, we can provide all of the
+ * return pointers and events.
+ *
+ * COMM_KEY_UNICAST tests on multiple nodes on a real network, but without any
+ * multicast support. It initializes one mc object on each node, and designates
+ * the first node in the multicast list, fiaddr[0], as the hardware root node.
+ * fiaddr[1..N] send directly to fiaddr[0], and fiaddr[0] sends to each of the
+ * other addresses in a simulated broadcast. This is not expected to be
+ * performant, but it does exercise a necessary incast edge case, and it fully
+ * exercises the collectives software across multiple nodes.
+ *
+ * COMM_KEY_MULTICAST is a fully-functioning model, but requires that an
+ * external application prepare the multicast address on the fabric before
+ * calling fi_join_collective() on any node. This information must be supplied
+ * through the av_set_obj->comm_key structure.
+ *
+ * COMM_KEY_NONE is the production model, in which fi_join_collective() creates
+ * the multicast address by making a CURL call to the fabric manager REST API.
+ * fiaddr[0] manages the CURL call, and broadcasts the results to all of the
+ * other objects across the collective group.
+ */
+int cxip_join_collective(struct fid_ep *ep, fi_addr_t coll_addr,
+			 const struct fid_av_set *coll_av_set,
+			 uint64_t flags, struct fid_mc **mc, void *context)
+{
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_av_set *av_set_obj;
+	struct cxip_join_state *jstate;
+	struct cxip_zbcoll_obj *zb;
+	bool link_zb;
+	int ret;
+
+	check_red_pkt();
+
+	TRACE_JOIN("%s: entry\n", __func__);
+	/* Validate arguments */
+	if (!ep || !coll_av_set || !mc || coll_addr != FI_ADDR_NOTAVAIL)
+		return -FI_EINVAL;
+	/* flags are ignored, per util_coll.c example code
+	 * Only FI_SCATTER is documented, and applies to fi_query_collective().
+	 */
+
+	cxip_ep = container_of(ep, struct cxip_ep, ep.fid);
+	av_set_obj = container_of(coll_av_set, struct cxip_av_set, av_set_fid);
+	jstate = NULL;
+	zb = NULL;
+	*mc = NULL;
+
+	ep_obj = cxip_ep->ep_obj;
+
+	/* join must be serialized through to completion */
+	ofi_genlock_lock(&ep_obj->lock);
+	if (ep_obj->coll.join_busy) {
+		ofi_genlock_unlock(&ep_obj->lock);
+		return -FI_EAGAIN;
+	}
+	/* SHORT-TERM HACK see NETCASSINI-5771 */
+	if (av_set_obj->comm_key.keytype != COMM_KEY_RANK)
+		ep_obj->coll.join_busy = true;
+	ofi_genlock_unlock(&ep_obj->lock);
+
+	/* allocate state to pass arguments through callbacks */
+	jstate = calloc(1, sizeof(*jstate));
+	if (!jstate) {
+		ret = -FI_ENOMEM;
+		goto fail;
+	}
+
+	jstate->ep_obj = ep_obj;
+	jstate->av_set_obj = av_set_obj;
+	jstate->mc = mc;
+	jstate->context = context;
+	jstate->join_flags = flags;
+	jstate->sched_state = state_init;
+	jstate->join_idx = ofi_atomic_inc32(&ep_obj->coll.join_cnt);
+
+	/* rank 0 (av_set_obj->fi_addr_cnt[0]) does zb broadcast, so all nodes
+	 * will share whatever bcast_data rank 0 ends up with.
+	 */
+
+	ret = -FI_EINVAL;
+	switch (av_set_obj->comm_key.keytype) {
+	case COMM_KEY_NONE:
+		/* Production case, acquire multicast from FM */
+		if (is_netsim(ep_obj)) {
+			CXIP_INFO("NETSIM COMM_KEY_NONE not supported\n");
+			goto fail;
+		}
+		TRACE_JOIN("%s: MULTICAST CURL model setup\n", __func__);
+		jstate->mynode_idx =
+			_caddr_to_idx(av_set_obj, ep_obj->src_addr);
+		jstate->mynode_fiaddr =
+			av_set_obj->fi_addr_ary[jstate->mynode_idx];
+		jstate->simrank = ZB_NOSIM;
+		jstate->pid_idx = CXIP_PTL_IDX_COLL;
+		jstate->bcast_data.hwroot_idx = 0;
+		jstate->bcast_data.mcast_addr = 0;
+		jstate->bcast_data.valid = false;
+		jstate->is_rank = false;
+		jstate->is_mcast = true;
+		jstate->create_mcast = (jstate->mynode_idx == 0);
+		jstate->rx_discard = true;
+		link_zb = false;
+		break;
+	case COMM_KEY_MULTICAST:
+		/* Real network test with predefined multicast address */
+		if (is_netsim(ep_obj)) {
+			CXIP_INFO("NETSIM COMM_KEY_MULTICAST not supported\n");
+			goto fail;
+		}
+		TRACE_JOIN("%s: MULTICAST prefab model setup\n", __func__);
+		jstate->mynode_idx =
+			_caddr_to_idx(av_set_obj, ep_obj->src_addr);
+		jstate->mynode_fiaddr =
+			av_set_obj->fi_addr_ary[jstate->mynode_idx];
+		jstate->simrank = ZB_NOSIM;
+		jstate->pid_idx = CXIP_PTL_IDX_COLL;
+		jstate->bcast_data.hwroot_idx =
+			av_set_obj->comm_key.mcast.hwroot_idx;
+		jstate->bcast_data.mcast_addr =
+			av_set_obj->comm_key.mcast.mcast_addr;
+		jstate->bcast_data.valid = true;
+		jstate->is_rank = false;
+		jstate->is_mcast = true;
+		jstate->create_mcast = false;
+		jstate->rx_discard = true;
+		link_zb = false;
+		break;
+	case COMM_KEY_UNICAST:
+		/* Real network test without multicast address */
+		if (is_netsim(ep_obj)) {
+			CXIP_INFO("NETSIM COMM_KEY_UNICAST not supported\n");
+			goto fail;
+		}
+		TRACE_JOIN("%s: UNICAST model setup\n", __func__);
+		jstate->mynode_idx =
+			_caddr_to_idx(av_set_obj, ep_obj->src_addr);
+		jstate->mynode_fiaddr =
+			av_set_obj->fi_addr_ary[jstate->mynode_idx];
+		jstate->simrank = ZB_NOSIM;
+		jstate->pid_idx = CXIP_PTL_IDX_COLL;
+		jstate->bcast_data.hwroot_idx =
+			av_set_obj->comm_key.ucast.hwroot_idx;
+		jstate->bcast_data.mcast_addr =
+			av_set_obj->comm_key.ucast.mcast_addr;
+		jstate->bcast_data.valid = false;
+		jstate->is_rank = false;
+		jstate->is_mcast = false;
+		jstate->create_mcast = false;
+		jstate->rx_discard = true;
+		link_zb = false;
+		break;
+	case COMM_KEY_RANK:
+		/* Single process simulation, can run under NETSIM */
+		TRACE_JOIN("%s: COMM_KEY_RANK model setup\n", __func__);
+		jstate->mynode_idx = av_set_obj->comm_key.rank.rank;
+		jstate->mynode_fiaddr = (fi_addr_t)jstate->mynode_idx;
+		jstate->simrank = jstate->mynode_idx;
+		jstate->pid_idx = CXIP_PTL_IDX_COLL + jstate->simrank;
+		jstate->bcast_data.hwroot_idx = 0;
+		jstate->bcast_data.mcast_addr = ep_obj->src_addr.nic;
+		jstate->bcast_data.valid = true;
+		jstate->is_rank = true;
+		jstate->is_mcast = false;
+		jstate->create_mcast = false;
+		jstate->rx_discard = av_set_obj->comm_key.rank.rx_discard;
+		link_zb = true;
+		break;
+	default:
+		CXIP_INFO("unexpected comm_key keytype: %d\n",
+			  av_set_obj->comm_key.keytype);
+		goto fail;
+	}
+
+	/* Reject if a rank tries to join a group it doesn't belong to */
+	ret = jstate->mynode_idx;
+	if (ret < 0) {
+		TRACE_JOIN("May not participate\n");
+		goto fail;
+	}
+
+	/* Acquire a zbcoll identifier */
+	TRACE_JOIN("%s: allocate zb\n", __func__);
+	ret = cxip_zbcoll_alloc(jstate->ep_obj,
+				jstate->av_set_obj->fi_addr_cnt,
+				jstate->av_set_obj->fi_addr_ary,
+				jstate->simrank, &zb);
+	TRACE_JOIN("%s: returned=%d\n", __func__, ret);
+	if (ret)
+		goto fail;
+
+	/* Install the callback function for zb collectives */
+	TRACE_JOIN("%s: cxip_zbcoll_set_user_cb\n", __func__);
+	cxip_zbcoll_set_user_cb(zb, _append_sched, jstate);
+
+	/* If COMM_KEY_RANK, join is called for each rank */
+	if (link_zb) {
+		static struct cxip_zbcoll_obj *zb0 = NULL;
+		static int zb0_count = 0;
+		int rank = av_set_obj->comm_key.rank.rank;
+
+		/* first call sets the zb0 simulated endpoint */
+		TRACE_JOIN("%s: rank = %d, zb0_count=%d\n", __func__, rank, zb0_count);
+		if (!zb0_count++) {
+			/* first must be rank 0 */
+			if (rank != 0) {
+				TRACE_JOIN("%s: rank %d not 0\n", __func__, rank);
+				ret = -FI_EINVAL;
+				goto fail;
+			}
+			zb0 = zb;
+			TRACE_JOIN("%s: zb0=%p zb=%p\n", __func__, zb0, zb);
+		}
+		/* link this zb to zb0 */
+		ret = cxip_zbcoll_simlink(zb0, zb);
+		if (ret) {
+			TRACE_JOIN("%s: return=%d\n", __func__, ret);
+			return ret;
+		}
+		/* after the last, we need to reset this */
+		if (zb0_count == av_set_obj->fi_addr_cnt) {
+			zb0_count = 0;
+			zb0 = NULL;
+		}
+	}
+
+	jstate->zb = zb;
+	_append_sched(zb, jstate);
+
+	return FI_SUCCESS;
+
+fail:
+	/* this path returns error, does not post to EQ */
+	TRACE_JOIN("cxip_join_collective, ret=%d\n", ret);
+	cxip_zbcoll_free(zb);
+	free(jstate);
+	ep_obj->coll.join_busy = false;
+
+	return ret;
+}
+
+/* Exported to be called by EQ read function */
+void cxip_coll_progress_join(struct cxip_ep_obj *ep_obj)
+{
+	ofi_genlock_lock(&ep_obj->lock);
+
+	/* progress the work schedule */
+	_progress_join(ep_obj);
+
+	/* don't want handle returned, callback function manages it */
+	cxip_curl_progress(NULL);
+
+	/* progress the underlying zbcoll */
+	cxip_ep_zbcoll_progress(ep_obj);
+
+	ofi_genlock_unlock(&ep_obj->lock);
+}
+
+/* Reset all of the diagnostic counters */
+void cxip_coll_reset_mc_ctrs(struct fid_mc *mc)
+{
+	struct cxip_coll_mc *mc_obj = (struct cxip_coll_mc *)mc;
+
+	ofi_atomic_set32(&mc_obj->coll_pte->recv_cnt, 0);
+	ofi_atomic_set32(&mc_obj->send_cnt, 0);
+	ofi_atomic_set32(&mc_obj->recv_cnt, 0);
+	ofi_atomic_set32(&mc_obj->pkt_cnt, 0);
+	ofi_atomic_set32(&mc_obj->seq_err_cnt, 0);
+	ofi_atomic_set32(&mc_obj->tmout_cnt, 0);
+}
+
+/****************************************************************************
+ * Manage the static coll structure in the EP. Because of its specialized
+ * nature, it made sense to manage it here, rather than in the EP module.
+ */
+struct fi_ops_collective cxip_collective_ops = {
+	.size = sizeof(struct fi_ops_collective),
+	.barrier = cxip_barrier,
+	.broadcast = cxip_broadcast,
+	.alltoall = fi_coll_no_alltoall,
+	.allreduce = cxip_allreduce,
+	.allgather = fi_coll_no_allgather,
+	.reduce_scatter = fi_coll_no_reduce_scatter,
+	.reduce = cxip_reduce,
+	.scatter = fi_coll_no_scatter,
+	.gather = fi_coll_no_gather,
+	.msg = fi_coll_no_msg,
+};
+
+struct fi_ops_collective cxip_collective_no_ops = {
+	.size = sizeof(struct fi_ops_collective),
+	.barrier = fi_coll_no_barrier,
+	.broadcast = fi_coll_no_broadcast,
+	.alltoall = fi_coll_no_alltoall,
+	.allreduce = fi_coll_no_allreduce,
+	.allgather = fi_coll_no_allgather,
+	.reduce_scatter = fi_coll_no_reduce_scatter,
+	.reduce = fi_coll_no_reduce,
+	.scatter = fi_coll_no_scatter,
+	.gather = fi_coll_no_gather,
+	.msg = fi_coll_no_msg,
+};
+
+/* Close collectives - call during EP close, ep_obj->lock is held */
+void cxip_coll_close(struct cxip_ep_obj *ep_obj)
+{
+	struct cxip_coll_mc *mc_obj;
+
+	while (!dlist_empty(&ep_obj->coll.mc_list)) {
+		dlist_pop_front(&ep_obj->coll.mc_list,
+				struct cxip_coll_mc, mc_obj, entry);
+		_close_mc(mc_obj);
+	}
+}
+
+/**
+ * Initialize collectives - call during EP init */
+void cxip_coll_init(struct cxip_ep_obj *ep_obj)
+{
+	cxip_coll_populate_opcodes();
+
+	memset(&ep_obj->coll.mcast_map, 0, sizeof(ep_obj->coll.mcast_map));
+	dlist_ts_init(&ep_obj->coll.sched_list);
+	dlist_init(&ep_obj->coll.mc_list);
+	ep_obj->coll.rx_cmdq = NULL;
+	ep_obj->coll.tx_cmdq = NULL;
+	ep_obj->coll.rx_cntr = NULL;
+	ep_obj->coll.tx_cntr = NULL;
+	ep_obj->coll.rx_evtq = NULL;
+	ep_obj->coll.tx_evtq = NULL;
+	ep_obj->coll.min_multi_recv = CXIP_COLL_MIN_MULTI_RECV;
+	ep_obj->coll.buffer_count = CXIP_COLL_MIN_RX_BUFS;
+	ep_obj->coll.buffer_size = CXIP_COLL_MIN_RX_SIZE;
+
+	ofi_atomic_initialize32(&ep_obj->coll.num_mc, 0);
+	ofi_atomic_initialize32(&ep_obj->coll.join_cnt, 0);
+}
+
+/**
+ * Enable collectives - call from EP enable.
+ */
+int cxip_coll_enable(struct cxip_ep *ep)
+{
+	struct cxip_ep_obj *ep_obj = ep->ep_obj;
+
+	if (ep_obj->coll.enabled)
+		return FI_SUCCESS;
+
+	if (!(ep_obj->caps & FI_COLLECTIVE)) {
+		CXIP_INFO("FI_COLLECTIVE not requested\n");
+		return FI_SUCCESS;
+	}
+
+	/* A read-only or write-only endpoint is legal */
+	if (!(ofi_recv_allowed(ep_obj->rxc.attr.caps) &&
+	      ofi_send_allowed(ep_obj->txc.attr.caps))) {
+		CXIP_INFO("EP not recv/send, collectives not enabled\n");
+		return FI_SUCCESS;
+	}
+
+	/* Sanity checks */
+	if (ep_obj->coll.buffer_size == 0)
+		return -FI_EINVAL;
+	if (ep_obj->coll.buffer_count == 0)
+		return -FI_EINVAL;
+	if (ep_obj->coll.min_multi_recv == 0)
+		return -FI_EINVAL;
+	if (ep_obj->coll.min_multi_recv >= ep_obj->coll.buffer_size)
+		return -FI_EINVAL;
+
+	/* Bind all STD EP objects to the coll object */
+	ep_obj->coll.rx_cmdq = ep_obj->rxc.rx_cmdq;
+	ep_obj->coll.tx_cmdq = ep_obj->txc.tx_cmdq;
+	ep_obj->coll.rx_cntr = ep_obj->rxc.recv_cntr;
+	ep_obj->coll.tx_cntr = ep_obj->txc.send_cntr;
+	ep_obj->coll.rx_evtq = &ep_obj->rxc.rx_evtq;
+	ep_obj->coll.tx_evtq = &ep_obj->txc.tx_evtq;
+	ep_obj->coll.eq = ep_obj->eq;
+
+	ep->ep.collective = &cxip_collective_ops;
+	ep_obj->coll.enabled = true;
+
+	return FI_SUCCESS;
+}
+
+/* Disable collectives - call from EP disable */
+int cxip_coll_disable(struct cxip_ep_obj *ep_obj)
+{
+	if (!ep_obj->coll.enabled)
+		return FI_SUCCESS;
+
+	ep_obj->coll.enabled = false;
+	ep_obj->coll.rx_cmdq = NULL;
+	ep_obj->coll.tx_cmdq = NULL;
+	ep_obj->coll.rx_cntr = NULL;
+	ep_obj->coll.tx_cntr = NULL;
+	ep_obj->coll.rx_evtq = NULL;
+	ep_obj->coll.tx_evtq = NULL;
+	ep_obj->coll.eq = NULL;
+
+	return FI_SUCCESS;
+}
diff --git a/prov/cxi/src/cxip_cq.c b/prov/cxi/src/cxip_cq.c
new file mode 100644
index 00000000000..675d91eeb56
--- /dev/null
+++ b/prov/cxi/src/cxip_cq.c
@@ -0,0 +1,436 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved.
+ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <ofi_list.h>
+#include <ofi.h>
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_CQ, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_CQ, __VA_ARGS__)
+
+/*
+ * cxip_cq_req_complete() - Generate a completion event for the request.
+ */
+int cxip_cq_req_complete(struct cxip_req *req)
+{
+	if (req->discard) {
+		CXIP_DBG("Event discarded: %p\n", req);
+		return FI_SUCCESS;
+	}
+
+	return ofi_cq_write(&req->cq->util_cq, (void *)req->context,
+			    req->flags, req->data_len, (void *)req->buf,
+			    req->data, req->tag);
+}
+
+/*
+ * cxip_cq_req_complete() - Generate a completion event with source address for
+ * the request.
+ */
+int cxip_cq_req_complete_addr(struct cxip_req *req, fi_addr_t src)
+{
+	if (req->discard) {
+		CXIP_DBG("Event discarded: %p\n", req);
+		return FI_SUCCESS;
+	}
+
+	return ofi_cq_write_src(&req->cq->util_cq, (void *)req->context,
+				req->flags, req->data_len, (void *)req->buf,
+				req->data, req->tag, src);
+}
+
+/*
+ * proverr2errno() - Match NIC errno to Linux errno.
+ */
+int proverr2errno(int err)
+{
+	if (err == C_RC_UNDELIVERABLE)
+		return FI_EHOSTUNREACH;
+	else if (err == C_RC_VNI_NOT_FOUND)
+		return FI_ENOTCONN;
+	return FI_EIO;
+}
+
+/*
+ * cxip_cq_req_error() - Generate an error event for the request.
+ */
+int cxip_cq_req_error(struct cxip_req *req, size_t olen,
+		      int err, int prov_errno, void *err_data,
+		      size_t err_data_size, fi_addr_t src_addr)
+{
+	struct fi_cq_err_entry err_entry;
+
+	if (req->discard) {
+		CXIP_DBG("Event discarded: %p\n", req);
+		return FI_SUCCESS;
+	}
+
+	err_entry.err = err;
+	err_entry.olen = olen;
+	err_entry.err_data = err_data;
+	err_entry.err_data_size = err_data_size;
+	err_entry.len = req->data_len;
+	err_entry.prov_errno = prov_errno;
+	err_entry.flags = req->flags;
+	err_entry.data = req->data;
+	err_entry.tag = req->tag;
+	err_entry.op_context = (void *)(uintptr_t)req->context;
+	err_entry.buf = (void *)(uintptr_t)req->buf;
+	err_entry.src_addr = src_addr;
+
+	return ofi_cq_write_error(&req->cq->util_cq, &err_entry);
+}
+
+/*
+ * cxip_cq_progress() - Progress the CXI Completion Queue.
+ *
+ * The CQ lock must not be held and this function can not be
+ * called from within event queue callback processing.
+ */
+void cxip_cq_progress(struct cxip_cq *cq)
+{
+	cxip_util_cq_progress(&cq->util_cq);
+}
+
+/*
+ * cxip_util_cq_progress() - Progress function wrapper for utility CQ.
+ */
+void cxip_util_cq_progress(struct util_cq *util_cq)
+{
+	struct cxip_cq *cq = container_of(util_cq, struct cxip_cq, util_cq);
+	struct fid_list_entry *fid_entry;
+	struct dlist_entry *item;
+
+	ofi_genlock_lock(&cq->ep_list_lock);
+	dlist_foreach(&util_cq->ep_list, item) {
+		fid_entry = container_of(item, struct fid_list_entry, entry);
+		cxip_ep_progress(fid_entry->fid);
+	}
+	ofi_genlock_unlock(&cq->ep_list_lock);
+}
+
+/*
+ * cxip_cq_strerror() - Converts provider specific error information into a
+ * printable string.
+ */
+static const char *cxip_cq_strerror(struct fid_cq *cq, int prov_errno,
+				    const void *err_data, char *buf,
+				    size_t len)
+{
+	switch (prov_errno) {
+	case CXIP_PROV_ERRNO_OK:
+		return "CXIP_COLL_OK";
+	case CXIP_PROV_ERRNO_PTE:
+		return "CXIP_COLL_PTE_ERROR";
+	case CXIP_PROV_ERRNO_MCAST_INUSE:
+		return "CXIP_COLL_MCAST_IN_USE";
+	case CXIP_PROV_ERRNO_HWROOT_INUSE:
+		return "CXIP_COLL_HWROOT_IN_USE";
+	case CXIP_PROV_ERRNO_MCAST_INVALID:
+		return "CXIP_COLL_MCAST_INVALID";
+	case CXIP_PROV_ERRNO_HWROOT_INVALID:
+		return "CXIP_COLL_HWROOT_INVALID";
+	case CXIP_PROV_ERRNO_CURL:
+		return "CXIP_COLL_CURL_ERROR";
+	}
+	return cxi_rc_to_str(prov_errno);
+}
+
+/*
+ * cxip_cq_trywait - Return success if able to block waiting for CQ events.
+ */
+static int cxip_cq_trywait(void *arg)
+{
+	struct cxip_cq *cq = (struct cxip_cq *)arg;
+	struct fid_list_entry *fid_entry;
+	struct dlist_entry *item;
+
+	assert(cq->util_cq.wait);
+
+	if (!cq->priv_wait) {
+		CXIP_WARN("No CXI wait object\n");
+		return -FI_EINVAL;
+	}
+
+	ofi_genlock_lock(&cq->ep_list_lock);
+	dlist_foreach(&cq->util_cq.ep_list, item) {
+		fid_entry = container_of(item, struct fid_list_entry, entry);
+		if (cxip_ep_peek(fid_entry->fid)) {
+			ofi_genlock_unlock(&cq->ep_list_lock);
+
+			return -FI_EAGAIN;
+		}
+	}
+
+	/* Clear wait, and check for any events */
+	cxil_clear_wait_obj(cq->priv_wait);
+	dlist_foreach(&cq->util_cq.ep_list, item) {
+		fid_entry = container_of(item, struct fid_list_entry, entry);
+		if (cxip_ep_peek(fid_entry->fid)) {
+			ofi_genlock_unlock(&cq->ep_list_lock);
+
+			return -FI_EAGAIN;
+		}
+	}
+	ofi_genlock_unlock(&cq->ep_list_lock);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_cq_flush_trig_reqs() - Flush all triggered requests on the CQ.
+ *
+ * This function will free all triggered requests associated with the
+ * CQ. This should only be called after canceling triggered operations
+ * against all counters in use and verifying the cancellations have
+ * completed successfully.
+ */
+void cxip_cq_flush_trig_reqs(struct cxip_cq *cq)
+{
+	struct fid_list_entry *fid_entry;
+	struct dlist_entry *item;
+	struct cxip_ep *ep;
+
+	ofi_genlock_lock(&cq->ep_list_lock);
+	dlist_foreach(&cq->util_cq.ep_list, item) {
+		fid_entry = container_of(item, struct fid_list_entry, entry);
+		ep = container_of(fid_entry->fid, struct cxip_ep, ep.fid);
+
+		cxip_ep_flush_trig_reqs(ep->ep_obj);
+	}
+	ofi_genlock_unlock(&cq->ep_list_lock);
+}
+
+/*
+ * cxip_cq_close() - Destroy the Completion Queue object.
+ */
+static int cxip_cq_close(struct fid *fid)
+{
+	struct cxip_cq *cq = container_of(fid, struct cxip_cq,
+					  util_cq.cq_fid.fid);
+	int ret;
+
+	if (ofi_atomic_get32(&cq->util_cq.ref))
+		return -FI_EBUSY;
+
+	if (cq->priv_wait) {
+		ret = ofi_wait_del_fd(cq->util_cq.wait,
+				      cxil_get_wait_obj_fd(cq->priv_wait));
+		if (ret)
+			CXIP_WARN("Wait FD delete error: %d\n", ret);
+
+		ret = cxil_destroy_wait_obj(cq->priv_wait);
+		if (ret)
+			CXIP_WARN("Release CXI wait object failed: %d\n", ret);
+	}
+
+	ofi_cq_cleanup(&cq->util_cq);
+	ofi_genlock_destroy(&cq->ep_list_lock);
+	cxip_domain_remove_cq(cq->domain, cq);
+
+	free(cq);
+
+	return 0;
+}
+
+static struct fi_ops cxip_cq_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_cq_close,
+	.bind = fi_no_bind,
+	.control = ofi_cq_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_cq_attr cxip_cq_def_attr = {
+	.flags = 0,
+	.format = FI_CQ_FORMAT_CONTEXT,
+	.wait_obj = FI_WAIT_NONE,
+	.signaling_vector = 0,
+	.wait_cond = FI_CQ_COND_NONE,
+	.wait_set = NULL,
+};
+
+/*
+ * cxip_cq_verify_attr() - Verify input Completion Queue attributes.
+ */
+static int cxip_cq_verify_attr(struct fi_cq_attr *attr)
+{
+	if (!attr)
+		return FI_SUCCESS;
+
+	switch (attr->format) {
+	case FI_CQ_FORMAT_CONTEXT:
+	case FI_CQ_FORMAT_MSG:
+	case FI_CQ_FORMAT_DATA:
+	case FI_CQ_FORMAT_TAGGED:
+		break;
+	case FI_CQ_FORMAT_UNSPEC:
+		attr->format = cxip_cq_def_attr.format;
+		break;
+	default:
+		CXIP_WARN("Unsupported CQ attribute format: %d\n",
+			  attr->format);
+		return -FI_ENOSYS;
+	}
+
+	/* Applications should set wait_obj == FI_WAIT_NONE for best
+	 * performance. However, if a wait_obj is required and not
+	 * specified, default to FI_WAIT_FD.
+	 */
+	switch (attr->wait_obj) {
+	case FI_WAIT_UNSPEC:
+		attr->wait_obj = FI_WAIT_FD;
+		break;
+	case FI_WAIT_NONE:
+	case FI_WAIT_FD:
+	case FI_WAIT_POLLFD:
+		break;
+	default:
+		CXIP_WARN("Unsupported CQ wait object: %d\n",
+			  attr->wait_obj);
+		return -FI_ENOSYS;
+	}
+
+	/* Use environment variable to allow for dynamic setting of default CQ
+	 * size.
+	 */
+	if (!attr->size)
+		attr->size = cxip_env.default_cq_size;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_cq_alloc_priv_wait - Allocate an internal wait channel for the CQ.
+ */
+static int cxip_cq_alloc_priv_wait(struct cxip_cq *cq)
+{
+	int ret;
+	int wait_fd;
+
+	assert(cq->domain);
+
+	/* Not required or already created */
+	if (!cq->util_cq.wait || cq->priv_wait)
+		return FI_SUCCESS;
+
+	ret = cxil_alloc_wait_obj(cq->domain->lni->lni, &cq->priv_wait);
+	if (ret) {
+		CXIP_WARN("Allocation of internal wait object failed %d\n",
+			  ret);
+		return ret;
+	}
+
+	wait_fd = cxil_get_wait_obj_fd(cq->priv_wait);
+	ret = fi_fd_nonblock(wait_fd);
+	if (ret) {
+		CXIP_WARN("Unable to set CQ wait non-blocking mode: %d\n", ret);
+		goto destroy_wait;
+	}
+
+	ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, POLLIN,
+			      cxip_cq_trywait, cq, &cq->util_cq.cq_fid.fid);
+	if (ret) {
+		CXIP_WARN("Add FD of internal wait object failed: %d\n", ret);
+		goto destroy_wait;
+	}
+
+	CXIP_DBG("Add CQ private wait object, CQ intr FD: %d\n", wait_fd);
+
+	return FI_SUCCESS;
+
+destroy_wait:
+	cxil_destroy_wait_obj(cq->priv_wait);
+	cq->priv_wait = NULL;
+
+	return ret;
+}
+
+/*
+ * cxip_cq_open() - Allocate a new Completion Queue object.
+ */
+int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
+		 struct fid_cq **cq, void *context)
+{
+	struct cxip_domain *cxi_dom;
+	struct cxip_cq *cxi_cq;
+	int ret;
+
+	if (!domain || !cq)
+		return -FI_EINVAL;
+
+	cxi_dom = container_of(domain, struct cxip_domain,
+			       util_domain.domain_fid);
+
+	ret = cxip_cq_verify_attr(attr);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	cxi_cq = calloc(1, sizeof(*cxi_cq));
+	if (!cxi_cq)
+		return -FI_ENOMEM;
+
+	if (!attr) {
+		cxi_cq->attr = cxip_cq_def_attr;
+		cxi_cq->attr.size = cxip_env.default_cq_size;
+	} else {
+		cxi_cq->attr = *attr;
+	}
+
+	ret = ofi_cq_init(&cxip_prov, domain, &cxi_cq->attr, &cxi_cq->util_cq,
+			  cxip_util_cq_progress, context);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("ofi_cq_init() failed: %d\n", ret);
+		goto err_util_cq;
+	}
+
+	cxi_cq->util_cq.cq_fid.ops->strerror = &cxip_cq_strerror;
+	cxi_cq->util_cq.cq_fid.fid.ops = &cxip_cq_fi_ops;
+
+	cxi_cq->domain = cxi_dom;
+	cxi_cq->ack_batch_size = cxip_env.eq_ack_batch_size;
+
+	/* Optimize locking when possible */
+	if (cxi_dom->util_domain.threading == FI_THREAD_DOMAIN ||
+	    cxi_dom->util_domain.threading == FI_THREAD_COMPLETION)
+		ofi_genlock_init(&cxi_cq->ep_list_lock, OFI_LOCK_NONE);
+	else
+		ofi_genlock_init(&cxi_cq->ep_list_lock, OFI_LOCK_SPINLOCK);
+
+	if (cxi_cq->util_cq.wait) {
+		ret = cxip_cq_alloc_priv_wait(cxi_cq);
+		if (ret != FI_SUCCESS) {
+			CXIP_WARN("Unable to allocate CXI wait obj: %d\n",
+				  ret);
+			goto err_wait_alloc;
+		}
+	}
+
+	cxip_domain_add_cq(cxi_dom, cxi_cq);
+	*cq = &cxi_cq->util_cq.cq_fid;
+
+	return FI_SUCCESS;
+
+err_wait_alloc:
+	ofi_cq_cleanup(&cxi_cq->util_cq);
+err_util_cq:
+	free(cxi_cq);
+
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_ctrl.c b/prov/cxi/src/cxip_ctrl.c
new file mode 100644
index 00000000000..3d484dcdccd
--- /dev/null
+++ b/prov/cxi/src/cxip_ctrl.c
@@ -0,0 +1,789 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2017 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <ofi_util.h>
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+/*
+ * cxip_ctrl_msg_cb() - Process control message target events.
+ */
+int cxip_ctrl_msg_cb(struct cxip_ctrl_req *req, const union c_event *event)
+{
+	uint32_t pid_bits = req->ep_obj->domain->iface->dev->info.pid_bits;
+	uint32_t nic_addr;
+	uint32_t pid;
+	union cxip_match_bits mb = {
+		.raw = event->tgt_long.match_bits,
+	};
+	uint32_t init = event->tgt_long.initiator.initiator.process;
+	int ret __attribute__((unused));
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_MATCH:
+		break;
+	case C_EVENT_PUT:
+		assert(cxi_event_rc(event) == C_RC_OK);
+
+		nic_addr = CXI_MATCH_ID_EP(pid_bits, init);
+		pid = CXI_MATCH_ID_PID(pid_bits, init);
+
+		switch (mb.ctrl_msg_type) {
+		case CXIP_CTRL_MSG_FC_NOTIFY:
+			ret = cxip_fc_process_drops(req->ep_obj, nic_addr, pid,
+						    mb.drops);
+			assert(ret == FI_SUCCESS);
+
+			break;
+		case CXIP_CTRL_MSG_FC_RESUME:
+			ret = cxip_fc_resume(req->ep_obj, nic_addr, pid);
+			assert(ret == FI_SUCCESS);
+
+			break;
+		case CXIP_CTRL_MSG_ZB_DATA:
+			ret = cxip_zbcoll_recv_cb(req->ep_obj, nic_addr, pid,
+						  mb.raw);
+			assert(ret == FI_SUCCESS);
+			break;
+		default:
+			CXIP_FATAL("Unexpected msg type: %d\n",
+				   mb.ctrl_msg_type);
+		}
+
+		break;
+	default:
+		CXIP_FATAL(CXIP_UNEXPECTED_EVENT,
+			   cxi_event_to_str(event),
+			   cxi_rc_to_str(cxi_event_rc(event)));
+	}
+
+	CXIP_DBG("got event: %s rc: %s (req: %p)\n",
+		 cxi_event_to_str(event),
+		 cxi_rc_to_str(cxi_event_rc(event)),
+		 req);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ctrl_msg_send() - Send a control message.
+ *
+ * Caller should hold req->ep_obj->lock.
+ */
+int cxip_ctrl_msg_send(struct cxip_ctrl_req *req)
+{
+	struct cxip_cmdq *txq;
+	union c_fab_addr dfa;
+	uint8_t idx_ext;
+	uint32_t pid_bits;
+	union c_cmdu cmd = {};
+	uint32_t match_id;
+	int ret;
+
+	txq = req->ep_obj->ctrl_txq;
+	pid_bits = req->ep_obj->domain->iface->dev->info.pid_bits;
+	cxi_build_dfa(req->send.nic_addr, req->send.pid, pid_bits,
+		      CXIP_PTL_IDX_CTRL, &dfa, &idx_ext);
+	match_id = CXI_MATCH_ID(pid_bits, req->ep_obj->src_addr.pid,
+				req->ep_obj->src_addr.nic);
+
+	cmd.c_state.event_send_disable = 1;
+	cmd.c_state.index_ext = idx_ext;
+	cmd.c_state.eq = req->ep_obj->ctrl_tx_evtq->eqn;
+	cmd.c_state.initiator = match_id;
+
+	if (!req->ep_obj->ctrl_tx_credits) {
+		CXIP_WARN("Control TX credits exhausted\n");
+		return -FI_EAGAIN;
+	}
+
+	req->ep_obj->ctrl_tx_credits--;
+
+	ret = cxip_cmdq_emit_c_state(txq, &cmd.c_state);
+	if (ret) {
+		CXIP_DBG("Failed to issue C_STATE command: %d\n", ret);
+		goto err_return_credit;
+	}
+
+	memset(&cmd.idc_msg, 0, sizeof(cmd.idc_msg));
+	cmd.idc_msg.dfa = dfa;
+	cmd.idc_msg.match_bits = req->send.mb.raw;
+	cmd.idc_msg.user_ptr = (uint64_t)req;
+
+	ret = cxi_cq_emit_idc_msg(txq->dev_cmdq, &cmd.idc_msg, NULL, 0);
+	if (ret) {
+		CXIP_DBG("Failed to write IDC: %d\n", ret);
+
+		/* Return error according to Domain Resource Management
+		 */
+		ret = -FI_EAGAIN;
+		goto err_return_credit;
+	}
+
+	cxi_cq_ring(txq->dev_cmdq);
+
+	CXIP_DBG("Queued control message: %p\n", req);
+
+	return FI_SUCCESS;
+
+err_return_credit:
+	req->ep_obj->ctrl_tx_credits++;
+
+	return ret;
+}
+
+/*
+ * cxip_ctrl_msg_init() - Initialize control messaging resources.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_ctrl_msg_init(struct cxip_ep_obj *ep_obj)
+{
+	const union c_event *event;
+	int ret;
+	uint32_t le_flags;
+	union cxip_match_bits mb = {
+		.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG,
+	};
+	union cxip_match_bits ib = {
+		.raw = ~0,
+	};
+
+	ret = cxip_domain_ctrl_id_alloc(ep_obj->domain, &ep_obj->ctrl_msg_req);
+	if (ret) {
+		CXIP_WARN("Failed to allocate MR buffer ID: %d\n", ret);
+		return -FI_ENOSPC;
+	}
+	ep_obj->ctrl_msg_req.ep_obj = ep_obj;
+	ep_obj->ctrl_msg_req.cb = cxip_ctrl_msg_cb;
+
+	le_flags = C_LE_UNRESTRICTED_BODY_RO | C_LE_UNRESTRICTED_END_RO |
+		   C_LE_OP_PUT;
+
+	ib.ctrl_le_type = 0;
+
+	ret = cxip_pte_append(ep_obj->ctrl_pte, 0, 0, 0,
+			      C_PTL_LIST_PRIORITY, ep_obj->ctrl_msg_req.req_id,
+			      mb.raw, ib.raw, CXI_MATCH_ID_ANY, 0, le_flags,
+			      NULL, ep_obj->ctrl_tgq, true);
+	if (ret) {
+		CXIP_DBG("Failed to write Append command: %d\n", ret);
+		goto err_free_id;
+	}
+
+	/* Wait for link EQ event */
+	while (!(event = cxi_eq_get_event(ep_obj->ctrl_tgt_evtq)))
+		sched_yield();
+
+	if (event->hdr.event_type != C_EVENT_LINK ||
+	    event->tgt_long.buffer_id != ep_obj->ctrl_msg_req.req_id) {
+		/* This is a device malfunction */
+		CXIP_WARN("Invalid Link EQE %u %u %u %u\n",
+			  event->hdr.event_type,
+			  event->tgt_long.return_code,
+			  event->tgt_long.buffer_id,
+			  ep_obj->ctrl_msg_req.req_id);
+		ret = -FI_EIO;
+		goto err_free_id;
+	}
+
+	if (cxi_event_rc(event) != C_RC_OK) {
+		CXIP_WARN("Append failed: %s\n",
+			  cxi_rc_to_str(cxi_event_rc(event)));
+		ret = -FI_ENOSPC;
+		goto err_free_id;
+	}
+
+	cxi_eq_ack_events(ep_obj->ctrl_tgt_evtq);
+
+	CXIP_DBG("Control messaging initialized: %p\n", ep_obj);
+
+	return FI_SUCCESS;
+
+err_free_id:
+	cxip_domain_ctrl_id_free(ep_obj->domain, &ep_obj->ctrl_msg_req);
+
+	return ret;
+}
+
+/*
+ * cxip_ctrl_msg_fini() - Finalize control messaging resources.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+void cxip_ctrl_msg_fini(struct cxip_ep_obj *ep_obj)
+{
+	cxip_domain_ctrl_id_free(ep_obj->domain, &ep_obj->ctrl_msg_req);
+
+	CXIP_DBG("Control messaging finalized: %p\n", ep_obj);
+}
+
+/*
+ * cxip_ep_ctrl_event_req() - Look up a control request using Cassini event.
+ */
+static struct cxip_ctrl_req *cxip_ep_ctrl_event_req(struct cxip_ep_obj *ep_obj,
+						    const union c_event *event)
+{
+	struct cxip_ctrl_req *req;
+	int event_rc;
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_ACK:
+		req = (struct cxip_ctrl_req *)event->init_short.user_ptr;
+		break;
+	case C_EVENT_LINK:
+	case C_EVENT_UNLINK:
+	case C_EVENT_MATCH:
+		req = cxip_domain_ctrl_id_at(ep_obj->domain,
+					     event->tgt_long.buffer_id);
+		if (!req)
+			CXIP_WARN("Invalid buffer_id: %d (%s)\n",
+				  event->tgt_long.buffer_id,
+				  cxi_event_to_str(event));
+		break;
+	case C_EVENT_PUT:
+	case C_EVENT_GET:
+	case C_EVENT_ATOMIC:
+	case C_EVENT_FETCH_ATOMIC:
+		event_rc = cxi_event_rc(event);
+
+		if (event_rc != C_RC_ENTRY_NOT_FOUND &&
+		    event_rc != C_RC_MST_CANCELLED) {
+			req = cxip_domain_ctrl_id_at(ep_obj->domain,
+						     event->tgt_long.buffer_id);
+			if (!req)
+				CXIP_WARN("Invalid buffer_id: %d (%s)\n",
+					  event->tgt_long.buffer_id,
+					  cxi_event_to_str(event));
+			break;
+		}
+
+		req = NULL;
+
+		/* Silently drop any invalidated LE events. Since the control
+		 * PtlTE is used for non-optimized MRs, it is possible to
+		 * trigger a target error event if an invalid MR key was
+		 * specified. For such operations, it is safe to just log the
+		 * bad access attempt and drop the EQ event, the error will be
+		 * reported to the initiator.
+		 */
+		if (event_rc != C_RC_MST_CANCELLED)
+			CXIP_WARN("Unexpected %s event rc: %s\n",
+				  cxi_event_to_str(event),
+				  cxi_rc_to_str(event_rc));
+
+		break;
+	case C_EVENT_STATE_CHANGE:
+		cxip_pte_state_change(ep_obj->domain->iface, event);
+
+		req = NULL;
+		break;
+	case C_EVENT_COMMAND_FAILURE:
+		CXIP_FATAL("Command failure: cq=%u target=%u fail_loc=%u cmd_type=%u cmd_size=%u opcode=%u\n",
+			   event->cmd_fail.cq_id, event->cmd_fail.is_target,
+			   event->cmd_fail.fail_loc,
+			   event->cmd_fail.fail_command.cmd_type,
+			   event->cmd_fail.fail_command.cmd_size,
+			   event->cmd_fail.fail_command.opcode);
+	default:
+		CXIP_FATAL("Invalid event type: %d\n", event->hdr.event_type);
+	}
+
+	CXIP_DBG("got control event: %s rc: %s (req: %p)\n",
+		 cxi_event_to_str(event),
+		 cxi_rc_to_str(cxi_event_rc(event)),
+		 req);
+
+	return req;
+}
+
+/* Caller must hold ep_obj->lock. */
+static void cxip_ep_return_ctrl_tx_credits(struct cxip_ep_obj *ep_obj,
+					   unsigned int credits)
+{
+	ep_obj->ctrl_tx_credits += credits;
+}
+
+void cxip_ep_ctrl_eq_progress(struct cxip_ep_obj *ep_obj,
+			      struct cxi_eq *ctrl_evtq, bool tx_evtq,
+			      bool ep_obj_locked)
+{
+	const union c_event *event;
+	struct cxip_ctrl_req *req;
+	int ret;
+
+	/* The Control EQ is shared by a SEP. Avoid locking. */
+	if (!cxi_eq_peek_event(ctrl_evtq))
+		return;
+
+	if (!ep_obj_locked)
+		ofi_genlock_lock(&ep_obj->lock);
+
+	while ((event = cxi_eq_peek_event(ctrl_evtq))) {
+		req = cxip_ep_ctrl_event_req(ep_obj, event);
+		if (req) {
+			ret = req->cb(req, event);
+			if (ret != FI_SUCCESS)
+				break;
+		}
+
+		/* Consume and ack event. */
+		cxi_eq_next_event(ctrl_evtq);
+
+		cxi_eq_ack_events(ctrl_evtq);
+
+		if (tx_evtq)
+			cxip_ep_return_ctrl_tx_credits(ep_obj, 1);
+
+	}
+
+	if (cxi_eq_get_drops(ctrl_evtq))
+		CXIP_FATAL("Control EQ drops detected\n");
+
+	if (!ep_obj_locked)
+		ofi_genlock_unlock(&ep_obj->lock);
+}
+
+void cxip_ep_tx_ctrl_progress(struct cxip_ep_obj *ep_obj)
+{
+	cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tx_evtq, true, false);
+}
+
+void cxip_ep_tx_ctrl_progress_locked(struct cxip_ep_obj *ep_obj)
+{
+	cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tx_evtq, true, true);
+}
+
+/*
+ * cxip_ep_ctrl_progress() - Progress operations using the control EQ.
+ */
+void cxip_ep_ctrl_progress(struct cxip_ep_obj *ep_obj)
+{
+	cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, false);
+	cxip_ep_tx_ctrl_progress(ep_obj);
+}
+
+/*
+ * cxip_ep_ctrl_progress_locked() - Progress operations using the control EQ.
+ */
+void cxip_ep_ctrl_progress_locked(struct cxip_ep_obj *ep_obj)
+{
+	cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, true);
+	cxip_ep_tx_ctrl_progress_locked(ep_obj);
+}
+
+/*
+ * cxip_ep_tgt_ctrl_progress() - Progress TGT operations using the control EQ.
+ */
+void cxip_ep_tgt_ctrl_progress(struct cxip_ep_obj *ep_obj)
+{
+	cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, false);
+}
+
+/*
+ * cxip_ep_tgt_ctrl_progress_locked() - Progress operations using the control
+ * EQ.
+ */
+void cxip_ep_tgt_ctrl_progress_locked(struct cxip_ep_obj *ep_obj)
+{
+	cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, true);
+}
+
+/*
+ * cxip_ep_ctrl_trywait() - Return 0 if no events need to be progressed.
+ */
+int cxip_ep_ctrl_trywait(void *arg)
+{
+	struct cxip_ep_obj *ep_obj = (struct cxip_ep_obj *)arg;
+
+	if (!ep_obj->ctrl_wait) {
+		CXIP_WARN("No CXI ep_obj wait object\n");
+		return -FI_EINVAL;
+	}
+
+	if (cxi_eq_peek_event(ep_obj->ctrl_tgt_evtq) ||
+	    cxi_eq_peek_event(ep_obj->ctrl_tx_evtq))
+		return -FI_EAGAIN;
+
+	ofi_genlock_lock(&ep_obj->lock);
+	cxil_clear_wait_obj(ep_obj->ctrl_wait);
+
+	if (cxi_eq_peek_event(ep_obj->ctrl_tgt_evtq) ||
+	    cxi_eq_peek_event(ep_obj->ctrl_tx_evtq)) {
+		ofi_genlock_unlock(&ep_obj->lock);
+
+		return -FI_EAGAIN;
+	}
+	ofi_genlock_unlock(&ep_obj->lock);
+
+	return FI_SUCCESS;
+}
+
+static void cxip_eq_ctrl_eq_free(void *eq_buf, struct cxi_md *eq_md,
+				 struct cxi_eq *eq)
+{
+	int ret;
+
+	ret = cxil_destroy_evtq(eq);
+	if (ret)
+		CXIP_WARN("Failed to free CXI EQ: ret=%d", ret);
+
+	ret = cxil_unmap(eq_md);
+	if (ret)
+		CXIP_WARN("Failed to unmap EQ buffer: ret=%d", ret);
+
+	free(eq_buf);
+}
+
+static int cxip_ep_ctrl_eq_alloc(struct cxip_ep_obj *ep_obj, size_t len,
+				 void **eq_buf, struct cxi_md **eq_md,
+				 struct cxi_eq **eq)
+{
+	struct cxi_eq_attr eq_attr = {
+		.flags = CXI_EQ_TGT_LONG,
+	};
+	int ret;
+	int unmap_ret __attribute__((unused));
+	int page_size;
+
+	page_size = ofi_get_page_size();
+	if (page_size < 0)
+		return -ofi_syserr();
+
+	len = ofi_get_aligned_size(len, page_size);
+	*eq_buf = aligned_alloc(page_size, len);
+	if (!eq_buf) {
+		ret = -FI_ENOMEM;
+		goto err;
+	}
+
+	ret = cxil_map(ep_obj->domain->lni->lni, *eq_buf, len,
+		       CXIP_EQ_MAP_FLAGS, NULL, eq_md);
+	if (ret)
+		goto err_free_eq_buf;
+
+	eq_attr.queue = *eq_buf;
+	eq_attr.queue_len = len;
+
+	/* ep_obj->ctrl_wait will be NULL if not required */
+	ret = cxil_alloc_evtq(ep_obj->domain->lni->lni, *eq_md, &eq_attr,
+			      ep_obj->ctrl_wait, NULL, eq);
+	if (ret)
+		goto err_free_eq_md;
+
+	return FI_SUCCESS;
+
+err_free_eq_md:
+	unmap_ret = cxil_unmap(*eq_md);
+	assert(unmap_ret == 0);
+
+err_free_eq_buf:
+	free(*eq_buf);
+err:
+	return ret;
+}
+
+/*
+ * cxip_ep_wait_required() - return true if base EP wait object is required.
+ */
+static bool cxip_ctrl_wait_required(struct cxip_ep_obj *ep_obj)
+{
+	if (ep_obj->rxc.recv_cq && ep_obj->rxc.recv_cq->priv_wait)
+		return true;
+
+	if (ep_obj->txc.send_cq && ep_obj->txc.send_cq->priv_wait)
+		return true;
+
+	return false;
+}
+
+/*
+ * cxip_ep_ctrl_del_wait() - Delete control FD object
+ */
+void cxip_ep_ctrl_del_wait(struct cxip_ep_obj *ep_obj)
+{
+	int wait_fd;
+
+	wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl_wait);
+
+	if (ep_obj->txc.send_cq) {
+		ofi_wait_del_fd(ep_obj->txc.send_cq->util_cq.wait, wait_fd);
+		CXIP_DBG("Deleted control HW EQ FD: %d from CQ: %p\n",
+			 wait_fd, ep_obj->txc.send_cq);
+	}
+
+	if (ep_obj->rxc.recv_cq && ep_obj->rxc.recv_cq != ep_obj->txc.send_cq) {
+		ofi_wait_del_fd(ep_obj->rxc.recv_cq->util_cq.wait, wait_fd);
+		CXIP_DBG("Deleted control HW EQ FD: %d from CQ %p\n",
+			 wait_fd, ep_obj->rxc.recv_cq);
+	}
+}
+
+/*
+ * cxip_ep_ctrl_add_wait() - Add control FD to CQ object
+ */
+int cxip_ep_ctrl_add_wait(struct cxip_ep_obj *ep_obj)
+{
+	struct cxip_cq *cq;
+	int wait_fd;
+	int ret;
+
+	ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni,
+				  &ep_obj->ctrl_wait);
+	if (ret) {
+		CXIP_WARN("Control wait object allocation failed: %d\n", ret);
+		return -FI_ENOMEM;
+	}
+
+	wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl_wait);
+	ret = fi_fd_nonblock(wait_fd);
+	if (ret) {
+		CXIP_WARN("Unable to set control wait non-blocking: %d, %s\n",
+			  ret, fi_strerror(-ret));
+		goto err;
+	}
+
+	cq = ep_obj->txc.send_cq;
+	if (cq) {
+		ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd,
+				      POLLIN, cxip_ep_ctrl_trywait, ep_obj,
+				      &cq->util_cq.cq_fid.fid);
+		if (ret) {
+			CXIP_WARN("TX CQ add FD failed: %d, %s\n",
+				  ret, fi_strerror(-ret));
+			goto err;
+		}
+	}
+
+	if (ep_obj->rxc.recv_cq && ep_obj->rxc.recv_cq != cq) {
+		cq = ep_obj->rxc.recv_cq;
+
+		ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd,
+				      POLLIN, cxip_ep_ctrl_trywait, ep_obj,
+				      &cq->util_cq.cq_fid.fid);
+		if (ret) {
+			CXIP_WARN("RX CQ add FD failed: %d, %s\n",
+				  ret, fi_strerror(-ret));
+			goto err_add_fd;
+		}
+	}
+
+	CXIP_DBG("Added control EQ private wait object, intr FD: %d\n",
+		 wait_fd);
+
+	return FI_SUCCESS;
+
+err_add_fd:
+	if (ep_obj->txc.send_cq)
+		ofi_wait_del_fd(ep_obj->txc.send_cq->util_cq.wait, wait_fd);
+err:
+	cxil_destroy_wait_obj(ep_obj->ctrl_wait);
+	ep_obj->ctrl_wait = NULL;
+
+	return ret;
+}
+
+/*
+ * cxip_ep_ctrl_init() - Initialize endpoint control resources.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj)
+{
+	struct cxi_pt_alloc_opts pt_opts = {
+		.use_long_event = 1,
+		.is_matching = 1,
+	};
+	const union c_event *event;
+	int ret;
+	size_t rx_eq_size = MIN(cxip_env.ctrl_rx_eq_max_size,
+				ofi_universe_size * 64 +
+				ep_obj->domain->mr_match_events * 256 * 64);
+
+	/* When MR event counting has been requested turn on
+	 * delivery of match events.
+	 */
+	if (ep_obj->domain->mr_match_events)
+		pt_opts.en_event_match = 1;
+
+	/* If CQ(s) are using a wait object, then control event
+	 * queues need to unblock CQ poll as well. CQ will add the
+	 * associated FD to the CQ FD list.
+	 */
+	if (cxip_ctrl_wait_required(ep_obj)) {
+		ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni,
+					  &ep_obj->ctrl_wait);
+		if (ret) {
+			CXIP_WARN("EP ctrl wait object alloc failed: %d\n",
+				  ret);
+			return ret;
+		}
+	}
+
+	ret = cxip_ep_ctrl_eq_alloc(ep_obj, 4 * s_page_size,
+				    &ep_obj->ctrl_tx_evtq_buf,
+				    &ep_obj->ctrl_tx_evtq_buf_md,
+				    &ep_obj->ctrl_tx_evtq);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to allocate TX EQ resources, ret: %d\n", ret);
+		goto err;
+	}
+
+	ret = cxip_ep_ctrl_eq_alloc(ep_obj, rx_eq_size,
+				    &ep_obj->ctrl_tgt_evtq_buf,
+				    &ep_obj->ctrl_tgt_evtq_buf_md,
+				    &ep_obj->ctrl_tgt_evtq);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to allocate TGT EQ resources, ret: %d\n",
+			  ret);
+		goto free_tx_evtq;
+	}
+
+	ret = cxip_ep_cmdq(ep_obj, true, ep_obj->domain->tclass,
+			   ep_obj->ctrl_tx_evtq, &ep_obj->ctrl_txq);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to allocate control TXQ, ret: %d\n", ret);
+		ret = -FI_EDOMAIN;
+		goto free_tgt_evtq;
+	}
+
+	ret = cxip_ep_cmdq(ep_obj, false, ep_obj->domain->tclass,
+			   ep_obj->ctrl_tgt_evtq, &ep_obj->ctrl_tgq);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to allocate control TGQ, ret: %d\n", ret);
+		ret = -FI_EDOMAIN;
+		goto free_txq;
+	}
+
+	ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl_tgt_evtq,
+				   &pt_opts, NULL, NULL, &ep_obj->ctrl_pte);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to allocate control PTE: %d\n", ret);
+		goto free_tgq;
+	}
+
+	/* CXIP_PTL_IDX_WRITE_MR_STD is shared with CXIP_PTL_IDX_CTRL. */
+	ret = cxip_pte_map(ep_obj->ctrl_pte, CXIP_PTL_IDX_WRITE_MR_STD, false);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to map write PTE: %d\n", ret);
+		goto free_pte;
+	}
+
+	ret = cxip_pte_map(ep_obj->ctrl_pte, CXIP_PTL_IDX_READ_MR_STD, false);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to map read PTE: %d\n", ret);
+		goto free_pte;
+	}
+
+	ret = cxip_pte_set_state(ep_obj->ctrl_pte, ep_obj->ctrl_tgq,
+				 C_PTLTE_ENABLED, 0);
+	if (ret) {
+		/* This is a bug, we have exclusive access to this CMDQ. */
+		CXIP_WARN("Failed to enqueue command: %d\n", ret);
+		goto free_pte;
+	}
+
+	/* Wait for Enable event */
+	while (!(event = cxi_eq_get_event(ep_obj->ctrl_tgt_evtq)))
+		sched_yield();
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_STATE_CHANGE:
+		if (event->tgt_long.return_code != C_RC_OK ||
+		    event->tgt_long.initiator.state_change.ptlte_state !=
+		    C_PTLTE_ENABLED ||
+		    event->tgt_long.ptlte_index != ep_obj->ctrl_pte->pte->ptn)
+			CXIP_FATAL("Invalid PtlTE enable event\n");
+		break;
+	case C_EVENT_COMMAND_FAILURE:
+		CXIP_FATAL("Command failure: cq=%u target=%u fail_loc=%u cmd_type=%u cmd_size=%u opcode=%u\n",
+			   event->cmd_fail.cq_id, event->cmd_fail.is_target,
+			   event->cmd_fail.fail_loc,
+			   event->cmd_fail.fail_command.cmd_type,
+			   event->cmd_fail.fail_command.cmd_size,
+			   event->cmd_fail.fail_command.opcode);
+	default:
+		CXIP_FATAL("Invalid event type: %d\n", event->hdr.event_type);
+	}
+
+	cxi_eq_ack_events(ep_obj->ctrl_tgt_evtq);
+
+	ret = cxip_ctrl_msg_init(ep_obj);
+	if (ret != FI_SUCCESS)
+		goto free_pte;
+
+	/* Reserve 4 event queue slots to prevent EQ overrun.
+	 * 1. One slot for EQ status writeback
+	 * 2. One slot for default reserved_fc value
+	 * 3. One slot for EQ overrun detection.
+	 * 4. TODO: Determine why an additional slot needs to be reserved.
+	 */
+	ep_obj->ctrl_tx_credits =
+		ep_obj->ctrl_tx_evtq->byte_size / C_EE_CFG_ECB_SIZE - 4;
+
+	CXIP_DBG("EP control initialized: %p\n", ep_obj);
+
+	return FI_SUCCESS;
+
+free_pte:
+	cxip_pte_free(ep_obj->ctrl_pte);
+free_tgq:
+	cxip_ep_cmdq_put(ep_obj, false);
+free_txq:
+	cxip_ep_cmdq_put(ep_obj, true);
+free_tgt_evtq:
+	cxip_eq_ctrl_eq_free(ep_obj->ctrl_tgt_evtq_buf,
+			     ep_obj->ctrl_tgt_evtq_buf_md,
+			     ep_obj->ctrl_tgt_evtq);
+free_tx_evtq:
+	cxip_eq_ctrl_eq_free(ep_obj->ctrl_tx_evtq_buf,
+			     ep_obj->ctrl_tx_evtq_buf_md, ep_obj->ctrl_tx_evtq);
+err:
+	if (ep_obj->ctrl_wait) {
+		cxil_destroy_wait_obj(ep_obj->ctrl_wait);
+		ep_obj->ctrl_wait = NULL;
+	}
+
+	return ret;
+}
+
+/*
+ * cxip_ep_ctrl_fini() - Finalize endpoint control resources.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+void cxip_ep_ctrl_fini(struct cxip_ep_obj *ep_obj)
+{
+	cxip_ctrl_mr_cache_flush(ep_obj);
+	cxip_ctrl_msg_fini(ep_obj);
+	cxip_pte_free(ep_obj->ctrl_pte);
+	cxip_ep_cmdq_put(ep_obj, false);
+	cxip_ep_cmdq_put(ep_obj, true);
+
+	cxip_eq_ctrl_eq_free(ep_obj->ctrl_tgt_evtq_buf,
+			     ep_obj->ctrl_tgt_evtq_buf_md,
+			     ep_obj->ctrl_tgt_evtq);
+	cxip_eq_ctrl_eq_free(ep_obj->ctrl_tx_evtq_buf,
+			     ep_obj->ctrl_tx_evtq_buf_md, ep_obj->ctrl_tx_evtq);
+
+	if (ep_obj->ctrl_wait) {
+		cxil_destroy_wait_obj(ep_obj->ctrl_wait);
+		ep_obj->ctrl_wait = NULL;
+
+		CXIP_DBG("Deleted control EQ wait object\n");
+	}
+
+	CXIP_DBG("EP control finalized: %p\n", ep_obj);
+}
diff --git a/prov/cxi/src/cxip_curl.c b/prov/cxi/src/cxip_curl.c
new file mode 100644
index 00000000000..225512dcaa8
--- /dev/null
+++ b/prov/cxi/src/cxip_curl.c
@@ -0,0 +1,599 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <curl/curl.h>
+
+#include <ofi.h>
+
+#include "cxip.h"
+
+#define	TRACE(fmt, ...)	CXIP_TRACE(CXIP_TRC_CURL, fmt, ##__VA_ARGS__)
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_FABRIC, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_FABRIC, __VA_ARGS__)
+
+#define	CHUNK_SIZE	4096
+#define	CHUNK_MASK	(CHUNK_SIZE-1)
+
+/**
+ * Expandable buffer that can receive data in arbitrary-sized chunks.
+ */
+struct curl_buffer {
+	char *buffer;
+	size_t size;
+	size_t offset;
+};
+
+/**
+ * Allocate an expandable CURL buffer.
+ *
+ * This expands as necessary to accommodate the data, which may be delivered in
+ * chunks over the network. If you know in advance the approximate size of the
+ * return data on a large transfer, you can avoid repeated calls to realloc().
+ *
+ * @param rsp_init_size : initial size of buffer area (> 0), default 4k
+ *
+ * @return struct curl_buffer* : returned CURL buffer
+ */
+static inline struct curl_buffer *init_curl_buffer(size_t rsp_init_size)
+{
+	struct curl_buffer *buf;
+
+	if (rsp_init_size == 0)
+		rsp_init_size = 4096;
+	buf = calloc(1, sizeof(*buf));
+	if (!buf)
+		return NULL;
+
+	buf->buffer = malloc(rsp_init_size);
+	if (!buf->buffer) {
+		free(buf);
+		buf = NULL;
+	}
+
+	return buf;
+}
+
+/**
+ * Free a curl_buffer and all its data.
+ *
+ * @param buf : curl buffer to delete
+ */
+static inline void free_curl_buffer(struct curl_buffer *buf)
+{
+	if (buf)
+		free(buf->buffer);
+	free(buf);
+}
+
+/**
+ * Curl is curl-centric, facing the application (not the server). For Curl, a
+ * "write" is a write to application memory. This is the function that fills
+ * the user's curl buffer with data returned by the server.
+ *
+ * Buffer expands as needed to accommodate data. Note that this means the buffer
+ * itself must be treated as uninitialized memory beyond buf->offset (see
+ * realloc()).
+ *
+ * If the return value does not match the number of bytes requested, it will
+ * abort the transfer and the curl function will return CURLE_WRITE_ERROR.
+ *
+ * @param curl_rcvd : poiter to data received from server
+ * @param size : size of member
+ * @param nmemb : number of members
+ * @param userp : (void *)-cast curl_buffer
+ *
+ * @return ssize_t : number of bytes added
+ */
+static size_t write_callback(void *curl_rcvd, size_t size, size_t nmemb,
+			     void *userp)
+{
+	struct curl_buffer *curl_buf = (struct curl_buffer *)userp;
+	size_t sz = size * nmemb;
+	size_t need = curl_buf->offset + sz;
+
+	if (need >= curl_buf->size) {
+		curl_buf->size = (need + CHUNK_MASK) & ~CHUNK_MASK;
+		curl_buf->buffer = realloc(curl_buf->buffer, curl_buf->size);
+		if (!curl_buf->buffer)
+			return 0;
+	}
+	memcpy(&curl_buf->buffer[curl_buf->offset], curl_rcvd, sz);
+
+	curl_buf->offset += sz;
+	return sz;
+}
+
+/*
+ * The CURL library must be explicitly initialized. It is application-global,
+ * and the initialization is not thread-safe, according to the documentation. We
+ * do not protect this call, because it is running under CXI_INIT (see
+ * cxip_info.c), which is single-threaded. The curl_global_init() call can be
+ * issued multiple times (non-concurrently) and has the same end result as
+ * calling it once.
+ */
+static CURLM *cxip_curlm;
+static int cxip_curl_count;
+
+/**
+ * Initialize CURL globally for the application, enabling multi-curl
+ * (concurrent calls).
+ */
+int cxip_curl_init(void)
+{
+	int ret = FI_SUCCESS;
+	CURLcode res;
+
+	if (!cxip_curlm) {
+		res = curl_global_init(CURL_GLOBAL_DEFAULT);
+		if (res == CURLE_OK) {
+			cxip_curlm = curl_multi_init();
+			if (!cxip_curlm) {
+				curl_global_cleanup();
+				ret = -FI_EINVAL;
+			}
+		} else
+			ret = -FI_EINVAL;
+	}
+	return ret;
+}
+
+/**
+ * Globally terminate this module.
+ */
+void cxip_curl_fini(void)
+{
+	cxip_curl_count = 0;
+	if (cxip_curlm) {
+		curl_multi_cleanup(cxip_curlm);
+		curl_global_cleanup();
+		cxip_curlm = NULL;
+	}
+}
+
+/**
+ * Return a name for an opcode.
+ *
+ * @param op            : curl operation
+ * @return const char*  : printable name for curl operation
+ */
+const char *cxip_curl_opname(enum curl_ops op)
+{
+	static char * const curl_opnames[] = {
+		"GET",
+		"PUT",
+		"POST",
+		"PATCH",
+		"DELETE",
+	};
+	return (op >= 0 && op < CURL_MAX) ? curl_opnames[op] : NULL;
+}
+
+/**
+ * Free a handle created by cxip_curl_perform().
+ *
+ * @param handle : handle created by cxip_curl_perform()
+ */
+void cxip_curl_free(struct cxip_curl_handle *handle)
+{
+	if (!handle)
+		return;
+
+	free((void *)handle->endpoint);
+	free((void *)handle->request);
+	/* do not directly free handle->response (== handle->recv->buffer) */
+	free_curl_buffer((struct curl_buffer *)handle->recv);
+	free(handle);
+	cxip_curl_count -= 1;
+}
+
+/**
+ * Dispatch a CURL request.
+ *
+ * This is a general-purpose CURL multi (async) JSON format curl request.
+ *
+ * Note that this function only dispatches the request. cxip_curl_progress()
+ * must be called to progress the dispatched operations and retrieve data.
+ *
+ * The usrfunc is called in cxip_curl_progress() when the request completes,
+ * and receives the handle as its sole argument. The handle also contains an
+ * arbitrary usrptr supplied by the caller. This usrptr can contain specific
+ * information to identify which of multiple concurrent requests has completed.
+ *
+ * There are no "normal" REST errors from this call. REST errors are instead
+ * returned on attempts to progress the dispatched operation.
+ *
+ * @param endpoint      : HTTP server endpoint address
+ * @param request       : JSON-formatted request
+ * @param rsp_init_size : initial size of response buffer (can be 0)
+ * @param op            : curl operation
+ * @param verbose       : use to display sent HTTP headers
+ * @param userfunc      : user-defined completion function
+ * @param usrptr	: user-defined data pointer
+ *
+ * @return int          : 0 on success, -1 on failure
+ */
+int cxip_curl_perform(const char *endpoint, const char *request,
+		      const char *sessionToken, size_t rsp_init_size,
+		      enum curl_ops op, bool verbose,
+		      curlcomplete_t usrfunc, void *usrptr)
+{
+	struct cxip_curl_handle *handle;
+	struct curl_slist *headers;
+	char *token;
+	CURLMcode mres;
+	CURL *curl;
+	int running;
+	int ret;
+
+	TRACE("%s: usrptr=%p\n", __func__, usrptr);
+	ret = -FI_ENOMEM;
+	handle = calloc(1, sizeof(*handle));
+	if (!handle)
+		goto fail;
+	TRACE("%s: handle=%p\n", __func__, handle);
+
+	/* libcurl is fussy about NULL requests */
+	handle->endpoint = strdup(endpoint);
+	if (!handle->endpoint)
+		goto fail;
+	handle->request = strdup(request ? request : "");
+	if (!handle->request)
+		goto fail;
+	handle->response = NULL;
+	handle->recv = (void *)init_curl_buffer(rsp_init_size);
+	if (!handle->recv)
+		goto fail;
+	/* add user completion function and pointer */
+	handle->usrfunc = usrfunc;
+	handle->usrptr = usrptr;
+	TRACE("%s: handle->usrfnc=%p\n", __func__, handle->usrfunc);
+	TRACE("%s: handle->usrptr=%p\n", __func__, handle->usrptr);
+
+	ret = -FI_EACCES;
+	curl = curl_easy_init();
+	if (!curl) {
+		CXIP_WARN("curl_easy_init() failed\n");
+		goto fail;
+	}
+
+	/* HTTP 1.1 assumed */
+	headers = NULL;
+	headers = curl_slist_append(headers, "Expect:");
+	headers = curl_slist_append(headers, "Accept: application/json");
+	headers = curl_slist_append(headers, "Content-Type: application/json");
+	headers = curl_slist_append(headers, "charset: utf-8");
+	token = NULL;
+	if (sessionToken) {
+		ret = asprintf(&token, "x-xenon-auth-token: %s", sessionToken);
+		if (ret < 0) {
+			CXIP_WARN("x-xenon-auth-token create failed\n");
+			goto fail;
+		}
+		headers = curl_slist_append(headers, token);
+	}
+	handle->headers = (void *)headers;
+
+	curl_easy_setopt(curl, CURLOPT_URL, handle->endpoint);
+	if (op == CURL_GET) {
+		curl_easy_setopt(curl, CURLOPT_HTTPGET, 1L);
+	} else {
+		curl_easy_setopt(curl, CURLOPT_POST, 1L);
+		curl_easy_setopt(curl, CURLOPT_POSTFIELDS, handle->request);
+		curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE,
+				 strlen(handle->request));
+	}
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, handle->recv);
+	curl_easy_setopt(curl, CURLOPT_PRIVATE, (void *)handle);
+	curl_easy_setopt(curl, CURLOPT_VERBOSE, (long)verbose);
+	curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, cxip_curl_opname(op));
+
+	curl_multi_add_handle(cxip_curlm, curl);
+	mres = curl_multi_perform(cxip_curlm, &running);
+	if (mres != CURLM_OK) {
+		CXIP_WARN("curl_multi_perform() failed: %s\n",
+			  curl_multi_strerror(mres));
+		goto fail;
+	}
+	cxip_curl_count += 1;
+	return FI_SUCCESS;
+
+fail:
+	CXIP_WARN("%s failed %d\n", __func__, ret);
+	cxip_curl_free(handle);
+	return ret;
+}
+
+/**
+ * Progress the CURL requests.
+ *
+ * This progresses concurrent CURL requests, and returns the following:
+ *
+ * -  0 indicates an operation completed
+ * -  -FI_EAGAIN  indicates operations are pending, none completed
+ * -  -FI_ENODATA indicates no operations are pending
+ * -  -errorcode  a fatal error
+ *
+ * Repeated calls will return additional completions, until there are no more
+ * pending and -FI_ENODATA is returned.
+ *
+ * Note that a CURL request will succeed if the server is not reachable. It will
+ * return a handle->status value of 0, which is an invalid HTTP status, and
+ * indicates that it could not connect to a server.
+ *
+ * For unit testing, it is useful for the test to be able to inspect the handle
+ * directly, and it can be obtained by specifying a non-null handleptr value. If
+ * handleptr is supplied, the caller is responsible for calling cxip_curl_free()
+ * on the returned handle. In normal usage, handleptr is NULL, and this routine
+ * will clean up the handle after the operation completes.
+ *
+ * The user should provide a callback routine to examine the final state of the
+ * CURL request, as well as any data it returns: see cxip_curl_perform(). This
+ * user callback is called after completion of the request, before the handle is
+ * destroyed.
+ *
+ * The callback routine has read-only access to the handle, and read-write
+ * access to its own data area, available as handle->usrptr.
+ *
+ * The handle contains the following documented fields:
+ *
+ * - status   = HTTP status of the op, or 0 if the endpoint could not be reached
+ * - endpoint = copy of the endpoint address supplied for the post
+ * - request  = copy of the JSON request data supplied for the post
+ * - response = pointer to the JSON response returned by the endpoint
+ * - usrptr  = arbitrary user pointer supplied during CURL request
+ *
+ * @param handleptr : if not NULL, returns the request handle
+ * @return int      : return code, see above
+ */
+int cxip_curl_progress(struct cxip_curl_handle **handleptr)
+{
+	struct cxip_curl_handle *handle;
+	struct CURLMsg *msg;
+	CURLMcode mres;
+	CURLcode res;
+	int running;
+	int messages;
+	long status;
+	struct curl_buffer *recv;
+
+
+	/* This needs to be quick if nothing is pending */
+	if (!cxip_curl_count)
+		return -FI_ENODATA;
+
+	handle = NULL;
+
+	/* running returns the number of curls running */
+	mres = curl_multi_perform(cxip_curlm, &running);
+	if (mres != CURLM_OK) {
+		CXIP_WARN("curl_multi_perform() failed: %s\n",
+			  curl_multi_strerror(mres));
+		return -FI_EOTHER;
+	}
+
+	/* messages returns the number of additional curls finished */
+	msg = curl_multi_info_read(cxip_curlm, &messages);
+	if (!msg || msg->msg != CURLMSG_DONE) {
+		return (running) ? -FI_EAGAIN : -FI_ENODATA;
+	}
+
+	/* retrieve our handle from the private pointer */
+	res = curl_easy_getinfo(msg->easy_handle,
+				CURLINFO_PRIVATE, (char **)&handle);
+	if (res != CURLE_OK) {
+		CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n",
+			"CURLINFO_PRIVATE",
+			curl_easy_strerror(res));
+		return -FI_EOTHER;
+	}
+	/* handle is now valid, must eventually be freed */
+	TRACE("%s: handle=%p\n", __func__, handle);
+
+	/* retrieve the status code, should not fail */
+	res = curl_easy_getinfo(msg->easy_handle,
+				CURLINFO_RESPONSE_CODE, &status);
+	if (res != CURLE_OK) {
+		CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n",
+			"CURLINFO_RESPONSE_CODE",
+			curl_easy_strerror(res));
+		/* continue, handle->status should show zero */
+	}
+
+	/* we can recover resources now */
+	curl_slist_free_all((struct curl_slist *)handle->headers);
+	curl_easy_cleanup(msg->easy_handle);
+	handle->headers = NULL;
+
+	/* make sure response string is terminated */
+	recv = (struct curl_buffer *)handle->recv;
+	recv->buffer[recv->offset] = 0;
+	handle->response = recv->buffer;
+	handle->status = status;
+
+	/* call the user function */
+	TRACE("%s: handle->usrfnc=%p\n", __func__, handle->usrfunc);
+	TRACE("%s: handle->usrptr=%p\n", __func__, handle->usrptr);
+	if (handle->usrfunc)
+		handle->usrfunc(handle);
+	TRACE("%s: returned from usrfnc\n", __func__);
+
+	/* return the handle, or free it */
+	if (handleptr) {
+		*handleptr = handle;
+	} else {
+		cxip_curl_free(handle);
+	}
+	return FI_SUCCESS;
+}
+
+/**
+ * @brief Simplified search for JSON objects.
+ *
+ * Simplified object search using a descriptor like the following:
+ * Example: "firstkey.secondkey.arraykey[3].thirdkey"
+ *
+ * The first character is '.' or '['. If omitted, it is assumed to be '.'.
+ *
+ * The appearance of '.' indicates that the current object is expected to be
+ * a json_type_object, and the text that follows is a key within the object.
+ *
+ * The appearance of '[' must be part of a '[<size_t>]' construction, and
+ * indicates that the current object is expected to be a json_type_array, and
+ * the specified integer value is an index into the array.
+ *
+ * The descriptor allows you to dive into the structure and return the endpoint
+ * of the dive in the returned jval pointer, and returns the type of this
+ * endpoint object.
+ *
+ * Note that this is a convenience method, primarily for testing. Results are
+ * undefined if the '.' or '[' or ']' characters appear in a key.
+ *
+ * Note that the returned jval is a json_object. You can use the following
+ * libjson functions to directly extract values:
+ *
+ * - json_object_get_boolean()
+ * - json_object_get_int()
+ * - json_object_get_int64()
+ * - json_object_get_uint64()
+ * - json_object_get_double()
+ * - json_object_get_string()
+ *
+ * Note also that these functions are used in the variants below.
+ *
+ * All memory is managed by json, so on 'put' of the head object, all memory is
+ * recovered.
+ *
+ * This returns json_type_null on any error.
+ *
+ * @param desc - string descriptor of endpoint argument
+ * @param jobj - starting object
+ * @param jval - final endpoint object, or NULL
+ * @return enum json_type - type of the endpoint object
+ */
+enum json_type cxip_json_extract(const char *desc, struct json_object *jobj,
+				 struct json_object **jval)
+{
+	const char *beg;
+	struct json_object *jo;
+	enum json_type jt;
+
+	*jval = NULL;
+
+	beg = desc;
+	jo = jobj;
+	jt = json_object_get_type(jo);
+	while (*beg) {
+		if (*beg == '[') {
+			/* expect "[<integer>]" */
+			size_t idx = 0;
+			size_t len;
+
+			if (jt != json_type_array)
+				return json_type_null;
+			/* skip '[' and ensure index is not empty */
+			if (*(++beg) == ']')
+				return json_type_null;
+			idx = strtoul(beg, (char **)&beg, 10);
+			/* ensure strtol consumed index */
+			if (*(beg++) != ']')
+				return json_type_null;
+			/* check index validity */
+			len = json_object_array_length(jo);
+			if (idx >= len)
+				return json_type_null;
+			/* get the indexed object and continue */
+			jo = json_object_array_get_idx(jo, idx);
+			jt = json_object_get_type(jo);
+			continue;
+		}
+		if (beg == desc || *beg == '.') {
+			/* expect ".key" */
+			char key[256], *p = key;
+			size_t len = sizeof(key);
+
+			if (jt != json_type_object)
+				return json_type_null;
+			/* skip leading '.' */
+			if (*beg == '.')
+				beg++;
+			/* copy key from descriptor to local storage */
+			while (*beg && *beg != '.' && *beg != '[' && --len > 0)
+				*p++ = *beg++;
+			*p = 0;
+			/* extract the associated value */
+			if (!json_object_object_get_ex(jo, key, &jo))
+				return json_type_null;
+			jt = json_object_get_type(jo);
+			continue;
+		}
+	}
+
+	/* return the final object */
+	*jval = jo;
+	return jt;
+}
+
+/**
+ * @brief Simplified search for JSON terminal type values.
+ *
+ * @param desc : search descriptor for cxip_json_extract()
+ * @param jobj : starting object
+ * @param val  : return value
+ * @return int : 0 on success, -EINVAL on error
+ */
+int cxip_json_bool(const char *desc, struct json_object *jobj, bool *val)
+{
+	struct json_object *jval;
+	if (json_type_boolean != cxip_json_extract(desc, jobj, &jval))
+		return -EINVAL;
+	*val = json_object_get_boolean(jval);
+	return 0;
+}
+
+int cxip_json_int(const char *desc, struct json_object *jobj, int *val)
+{
+	struct json_object *jval;
+	if (json_type_int != cxip_json_extract(desc, jobj, &jval))
+		return -EINVAL;
+	*val = json_object_get_int(jval);
+	return 0;
+}
+
+int cxip_json_int64(const char *desc, struct json_object *jobj, int64_t *val)
+{
+	struct json_object *jval;
+	if (json_type_int != cxip_json_extract(desc, jobj, &jval))
+		return -EINVAL;
+	*val = json_object_get_int64(jval);
+	return 0;
+}
+
+int cxip_json_double(const char *desc, struct json_object *jobj, double *val)
+{
+	struct json_object *jval;
+	if (json_type_double != cxip_json_extract(desc, jobj, &jval))
+		return -EINVAL;
+	*val = json_object_get_double(jval);
+	return 0;
+}
+
+int cxip_json_string(const char *desc, struct json_object *jobj,
+		     const char **val)
+{
+	struct json_object *jval;
+	if (json_type_string != cxip_json_extract(desc, jobj, &jval))
+		return -EINVAL;
+	*val = json_object_get_string(jval);
+	return 0;
+}
diff --git a/prov/cxi/src/cxip_dom.c b/prov/cxi/src/cxip_dom.c
new file mode 100644
index 00000000000..8aa6831b0c8
--- /dev/null
+++ b/prov/cxi/src/cxip_dom.c
@@ -0,0 +1,1676 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved.
+ * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved.
+ * Copyright (c) 2018,2020-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <ofi_util.h>
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__)
+
+extern struct fi_ops_mr cxip_dom_mr_ops;
+
+/*
+ * cxip_domain_req_alloc() - Allocate a domain control buffer ID
+ */
+int cxip_domain_ctrl_id_alloc(struct cxip_domain *dom,
+			      struct cxip_ctrl_req *req)
+{
+	int buffer_id;
+
+	ofi_spin_lock(&dom->ctrl_id_lock);
+	buffer_id = ofi_idx_insert(&dom->req_ids, req);
+	if (buffer_id < 0 || buffer_id >= CXIP_BUFFER_ID_MAX) {
+		CXIP_WARN("Failed to allocate MR buffer ID: %d\n",
+			  buffer_id);
+		ofi_spin_unlock(&dom->ctrl_id_lock);
+		return -FI_ENOSPC;
+	}
+
+	ofi_spin_unlock(&dom->ctrl_id_lock);
+	req->req_id = buffer_id;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_domain_ctrl_id_free() - Free a domain wide control buffer id.
+ */
+void cxip_domain_ctrl_id_free(struct cxip_domain *dom,
+			      struct cxip_ctrl_req *req)
+{
+	/* Non-remote MR will not have a buffer ID assigned */
+	if (req->req_id < 0)
+		return;
+
+	ofi_spin_lock(&dom->ctrl_id_lock);
+	ofi_idx_remove(&dom->req_ids, req->req_id);
+	ofi_spin_unlock(&dom->ctrl_id_lock);
+}
+
+/*
+ * cxip_domain_prov_mr_key_alloc() - Allocate a domain unique
+ * non-cached FI_MR_PROV_KEY key ID.
+ */
+int cxip_domain_prov_mr_id_alloc(struct cxip_domain *dom,
+				 struct cxip_mr *mr)
+{
+	struct cxip_mr_key key = {};
+	int buffer_id;
+
+	/* Allocations favor optimized MR range (if enabled) */
+	ofi_spin_lock(&dom->ctrl_id_lock);
+	buffer_id = ofi_idx_insert(&dom->mr_ids, mr);
+	if (buffer_id < 0 || buffer_id >= CXIP_BUFFER_ID_MAX) {
+		CXIP_WARN("Failed to allocate FI_MR_PROV_KEY MR ID: %d\n",
+			  buffer_id);
+		ofi_spin_unlock(&dom->ctrl_id_lock);
+		return -FI_ENOSPC;
+	}
+
+	/* IDX 0 is reserved and should never be returned */
+	assert(buffer_id > 0);
+	buffer_id = buffer_id - 1;
+
+	mr->mr_id = buffer_id;
+	key.is_prov = 1;
+	key.id = buffer_id;
+	key.seqnum = ++dom->prov_key_seqnum;
+
+	/* Let the source know events are required and it should use
+	 * unrestricted operations.
+	 */
+	key.events = mr->count_events || mr->rma_events || mr->cntr;
+
+	key.opt = cxip_env.optimized_mrs &&
+			key.id < CXIP_PTL_IDX_PROV_MR_OPT_CNT;
+	mr->key = key.raw;
+	ofi_spin_unlock(&dom->ctrl_id_lock);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_domain_prov_mr_id_free() - Free a domain wide FI_MR_PROV_KEY MR id.
+ */
+void cxip_domain_prov_mr_id_free(struct cxip_domain *dom,
+				 struct cxip_mr *mr)
+{
+	struct cxip_mr_key key = {
+		.raw = mr->key,
+	};
+
+	/* Only non-cached FI_MR_PROV_KEY MR require MR ID */
+	if (mr->mr_id < 0)
+		return;
+
+	ofi_spin_lock(&dom->ctrl_id_lock);
+	ofi_idx_remove(&dom->mr_ids, key.id + 1);
+	ofi_spin_unlock(&dom->ctrl_id_lock);
+}
+
+#define TLE_RESERVED 8U
+
+/*
+ * cxip_domain_enable() - Enable an FI Domain for use.
+ *
+ * Allocate hardware resources and initialize software to prepare the Domain
+ * for use.
+ */
+static int cxip_domain_enable(struct cxip_domain *dom)
+{
+	int ret = FI_SUCCESS;
+	struct cxi_svc_desc svc_desc;
+
+	ofi_spin_lock(&dom->lock);
+
+	if (dom->enabled)
+		goto unlock;
+
+	ret = cxip_get_if(dom->nic_addr, &dom->iface);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Unable to get IF\n");
+		ret = -FI_ENODEV;
+		goto unlock;
+	}
+
+	ret = cxil_get_svc(dom->iface->dev, dom->auth_key.svc_id, &svc_desc);
+	if (ret) {
+		CXIP_WARN("cxil_get_svc with %s and svc_id %d failed: %d:%s\n",
+			  dom->iface->dev->info.device_name,
+			  dom->auth_key.svc_id, ret, strerror(-ret));
+		ret = -FI_EINVAL;
+		goto put_if;
+	}
+
+	if (!svc_desc.restricted_members)
+		CXIP_WARN("Security Issue: Using unrestricted service ID %d for %s. "
+			  "Please provide a service ID via auth_key fields.\n",
+			  dom->auth_key.svc_id,
+			  dom->iface->dev->info.device_name);
+	if (!svc_desc.restricted_vnis)
+		CXIP_WARN("Security Issue: Using service ID %d with unrestricted VNI access %s. "
+			  "Please provide a service ID via auth_key fields.\n",
+			  dom->auth_key.svc_id,
+			  dom->iface->dev->info.device_name);
+
+	/* Need to reserved TLEs to prevent stalling. */
+	dom->max_trig_op_in_use =
+		svc_desc.limits.type[CXI_RSRC_TYPE_TLE].res - TLE_RESERVED;
+
+	ret = cxip_alloc_lni(dom->iface, dom->auth_key.svc_id, &dom->lni);
+	if (ret) {
+		CXIP_WARN("cxip_alloc_lni returned: %d\n", ret);
+		ret = -FI_ENODEV;
+		goto put_if;
+	}
+
+	ret = cxip_iomm_init(dom);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to initialize IOMM: %d\n", ret);
+		assert(ret == -FI_ENOMEM);
+		goto free_lni;
+	}
+
+	ret = cxil_get_amo_remap_to_pcie_fadd(dom->iface->dev,
+					      &dom->amo_remap_to_pcie_fadd);
+	if (ret) {
+		CXIP_WARN("Failed to get amo_remap_to_pcie_fadd value: %d\n",
+			  ret);
+		goto iomm_fini;
+	}
+
+	cxip_mr_domain_init(&dom->mr_domain);
+
+	dom->enabled = true;
+	ofi_spin_unlock(&dom->lock);
+
+	DOM_INFO(dom, "Domain enabled\n");
+
+	/* Telemetry are considered optional and will not stop domain
+	 * allocation.
+	 */
+	ret = cxip_telemetry_alloc(dom, &dom->telemetry);
+	if (ret == FI_SUCCESS)
+		DOM_INFO(dom, "Telemetry collection enabled\n");
+
+	return FI_SUCCESS;
+
+iomm_fini:
+	cxip_iomm_fini(dom);
+free_lni:
+	cxip_free_lni(dom->lni);
+	dom->lni = NULL;
+put_if:
+	cxip_put_if(dom->iface);
+	dom->iface = NULL;
+unlock:
+	ofi_spin_unlock(&dom->lock);
+
+	return ret;
+}
+
+/*
+ * cxip_domain_disable() - Disable an FI Domain.
+ */
+static void cxip_domain_disable(struct cxip_domain *dom)
+{
+	ofi_spin_lock(&dom->lock);
+
+	if (!dom->enabled)
+		goto unlock;
+
+	DOM_INFO(dom, "Domain disabled\n");
+
+	cxip_mr_domain_fini(&dom->mr_domain);
+	cxip_dom_cntr_disable(dom);
+	cxip_iomm_fini(dom);
+	cxip_free_lni(dom->lni);
+	cxip_put_if(dom->iface);
+
+	dom->enabled = false;
+
+unlock:
+	ofi_spin_unlock(&dom->lock);
+}
+
+/*
+ * cxip_dom_close() - Provider fi_close implementation for an FI Domain object.
+ */
+static int cxip_dom_close(struct fid *fid)
+{
+	struct cxip_domain *dom;
+
+	dom = container_of(fid, struct cxip_domain,
+			   util_domain.domain_fid.fid);
+	if (ofi_atomic_get32(&dom->ref))
+		return -FI_EBUSY;
+
+	if (dom->telemetry) {
+		cxip_telemetry_dump_delta(dom->telemetry);
+		cxip_telemetry_free(dom->telemetry);
+	}
+
+	cxip_domain_disable(dom);
+
+	ofi_spin_destroy(&dom->lock);
+	ofi_spin_destroy(&dom->ctrl_id_lock);
+	ofi_idx_reset(&dom->req_ids);
+	ofi_idx_reset(&dom->mr_ids);
+	ofi_domain_close(&dom->util_domain);
+	free(dom);
+
+	return 0;
+}
+
+/*
+ * cxip_dom_bind() - Provider fi_domain_bind implementation.
+ */
+static int cxip_dom_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
+{
+	struct cxip_domain *dom;
+	struct cxip_eq *eq;
+
+	dom = container_of(fid, struct cxip_domain, util_domain.domain_fid.fid);
+	eq = container_of(bfid, struct cxip_eq, util_eq.eq_fid.fid);
+
+	if (dom->eq)
+		return -FI_EINVAL;
+
+	dom->eq = eq;
+	if (flags & FI_REG_MR)
+		dom->mr_eq = eq;
+
+	return 0;
+}
+
+static int cxip_dom_dwq_op_send(struct cxip_domain *dom, struct fi_op_msg *msg,
+				struct cxip_cntr *trig_cntr,
+				struct cxip_cntr *comp_cntr,
+				uint64_t trig_thresh)
+{
+	struct cxip_ep *ep = container_of(msg->ep, struct cxip_ep, ep);
+	const void *buf;
+	size_t len;
+	int ret;
+
+	if (!msg || msg->msg.iov_count > 1)
+		return -FI_EINVAL;
+
+	/* FI_INJECT is not supported for triggered sends */
+	if (msg->flags & FI_INJECT) {
+		CXIP_WARN("FI_INJECT not supported for triggered op\n");
+		return -FI_EINVAL;
+	}
+
+	/* To prevent triggered operation exhaustion, FI_MORE cannot be
+	 * supported.
+	 */
+	msg->flags &= ~FI_MORE;
+
+	buf = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_base : NULL;
+	len = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_len : 0;
+
+	ret = cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len,
+			       NULL, msg->msg.data, msg->msg.addr, 0,
+			       msg->msg.context, msg->flags, false, true,
+			       trig_thresh, trig_cntr, comp_cntr);
+	if (ret)
+		CXIP_DBG("Failed to emit message triggered op, ret=%d\n", ret);
+	else
+		CXIP_DBG("Queued triggered message op with threshold %lu\n",
+			 trig_thresh);
+
+	return ret;
+}
+
+static int cxip_dom_dwq_op_tsend(struct cxip_domain *dom,
+				 struct fi_op_tagged *tagged,
+				 struct cxip_cntr *trig_cntr,
+				 struct cxip_cntr *comp_cntr,
+				 uint64_t trig_thresh)
+{
+	struct cxip_ep *ep = container_of(tagged->ep, struct cxip_ep, ep);
+	const void *buf;
+	size_t len;
+	int ret;
+
+	if (!tagged || tagged->msg.iov_count > 1)
+		return -FI_EINVAL;
+
+	/* FI_INJECT is not supported for triggered tsends */
+	if (tagged->flags & FI_INJECT) {
+		CXIP_WARN("FI_INJECT not supported for triggered op\n");
+		return -FI_EINVAL;
+	}
+
+	/* To prevent triggered operation exhaustion, FI_MORE cannot be
+	 * supported.
+	 */
+	tagged->flags &= ~FI_MORE;
+
+	buf = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_base : NULL;
+	len = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_len : 0;
+
+	ret = cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len,
+			       NULL, tagged->msg.data, tagged->msg.addr,
+			       tagged->msg.tag, tagged->msg.context,
+			       tagged->flags, true, true, trig_thresh,
+			       trig_cntr, comp_cntr);
+	if (ret)
+		CXIP_DBG("Failed to emit tagged msg triggered op, ret=%d\n",
+			 ret);
+	else
+		CXIP_DBG("Queued triggered tagged msg op with threshold %lu\n",
+			 trig_thresh);
+
+	return ret;
+}
+
+static int cxip_dom_dwq_op_rma(struct cxip_domain *dom, struct fi_op_rma *rma,
+			       enum fi_op_type op, struct cxip_cntr *trig_cntr,
+			       struct cxip_cntr *comp_cntr,
+			       uint64_t trig_thresh)
+{
+	struct cxip_ep *ep = container_of(rma->ep, struct cxip_ep, ep);
+	const void *buf;
+	size_t len;
+	int ret;
+
+	if (!rma || !rma->msg.msg_iov || rma->msg.iov_count > 1 ||
+	    !rma->msg.rma_iov || rma->msg.rma_iov_count != 1)
+		return -FI_EINVAL;
+
+	/* To prevent triggered operation exhaustion, FI_MORE cannot be
+	 * supported.
+	 */
+	rma->flags &= ~FI_MORE;
+
+	buf = rma->msg.iov_count ? rma->msg.msg_iov[0].iov_base : NULL;
+	len = rma->msg.iov_count ? rma->msg.msg_iov[0].iov_len : 0;
+
+	ret = cxip_rma_common(op, &ep->ep_obj->txc, buf, len, NULL,
+			      rma->msg.addr, rma->msg.rma_iov[0].addr,
+			      rma->msg.rma_iov[0].key, rma->msg.data,
+			      rma->flags, ep->tx_attr.tclass,
+			      ep->tx_attr.msg_order, rma->msg.context, true,
+			      trig_thresh, trig_cntr, comp_cntr);
+	if (ret)
+		CXIP_DBG("Failed to emit RMA triggered op, ret=%d\n", ret);
+	else
+		CXIP_DBG("Queued triggered RMA operation with threshold %lu\n",
+			 trig_thresh);
+
+	return ret;
+}
+
+static int cxip_dom_dwq_op_atomic(struct cxip_domain *dom,
+				  struct fi_op_atomic *amo,
+				  struct cxip_cntr *trig_cntr,
+				  struct cxip_cntr *comp_cntr,
+				  uint64_t trig_thresh)
+{
+	struct cxip_ep *ep = container_of(amo->ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+	int ret;
+
+	if (!amo)
+		return -FI_EINVAL;
+
+	/* To prevent triggered operation exhaustion, FI_MORE cannot be
+	 * supported.
+	 */
+	amo->flags &= ~FI_MORE;
+
+	ret = cxip_amo_common(CXIP_RQ_AMO, txc, txc->tclass, &amo->msg,
+			      NULL, NULL, 0, NULL, NULL, 0, amo->flags,
+			      true, trig_thresh, trig_cntr, comp_cntr);
+	if (ret)
+		CXIP_DBG("Failed to emit AMO triggered op, ret=%d\n", ret);
+	else
+		CXIP_DBG("Queued triggered AMO operation with threshold %lu\n",
+			 trig_thresh);
+
+	return ret;
+}
+
+static int cxip_dom_dwq_op_fetch_atomic(struct cxip_domain *dom,
+					struct fi_op_fetch_atomic *fetch_amo,
+					struct cxip_cntr *trig_cntr,
+					struct cxip_cntr *comp_cntr,
+					uint64_t trig_thresh)
+{
+	struct cxip_ep *ep = container_of(fetch_amo->ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+	int ret;
+
+	if (!fetch_amo)
+		return -FI_EINVAL;
+
+	/* To prevent triggered operation exhaustion, FI_MORE cannot be
+	 * supported.
+	 */
+	fetch_amo->flags &= ~FI_MORE;
+
+	ret = cxip_amo_common(CXIP_RQ_AMO_FETCH, txc, txc->tclass,
+			      &fetch_amo->msg, NULL, NULL, 0,
+			      fetch_amo->fetch.msg_iov, fetch_amo->fetch.desc,
+			      fetch_amo->fetch.iov_count, fetch_amo->flags,
+			      true, trig_thresh, trig_cntr, comp_cntr);
+	if (ret)
+		CXIP_DBG("Failed to emit fetching AMO triggered op, ret=%d\n",
+			 ret);
+	else
+		CXIP_DBG("Queued triggered fetching AMO op with threshold %lu\n",
+			 trig_thresh);
+
+	return ret;
+}
+
+static int cxip_dom_dwq_op_comp_atomic(struct cxip_domain *dom,
+				       struct fi_op_compare_atomic *comp_amo,
+				       struct cxip_cntr *trig_cntr,
+				       struct cxip_cntr *comp_cntr,
+				       uint64_t trig_thresh)
+{
+	struct cxip_ep *ep = container_of(comp_amo->ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+	int ret;
+
+	if (!comp_amo)
+		return -FI_EINVAL;
+
+	/* To prevent triggered operation exhaustion, FI_MORE cannot be
+	 * supported.
+	 */
+	comp_amo->flags &= ~FI_MORE;
+
+	ret = cxip_amo_common(CXIP_RQ_AMO_SWAP, txc, txc->tclass,
+			      &comp_amo->msg, comp_amo->compare.msg_iov,
+			      comp_amo->compare.desc,
+			      comp_amo->compare.iov_count,
+			      comp_amo->fetch.msg_iov, comp_amo->fetch.desc,
+			      comp_amo->fetch.iov_count, comp_amo->flags, true,
+			      trig_thresh, trig_cntr, comp_cntr);
+	if (ret)
+		CXIP_DBG("Failed to emit compare AMO triggered op, ret=%d\n",
+			 ret);
+	else
+		CXIP_DBG("Queued triggered compare AMO op with threshold %lu\n",
+			 trig_thresh);
+
+	return ret;
+}
+
+static int cxip_dom_dwq_op_cntr(struct cxip_domain *dom,
+				struct fi_op_cntr *cntr, enum fi_op_type op,
+				struct cxip_cntr *trig_cntr,
+				struct cxip_cntr *comp_cntr,
+				uint64_t trig_thresh,
+				bool cntr_wb)
+{
+	struct cxip_cntr *op_cntr;
+	int ret;
+	unsigned opcode;
+	struct c_ct_cmd cmd = {};
+
+	/* Completion counter must be NULL. */
+	if (!cntr || !cntr->cntr || comp_cntr)
+		return -FI_EINVAL;
+
+	if (cntr_wb) {
+		opcode = C_CMD_CT_TRIG_EVENT;
+		cmd.eq = C_EQ_NONE;
+	} else {
+		opcode = op == FI_OP_CNTR_SET ?
+			C_CMD_CT_TRIG_SET : C_CMD_CT_TRIG_INC;
+	}
+
+	op_cntr = container_of(cntr->cntr, struct cxip_cntr, cntr_fid);
+
+	cmd.trig_ct = trig_cntr->ct->ctn;
+	cmd.threshold = trig_thresh;
+	cmd.ct = op_cntr->ct->ctn;
+	cmd.set_ct_success = 1;
+	cmd.ct_success = cntr->value;
+
+	ofi_genlock_lock(&dom->trig_cmdq_lock);
+	ret = cxi_cq_emit_ct(dom->trig_cmdq->dev_cmdq, opcode, &cmd);
+	if (ret) {
+		/* TODO: Handle this assert. */
+		assert(!ret);
+	}
+	cxi_cq_ring(dom->trig_cmdq->dev_cmdq);
+	ofi_genlock_unlock(&dom->trig_cmdq_lock);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_dom_dwq_op_recv(struct cxip_domain *dom, struct fi_op_msg *msg,
+				struct cxip_cntr *trig_cntr,
+				struct cxip_cntr *comp_cntr,
+				uint64_t trig_thresh)
+{
+	struct cxip_ep *ep = container_of(msg->ep, struct cxip_ep, ep);
+	void *buf;
+	size_t len;
+
+	/* Non-zero thresholds for triggered receives are not supported. */
+	if (!msg || msg->msg.iov_count > 1 || trig_thresh)
+		return -FI_EINVAL;
+
+	buf = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_base : NULL;
+	len = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_len : 0;
+
+	return cxip_recv_common(&ep->ep_obj->rxc, buf, len, NULL, msg->msg.addr,
+				0, 0, msg->msg.context, msg->flags, false,
+				comp_cntr);
+}
+
+static int cxip_dom_dwq_op_trecv(struct cxip_domain *dom,
+				 struct fi_op_tagged *tagged,
+				 struct cxip_cntr *trig_cntr,
+				 struct cxip_cntr *comp_cntr,
+				 uint64_t trig_thresh)
+{
+	struct cxip_ep *ep = container_of(tagged->ep, struct cxip_ep, ep);
+	void *buf;
+	size_t len;
+
+	/* Non-zero thresholds for triggered receives are not supported. */
+	if (!tagged || tagged->msg.iov_count > 1 || trig_thresh)
+		return -FI_EINVAL;
+
+	buf = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_base : NULL;
+	len = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_len : 0;
+
+	return cxip_recv_common(&ep->ep_obj->rxc, buf, len, tagged->msg.desc,
+				tagged->msg.addr, tagged->msg.tag,
+				tagged->msg.ignore, tagged->msg.context,
+				tagged->flags, true, comp_cntr);
+}
+
+/* Must hold domain lock. */
+static void cxip_dom_progress_all_cqs(struct cxip_domain *dom)
+{
+	struct cxip_cq *cq;
+
+	dlist_foreach_container(&dom->cq_list, struct cxip_cq, cq,
+				dom_entry)
+		cxip_util_cq_progress(&cq->util_cq);
+}
+
+static int cxip_dom_trig_op_get_in_use(struct cxip_domain *dom)
+{
+	struct cxi_rsrc_use in_use;
+	int ret;
+
+	ret = cxil_get_svc_rsrc_use(dom->iface->dev, dom->auth_key.svc_id,
+				    &in_use);
+	if (ret)
+		return ret;
+
+	return in_use.in_use[CXI_RSRC_TYPE_TLE];
+}
+
+#define DWQ_SEMAPHORE_TIMEOUT 10U
+
+static int cxip_dom_dwq_queue_work_validate(struct cxip_domain *dom,
+					    struct fi_deferred_work *work)
+{
+	struct cxip_ep *ep;
+
+	if (!work->triggering_cntr)
+		return -FI_EINVAL;
+
+	switch (work->op_type) {
+	case FI_OP_SEND:
+	case FI_OP_RECV:
+		ep = container_of(work->op.msg->ep, struct cxip_ep, ep);
+		break;
+
+	case FI_OP_TSEND:
+	case FI_OP_TRECV:
+		ep = container_of(work->op.tagged->ep, struct cxip_ep, ep);
+		break;
+
+	case FI_OP_READ:
+	case FI_OP_WRITE:
+		ep = container_of(work->op.rma->ep, struct cxip_ep, ep);
+		break;
+
+	case FI_OP_ATOMIC:
+		ep = container_of(work->op.atomic->ep, struct cxip_ep, ep);
+		break;
+
+	case FI_OP_FETCH_ATOMIC:
+		ep = container_of(work->op.fetch_atomic->ep, struct cxip_ep,
+				  ep);
+		break;
+
+	case FI_OP_COMPARE_ATOMIC:
+		ep = container_of(work->op.compare_atomic->ep, struct cxip_ep,
+				  ep);
+		break;
+
+	case FI_OP_CNTR_SET:
+	case FI_OP_CNTR_ADD:
+		return FI_SUCCESS;
+
+	default:
+		return -FI_EINVAL;
+	}
+
+	/* All EPs that share a Domain must use the same VNI. This is a
+	 * simplification due to Cassini requiring triggered op TXQs to
+	 * use CP 0.
+	 */
+	if (ep->ep_obj->auth_key.vni != dom->auth_key.vni) {
+		CXIP_WARN("Invalid VNI: %u\n", ep->ep_obj->auth_key.vni);
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+static int cxip_dom_dwq_queue_work(struct cxip_domain *dom,
+				   struct fi_deferred_work *work)
+{
+	struct cxip_cntr *trig_cntr;
+	struct cxip_cntr *comp_cntr;
+	bool queue_wb_work;
+	int ret;
+	int trig_op_count;
+	int trig_op_in_use;
+	struct timespec ts;
+	bool again;
+
+	ret = cxip_dom_dwq_queue_work_validate(dom, work);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	comp_cntr = work->completion_cntr ?
+		container_of(work->completion_cntr, struct cxip_cntr,
+			     cntr_fid) : NULL;
+	trig_cntr = container_of(work->triggering_cntr, struct cxip_cntr,
+				 cntr_fid);
+
+	switch (work->op_type) {
+	case FI_OP_SEND:
+	case FI_OP_RECV:
+		queue_wb_work = !!(work->op.msg->flags & FI_CXI_CNTR_WB);
+		break;
+
+	case FI_OP_TSEND:
+	case FI_OP_TRECV:
+		queue_wb_work = !!(work->op.tagged->flags & FI_CXI_CNTR_WB);
+		break;
+
+	case FI_OP_READ:
+	case FI_OP_WRITE:
+		queue_wb_work = !!(work->op.rma->flags & FI_CXI_CNTR_WB);
+		break;
+
+	case FI_OP_ATOMIC:
+		queue_wb_work = !!(work->op.atomic->flags & FI_CXI_CNTR_WB);
+		break;
+
+	case FI_OP_FETCH_ATOMIC:
+		queue_wb_work = !!(work->op.fetch_atomic->flags & FI_CXI_CNTR_WB);
+		break;
+
+	case FI_OP_COMPARE_ATOMIC:
+		queue_wb_work = !!(work->op.compare_atomic->flags & FI_CXI_CNTR_WB);
+		break;
+
+	default:
+		queue_wb_work = false;
+	}
+
+	if (cxip_env.enable_trig_op_limit) {
+		if (queue_wb_work)
+			trig_op_count = 2;
+		else
+			trig_op_count = 1;
+
+		if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
+			CXIP_WARN("clock_gettime failed: %d\n", -errno);
+			return -errno;
+		}
+
+		ts.tv_sec += DWQ_SEMAPHORE_TIMEOUT;
+
+		again = true;
+		do {
+			if (sem_timedwait(dom->trig_op_lock, &ts) == -1) {
+				if (errno == EINTR) {
+					CXIP_WARN("sem_timedwait failed: %d\n",
+						  -errno);
+					return -errno;
+				}
+			} else {
+				again = false;
+			}
+		} while (again);
+
+		ret = cxip_dom_trig_op_get_in_use(dom);
+		if (ret < 0) {
+			CXIP_WARN("cxip_dom_trig_op_get_in_use: %d\n", ret);
+			goto unlock;
+		}
+
+		trig_op_in_use = ret;
+
+		if ((trig_op_in_use + trig_op_count) > dom->max_trig_op_in_use) {
+			CXIP_WARN("Trig ops exhausted: in-use=%d\n", trig_op_in_use);
+			ret = -FI_ENOSPC;
+			goto unlock;
+		}
+	}
+
+	switch (work->op_type) {
+	case FI_OP_SEND:
+		ret = cxip_dom_dwq_op_send(dom, work->op.msg, trig_cntr,
+					   comp_cntr, work->threshold);
+		break;
+
+	case FI_OP_TSEND:
+		ret = cxip_dom_dwq_op_tsend(dom, work->op.tagged, trig_cntr,
+					    comp_cntr, work->threshold);
+		break;
+
+	case FI_OP_RECV:
+		ret = cxip_dom_dwq_op_recv(dom, work->op.msg, trig_cntr,
+					   comp_cntr, work->threshold);
+		break;
+
+	case FI_OP_TRECV:
+		ret = cxip_dom_dwq_op_trecv(dom, work->op.tagged, trig_cntr,
+					    comp_cntr, work->threshold);
+		break;
+
+	case FI_OP_READ:
+	case FI_OP_WRITE:
+		ret = cxip_dom_dwq_op_rma(dom, work->op.rma, work->op_type,
+					  trig_cntr, comp_cntr,
+					  work->threshold);
+		break;
+
+	case FI_OP_ATOMIC:
+		ret = cxip_dom_dwq_op_atomic(dom, work->op.atomic, trig_cntr,
+					     comp_cntr, work->threshold);
+		break;
+
+	case FI_OP_FETCH_ATOMIC:
+		ret = cxip_dom_dwq_op_fetch_atomic(dom, work->op.fetch_atomic,
+						   trig_cntr, comp_cntr,
+						   work->threshold);
+		break;
+
+	case FI_OP_COMPARE_ATOMIC:
+		ret = cxip_dom_dwq_op_comp_atomic(dom, work->op.compare_atomic,
+						  trig_cntr, comp_cntr,
+						  work->threshold);
+		break;
+
+	case FI_OP_CNTR_SET:
+	case FI_OP_CNTR_ADD:
+		ret = cxip_dom_dwq_op_cntr(dom, work->op.cntr, work->op_type,
+					   trig_cntr, comp_cntr,
+					   work->threshold, false);
+		break;
+
+	default:
+		ret = -FI_EINVAL;
+		CXIP_WARN("Invalid FI_QUEUE_WORK op %s\n",
+				fi_tostr(&work->op_type, FI_TYPE_OP_TYPE));
+	}
+
+	if (ret)
+		goto unlock;
+
+	if (queue_wb_work) {
+		struct fi_op_cntr op_cntr = {
+			.cntr = &trig_cntr->cntr_fid,
+		};
+
+		/* no op_type needed for counter writeback */
+		ret = cxip_dom_dwq_op_cntr(dom, &op_cntr, 0, trig_cntr, NULL,
+					   work->threshold + 1, true);
+		/* TODO: If cxip_dom_dwq_op_cntr fails we need to cancel the
+		 * above work queue.
+		 */
+	}
+
+	/* Wait until the command queue is empty. This is a sign that hardware
+	 * has processed triggered operation commands. At this point, it is
+	 * safe to release the trigger op pool lock.
+	 */
+	if (cxip_env.enable_trig_op_limit) {
+		ofi_genlock_lock(&dom->trig_cmdq_lock);
+		while (dom->trig_cmdq->dev_cmdq->status->rd_ptr !=
+		       (dom->trig_cmdq->dev_cmdq->hw_wp32 / 2)) {};
+		ofi_genlock_unlock(&dom->trig_cmdq_lock);
+	}
+
+unlock:
+	if (cxip_env.enable_trig_op_limit)
+		sem_post(dom->trig_op_lock);
+
+	return ret;
+}
+
+static int cxip_dom_dwq_flush_work(struct cxip_domain *dom)
+{
+	struct cxip_cntr *trig_cntr;
+	struct cxip_txc *txc;
+	struct cxip_cq *cq;
+	int ret __attribute__ ((unused));
+
+	ofi_spin_lock(&dom->lock);
+	if (!dom->cntr_init) {
+		ofi_spin_unlock(&dom->lock);
+		return FI_SUCCESS;
+	}
+
+	ofi_genlock_lock(&dom->trig_cmdq_lock);
+
+	/* Issue cancels to all allocated counters. */
+	dlist_foreach_container(&dom->cntr_list, struct cxip_cntr,
+				trig_cntr, dom_entry) {
+		struct c_ct_cmd ct_cmd = {};
+
+		if (!trig_cntr->ct)
+			continue;
+
+		ct_cmd.ct = trig_cntr->ct->ctn;
+		ret = cxi_cq_emit_ct(dom->trig_cmdq->dev_cmdq, C_CMD_CT_CANCEL,
+				     &ct_cmd);
+
+		// TODO: Handle this assert. Multiple triggered CQs may
+		// be required.
+		assert(!ret);
+		cxi_cq_ring(dom->trig_cmdq->dev_cmdq);
+	};
+
+	/* Rely on the triggered CQ ack counter to know when there are no more
+	 * pending triggered operations. In-between, progress CQs to cleanup
+	 * internal transaction state.
+	 */
+	while (true) {
+		unsigned int ack_counter;
+
+		ret = cxil_cmdq_ack_counter(dom->trig_cmdq->dev_cmdq,
+					    &ack_counter);
+		assert(!ret);
+
+		if (!ack_counter)
+			break;
+
+		cxip_dom_progress_all_cqs(dom);
+	}
+
+	/* It is possible that the ack counter is zero and there are completion
+	 * events in-flight meaning that the above progression may have missed
+	 * events. Perform a sleep to help ensure events have arrived and
+	 * progress all CQs one more time.
+	 *
+	 * TODO: Investigate better way to resolve this race condition.
+	 */
+	sleep(1);
+	cxip_dom_progress_all_cqs(dom);
+
+	/* At this point, all triggered operations should be cancelled or have
+	 * completed. Due to special handling of message operations, flush any
+	 * remaining message triggered requests from the TX context first.
+	 */
+	dlist_foreach_container(&dom->txc_list, struct cxip_txc, txc,
+				dom_entry)
+		cxip_txc_flush_msg_trig_reqs(txc);
+
+	/* Flush all the CQs of any remaining non-message triggered operation
+	 * requests.
+	 */
+	dlist_foreach_container(&dom->cq_list, struct cxip_cq, cq, dom_entry)
+		cxip_cq_flush_trig_reqs(cq);
+
+	ofi_genlock_unlock(&dom->trig_cmdq_lock);
+	ofi_spin_unlock(&dom->lock);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_domain_enable_mr_match_events(struct fid *fid, bool enable)
+{
+	struct cxip_domain *dom;
+
+	if (fid->fclass != FI_CLASS_DOMAIN) {
+		CXIP_WARN("Invalid FID: %p\n", fid);
+		return -FI_EINVAL;
+	}
+
+	dom = container_of(fid, struct cxip_domain,
+			   util_domain.domain_fid.fid);
+	dom->mr_match_events = enable;
+
+	return FI_SUCCESS;
+}
+
+static int cxip_domain_enable_optimized_mrs(struct fid *fid, bool enable)
+{
+	struct cxip_domain *dom;
+
+	if (fid->fclass != FI_CLASS_DOMAIN) {
+		CXIP_WARN("Invalid FID: %p\n", fid);
+		return -FI_EINVAL;
+	}
+
+	dom = container_of(fid, struct cxip_domain,
+			   util_domain.domain_fid.fid);
+	if (!dom->is_prov_key) {
+		CXIP_WARN("Requires FI_MR_PROV_KEY\n");
+		return -FI_EINVAL;
+	}
+
+	dom->optimized_mrs = enable;
+
+	return FI_SUCCESS;
+}
+
+static int cxip_domain_enable_prov_key_cache(struct fid *fid, bool enable)
+{
+	struct cxip_domain *dom;
+
+	if (fid->fclass != FI_CLASS_DOMAIN) {
+		CXIP_WARN("Invalid FID: %p\n", fid);
+		return -FI_EINVAL;
+	}
+
+	dom = container_of(fid, struct cxip_domain,
+			   util_domain.domain_fid.fid);
+	if (!dom->is_prov_key) {
+		CXIP_WARN("Requires FI_MR_PROV_KEY\n");
+		return -FI_EINVAL;
+	}
+
+	dom->prov_key_cache = enable;
+
+	return FI_SUCCESS;
+}
+
+
+static int cxip_dom_control(struct fid *fid, int command, void *arg)
+{
+	struct cxip_domain *dom;
+
+	dom = container_of(fid, struct cxip_domain, util_domain.domain_fid.fid);
+
+	if (command != FI_FLUSH_WORK && !arg) {
+		CXIP_WARN("Required argument missing\n");
+		return -FI_EINVAL;
+	}
+
+	switch (command) {
+	case FI_QUEUE_WORK:
+		return cxip_dom_dwq_queue_work(dom, arg);
+
+	case FI_FLUSH_WORK:
+		return cxip_dom_dwq_flush_work(dom);
+
+	case FI_OPT_CXI_SET_OPTIMIZED_MRS:
+		return cxip_domain_enable_optimized_mrs(fid, *(bool *)arg);
+
+	case FI_OPT_CXI_GET_OPTIMIZED_MRS:
+		*(bool *)arg = dom->optimized_mrs;
+		break;
+
+	case FI_OPT_CXI_SET_MR_MATCH_EVENTS:
+		return cxip_domain_enable_mr_match_events(fid, *(bool *)arg);
+
+	case FI_OPT_CXI_GET_MR_MATCH_EVENTS:
+		*(bool *)arg = dom->mr_match_events;
+		break;
+
+	case FI_OPT_CXI_SET_PROV_KEY_CACHE:
+		return cxip_domain_enable_prov_key_cache(fid, *(bool *)arg);
+
+	case FI_OPT_CXI_GET_PROV_KEY_CACHE:
+		*(bool *)arg = dom->prov_key_cache;
+		break;
+
+	default:
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+static int cxip_domain_cntr_read(struct fid *fid, unsigned int cntr,
+				 uint64_t *value, struct timespec *ts)
+{
+	struct cxip_domain *dom;
+	int ret;
+
+	if (fid->fclass != FI_CLASS_DOMAIN) {
+		CXIP_WARN("Invalid FID: %p\n", fid);
+		return -FI_EINVAL;
+	}
+
+	dom = container_of(fid, struct cxip_domain,
+			   util_domain.domain_fid.fid);
+
+	if (!dom->enabled)
+		return -FI_EOPBADSTATE;
+
+	ret = cxil_read_cntr(dom->iface->dev, cntr, value, ts);
+
+	return ret ? -FI_EINVAL : FI_SUCCESS;
+}
+
+static int cxip_domain_topology(struct fid *fid, unsigned int *group_id,
+				unsigned int *switch_id, unsigned int *port_id)
+{
+	struct cxip_domain *dom;
+	struct cxip_topo_addr topo;
+
+	if (fid->fclass != FI_CLASS_DOMAIN) {
+		CXIP_WARN("Invalid FID: %p\n", fid);
+		return -FI_EINVAL;
+	}
+
+	dom = container_of(fid, struct cxip_domain,
+			   util_domain.domain_fid.fid);
+	topo.addr = dom->nic_addr;
+
+	/* Only a dragonfly topology is supported at this time */
+	if (group_id)
+		*group_id = topo.dragonfly.group_num;
+	if (switch_id)
+		*switch_id = topo.dragonfly.switch_num;
+	if (port_id)
+		*port_id = topo.dragonfly.port_num;
+
+	return FI_SUCCESS;
+}
+
+static int cxip_domain_enable_hybrid_mr_desc(struct fid *fid, bool enable)
+{
+	struct cxip_domain *dom;
+
+	if (fid->fclass != FI_CLASS_DOMAIN) {
+		CXIP_WARN("Invalid FID: %p\n", fid);
+		return -FI_EINVAL;
+	}
+
+	dom = container_of(fid, struct cxip_domain,
+			   util_domain.domain_fid.fid);
+
+	dom->hybrid_mr_desc = enable;
+
+	return FI_SUCCESS;
+}
+
+static int cxip_domain_get_dwq_depth(struct fid *fid, size_t *depth)
+{
+	struct cxip_domain *dom;
+
+	if (fid->fclass != FI_CLASS_DOMAIN) {
+		CXIP_WARN("Invalid FID: %p\n", fid);
+		return -FI_EINVAL;
+	}
+
+	dom = container_of(fid, struct cxip_domain,
+			   util_domain.domain_fid.fid);
+
+	*depth = dom->max_trig_op_in_use;
+
+	return FI_SUCCESS;
+}
+
+static struct fi_cxi_dom_ops cxip_dom_ops_ext = {
+	.cntr_read = cxip_domain_cntr_read,
+	.topology = cxip_domain_topology,
+	.enable_hybrid_mr_desc = cxip_domain_enable_hybrid_mr_desc,
+	.ep_get_unexp_msgs = cxip_ep_get_unexp_msgs,
+	.get_dwq_depth = cxip_domain_get_dwq_depth,
+	.enable_mr_match_events = cxip_domain_enable_mr_match_events,
+	.enable_optimized_mrs = cxip_domain_enable_optimized_mrs,
+};
+
+static int cxip_dom_ops_open(struct fid *fid, const char *ops_name,
+			     uint64_t flags, void **ops, void *context)
+{
+	/* v5 only appended a new function */
+	if (!strcmp(ops_name, FI_CXI_DOM_OPS_1) ||
+	    !strcmp(ops_name, FI_CXI_DOM_OPS_2) ||
+	    !strcmp(ops_name, FI_CXI_DOM_OPS_3) ||
+	    !strcmp(ops_name, FI_CXI_DOM_OPS_4) ||
+	    !strcmp(ops_name, FI_CXI_DOM_OPS_5) ||
+	    !strcmp(ops_name, FI_CXI_DOM_OPS_6)) {
+		*ops = &cxip_dom_ops_ext;
+		return FI_SUCCESS;
+	}
+
+	return -FI_EINVAL;
+}
+
+static int cxip_domain_ops_set(struct fid *fid, const char *name,
+			       uint64_t flags, void *ops, void *context)
+{
+	struct cxip_domain *domain =
+		container_of(fid, struct cxip_domain,
+			     util_domain.domain_fid.fid);
+	struct fi_hmem_override_ops *hmem_ops;
+
+	if (strcmp(FI_SET_OPS_HMEM_OVERRIDE, name) == 0) {
+		hmem_ops = ops;
+
+		if (!hmem_ops->copy_from_hmem_iov ||
+		    !hmem_ops->copy_to_hmem_iov)
+			return -FI_EINVAL;
+
+		domain->hmem_ops = *hmem_ops;
+
+		return FI_SUCCESS;
+	}
+
+	return -FI_ENOSYS;
+}
+
+static int cxip_query_atomic_flags_valid(uint64_t flags)
+{
+	/* FI_COMPARE_ATOMIC and FI_FETCH_ATOMIC are mutually exclusive. */
+	if ((flags & FI_COMPARE_ATOMIC) && (flags & FI_FETCH_ATOMIC))
+		return -FI_EINVAL;
+
+	if (flags & FI_CXI_PCIE_AMO) {
+		/* Only FI_FETCH_ATOMIC is support with FI_CXI_PCIE_AMO. */
+		if (!(flags & FI_FETCH_ATOMIC))
+			return -FI_EOPNOTSUPP;
+	}
+
+	return FI_SUCCESS;
+}
+
+static int cxip_query_atomic(struct fid_domain *domain,
+			     enum fi_datatype datatype, enum fi_op op,
+			     struct fi_atomic_attr *attr, uint64_t flags)
+{
+	enum cxip_amo_req_type req_type;
+	int ret;
+	unsigned int datatype_len;
+	struct cxip_domain *dom;
+
+	dom = container_of(domain, struct cxip_domain,
+			   util_domain.domain_fid.fid);
+
+	if (!attr)
+		return -FI_EINVAL;
+
+	ret = cxip_query_atomic_flags_valid(flags);
+	if (ret)
+		return ret;
+
+	if (flags & FI_COMPARE_ATOMIC) {
+		req_type = CXIP_RQ_AMO_SWAP;
+	} else if (flags & FI_FETCH_ATOMIC) {
+		if (flags & FI_CXI_PCIE_AMO)
+			req_type = CXIP_RQ_AMO_PCIE_FETCH;
+		else
+			req_type = CXIP_RQ_AMO_FETCH;
+	} else {
+		req_type = CXIP_RQ_AMO;
+	}
+
+	ret = _cxip_atomic_opcode(req_type, datatype, op,
+				  dom->amo_remap_to_pcie_fadd, NULL, NULL, NULL,
+				  &datatype_len);
+	if (ret)
+		return ret;
+
+	attr->count = 1;
+	attr->size = datatype_len;
+
+	return FI_SUCCESS;
+}
+
+static int cxip_query_collective(struct fid_domain *domain,
+				 enum fi_collective_op coll,
+			         struct fi_collective_attr *attr,
+				 uint64_t flags)
+{
+	int ext_op;
+
+	/* BARRIER does not require attr */
+	if (coll == FI_BARRIER && !attr)
+		return FI_SUCCESS;
+
+	/* Anything else requires attr */
+	if (!attr)
+		return -FI_EINVAL;
+
+	/* Flags are not supported */
+	if (flags)
+		return -FI_EOPNOTSUPP;
+
+	/* The limit to collective membership is the size of the multicast tree,
+	 * which is limited by the maximum address space of addressable ports on
+	 * the fabric.
+	 */
+	attr->max_members = (1L << C_DFA_NIC_BITS) - 1;
+
+	/* supported collective operations */
+	ext_op = (int)attr->op;
+	switch (coll) {
+	case FI_BARRIER:
+		/* ignore attr->op: barrier takes no operator */
+		/* ignore attr->datatype: barrier takes no data */
+		attr->datatype_attr.count = 0;
+		attr->datatype_attr.size = 0;
+		break;
+	case FI_BROADCAST:
+		/* ignore attr->op: barrier takes no operator */
+		switch (attr->datatype) {
+		case FI_INT8:
+		case FI_UINT8:
+			attr->datatype_attr.count = 32;
+			attr->datatype_attr.size = 1;
+			break;
+		case FI_INT16:
+		case FI_UINT16:
+			attr->datatype_attr.count = 16;
+			attr->datatype_attr.size = 2;
+			break;
+		case FI_INT32:
+		case FI_UINT32:
+		case FI_FLOAT:
+			attr->datatype_attr.count = 8;
+			attr->datatype_attr.size = 4;
+			break;
+		case FI_INT64:
+		case FI_UINT64:
+		case FI_DOUBLE:
+			attr->datatype_attr.count = 4;
+			attr->datatype_attr.size = 8;
+			break;
+		default:
+			return -FI_EOPNOTSUPP;
+		}
+		break;
+	case FI_REDUCE:
+	case FI_ALLREDUCE:
+		switch (ext_op) {
+		case FI_BOR:
+		case FI_BAND:
+		case FI_BXOR:
+			switch (attr->datatype) {
+			case FI_INT8:
+			case FI_UINT8:
+				attr->datatype_attr.count = 32;
+				attr->datatype_attr.size = 1;
+				break;
+			case FI_INT16:
+			case FI_UINT16:
+				attr->datatype_attr.count = 16;
+				attr->datatype_attr.size = 2;
+				break;
+			case FI_INT32:
+			case FI_UINT32:
+				attr->datatype_attr.count = 8;
+				attr->datatype_attr.size = 4;
+				break;
+			case FI_INT64:
+			case FI_UINT64:
+				attr->datatype_attr.count = 4;
+				attr->datatype_attr.size = 8;
+				break;
+			default:
+				return -FI_EOPNOTSUPP;
+			}
+			break;
+		case FI_MIN:
+		case FI_MAX:
+		case FI_SUM:
+			if (attr->datatype != FI_INT64 &&
+			    attr->datatype != FI_DOUBLE)
+				return -FI_EOPNOTSUPP;
+			attr->datatype_attr.count = 4;
+			attr->datatype_attr.size = 8;
+			break;
+		case FI_CXI_MINMAXLOC:
+			attr->datatype_attr.count = 1;
+			attr->datatype_attr.size = 32;
+			break;
+		case FI_CXI_REPSUM:
+			attr->datatype_attr.count = 1;
+			attr->datatype_attr.size = 8;
+			break;
+		default:
+			return -FI_EOPNOTSUPP;
+		}
+		break;
+	default:
+		return -FI_EOPNOTSUPP;
+	}
+	return FI_SUCCESS;
+}
+
+static struct fi_ops cxip_dom_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_dom_close,
+	.bind = cxip_dom_bind,
+	.control = cxip_dom_control,
+	.ops_open = cxip_dom_ops_open,
+	.ops_set = cxip_domain_ops_set,
+};
+
+static struct fi_ops_domain cxip_dom_ops = {
+	.size = sizeof(struct fi_ops_domain),
+	.av_open = cxip_av_open,
+	.cq_open = cxip_cq_open,
+	.endpoint = cxip_endpoint,
+	.scalable_ep = fi_no_scalable_ep,
+	.cntr_open = cxip_cntr_open,
+	.poll_open = fi_no_poll_open,
+	.stx_ctx = fi_no_stx_context,
+	.srx_ctx = fi_no_srx_context,
+	.query_atomic = cxip_query_atomic,
+	.query_collective = cxip_query_collective
+};
+
+/*
+ * cxip_domain() - Provider fi_domain() implementation.
+ */
+int cxip_domain(struct fid_fabric *fabric, struct fi_info *info,
+		struct fid_domain **dom, void *context)
+{
+	struct cxip_domain *cxi_domain;
+	struct cxip_fabric *fab;
+	struct cxip_addr *src_addr;
+	uint32_t seed;
+	int ret;
+
+	/* The OFI check_info function does not verify that rx/tx attribute
+	 * capabilities are a subset of the info capabilities. Currently
+	 * MPI removes the FI_HMEM cap from info->caps but not the rx/tx
+	 * caps. To avoided breaking MPI, the capabilities are removed
+	 * here as a temporary work around.
+	 * TODO: Remove this code when no longer required.
+	 */
+	if (info->caps && !(info->caps & FI_HMEM)) {
+		if (info->tx_attr)
+			info->tx_attr->caps &= ~FI_HMEM;
+		if (info->rx_attr)
+			info->rx_attr->caps &= ~FI_HMEM;
+	}
+
+	ret = ofi_prov_check_info(&cxip_util_prov, CXIP_FI_VERSION, info);
+	if (ret != FI_SUCCESS)
+		return -FI_EINVAL;
+
+	ret = cxip_check_auth_key_info(info);
+	if (ret)
+		return ret;
+
+	fab = container_of(fabric, struct cxip_fabric, util_fabric.fabric_fid);
+
+	cxi_domain = calloc(1, sizeof(*cxi_domain));
+	if (!cxi_domain)
+		return -FI_ENOMEM;
+
+	ret = ofi_domain_init(&fab->util_fabric.fabric_fid, info,
+			      &cxi_domain->util_domain, context,
+			      OFI_LOCK_SPINLOCK);
+	if (ret)
+		goto free_dom;
+
+	if (!info || !info->src_addr) {
+		CXIP_WARN("Invalid fi_info\n");
+		goto close_util_dom;
+	}
+	src_addr = (struct cxip_addr *)info->src_addr;
+	cxi_domain->nic_addr = src_addr->nic;
+
+	if (info->domain_attr->auth_key) {
+		/* Auth key size is verified in ofi_prov_check_info(). */
+		assert(info->domain_attr->auth_key_size ==
+		       sizeof(struct cxi_auth_key));
+
+		memcpy(&cxi_domain->auth_key, info->domain_attr->auth_key,
+		       sizeof(struct cxi_auth_key));
+	} else {
+		ret = cxip_gen_auth_key(info, &cxi_domain->auth_key);
+		if (ret) {
+			CXIP_WARN("cxip_gen_auth_key failed: %d:%s", ret,
+				  fi_strerror(-ret));
+			return ret;
+		}
+
+		/* If FI_AV_AUTH_KEY is used, the auth_key.vni value will never
+		 * be used. Thus, set it to zero which is invalid.
+		 */
+		cxi_domain->av_auth_key =
+			info->domain_attr->auth_key_size == FI_AV_AUTH_KEY;
+		if (cxi_domain->av_auth_key)
+			cxi_domain->auth_key.vni = 0;
+	}
+
+	if (info->domain_attr->tclass != FI_TC_UNSPEC) {
+		if (info->domain_attr->tclass >= FI_TC_LABEL &&
+		    info->domain_attr->tclass <= FI_TC_SCAVENGER) {
+			cxi_domain->tclass = info->domain_attr->tclass;
+		} else {
+			CXIP_WARN("Invalid tclass\n");
+			goto close_util_dom;
+		}
+	} else {
+		/* Use default tclass */
+		cxi_domain->tclass = FI_TC_BEST_EFFORT;
+	}
+
+	cxi_domain->av_user_id =
+		!!(cxi_domain->util_domain.info_domain_caps & FI_AV_USER_ID);
+	cxi_domain->auth_key_entry_max = info->domain_attr->max_ep_auth_key;
+	cxi_domain->util_domain.domain_fid.fid.ops = &cxip_dom_fi_ops;
+	cxi_domain->util_domain.domain_fid.ops = &cxip_dom_ops;
+	cxi_domain->util_domain.domain_fid.mr = &cxip_dom_mr_ops;
+
+	dlist_init(&cxi_domain->txc_list);
+	dlist_init(&cxi_domain->cntr_list);
+	dlist_init(&cxi_domain->cq_list);
+	ofi_spin_init(&cxi_domain->lock);
+	ofi_spin_init(&cxi_domain->ctrl_id_lock);
+	memset(&cxi_domain->req_ids, 0, sizeof(cxi_domain->req_ids));
+	memset(&cxi_domain->mr_ids, 0, sizeof(cxi_domain->mr_ids));
+
+	ofi_atomic_initialize32(&cxi_domain->ref, 0);
+	cxi_domain->fab = fab;
+
+	cxi_domain->hmem_ops.copy_from_hmem_iov = ofi_copy_from_hmem_iov;
+	cxi_domain->hmem_ops.copy_to_hmem_iov = ofi_copy_to_hmem_iov;
+
+	/* Allocate/initialize domain hardware resources */
+	ret = cxip_domain_enable(cxi_domain);
+	if (ret) {
+		CXIP_WARN("Resource allocation failed: %d: %s\n",
+			  ret, fi_strerror(-ret));
+		goto cleanup_dom;
+	}
+
+	/* Handle client vs provider MR RKEY differences */
+	if (cxi_domain->util_domain.mr_mode & FI_MR_PROV_KEY) {
+		cxi_domain->is_prov_key = true;
+
+		seed = (uint32_t)ofi_gettime_ns();
+		cxi_domain->prov_key_seqnum = ofi_xorshift_random(seed);
+	}
+
+	cxi_domain->mr_match_events = cxip_env.mr_match_events;
+	cxi_domain->optimized_mrs = cxip_env.optimized_mrs;
+	cxi_domain->prov_key_cache = cxip_env.prov_key_cache;
+	*dom = &cxi_domain->util_domain.domain_fid;
+
+	return 0;
+
+cleanup_dom:
+	ofi_spin_destroy(&cxi_domain->lock);
+close_util_dom:
+	ofi_domain_close(&cxi_domain->util_domain);
+free_dom:
+	free(cxi_domain);
+	return -FI_EINVAL;
+}
+
+int cxip_domain_valid_vni(struct cxip_domain *dom, unsigned int vni)
+{
+	/* Currently the auth_key.svc_id field contains the resource group ID.
+	*/
+	return cxip_if_valid_rgroup_vni(dom->iface, dom->auth_key.svc_id, vni);
+}
+
+#define SUPPORTED_DWQ_FLAGS (FI_MORE | FI_COMPLETION | FI_DELIVERY_COMPLETE | \
+	FI_MATCH_COMPLETE | FI_TRANSMIT_COMPLETE | FI_CXI_CNTR_WB)
+
+static int cxip_domain_dwq_emit_validate(struct cxip_domain *dom, uint16_t vni,
+					 enum cxi_traffic_class tc,
+					 enum cxi_traffic_class_type tc_type,
+					 uint64_t flags)
+{
+	uint64_t unsupported_flags = flags & ~SUPPORTED_DWQ_FLAGS;
+
+	if (unsupported_flags) {
+		CXIP_WARN("Unsupported flags: %lx\n", unsupported_flags);
+		return -FI_EINVAL;
+	}
+
+	if (tc != dom->trig_cmdq->cur_cp->tc) {
+		CXIP_WARN("Invalid tc: %d\n", tc);
+		return -FI_EINVAL;
+	}
+
+	if (tc_type != dom->trig_cmdq->cur_cp->tc_type) {
+		CXIP_WARN("Invalid tc_type: %d\n", tc_type);
+		return -FI_EINVAL;
+	}
+
+	if (vni != dom->trig_cmdq->cur_cp->vni) {
+		CXIP_WARN("Invalid vni: %d\n", vni);
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+int cxip_domain_dwq_emit_dma(struct cxip_domain *dom, uint16_t vni,
+			     enum cxi_traffic_class tc,
+			     enum cxi_traffic_class_type tc_type,
+			     struct cxip_cntr *trig_cntr, size_t trig_thresh,
+			     struct c_full_dma_cmd *dma, uint64_t flags)
+{
+	struct c_ct_cmd ct_cmd = {
+		.trig_ct = trig_cntr->ct->ctn,
+		.threshold = trig_thresh,
+	};
+	int ret;
+
+	ret = cxip_domain_dwq_emit_validate(dom, vni, tc, tc_type, flags);
+	if (ret)
+		return ret;
+
+	ofi_genlock_lock(&dom->trig_cmdq_lock);
+
+	ret = cxi_cq_emit_trig_full_dma(dom->trig_cmdq->dev_cmdq, &ct_cmd, dma);
+	if (ret) {
+		CXIP_WARN("Failed to emit trigger dma command: %d:%s\n", ret,
+			  fi_strerror(-ret));
+		ret = -FI_EAGAIN;
+	} else {
+		cxip_txq_ring(dom->trig_cmdq, false, 1);
+	}
+
+	ofi_genlock_unlock(&dom->trig_cmdq_lock);
+
+	return ret;
+}
+
+int cxip_domain_dwq_emit_amo(struct cxip_domain *dom, uint16_t vni,
+			     enum cxi_traffic_class tc,
+			     enum cxi_traffic_class_type tc_type,
+			     struct cxip_cntr *trig_cntr, size_t trig_thresh,
+			     struct c_dma_amo_cmd *amo, uint64_t flags,
+			     bool fetching, bool flush)
+{
+	struct c_ct_cmd ct_cmd = {
+		.trig_ct = trig_cntr->ct->ctn,
+		.threshold = trig_thresh,
+	};
+	struct c_full_dma_cmd flush_cmd;
+	bool fetching_flush = fetching && flush;
+	int ret;
+
+	/* TODO: Need to ensure there are at least 2 TLEs free for the following
+	 * triggered commands.
+	 */
+
+	/* TODO: Support triggered operations with different VNIs. */
+
+	if (fetching_flush) {
+		memset(&flush_cmd, 0, sizeof(flush_cmd));
+		flush_cmd.command.opcode = C_CMD_PUT;
+		flush_cmd.index_ext = amo->index_ext;
+		flush_cmd.event_send_disable = 1;
+		flush_cmd.dfa = amo->dfa;
+		flush_cmd.remote_offset = amo->remote_offset;
+		flush_cmd.eq = amo->eq;
+		flush_cmd.user_ptr = amo->user_ptr;
+		flush_cmd.flush = 1;
+	}
+
+	ret = cxip_domain_dwq_emit_validate(dom, vni, tc, tc_type, flags);
+	if (ret)
+		return ret;
+
+	ofi_genlock_lock(&dom->trig_cmdq_lock);
+
+	if (fetching_flush &&
+	    __cxi_cq_free_slots(dom->trig_cmdq->dev_cmdq) >= 16) {
+		CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n");
+		ret = -FI_EAGAIN;
+		goto out_unlock;
+	}
+
+	ret = cxi_cq_emit_trig_dma_amo(dom->trig_cmdq->dev_cmdq, &ct_cmd,
+				       amo, fetching);
+	if (ret) {
+		CXIP_WARN("Failed to emit trigger amo command: %d:%s\n", ret,
+			  fi_strerror(-ret));
+		ret = -FI_EAGAIN;
+		goto out_unlock;
+	}
+
+	if (fetching_flush) {
+		/* CQ space check already occurred. Thus, return code can be
+		 * ignored.
+		 */
+		ret = cxi_cq_emit_trig_full_dma(dom->trig_cmdq->dev_cmdq,
+						&ct_cmd, &flush_cmd);
+		assert(ret == 0);
+	}
+
+	cxip_txq_ring(dom->trig_cmdq, false, 1);
+
+out_unlock:
+	ofi_genlock_unlock(&dom->trig_cmdq_lock);
+
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c
new file mode 100644
index 00000000000..4b579662002
--- /dev/null
+++ b/prov/cxi/src/cxip_ep.c
@@ -0,0 +1,1311 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2013-2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved.
+ * Copyright (c) 2018,2020-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#include "ofi_util.h"
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+extern struct fi_ops_rma cxip_ep_rma_ops;
+extern struct fi_ops_rma cxip_ep_rma_no_ops;
+
+extern struct fi_ops_msg cxip_ep_msg_ops;
+extern struct fi_ops_msg cxip_ep_msg_no_ops;
+extern struct fi_ops_msg cxip_ep_msg_no_tx_ops;
+extern struct fi_ops_msg cxip_ep_msg_no_rx_ops;
+
+extern struct fi_ops_tagged cxip_ep_tagged_ops;
+extern struct fi_ops_tagged cxip_ep_tagged_no_ops;
+extern struct fi_ops_tagged cxip_ep_tagged_no_tx_ops;
+extern struct fi_ops_tagged cxip_ep_tagged_no_rx_ops;
+
+extern struct fi_ops_atomic cxip_ep_atomic_ops;
+extern struct fi_ops_atomic cxip_ep_atomic_no_ops;
+
+extern struct fi_ops_collective cxip_collective_ops;
+extern struct fi_ops_collective cxip_collective_no_ops;
+
+extern struct fi_ops_cm cxip_ep_cm_ops;
+extern struct fi_ops_ep cxip_ep_ops;
+extern struct fi_ops cxip_ep_fi_ops;
+extern struct fi_ops_ep cxip_ctx_ep_ops;
+
+/*
+ * cxip_ep_cmdq() - Open a shareable TX or Target command queue.
+ *
+ * Caller must hold ep_obj->lock
+ */
+int cxip_ep_cmdq(struct cxip_ep_obj *ep_obj, bool transmit, uint32_t tclass,
+		 struct cxi_eq *evtq, struct cxip_cmdq **cmdq)
+{
+	struct cxi_cq_alloc_opts cq_opts = {};
+	struct cxip_cmdq **ep_obj_cmdq;
+	ofi_atomic32_t *ep_obj_ref;
+	int ret;
+	size_t size;
+
+	if (transmit) {
+		ep_obj_cmdq = &ep_obj->txq;
+		ep_obj_ref = &ep_obj->txq_ref;
+		size = ep_obj->txq_size;
+	} else {
+		ep_obj_cmdq = &ep_obj->tgq;
+		ep_obj_ref = &ep_obj->tgq_ref;
+		size = ep_obj->tgq_size;
+	}
+
+	if (*ep_obj_cmdq) {
+		ofi_atomic_inc32(ep_obj_ref);
+		CXIP_DBG("Reusing %s base CMDQ: %p\n",
+			 transmit ? "TX" : "RX", *ep_obj_cmdq);
+		*cmdq = *ep_obj_cmdq;
+
+		return FI_SUCCESS;
+	}
+
+	/* An IDC command can use up to 4 64 byte slots. */
+	cq_opts.count = size * 4;
+	cq_opts.flags = transmit ? CXI_CQ_IS_TX : 0;
+	cq_opts.policy = cxip_env.cq_policy;
+
+	ret = cxip_cmdq_alloc(ep_obj->domain->lni, evtq, &cq_opts,
+			      ep_obj->auth_key.vni, cxip_ofi_to_cxi_tc(tclass),
+			      CXI_TC_TYPE_DEFAULT, cmdq);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Unable to allocate CMDQ, ret: %d\n", ret);
+		return -FI_ENOSPC;
+	}
+	*ep_obj_cmdq = *cmdq;
+	ofi_atomic_inc32(ep_obj_ref);
+
+	CXIP_DBG("Allocated %s CMDQ: %p CP: %u\n",
+		 transmit ? "TX" : "RX", *cmdq, cq_opts.lcid);
+	return ret;
+}
+
+/*
+ * cxip_ep_cmdq_put() - Release reference to shareable TX or Target command
+ * queue.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+void cxip_ep_cmdq_put(struct cxip_ep_obj *ep_obj, bool transmit)
+{
+	struct cxip_cmdq **ep_obj_cmdq;
+	ofi_atomic32_t *ep_obj_ref;
+
+	if (transmit) {
+		ep_obj_cmdq = &ep_obj->txq;
+		ep_obj_ref = &ep_obj->txq_ref;
+	} else {
+		ep_obj_cmdq = &ep_obj->tgq;
+		ep_obj_ref = &ep_obj->tgq_ref;
+	}
+
+	if (!ofi_atomic_dec32(ep_obj_ref)) {
+		cxip_cmdq_free(*ep_obj_cmdq);
+
+		CXIP_DBG("Freed %s CMDQ: %p\n",
+			 transmit ? "TX" : "RX", *ep_obj_cmdq);
+		*ep_obj_cmdq = NULL;
+	} else {
+		CXIP_DBG("Put %s CMDQ: %p\n",
+			 transmit ? "TX" : "RX", *ep_obj_cmdq);
+	}
+}
+
+static int cxip_ep_cm_getname(fid_t fid, void *addr, size_t *addrlen)
+{
+	struct cxip_ep *cxip_ep = container_of(fid, struct cxip_ep, ep.fid);
+	size_t len;
+
+	len = MIN(*addrlen, sizeof(struct cxip_addr));
+
+	if (!cxip_ep->ep_obj->enabled)
+		return -FI_EOPBADSTATE;
+
+	CXIP_DBG("NIC: 0x%x PID: %u\n", cxip_ep->ep_obj->src_addr.nic,
+		 cxip_ep->ep_obj->src_addr.pid);
+
+	memcpy(addr, &cxip_ep->ep_obj->src_addr, len);
+	*addrlen = sizeof(struct cxip_addr);
+
+	return (len == sizeof(struct cxip_addr)) ? FI_SUCCESS : -FI_ETOOSMALL;
+}
+
+static int _join_collective(struct fid_ep *ep, const void *addr,
+			    uint64_t flags, struct fid_mc **mc, void *context)
+{
+	struct fi_collective_addr *arg = (struct fi_collective_addr *)addr;
+
+	return cxip_join_collective(ep, arg->coll_addr, arg->set,
+				    flags, mc, context);
+}
+
+struct fi_ops_cm cxip_ep_cm_ops = {
+	.size = sizeof(struct fi_ops_cm),
+	.setname = fi_no_setname,
+	.getname = cxip_ep_cm_getname,
+	.getpeer = fi_no_getpeer,
+	.connect = fi_no_connect,
+	.listen = fi_no_listen,
+	.accept = fi_no_accept,
+	.reject = fi_no_reject,
+	.shutdown = fi_no_shutdown,
+	.join = _join_collective,
+};
+
+/*
+ * cxip_ep_progress() - Progress an endpoint.
+ */
+void cxip_ep_progress(struct fid *fid)
+{
+	struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid);
+	struct cxip_ep_obj *ep_obj = ep->ep_obj;
+
+	if (ep_obj->enabled) {
+
+		ofi_genlock_lock(&ep_obj->lock);
+		cxip_evtq_progress(&ep_obj->rxc.rx_evtq);
+		cxip_evtq_progress(&ep_obj->txc.tx_evtq);
+		cxip_ep_ctrl_progress_locked(ep_obj);
+		ofi_genlock_unlock(&ep_obj->lock);
+	}
+}
+
+/*
+ * cxip_ep_peek() - Peek at EP event queues
+ *
+ * Return whether the associated EP event queues are empty.
+ */
+int cxip_ep_peek(struct fid *fid)
+{
+	struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid);
+	struct cxip_ep_obj *ep_obj = ep->ep_obj;
+
+	if (ep_obj->txc.tx_evtq.eq && cxi_eq_peek_event(ep_obj->txc.tx_evtq.eq))
+		return -FI_EAGAIN;
+	if (ep_obj->rxc.rx_evtq.eq && cxi_eq_peek_event(ep_obj->rxc.rx_evtq.eq))
+		return -FI_EAGAIN;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * fi_ep_get_unexpected_msgs() - Get unexpected message information, exposed
+ * via domain open ops.
+ */
+size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep,
+			      struct fi_cq_tagged_entry *entry, size_t count,
+			      fi_addr_t *src_addr, size_t *ux_count)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	size_t ret_count = 0;
+
+	/* Synchronous implementation to return a snapshot of the unexpected
+	 * message queue for the endpoint.
+	 */
+	if (!ux_count)
+		return -FI_EINVAL;
+
+	if (ep->ep_obj->rxc.state == RXC_DISABLED)
+		return -FI_EOPBADSTATE;
+
+	if (!ofi_recv_allowed(ep->rx_attr.caps)) {
+		CXIP_WARN("FI_RECV not enabled\n");
+		return -FI_EINVAL;
+	}
+
+	/* If in flow control, let that complete since
+	 * on-loading could be in progress.
+	 */
+	if (ep->ep_obj->rxc.state != RXC_ENABLED &&
+	    ep->ep_obj->rxc.state != RXC_ENABLED_SOFTWARE) {
+		cxip_cq_progress(ep->ep_obj->rxc.recv_cq);
+		return -FI_EAGAIN;
+	}
+
+	ofi_genlock_lock(&ep->ep_obj->lock);
+	if (cxip_evtq_saturated(&ep->ep_obj->rxc.rx_evtq)) {
+		RXC_DBG(&ep->ep_obj->rxc, "Target HW EQ saturated\n");
+		ofi_genlock_unlock(&ep->ep_obj->lock);
+
+		return -FI_EAGAIN;
+	}
+
+	/* Fill in supplied memory with what can fit */
+	ret_count = cxip_build_ux_entry_info(ep, entry, count, src_addr,
+					     ux_count);
+	ofi_genlock_unlock(&ep->ep_obj->lock);
+
+	return ret_count;
+}
+
+/*
+ * cxip_ep_flush_trig_reqs() - Free triggered request for the EP.
+ */
+void cxip_ep_flush_trig_reqs(struct cxip_ep_obj *ep_obj)
+{
+	ofi_genlock_lock(&ep_obj->lock);
+	cxip_evtq_flush_trig_reqs(&ep_obj->txc.tx_evtq);
+	ofi_genlock_unlock(&ep_obj->lock);
+}
+
+/*
+ * cxip_txc_close() - close the TX side of endpoint object.
+ */
+void cxip_txc_close(struct cxip_ep *ep)
+{
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+
+	if (txc->send_cq) {
+		ofi_genlock_lock(&txc->send_cq->ep_list_lock);
+		fid_list_remove2(&txc->send_cq->util_cq.ep_list,
+				&txc->send_cq->util_cq.ep_list_lock,
+				&ep->ep.fid);
+		ofi_genlock_unlock(&txc->send_cq->ep_list_lock);
+
+		ofi_atomic_dec32(&txc->send_cq->util_cq.ref);
+	}
+
+	if (txc->send_cntr) {
+		fid_list_remove(&txc->send_cntr->ctx_list,
+				&txc->send_cntr->lock,
+				&ep->ep.fid);
+		ofi_atomic_dec32(&txc->send_cntr->ref);
+	}
+
+	if (txc->read_cntr) {
+		fid_list_remove(&txc->read_cntr->ctx_list,
+				&txc->read_cntr->lock,
+				&ep->ep.fid);
+		ofi_atomic_dec32(&txc->read_cntr->ref);
+	}
+
+	if (txc->write_cntr) {
+		fid_list_remove(&txc->write_cntr->ctx_list,
+				&txc->write_cntr->lock,
+				&ep->ep.fid);
+		ofi_atomic_dec32(&txc->write_cntr->ref);
+	}
+
+	cxip_domain_remove_txc(txc->domain, txc);
+
+	cxip_txc_disable(txc);
+}
+
+/*
+ * cxip_rxc_close() - close the RX side of the endpoint object.
+ */
+void cxip_rxc_close(struct cxip_ep *ep)
+{
+	struct cxip_rxc *rxc = &ep->ep_obj->rxc;
+
+	if (rxc->recv_cq) {
+		/* EP FID may not be found in the list if recv_cq == send_cq,
+		 * but we still need to decrement reference.
+		 */
+		ofi_genlock_lock(&rxc->recv_cq->ep_list_lock);
+		fid_list_remove2(&rxc->recv_cq->util_cq.ep_list,
+				&rxc->recv_cq->util_cq.ep_list_lock,
+				&ep->ep.fid);
+		ofi_genlock_unlock(&rxc->recv_cq->ep_list_lock);
+
+		ofi_atomic_dec32(&rxc->recv_cq->util_cq.ref);
+	}
+
+	if (rxc->recv_cntr) {
+		fid_list_remove(&rxc->recv_cntr->ctx_list,
+				&rxc->recv_cntr->lock,
+				&ep->ep.fid);
+		ofi_atomic_dec32(&rxc->recv_cntr->ref);
+	}
+
+	cxip_rxc_disable(rxc);
+}
+
+/**
+ * Get TX/RX option flags.
+ *
+ * Support TX/RX context control(FI_GETOPSFLAG).
+ *
+ * @param tx_attr : TX attributes, or NULL
+ * @param rx_attr : RX attributes, or NULL
+ * @param flags : storage for returned flags
+ *
+ * @return int : 0 on success, -errno on failure
+ */
+int cxip_getopflags(struct fi_tx_attr *tx_attr, struct fi_rx_attr *rx_attr,
+		    uint64_t *flags)
+{
+	if ((*flags & FI_TRANSMIT) && (*flags & FI_RECV)) {
+		CXIP_WARN("Both Tx/Rx flags cannot be specified\n");
+		return -FI_EINVAL;
+	} else if (tx_attr && (*flags & FI_TRANSMIT)) {
+		*flags = tx_attr->op_flags;
+	} else if (rx_attr && (*flags & FI_RECV)) {
+		*flags = rx_attr->op_flags;
+	} else {
+		CXIP_WARN("Tx/Rx flags not specified\n");
+		return -FI_EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * Set TX/RX option flags.
+ *
+ * Support TX/RX control(FI_SETOPSFLAG).
+ *
+ * @param tx_attr : TX attributes, or NULL
+ * @param rx_attr : RX attributes, or NULL
+ * @param flags : flags to set
+ *
+ * @return int : 0 on success, -errno on failure
+ */
+int cxip_setopflags(struct fi_tx_attr *tx_attr, struct fi_rx_attr *rx_attr,
+		    uint64_t flags)
+{
+	if ((flags & FI_TRANSMIT) && (flags & FI_RECV)) {
+		CXIP_WARN("Both Tx/Rx flags cannot be specified\n");
+		return -FI_EINVAL;
+	} else if (tx_attr && (flags & FI_TRANSMIT)) {
+		tx_attr->op_flags = flags;
+		tx_attr->op_flags &= ~FI_TRANSMIT;
+		if (!(flags & (FI_INJECT_COMPLETE | FI_TRANSMIT_COMPLETE |
+			       FI_DELIVERY_COMPLETE)))
+			tx_attr->op_flags |= FI_TRANSMIT_COMPLETE;
+	} else if (rx_attr && (flags & FI_RECV)) {
+		rx_attr->op_flags = flags;
+		rx_attr->op_flags &= ~FI_RECV;
+	} else {
+		CXIP_WARN("Tx/Rx flags not specified\n");
+		return -FI_EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * Cancel RX operation
+ *
+ * Support TX/RX context cancel().
+ *
+ * Searches the RX queue for a pending async operation with the specified
+ * 'context', and cancels it if still pending.
+ *
+ * @param rxc : RX context to search
+ * @param context : user context pointer to search for
+ *
+ * @return ssize_t : 0 on success, -errno on failure
+ */
+ssize_t cxip_rxc_cancel(struct cxip_rxc *rxc, void *context)
+{
+	if (rxc->state == RXC_DISABLED)
+		return -FI_EOPBADSTATE;
+
+	return cxip_evtq_req_cancel(&rxc->rx_evtq, rxc, context, true);
+}
+
+/*
+ * cxip_ep_cancel() - Cancel TX/RX operation for EP.
+ */
+ssize_t cxip_ep_cancel(fid_t fid, void *context)
+{
+	struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid);
+
+	/* TODO: Remove this since it requires malicious programming to
+	 * create this condition.
+	 */
+	if (fid->fclass != FI_CLASS_EP)
+		return -FI_EINVAL;
+
+	if (!ofi_recv_allowed(ep->ep_obj->caps))
+		return -FI_ENOENT;
+
+	return cxip_rxc_cancel(&ep->ep_obj->rxc, context);
+}
+
+/*
+ * cxip_ep_enable() - Enable standard EP.
+ */
+static int cxip_ep_enable(struct fid_ep *fid_ep)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct cxip_ep_obj *ep_obj = ep->ep_obj;
+	int ret = FI_SUCCESS;
+
+	ofi_genlock_lock(&ep_obj->lock);
+	if (ep_obj->enabled)
+		goto unlock;
+
+	if (!ep_obj->av) {
+		CXIP_WARN("Endpoint must be bound to an AV\n");
+		ret = -FI_ENOAV;
+		goto unlock;
+	}
+
+	assert(ep_obj->domain->enabled);
+
+	/* src_addr.pid may be C_PID_ANY at this point. */
+	if (ep_obj->av_auth_key) {
+		ret = cxip_av_auth_key_get_vnis(ep_obj->av, &ep_obj->vnis,
+						&ep_obj->vni_count);
+		if (ret)
+			goto unlock;
+
+		ret = cxip_portals_table_alloc(ep_obj->domain->lni,
+					       ep_obj->vnis, ep_obj->vni_count,
+					       ep_obj->src_addr.pid,
+					       &ep_obj->ptable);
+		if (ret != FI_SUCCESS) {
+			CXIP_WARN("Failed to allocate auth key ring portals table: %d\n",
+				  ret);
+			goto free_vnis;
+		}
+
+		/* This is unfortunately needed to allocate a command queue.
+		 * But, this can be changed later.
+		 */
+		ep_obj->auth_key.vni = ep_obj->vnis[0];
+	} else {
+		ret = cxip_portals_table_alloc(ep_obj->domain->lni,
+					       &ep_obj->auth_key.vni, 1,
+					       ep_obj->src_addr.pid,
+					       &ep_obj->ptable);
+		if (ret != FI_SUCCESS) {
+			CXIP_WARN("Failed to allocate portals table: %d\n",
+				  ret);
+			goto unlock;
+		}
+	}
+
+	ep_obj->src_addr.pid = ep_obj->ptable->pid;
+
+	ret = cxip_ep_ctrl_init(ep_obj);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("cxip_ep_ctrl_init returned: %d\n", ret);
+		goto free_portals_table;
+	}
+
+	ret = cxip_zbcoll_init(ep_obj);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("cxip_zbcoll_init returned: %d\n", ret);
+		goto free_ep_ctrl;
+	}
+
+	CXIP_DBG("EP assigned NIC: %#x VNI: %u PID: %u\n",
+		 ep_obj->src_addr.nic,
+		 ep_obj->auth_key.vni,
+		 ep_obj->src_addr.pid);
+
+	ret = cxip_txc_enable(&ep_obj->txc);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("cxip_txc_enable returned: %d\n", ret);
+		goto unlock;
+	}
+
+	ret = cxip_rxc_enable(&ep_obj->rxc);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("cxip_rxc_enable returned: %d\n", ret);
+		goto unlock;
+	}
+
+	ret = cxip_coll_enable(ep);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("cxip_coll_enable returned: %d\n", ret);
+		/* collectives will not function, but EP will */
+	}
+
+	/* Enable only appropriate API functions based on primary/secondary
+	 * capabilities. Send/Receive requires FI_MSG or FI_TAGGED.
+	 */
+	if (ofi_send_allowed(ep->tx_attr.caps & ~FI_MSG) &&
+	    ofi_recv_allowed(ep->rx_attr.caps & ~FI_MSG))
+		ep->ep.tagged = &cxip_ep_tagged_ops;
+	else if (ofi_send_allowed(ep->tx_attr.caps & ~FI_MSG))
+		ep->ep.tagged = &cxip_ep_tagged_no_rx_ops;
+	else if (ofi_recv_allowed(ep->rx_attr.caps & ~FI_MSG))
+		ep->ep.tagged = &cxip_ep_tagged_no_tx_ops;
+
+	if (ofi_send_allowed(ep->tx_attr.caps & ~FI_TAGGED) &&
+	    ofi_recv_allowed(ep->rx_attr.caps & ~FI_TAGGED))
+		ep->ep.msg = &cxip_ep_msg_ops;
+	else if (ofi_send_allowed(ep->tx_attr.caps & ~FI_TAGGED))
+		ep->ep.msg = &cxip_ep_msg_no_rx_ops;
+	else if (ofi_recv_allowed(ep->rx_attr.caps & ~FI_TAGGED))
+		ep->ep.msg = &cxip_ep_msg_no_tx_ops;
+
+	/* Initiate requires FI_RMA or FI_ATOMIC */
+	if (ofi_rma_initiate_allowed(ep->tx_attr.caps & ~FI_RMA))
+		ep->ep.atomic = &cxip_ep_atomic_ops;
+
+	if (ofi_rma_initiate_allowed(ep->tx_attr.caps & ~FI_ATOMIC))
+		ep->ep.rma = &cxip_ep_rma_ops;
+
+	ep_obj->enabled = true;
+	ofi_genlock_unlock(&ep_obj->lock);
+
+	return FI_SUCCESS;
+
+free_ep_ctrl:
+	cxip_ep_ctrl_fini(ep_obj);
+
+free_portals_table:
+	cxip_portals_table_free(ep_obj->ptable);
+	ep_obj->ptable = NULL;
+free_vnis:
+	if (ep_obj->vnis) {
+		cxip_av_auth_key_put_vnis(ep_obj->av, ep_obj->vnis,
+					  ep_obj->vni_count);
+		ep_obj->vnis = NULL;
+	}
+unlock:
+	ofi_genlock_unlock(&ep_obj->lock);
+
+	return ret;
+}
+
+/*
+ * cxip_ep_disable() - Disable the base EP if enabled.
+ */
+static void cxip_ep_disable(struct cxip_ep_obj *ep_obj)
+{
+	if (ep_obj->enabled) {
+		cxip_coll_disable(ep_obj);
+		cxip_zbcoll_fini(ep_obj);
+		cxip_ep_ctrl_fini(ep_obj);
+		cxip_portals_table_free(ep_obj->ptable);
+		if (ep_obj->vnis) {
+			cxip_av_auth_key_put_vnis(ep_obj->av, ep_obj->vnis,
+						  ep_obj->vni_count);
+			ep_obj->vnis = NULL;
+		}
+		ep_obj->ptable = NULL;
+		ep_obj->enabled = false;
+	}
+}
+
+/*
+ * cxip_free_endpoint() - Release base EP object resources and free object.
+ */
+int cxip_free_endpoint(struct cxip_ep *ep)
+{
+	struct cxip_ep_obj *ep_obj = ep->ep_obj;
+	int count;
+
+	/* Each bound MR increments ref, so MRs must be removed.
+	 */
+	count = ofi_atomic_get32(&ep_obj->ref);
+	if (count) {
+		CXIP_WARN("EP refcount non-zero: %d\n", count);
+		return -FI_EBUSY;
+	}
+
+	count = ofi_atomic_get32(&ep_obj->coll.num_mc);
+	if (count) {
+		CXIP_WARN("EP num_mc non-zero: %d\n", count);
+		return -FI_EBUSY;
+	}
+
+	if (ep_obj->av)
+		cxip_av_unbind_ep(ep_obj->av, ep);
+
+	if (ep->ep_obj->eq) {
+		ofi_mutex_lock(&ep_obj->eq->list_lock);
+		dlist_remove(&ep_obj->eq_link);
+		ofi_mutex_unlock(&ep_obj->eq->list_lock);
+		ofi_atomic_dec32(&ep_obj->eq->util_eq.ref);
+	}
+
+	ofi_genlock_lock(&ep_obj->lock);
+	cxip_coll_close(ep_obj);
+	cxip_txc_close(ep);
+	cxip_rxc_close(ep);
+	cxip_ep_disable(ep_obj);
+	ofi_genlock_unlock(&ep_obj->lock);
+
+	ofi_atomic_dec32(&ep_obj->domain->ref);
+	ofi_genlock_destroy(&ep_obj->lock);
+	free(ep_obj);
+	ep->ep_obj = NULL;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ep_close() - Close (destroy) the base EP.
+ */
+static int cxip_ep_close(struct fid *fid)
+{
+	struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid);
+	int ret;
+	int count;
+
+	if (ep->is_alias) {
+		count = ofi_atomic_get32(&ep->ep_obj->ref);
+
+		if (count > 0) {
+			ofi_atomic_dec32(&ep->ep_obj->ref);
+			free(ep);
+			return FI_SUCCESS;
+		}
+
+		CXIP_WARN("EP alias %p, invalid EP object refcnt %d\n",
+			  ep, count);
+		return -FI_EINVAL;
+	}
+
+	ret = cxip_free_endpoint(ep);
+	if (ret) {
+		CXIP_WARN("Unable to free EP object %d : %s\n",
+			  ret, fi_strerror(-ret));
+		return ret;
+	}
+	free(ep);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ep_bind_cq() - Bind the EP to the CQ resource.
+ */
+static int cxip_ep_bind_cq(struct cxip_ep *ep, struct cxip_cq *cq,
+			   uint64_t flags)
+{
+	struct cxip_txc *txc;
+	struct cxip_rxc *rxc;
+	int ret;
+
+	if (ep->ep_obj->domain != cq->domain) {
+		CXIP_WARN("Invalid CQ domain for EP\n");
+		return -FI_EINVAL;
+	}
+
+	if ((flags | CXIP_EP_CQ_FLAGS) != CXIP_EP_CQ_FLAGS) {
+		CXIP_WARN("Invalid CQ flags\n");
+		return -FI_EINVAL;
+	}
+
+	if (flags & FI_TRANSMIT) {
+		txc = &ep->ep_obj->txc;
+		if (txc->send_cq) {
+			CXIP_WARN("SEND CQ previously bound\n");
+			return -FI_EINVAL;
+		}
+
+		ofi_atomic_inc32(&cq->util_cq.ref);
+		txc->send_cq = cq;
+
+		if (flags & FI_SELECTIVE_COMPLETION)
+			txc->selective_completion = 1;
+		if (!txc->selective_completion)
+			txc->attr.op_flags |= FI_COMPLETION;
+
+		ep->tx_attr.op_flags = txc->attr.op_flags;
+
+		/* Use CXI ep_list_lock that can be selectively optimized */
+		ofi_genlock_lock(&cq->ep_list_lock);
+		ret = fid_list_insert2(&cq->util_cq.ep_list,
+				      &cq->util_cq.ep_list_lock,
+				      &ep->ep.fid);
+		ofi_genlock_unlock(&cq->ep_list_lock);
+
+		if (ret) {
+			CXIP_WARN("EP CQ fid insert failed %d\n", ret);
+			ofi_atomic_dec32(&cq->util_cq.ref);
+			txc->send_cq = NULL;
+		}
+	}
+
+	if (flags & FI_RECV) {
+		rxc = &ep->ep_obj->rxc;
+		if (rxc->recv_cq) {
+			CXIP_WARN("RECV CQ previously bound\n");
+			return -FI_EINVAL;
+		}
+
+		ofi_atomic_inc32(&cq->util_cq.ref);
+		rxc->recv_cq = cq;
+
+		if (flags & FI_SELECTIVE_COMPLETION)
+			rxc->selective_completion = 1;
+		if (!rxc->selective_completion)
+			rxc->attr.op_flags |= FI_COMPLETION;
+
+		ep->rx_attr.op_flags = rxc->attr.op_flags;
+
+		/* Use CXI ep_list_lock that can be selectively optimized */
+		ofi_genlock_lock(&cq->ep_list_lock);
+		ret = fid_list_insert2(&cq->util_cq.ep_list,
+				      &cq->util_cq.ep_list_lock,
+				      &ep->ep.fid);
+		ofi_genlock_unlock(&cq->ep_list_lock);
+
+		if (ret) {
+			CXIP_WARN("EP CQ fid insert failed %d\n", ret);
+			ofi_atomic_dec32(&cq->util_cq.ref);
+			rxc->recv_cq = NULL;
+		}
+	}
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ep_bind_cntr() - Bind EP to counter resource
+ */
+static int cxip_ep_bind_cntr(struct cxip_ep *ep, struct cxip_cntr *cntr,
+			     uint64_t flags)
+{
+	int ret;
+
+	if (ep->ep_obj->domain != cntr->domain) {
+		CXIP_WARN("Counter domain invalid for EP\n");
+		return -FI_EINVAL;
+	}
+
+	if (!(flags & CXIP_EP_CNTR_FLAGS))
+		return FI_SUCCESS;
+
+	if ((flags & FI_SEND && ep->ep_obj->txc.send_cntr) ||
+	    (flags & FI_READ && ep->ep_obj->txc.read_cntr) ||
+	    (flags & FI_WRITE && ep->ep_obj->txc.write_cntr) ||
+	    (flags & FI_RECV && ep->ep_obj->rxc.recv_cntr)) {
+		CXIP_WARN("EP previously bound to counter\n");
+		return -FI_EINVAL;
+	}
+
+	ret = fid_list_insert(&cntr->ctx_list, &cntr->lock, &ep->ep.fid);
+	if (ret) {
+		CXIP_WARN("Add of EP to cntr EP list failed: %d:%s\n",
+			  ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	if (flags & FI_SEND) {
+		ep->ep_obj->txc.send_cntr = cntr;
+		ofi_atomic_inc32(&cntr->ref);
+	}
+	if (flags & FI_READ) {
+		ep->ep_obj->txc.read_cntr = cntr;
+		ofi_atomic_inc32(&cntr->ref);
+	}
+	if (flags & FI_WRITE) {
+		ep->ep_obj->txc.write_cntr = cntr;
+		ofi_atomic_inc32(&cntr->ref);
+	}
+	if (flags & FI_RECV) {
+		ep->ep_obj->rxc.recv_cntr = cntr;
+		ofi_atomic_inc32(&cntr->ref);
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ep_bind() - Bind EP resources.
+ */
+int cxip_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
+{
+	int ret;
+	struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid);
+	struct cxip_eq *eq;
+	struct cxip_cq *cq;
+	struct cxip_av *av;
+	struct cxip_cntr *cntr;
+
+	/* TODO: Remove this since it requires malicious programming to
+	 * create this condition.
+	 */
+	if (fid->fclass != FI_CLASS_EP)
+		return -FI_EINVAL;
+
+	ret = ofi_ep_bind_valid(&cxip_prov, bfid, flags);
+	if (ret)
+		return ret;
+
+	switch (bfid->fclass) {
+	case FI_CLASS_EQ:
+		eq = container_of(bfid, struct cxip_eq, util_eq.eq_fid.fid);
+		ofi_atomic_inc32(&eq->util_eq.ref);
+		ofi_mutex_lock(&eq->list_lock);
+		dlist_insert_tail(&ep->ep_obj->eq_link, &eq->ep_list);
+		ofi_mutex_unlock(&eq->list_lock);
+		ep->ep_obj->eq = eq;
+		break;
+
+	case FI_CLASS_CQ:
+		cq = container_of(bfid, struct cxip_cq, util_cq.cq_fid.fid);
+		ret = cxip_ep_bind_cq(ep, cq, flags);
+		if (ret)
+			return ret;
+		break;
+
+	case FI_CLASS_CNTR:
+		cntr = container_of(bfid, struct cxip_cntr, cntr_fid.fid);
+		ret = cxip_ep_bind_cntr(ep, cntr, flags);
+		if (ret)
+			return ret;
+		break;
+
+	case FI_CLASS_AV:
+		av = container_of(bfid, struct cxip_av, av_fid.fid);
+		ret = cxip_av_bind_ep(av, ep);
+		if (ret)
+			return ret;
+		ep->ep_obj->av = av;
+
+		break;
+
+	default:
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_set_tclass()
+ */
+int cxip_set_tclass(uint32_t desired_tc, uint32_t default_tc, uint32_t *new_tc)
+{
+	assert(new_tc != NULL);
+
+	if (desired_tc != FI_TC_UNSPEC) {
+		if (desired_tc >= FI_TC_LABEL &&
+		    desired_tc <= FI_TC_SCAVENGER) {
+			*new_tc = desired_tc;
+		} else {
+			CXIP_WARN("Invalid tclass\n");
+			return -FI_EINVAL;
+		}
+	} else {
+		*new_tc = default_tc;
+	}
+
+	CXIP_DBG("Set tclass to %d\n", *new_tc);
+	return FI_SUCCESS;
+}
+
+/**
+ * provider fi_set_val()/FI_SET_VAL implementation for EP
+ *
+ * @param fid : EP fid
+ * @param val : parameter structure for set value operations.
+ *
+ * @return int : 0 on success, -errno on failure
+ */
+static inline int cxip_ep_set_val(struct cxip_ep *cxi_ep,
+				  struct fi_fid_var *val)
+{
+	uint32_t *req_tclass;
+	uint64_t *req_order;
+	uint32_t new_tclass;
+
+	if (!val->val)
+		return -FI_EINVAL;
+
+	switch (val->name) {
+	case FI_OPT_CXI_SET_TCLASS:
+		req_tclass = (uint32_t *) val->val;
+
+		if (cxip_set_tclass(*req_tclass, cxi_ep->tx_attr.tclass,
+				    &new_tclass))
+			return -FI_EINVAL;
+
+		cxi_ep->tx_attr.tclass = new_tclass;
+		break;
+	case FI_OPT_CXI_SET_MSG_ORDER:
+		req_order = (uint64_t *) val->val;
+
+		if (*req_order & ~CXIP_MSG_ORDER) {
+			CXIP_WARN("Invalid message order 0x%" PRIx64 "\n",
+				  *req_order);
+			return -FI_EINVAL;
+		}
+
+		cxi_ep->tx_attr.msg_order = *req_order;
+		break;
+	default:
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ep_control() - Provider EP control implementation.
+ */
+static int cxip_ep_control(struct fid *fid, int command, void *arg)
+{
+	struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid);
+	struct cxip_ep *new_ep;
+	struct fi_alias *alias;
+	int ret;
+
+	/* TODO: Remove this since it requires malicious programming to
+	 * create this condition.
+	 */
+	if (fid->fclass != FI_CLASS_EP)
+		return -FI_EINVAL;
+
+	switch (command) {
+	case FI_ALIAS:
+		if (!arg)
+			return -FI_EINVAL;
+		alias = (struct fi_alias *)arg;
+		if (!alias->fid)
+			return -FI_EINVAL;
+		new_ep = calloc(1, sizeof(*new_ep));
+		if (!new_ep)
+			return -FI_ENOMEM;
+
+		memcpy(&new_ep->tx_attr, &ep->tx_attr,
+		       sizeof(struct fi_tx_attr));
+		memcpy(&new_ep->rx_attr, &ep->rx_attr,
+		       sizeof(struct fi_rx_attr));
+		ret = cxip_setopflags(&new_ep->tx_attr, &new_ep->rx_attr,
+				      alias->flags);
+		if (ret) {
+			free(new_ep);
+			return -FI_EINVAL;
+		}
+		new_ep->ep_obj = ep->ep_obj;
+		new_ep->is_alias = 1;
+		memcpy(&new_ep->ep, &ep->ep, sizeof(struct fid_ep));
+		*alias->fid = &new_ep->ep.fid;
+		ofi_atomic_inc32(&new_ep->ep_obj->ref);
+		break;
+	case FI_GETOPSFLAG:
+		if (!arg)
+			return -FI_EINVAL;
+		ret = cxip_getopflags(&ep->tx_attr, &ep->rx_attr,
+				      (uint64_t *)arg);
+		if (ret)
+			return -FI_EINVAL;
+		break;
+	case FI_SETOPSFLAG:
+		if (!arg)
+			return -FI_EINVAL;
+		ret = cxip_setopflags(&ep->tx_attr, &ep->rx_attr,
+				      *(uint64_t *)arg);
+		if (ret)
+			return -FI_EINVAL;
+		break;
+	case FI_ENABLE:
+		return cxip_ep_enable(&ep->ep);
+	case FI_SET_VAL:
+		if (!arg)
+			return -FI_EINVAL;
+		return cxip_ep_set_val(ep, (struct fi_fid_var *) arg);
+	default:
+		return -FI_EINVAL;
+	}
+
+	return FI_SUCCESS;
+}
+
+struct fi_ops cxip_ep_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_ep_close,
+	.bind = cxip_ep_bind,
+	.control = cxip_ep_control,
+	.ops_open = fi_no_ops_open,
+};
+
+int cxip_ep_getopt_priv(struct cxip_ep *ep, int level, int optname,
+			void *optval, size_t *optlen)
+{
+	if (level != FI_OPT_ENDPOINT)
+		return -FI_ENOPROTOOPT;
+
+	switch (optname) {
+	case FI_OPT_MIN_MULTI_RECV:
+		if (!optval || !optlen)
+			return -FI_EINVAL;
+		if (*optlen < sizeof(size_t))
+			return -FI_ETOOSMALL;
+
+		*(size_t *)optval = ep->ep_obj->rxc.min_multi_recv;
+		*optlen = sizeof(size_t);
+		break;
+
+	default:
+		return -FI_ENOPROTOOPT;
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ep_getopt() - Return endpoint option value if supported.
+ */
+static int cxip_ep_getopt(fid_t fid, int level, int optname, void *optval,
+			  size_t *optlen)
+{
+	struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid);
+
+	return cxip_ep_getopt_priv(ep, level, optname, optval, optlen);
+}
+
+int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname,
+			const void *optval, size_t optlen)
+{
+	size_t min_multi_recv;
+
+	if (level != FI_OPT_ENDPOINT)
+		return -FI_ENOPROTOOPT;
+
+	switch (optname) {
+	case FI_OPT_MIN_MULTI_RECV:
+		if (!optval)
+			return -FI_EINVAL;
+
+		min_multi_recv = *(size_t *)optval;
+
+		if (min_multi_recv > CXIP_EP_MAX_MULTI_RECV) {
+			CXIP_WARN("Maximum min_multi_recv value is: %u\n",
+				  CXIP_EP_MAX_MULTI_RECV);
+			return -FI_EINVAL;
+		}
+		ep->ep_obj->rxc.min_multi_recv = min_multi_recv;
+		break;
+
+	default:
+		return -FI_ENOPROTOOPT;
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ep_setopt() - Set endpoint option value if supported.
+ */
+static int cxip_ep_setopt(fid_t fid, int level, int optname, const void *optval,
+			  size_t optlen)
+{
+	struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid);
+
+	return cxip_ep_setopt_priv(ep, level, optname, optval, optlen);
+}
+
+struct fi_ops_ep cxip_ep_ops = {
+	.size = sizeof(struct fi_ops_ep),
+	.cancel = cxip_ep_cancel,
+	.getopt = cxip_ep_getopt,
+	.setopt = cxip_ep_setopt,
+	.tx_ctx = fi_no_tx_ctx,
+	.rx_ctx = fi_no_rx_ctx,
+	.rx_size_left = fi_no_rx_size_left,
+	.tx_size_left = fi_no_tx_size_left,
+};
+
+/*
+ * cxip_alloc_endpoint() - Allocate and initialize base EP object.
+ */
+int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints,
+			struct cxip_ep_obj **ep_base_obj, void *context)
+{
+	int ret;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_txc *txc;
+	struct cxip_rxc *rxc;
+	uint32_t nic;
+	uint32_t pid;
+	int i;
+
+	if (!hints || !hints->ep_attr || !hints->tx_attr || !hints->rx_attr)
+		return -FI_EINVAL;
+
+	ret = ofi_prov_check_info(&cxip_util_prov, CXIP_FI_VERSION, hints);
+	if (ret != FI_SUCCESS)
+		return -FI_EINVAL;
+
+	if (cxip_dom->auth_key_entry_max > 1 &&
+	    ((hints->caps & FI_DIRECTED_RECV) ||
+	     (hints->rx_attr->caps & FI_DIRECTED_RECV))) {
+		CXIP_WARN("FI_DIRECTED_RECV not supported with multiple auth key per EP\n");
+		return -FI_EINVAL;
+	}
+
+	ret = cxip_check_auth_key_info(hints);
+	if (ret)
+		return ret;
+
+	nic = cxip_dom->nic_addr;
+	if (hints->src_addr) {
+		struct cxip_addr *src = hints->src_addr;
+		if (src->nic != nic) {
+			CXIP_WARN("bad src_addr NIC value\n");
+			return -FI_EINVAL;
+		}
+		pid = src->pid;
+	} else {
+		pid = C_PID_ANY;
+	}
+
+	ep_obj = calloc(1, sizeof(struct cxip_ep_obj));
+	if (!ep_obj)
+		return -FI_ENOMEM;
+
+	txc = &ep_obj->txc;
+	rxc = &ep_obj->rxc;
+
+	/* For faster access */
+	ep_obj->asic_ver = cxip_dom->iface->info->cassini_version;
+
+	/* Save EP attributes from hints */
+	ep_obj->caps = hints->caps;
+	ep_obj->ep_attr = *hints->ep_attr;
+	ep_obj->txq_size = hints->tx_attr->size;
+	ep_obj->tgq_size = hints->rx_attr->size;
+	ep_obj->tx_attr = *hints->tx_attr;
+	ep_obj->rx_attr = *hints->rx_attr;
+
+	if (hints->ep_attr->auth_key) {
+		/* Auth key size is verified in ofi_prov_check_info(). */
+		assert(hints->ep_attr->auth_key_size ==
+		       sizeof(struct cxi_auth_key));
+
+		memcpy(&ep_obj->auth_key, hints->ep_attr->auth_key,
+		       sizeof(struct cxi_auth_key));
+
+		/* All EPs that share a Domain must use the same Service ID. */
+		if (ep_obj->auth_key.svc_id != cxip_dom->auth_key.svc_id) {
+			CXIP_WARN("Invalid svc_id: %u\n",
+				  ep_obj->auth_key.svc_id);
+			ret = -FI_EINVAL;
+			goto err;
+		}
+	} else {
+		if (cxip_dom->av_auth_key) {
+			ep_obj->av_auth_key = true;
+		} else {
+			/* Inherit auth_key from Domain. */
+			ep_obj->auth_key = cxip_dom->auth_key;
+			CXIP_DBG("Inherited domain auth_key\n");
+		}
+	}
+
+	if (cxip_set_tclass(ep_obj->tx_attr.tclass,
+			    cxip_dom->tclass, &ep_obj->txc.tclass)) {
+		CXIP_WARN("Invalid tclass\n");
+		ret = -FI_EINVAL;
+		goto err;
+	}
+	ep_obj->tx_attr.tclass = ep_obj->txc.tclass;
+
+	/* Initialize object */
+	ofi_atomic_initialize32(&ep_obj->ref, 0);
+
+	/* Allow FI_THREAD_DOMAIN optimizaiton */
+	if (cxip_dom->util_domain.threading == FI_THREAD_DOMAIN ||
+	    cxip_dom->util_domain.threading == FI_THREAD_COMPLETION)
+		ofi_genlock_init(&ep_obj->lock, OFI_LOCK_NONE);
+	else
+		ofi_genlock_init(&ep_obj->lock, OFI_LOCK_SPINLOCK);
+
+	ep_obj->domain = cxip_dom;
+	ep_obj->src_addr.nic = nic;
+	ep_obj->src_addr.pid = pid;
+	ep_obj->fi_addr = FI_ADDR_NOTAVAIL;
+
+	ofi_atomic_initialize32(&ep_obj->txq_ref, 0);
+	ofi_atomic_initialize32(&ep_obj->tgq_ref, 0);
+
+	for (i = 0; i < CXIP_NUM_CACHED_KEY_LE; i++) {
+		ofi_atomic_initialize32(&ep_obj->std_mr_cache[i].ref, 0);
+		ofi_atomic_initialize32(&ep_obj->opt_mr_cache[i].ref, 0);
+	}
+
+	dlist_init(&ep_obj->mr_list);
+	ep_obj->ep_attr.tx_ctx_cnt = 1;
+	ep_obj->ep_attr.rx_ctx_cnt = 1;
+	txc->ep_obj = ep_obj;
+	rxc->ep_obj = ep_obj;
+
+	cxip_txc_struct_init(txc, &ep_obj->tx_attr, context);
+	cxip_rxc_struct_init(rxc, &ep_obj->rx_attr, context);
+
+	txc->domain = cxip_dom;
+	txc->hrp_war_req = txc->ep_obj->asic_ver < CASSINI_2_0;
+
+	rxc->domain = cxip_dom;
+	rxc->min_multi_recv = CXIP_EP_MIN_MULTI_RECV;
+	ofi_atomic_inc32(&cxip_dom->ref);
+
+	*ep_base_obj = ep_obj;
+
+	return FI_SUCCESS;
+
+err:
+	free(ep_obj);
+
+	return ret;
+}
+
+/*
+ * cxip_endpoint() - Provider fi_endpoint() implementation.
+ */
+int cxip_endpoint(struct fid_domain *domain, struct fi_info *info,
+		  struct fid_ep **fid_ep, void *context)
+{
+	int ret;
+	struct cxip_ep *ep;
+	struct cxip_domain *cxip_dom = container_of(domain, struct cxip_domain,
+						    util_domain.domain_fid);
+	if (!fid_ep)
+		return -FI_EINVAL;
+
+	ep = calloc(1, sizeof(*ep));
+	if (!ep)
+		return -FI_ENOMEM;
+
+	/* Allocate and initialize the base endpoint */
+	ret = cxip_alloc_endpoint(cxip_dom, info, &ep->ep_obj, context);
+	if (ret) {
+		free(ep);
+		return ret;
+	}
+
+	/* Store EP attribures with the wrapper since values can be
+	 * overridden by alias EP that share the same EP object.
+	 */
+	ep->tx_attr = ep->ep_obj->tx_attr;
+	ep->rx_attr = ep->ep_obj->rx_attr;
+
+	ep->ep.fid.fclass = FI_CLASS_EP;
+	ep->ep.fid.context = context;
+	ep->ep.fid.ops = &cxip_ep_fi_ops;
+	ep->ep.ops = &cxip_ep_ops;
+	ep->ep.cm = &cxip_ep_cm_ops;
+
+	/* Initialize API to not supported until EP is enabled */
+	ep->ep.msg = &cxip_ep_msg_no_ops;
+	ep->ep.tagged = &cxip_ep_tagged_no_ops;
+	ep->ep.rma = &cxip_ep_rma_no_ops;
+	ep->ep.atomic = &cxip_ep_atomic_no_ops;
+	ep->ep.collective = &cxip_collective_no_ops;
+
+	*fid_ep = &ep->ep;
+
+	cxip_coll_init(ep->ep_obj);
+	cxip_domain_add_txc(ep->ep_obj->domain, &ep->ep_obj->txc);
+
+	return FI_SUCCESS;
+}
diff --git a/prov/cxi/src/cxip_eq.c b/prov/cxi/src/cxip_eq.c
new file mode 100644
index 00000000000..61aad506663
--- /dev/null
+++ b/prov/cxi/src/cxip_eq.c
@@ -0,0 +1,134 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2020 Hewlett Packard Enterprise Development LP
+ */
+
+ /*
+  * Notes:
+  *
+  * Implemented as an extension of util_eq.
+  *
+  * At present, the cxip_wait objects are not implemented as extensions of the
+  * util_wait object, so we cannot currently fully implement the EQ with wait
+  * states. However, the non-blocking read() and peek() functions work.
+  */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <ofi_list.h>
+#include <ofi.h>
+
+#include "cxip.h"
+
+static int cxip_eq_close(struct fid *fid)
+{
+	struct cxip_eq *cxi_eq;
+
+	cxi_eq = container_of(fid, struct cxip_eq, util_eq.eq_fid.fid);
+
+	/* May not close until all bound EPs closed */
+	if (ofi_atomic_get32(&cxi_eq->util_eq.ref))
+		return -FI_EBUSY;
+
+	ofi_mutex_destroy(&cxi_eq->list_lock);
+	ofi_eq_cleanup(&cxi_eq->util_eq.eq_fid.fid);
+	free(cxi_eq);
+
+	return FI_SUCCESS;
+}
+
+static void cxip_eq_progress(struct cxip_eq *eq)
+{
+	struct cxip_ep_obj *ep_obj;
+
+	ofi_mutex_lock(&eq->list_lock);
+	dlist_foreach_container(&eq->ep_list, struct cxip_ep_obj,
+				ep_obj, eq_link) {
+		cxip_coll_progress_join(ep_obj);
+	}
+	ofi_mutex_unlock(&eq->list_lock);
+}
+
+ssize_t cxip_eq_read(struct fid_eq *eq_fid, uint32_t *event,
+		     void *buf, size_t len, uint64_t flags)
+{
+	struct cxip_eq *eq;
+	int ret;
+
+	eq = container_of(eq_fid, struct cxip_eq, util_eq.eq_fid.fid);
+
+	ret = ofi_eq_read(eq_fid, event, buf, len, flags);
+	if (ret == -FI_EAGAIN)
+		cxip_eq_progress(eq);
+	return ret;
+}
+
+static struct fi_ops_eq cxi_eq_ops = {
+	.size = sizeof(struct fi_ops_eq),
+	.read = cxip_eq_read,		// customized
+	.readerr = ofi_eq_readerr,
+	.sread = ofi_eq_sread,
+	.write = ofi_eq_write,
+	.strerror = ofi_eq_strerror,
+};
+
+static struct fi_ops cxi_eq_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_eq_close,		// customized
+	.bind = fi_no_bind,
+	.control = ofi_eq_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static struct fi_eq_attr cxip_eq_def_attr = {
+	.size = CXIP_EQ_DEF_SZ,
+	.flags = 0,
+	.wait_obj = FI_WAIT_FD,
+	.signaling_vector = 0,
+	.wait_set = NULL
+};
+
+int cxip_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
+		 struct fid_eq **eq, void *context)
+{
+	struct cxip_eq *cxi_eq;
+	int ret;
+
+	cxi_eq = calloc(1, sizeof(*cxi_eq));
+	if (!cxi_eq)
+		return -FI_ENOMEM;
+
+	if (!attr)
+		cxi_eq->attr = cxip_eq_def_attr;
+	else
+		cxi_eq->attr = *attr;
+
+	ret = ofi_eq_init(fabric, &cxi_eq->attr, &cxi_eq->util_eq.eq_fid,
+			  context);
+	if (ret != FI_SUCCESS)
+		goto err0;
+
+	ofi_mutex_init(&cxi_eq->list_lock);
+	dlist_init(&cxi_eq->ep_list);
+	ofi_atomic_initialize32(&cxi_eq->util_eq.ref, 0);
+
+	/* custom operations */
+	cxi_eq->util_eq.eq_fid.fid.ops = &cxi_eq_fi_ops;
+	cxi_eq->util_eq.eq_fid.ops = &cxi_eq_ops;
+
+	*eq = &cxi_eq->util_eq.eq_fid;
+
+	return FI_SUCCESS;
+err0:
+	free(cxi_eq);
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_evtq.c b/prov/cxi/src/cxip_evtq.c
new file mode 100644
index 00000000000..68b3a99d165
--- /dev/null
+++ b/prov/cxi/src/cxip_evtq.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <ofi_list.h>
+#include <ofi.h>
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_CQ, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_CQ, __VA_ARGS__)
+
+bool cxip_evtq_saturated(struct cxip_evtq *evtq)
+{
+	if (evtq->eq_saturated)
+		return true;
+
+	/* Hardware will automatically update the EQ status writeback area,
+	 * which includes a timestamp, once the EQ reaches a certain fill
+	 * percentage. The EQ status timestamp is compared against cached
+	 * versions of the previous EQ status timestamp to determine if new
+	 * writebacks have occurred. Each time a new writeback occurs, the EQ
+	 * is treated as saturated.
+	 *
+	 * Note that the previous EQ status is always updated when the
+	 * corresponding OFI completion queue is progressed.
+	 */
+	if (evtq->eq->status->timestamp_sec >
+	    evtq->prev_eq_status.timestamp_sec ||
+	    evtq->eq->status->timestamp_ns >
+	    evtq->prev_eq_status.timestamp_ns) {
+		evtq->eq_saturated = true;
+		return true;
+	}
+
+	return false;
+}
+
+int cxip_evtq_adjust_reserved_fc_event_slots(struct cxip_evtq *evtq, int value)
+{
+	int ret;
+
+	ret = cxil_evtq_adjust_reserved_fc(evtq->eq, value);
+	if (ret >= 0)
+		ret = 0;
+
+	return ret;
+}
+
+/*
+ * cxip_evtq_req_cancel() - Cancel one request.
+ *
+ * Cancel one Receive request. If match is true, cancel the request with
+ * matching op_ctx. Only Receive requests should be in the request list.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_evtq_req_cancel(struct cxip_evtq *evtq, void *req_ctx,
+			 void *op_ctx, bool match)
+{
+	int ret = -FI_ENOENT;
+	struct cxip_req *req;
+	struct dlist_entry *tmp;
+
+	dlist_foreach_container_safe(&evtq->req_list, struct cxip_req, req,
+				     evtq_entry, tmp) {
+		if (req->req_ctx == req_ctx &&
+		    req->type == CXIP_REQ_RECV &&
+		    !req->recv.canceled &&
+		    !req->recv.parent &&
+		    (!match || (void *)req->context == op_ctx)) {
+			ret = cxip_recv_cancel(req);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void cxip_evtq_req_free_no_lock(struct cxip_req *req)
+{
+	struct cxip_req *table_req;
+
+	CXIP_DBG("Freeing req: %p (ID: %d)\n", req, req->req_id);
+
+	dlist_remove(&req->evtq_entry);
+
+	if (req->req_id >= 0) {
+		table_req = (struct cxip_req *)ofi_idx_remove(
+			&req->evtq->req_table, req->req_id);
+		if (table_req != req)
+			CXIP_WARN("Failed to unmap request: %p\n", req);
+	}
+
+	ofi_buf_free(req);
+}
+
+/*
+ * cxip_evtq_flush_trig_reqs() - Flush triggered TX requests
+ */
+void cxip_evtq_flush_trig_reqs(struct cxip_evtq *evtq)
+{
+	struct cxip_req *req;
+	struct dlist_entry *tmp;
+	struct cxip_txc *txc;
+
+	dlist_foreach_container_safe(&evtq->req_list, struct cxip_req, req,
+				     evtq_entry, tmp) {
+
+		if (cxip_is_trig_req(req)) {
+			/* If a request is triggered, the context will only be
+			 * a TX context (never a RX context).
+			 */
+			txc = req->req_ctx;
+
+			/* Since an event will not arrive to progress the
+			 * request, MDs must be cleaned up now.
+			 */
+			switch (req->type) {
+			case CXIP_REQ_RMA:
+				if (req->rma.local_md)
+					cxip_unmap(req->rma.local_md);
+				if (req->rma.ibuf)
+					cxip_txc_ibuf_free(txc,
+							   req->rma.ibuf);
+				break;
+
+			case CXIP_REQ_AMO:
+				if (req->amo.oper1_md)
+					cxip_unmap(req->amo.oper1_md);
+				if (req->amo.result_md)
+					cxip_unmap(req->amo.result_md);
+				if (req->amo.ibuf)
+					cxip_txc_ibuf_free(txc,
+							   req->amo.ibuf);
+				break;
+
+			case CXIP_REQ_SEND:
+				if (req->send.send_md)
+					cxip_unmap(req->send.send_md);
+				if (req->send.ibuf)
+					cxip_txc_ibuf_free(txc,
+							   req->send.ibuf);
+				break;
+
+			default:
+				CXIP_WARN("Invalid trig req type: %d\n",
+					  req->type);
+			}
+
+			ofi_atomic_dec32(&txc->otx_reqs);
+			cxip_evtq_req_free_no_lock(req);
+		}
+
+	}
+}
+
+/*
+ * cxip_evtq_req_discard() - Discard all matching requests.
+ *
+ * Mark all requests on the Completion Queue to be discarded. When a marked
+ * request completes, it's completion event will be dropped. This is the
+ * behavior defined for requests belonging to a closed Endpoint.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+void cxip_evtq_req_discard(struct cxip_evtq *evtq, void *req_ctx)
+{
+	struct cxip_req *req;
+	int discards = 0;
+
+	dlist_foreach_container(&evtq->req_list, struct cxip_req, req,
+				evtq_entry) {
+		if (req->req_ctx == req_ctx) {
+			req->discard = true;
+			discards++;
+		}
+	}
+
+	if (discards)
+		CXIP_DBG("Marked %d requests\n", discards);
+}
+
+/*
+ * cxip_evtq_req_find() - Look up a request by ID (from an event).
+ */
+static struct cxip_req *cxip_evtq_req_find(struct cxip_evtq *evtq, int id)
+{
+	return ofi_idx_at(&evtq->req_table, id);
+}
+
+/*
+ * cxip_evtq_req_alloc() - Allocate a request.
+ *
+ * If remap is set, allocate a 16-bit request ID and map it to the new
+ * request.
+ *
+ * Caller must hold ep_obj->lock of associated EP.
+ */
+struct cxip_req *cxip_evtq_req_alloc(struct cxip_evtq *evtq, int remap,
+				     void *req_ctx)
+{
+	struct cxip_req *req;
+
+	req = (struct cxip_req *)ofi_buf_alloc(evtq->req_pool);
+	if (!req) {
+		CXIP_DBG("Failed to allocate request\n");
+		goto out;
+	}
+	memset(req, 0, sizeof(*req));
+
+	if (remap) {
+		req->req_id = ofi_idx_insert(&evtq->req_table, req);
+
+		/* Target command buffer IDs are 16 bits wide. */
+		if (req->req_id < 0 || req->req_id >= CXIP_BUFFER_ID_MAX) {
+			CXIP_WARN("Failed to map request: %d\n",
+				  req->req_id);
+			if (req->req_id > 0)
+				ofi_idx_remove(&evtq->req_table, req->req_id);
+			ofi_buf_free(req);
+			req = NULL;
+			goto out;
+		}
+	} else {
+		req->req_id = -1;
+	}
+
+	CXIP_DBG("Allocated req: %p (ID: %d)\n", req, req->req_id);
+	req->cq = evtq->cq;
+	req->evtq = evtq;
+	req->req_ctx = req_ctx;
+	req->discard = false;
+	dlist_init(&req->evtq_entry);
+	dlist_insert_tail(&req->evtq_entry, &evtq->req_list);
+
+out:
+	return req;
+}
+
+/*
+ * cxip_evtq_req_free() - Free a request.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+void cxip_evtq_req_free(struct cxip_req *req)
+{
+	cxip_evtq_req_free_no_lock(req);
+}
+
+/*
+ * cxip_evtq_event_req() - Locate a request corresponding to the Cassini event.
+ */
+static struct cxip_req *cxip_evtq_event_req(struct cxip_evtq *evtq,
+					    const union c_event *event)
+{
+	struct cxip_req *req;
+	int return_code;
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_ACK:
+		req = (struct cxip_req *)event->init_short.user_ptr;
+		break;
+	case C_EVENT_UNLINK:
+		switch (cxi_tgt_event_rc(event)) {
+		/* User issued unlink events can race with put events. Assume
+		 * C_RC_ENTRY_NOT_FOUND is this case.
+		 */
+		case C_RC_ENTRY_NOT_FOUND:
+			return NULL;
+		case C_RC_OK:
+			break;
+		default:
+			CXIP_FATAL("Unhandled unlink return code: %d\n",
+				   cxi_tgt_event_rc(event));
+		}
+
+		/* Fall through. */
+	case C_EVENT_LINK:
+	case C_EVENT_GET:
+	case C_EVENT_PUT:
+	case C_EVENT_PUT_OVERFLOW:
+	case C_EVENT_RENDEZVOUS:
+	case C_EVENT_SEARCH:
+		req = cxip_evtq_req_find(evtq, event->tgt_long.buffer_id);
+		if (req)
+			break;
+		/* HW error can return zero buffer_id */
+		CXIP_WARN("Invalid buffer_id: %d (%s)\n",
+			  event->tgt_long.buffer_id, cxi_event_to_str(event));
+		return_code = cxi_tgt_event_rc(event);
+		if (return_code != C_RC_OK)
+			CXIP_WARN("Hardware return code: %s (%s)\n",
+				  cxi_rc_to_str(return_code),
+				  cxi_event_to_str(event));
+		break;
+	case C_EVENT_REPLY:
+	case C_EVENT_SEND:
+		if (!event->init_short.rendezvous) {
+			req = (struct cxip_req *)event->init_short.user_ptr;
+		} else {
+			struct cxi_rdzv_user_ptr *up =
+					(struct cxi_rdzv_user_ptr *)
+					 &event->init_short.user_ptr;
+			req = cxip_evtq_req_find(evtq, up->buffer_id);
+			if (req)
+				break;
+			/* HW error can return zero buffer_id */
+			CXIP_WARN("Invalid buffer_id: %d (%s)\n",
+				  event->tgt_long.buffer_id,
+				  cxi_event_to_str(event));
+			return_code = cxi_tgt_event_rc(event);
+			if (return_code != C_RC_OK)
+				CXIP_WARN("Hardware return code: %s (%s)\n",
+					  cxi_rc_to_str(return_code),
+					  cxi_event_to_str(event));
+		}
+		break;
+
+
+		req = NULL;
+		break;
+	case C_EVENT_COMMAND_FAILURE:
+		CXIP_FATAL("Command failure: cq=%u target=%u fail_loc=%u cmd_type=%u cmd_size=%u opcode=%u\n",
+			   event->cmd_fail.cq_id, event->cmd_fail.is_target,
+			   event->cmd_fail.fail_loc,
+			   event->cmd_fail.fail_command.cmd_type,
+			   event->cmd_fail.fail_command.cmd_size,
+			   event->cmd_fail.fail_command.opcode);
+	default:
+		CXIP_FATAL("Invalid event type: %d\n", event->hdr.event_type);
+	}
+
+	CXIP_DBG("got event: %s rc: %s (req: %p)\n",
+		 cxi_event_to_str(event),
+		 cxi_rc_to_str(cxi_event_rc(event)),
+		 req);
+
+	return req;
+}
+
+/*
+ * cxip_evtq_progress() - Progress the CXI hardware EQ specified
+ *
+ * Caller must hold ep_obj->lock.
+ */
+void cxip_evtq_progress(struct cxip_evtq *evtq)
+{
+	const union c_event *event;
+	struct cxip_req *req;
+	int ret = FI_SUCCESS;
+
+	if (!evtq->eq || !evtq->cq)
+		return;
+
+	/* The EQ status needs to be cached on each poll to be able to properly
+	 * determine if the OFI completion queue is saturated.
+	 */
+	evtq->prev_eq_status = *evtq->eq->status;
+
+	while ((event = cxi_eq_peek_event(evtq->eq))) {
+
+		/* State change events can be caused due to unacked events. When
+		 * a state change event occurs, always ack EQ.
+		 */
+		if (event->hdr.event_type == C_EVENT_STATE_CHANGE) {
+			cxi_eq_ack_events(evtq->eq);
+			evtq->unacked_events = 0;
+			cxip_pte_state_change(evtq->cq->domain->iface, event);
+		} else {
+
+			req = cxip_evtq_event_req(evtq, event);
+			if (req) {
+				ret = req->cb(req, event);
+				if (ret != FI_SUCCESS)
+					break;
+			}
+		}
+
+		cxi_eq_next_event(evtq->eq);
+
+		evtq->unacked_events++;
+		if (evtq->unacked_events >= evtq->ack_batch_size) {
+			cxi_eq_ack_events(evtq->eq);
+			evtq->unacked_events = 0;
+		}
+	}
+
+	if (cxi_eq_get_drops(evtq->eq)) {
+		CXIP_WARN("EQ %d dropped event, rsvd slots %u, free slots %u\n",
+			  evtq->eq->eqn,
+			  evtq->eq->status->event_slots_rsrvd,
+			  evtq->eq->status->event_slots_free);
+		CXIP_FATAL("H/W Event Queue overflow detected.\n");
+	}
+
+	if (ret == FI_SUCCESS)
+		evtq->eq_saturated = false;
+}
+
+void cxip_evtq_fini(struct cxip_evtq *evtq)
+{
+	if (!evtq->eq)
+		return;
+
+	cxil_destroy_evtq(evtq->eq);
+
+	if (evtq->md)
+		cxil_unmap(evtq->md);
+	else
+		madvise(evtq->buf, evtq->len, MADV_DOFORK);
+
+	if (evtq->mmap)
+		munmap(evtq->buf, evtq->len);
+	else
+		free(evtq->buf);
+
+	ofi_idx_reset(&evtq->req_table);
+	ofi_bufpool_destroy(evtq->req_pool);
+	evtq->eq = NULL;
+}
+
+static size_t cxip_evtq_get_queue_size(struct cxip_cq *cq, size_t num_events)
+{
+	size_t num_slots = num_events + cq->ack_batch_size;
+
+	/* One additional event slot is needed for full queue. */
+	num_slots += 1;
+
+	/* One additional event slot is needed for EQ status. */
+	num_slots += 1;
+
+	/* Users current expect libfabric CQ size to control sizing of HW EQs.
+	 * Honor this by using CQ size to override CXI provider requested EQ
+	 * slots.
+	 */
+	num_slots = MAX(num_slots, cq->attr.size);
+
+	return num_slots * C_EE_CFG_ECB_SIZE;
+}
+
+#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
+int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq,
+		   size_t num_events, size_t num_fc_events)
+{
+	struct cxi_eq_attr eq_attr = {
+		.reserved_slots = num_fc_events,
+	};
+	struct ofi_bufpool_attr bp_attr = {
+		.size = sizeof(struct cxip_req),
+		.alignment = 8,
+		.chunk_cnt = 64,
+		.flags = OFI_BUFPOOL_NO_TRACK,
+	};
+	size_t len;
+	size_t eq_len;
+	bool eq_passthrough = false;
+	int ret;
+	int page_size;
+
+	assert(cq->domain->enabled);
+
+	len = cxip_evtq_get_queue_size(cq, num_events + num_fc_events);
+
+	/* Note max_cnt == 0 is unlimited */
+	ret = ofi_bufpool_create_attr(&bp_attr, &evtq->req_pool);
+	if (ret) {
+		CXIP_WARN("Failed to create req pool: %d, %s\n",
+			  ret, fi_strerror(-ret));
+		return ret;
+	}
+	memset(&evtq->req_table, 0, sizeof(evtq->req_table));
+	dlist_init(&evtq->req_list);
+
+	/* Attempt to use 2 MiB hugepages. */
+	if (!cxip_env.disable_eq_hugetlb) {
+		eq_len = ofi_get_aligned_size(len, 1U << 21);
+		evtq->buf = mmap(NULL, eq_len, PROT_READ | PROT_WRITE,
+				 MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB |
+				 MAP_HUGE_2MB, -1, 0);
+		if (evtq->buf != MAP_FAILED) {
+			evtq->mmap = true;
+
+			/* If a single hugepage is used, CXI_EQ_PASSTHROUGH can
+			 * be used.
+			 */
+			if (eq_len <= (1U << 21))
+				eq_passthrough = true;
+			goto mmap_success;
+		}
+
+		CXIP_DBG("Unable to map hugepage for EQ\n");
+	}
+
+	page_size = ofi_get_page_size();
+	if (page_size < 0)
+		return -ofi_syserr();
+
+	evtq->mmap = false;
+	eq_len = ofi_get_aligned_size(len, page_size);
+	evtq->buf = aligned_alloc(page_size, eq_len);
+	if (!evtq->buf) {
+		CXIP_WARN("Unable to allocate EQ buffer\n");
+		ret = -FI_ENOMEM;
+		goto err_free_bp;
+	}
+
+mmap_success:
+	/* Buffer has been allocated. Only map if needed. */
+	evtq->len = eq_len;
+	if (eq_passthrough) {
+		evtq->md = NULL;
+		eq_attr.flags |= CXI_EQ_PASSTHROUGH;
+
+		ret = madvise(evtq->buf, evtq->len, MADV_DONTFORK);
+		if (ret) {
+			ret = -errno;
+			CXIP_WARN("madvise failed: %d\n", ret);
+			goto err_free_eq_buf;
+		}
+	} else {
+		ret = cxil_map(cq->domain->lni->lni, evtq->buf, evtq->len,
+			       CXIP_EQ_MAP_FLAGS, NULL, &evtq->md);
+		if (ret) {
+			CXIP_WARN("Unable to map EQ buffer: %d\n", ret);
+			goto err_free_eq_buf;
+		}
+	}
+
+	/* Once the EQ is at CQ fill percentage full, a status event is
+	 * generated. When a status event occurs, the CXIP CQ is considered
+	 * saturated until the CXI EQ is drained.
+	 */
+	eq_attr.status_thresh_base = cxip_env.cq_fill_percent;
+	eq_attr.status_thresh_delta = 0;
+	eq_attr.status_thresh_count = 1;
+
+	eq_attr.queue = evtq->buf;
+	eq_attr.queue_len = evtq->len;
+	eq_attr.flags |= CXI_EQ_TGT_LONG | CXI_EQ_EC_DISABLE;
+
+	/* CPU number will be ignored if invalid */
+	if (cq->attr.flags & FI_AFFINITY && cq->attr.signaling_vector > 0)
+		eq_attr.cpu_affinity = cq->attr.signaling_vector;
+
+	/* cq->priv_wait is NULL if not backed by wait object */
+	ret = cxil_alloc_evtq(cq->domain->lni->lni, evtq->md, &eq_attr,
+			      cq->priv_wait, NULL, &evtq->eq);
+	if (ret) {
+		CXIP_WARN("Failed to allocated EQ: %d\n", ret);
+		goto err_unmap_eq_buf;
+	}
+
+	/* Point back to the CQ bound to the TX or RX context */
+	evtq->cq = cq;
+	evtq->ack_batch_size = cq->ack_batch_size;
+
+	return FI_SUCCESS;
+
+err_unmap_eq_buf:
+	if (evtq->md)
+		cxil_unmap(evtq->md);
+	else
+		madvise(evtq->buf, evtq->len, MADV_DOFORK);
+err_free_eq_buf:
+	if (evtq->mmap)
+		munmap(evtq->buf, evtq->len);
+	else
+		free(evtq->buf);
+
+err_free_bp:
+	ofi_idx_reset(&evtq->req_table);
+	ofi_bufpool_destroy(evtq->req_pool);
+
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_fabric.c b/prov/cxi/src/cxip_fabric.c
new file mode 100644
index 00000000000..c8528cf829c
--- /dev/null
+++ b/prov/cxi/src/cxip_fabric.c
@@ -0,0 +1,93 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved.
+ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved.
+ * Copyright (c) 2018,2020 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+
+#include "ofi_prov.h"
+#include "ofi_osd.h"
+
+#include "cxip.h"
+
+int cxip_cq_def_sz = CXIP_CQ_DEF_SZ;
+int cxip_eq_def_sz = CXIP_EQ_DEF_SZ;
+
+static int read_default_params;
+
+static struct fi_ops_fabric cxip_fab_ops = {
+	.size = sizeof(struct fi_ops_fabric),
+	.domain = cxip_domain,
+	.passive_ep = fi_no_passive_ep,
+	.eq_open = cxip_eq_open,
+	.wait_open = ofi_wait_fd_open,
+	.trywait = ofi_trywait,
+};
+
+static int cxip_fabric_close(fid_t fid)
+{
+	struct cxip_fabric *fab;
+
+	fab = container_of(fid, struct cxip_fabric, util_fabric.fabric_fid);
+	if (ofi_atomic_get32(&fab->ref))
+		return -FI_EBUSY;
+
+	ofi_fabric_close(&fab->util_fabric);
+	free(fab);
+
+	return 0;
+}
+
+static struct fi_ops cxip_fab_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_fabric_close,
+	.bind = fi_no_bind,
+	.control = fi_no_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static void cxip_read_default_params(void)
+{
+	if (!read_default_params)
+		read_default_params = 1;
+}
+
+int cxip_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
+		void *context)
+{
+	struct cxip_fabric *fab;
+	int ret;
+
+	fab = calloc(1, sizeof(*fab));
+	if (!fab)
+		return -FI_ENOMEM;
+
+	ret = ofi_fabric_init(&cxip_prov, &cxip_fabric_attr, attr,
+			      &fab->util_fabric, context);
+	if (ret != FI_SUCCESS)
+		goto free_fab;
+
+	cxip_read_default_params();
+
+	ofi_atomic_initialize32(&fab->ref, 0);
+
+	fab->util_fabric.fabric_fid.fid.ops = &cxip_fab_fi_ops;
+	fab->util_fabric.fabric_fid.ops = &cxip_fab_ops;
+
+	*fabric = &fab->util_fabric.fabric_fid;
+
+	return 0;
+
+free_fab:
+	free(fab);
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_faults.c b/prov/cxi/src/cxip_faults.c
new file mode 100644
index 00000000000..04564b1bd04
--- /dev/null
+++ b/prov/cxi/src/cxip_faults.c
@@ -0,0 +1,70 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2019 Hewlett Packard Enterprise Development LP
+ */
+
+/* Fault injection. */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+#if ENABLE_DEBUG
+
+struct cxip_fault dma_fault = { .env = "DMA_FAULT_RATE" };
+struct cxip_fault malloc_fault = { .env = "MALLOC_FAULT_RATE" };
+
+static void fault_init(struct cxip_fault *fault)
+{
+	char *var;
+	float rate;
+	int ret;
+
+	var = getenv(fault->env);
+	if (var) {
+		ret = sscanf(var, "%f", &rate);
+		if (ret == 1) {
+			if (rate < 0)
+				rate = 0;
+			if (rate > 1)
+				rate = 1;
+
+			fault->prop = rate * RAND_MAX;
+			CXIP_DBG("%s: %f\n", fault->env, rate);
+		}
+	}
+}
+
+static void fault_fini(struct cxip_fault *fault)
+{
+	if (fault->prop)
+		CXIP_WARN("%s: %ld faults injected\n",
+			  fault->env, fault->count);
+}
+
+void cxip_fault_inject_init(void)
+{
+	time_t t = time(NULL);
+
+	CXIP_DBG("Rand seed: %lu\n", t);
+	srand(t);
+
+	fault_init(&dma_fault);
+	fault_init(&malloc_fault);
+}
+
+void cxip_fault_inject_fini(void)
+{
+	fault_fini(&dma_fault);
+	fault_fini(&malloc_fault);
+}
+
+#else
+void cxip_fault_inject_init(void) {}
+void cxip_fault_inject_fini(void) {}
+#endif
diff --git a/prov/cxi/src/cxip_if.c b/prov/cxi/src/cxip_if.c
new file mode 100644
index 00000000000..d037bda13a5
--- /dev/null
+++ b/prov/cxi/src/cxip_if.c
@@ -0,0 +1,628 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018,2020-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+#include <dirent.h>
+#include <glob.h>
+
+#include "ofi_prov.h"
+#include "ofi_osd.h"
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_DOMAIN, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__)
+
+struct slist cxip_if_list;
+static struct cxil_device_list *cxi_dev_list;
+
+/*
+ * cxip_if_lookup_addr() - Return a provider NIC interface descriptor
+ * associated with a specified NIC address, if available.
+ */
+struct cxip_if *cxip_if_lookup_addr(uint32_t nic_addr)
+{
+	struct slist_entry *entry, *prev __attribute__ ((unused));
+	struct cxip_if *if_entry;
+
+	slist_foreach(&cxip_if_list, entry, prev) {
+		if_entry = container_of(entry, struct cxip_if, if_entry);
+		if (if_entry->info->nic_addr == nic_addr)
+			return if_entry;
+	}
+
+	return NULL;
+}
+
+/*
+ * cxip_if_lookup() - Return a provider NIC interface descriptor associated
+ * with a specified NIC device name, if available.
+ */
+struct cxip_if *cxip_if_lookup_name(const char *name)
+{
+	struct slist_entry *entry, *prev __attribute__ ((unused));
+	struct cxip_if *if_entry;
+
+	slist_foreach(&cxip_if_list, entry, prev) {
+		if_entry = container_of(entry, struct cxip_if, if_entry);
+		if (!strcmp(if_entry->info->device_name, name))
+			return if_entry;
+	}
+
+	return NULL;
+}
+
+/*
+ * cxip_lni_res_count() - Return usage information for LNI resource.
+ */
+int cxip_lni_res_cnt(struct cxip_lni *lni, char *res_str)
+{
+	struct dirent *de;
+	char path[100];
+	uint32_t c = 0;
+	DIR *dr;
+
+	sprintf(path, "/sys/kernel/debug/cxi/cxi%u/lni/%u/%s",
+		lni->iface->info->dev_id, lni->lni->id, res_str);
+
+	dr = opendir(path);
+	if (!dr)
+		return 0;
+
+	while ((de = readdir(dr))) {
+		if (strncmp(de->d_name, ".", 1))
+			c++;
+	}
+
+	closedir(dr);
+
+	return c;
+}
+
+/*
+ * cxip_lni_res_dump() - Dump resource usage information for an LNI.
+ */
+void cxip_lni_res_dump(struct cxip_lni *lni)
+{
+	DIR *dr;
+	uint32_t pt_count = 0;
+	uint32_t cq_count = 0;
+	uint32_t eq_count = 0;
+	uint32_t ct_count = 0;
+	uint32_t ac_count = 0;
+
+	/* Check if debugfs is available. */
+	dr = opendir("/sys/kernel/debug/cxi");
+	if (!dr) {
+		CXIP_INFO("Resource usage info unavailable: %s RGID: %u.\n",
+			  lni->iface->info->device_name, lni->lni->id);
+		return;
+	}
+
+	closedir(dr);
+
+	cq_count = cxip_lni_res_cnt(lni, "cq");
+	pt_count = cxip_lni_res_cnt(lni, "pt");
+	eq_count = cxip_lni_res_cnt(lni, "eq");
+	ct_count = cxip_lni_res_cnt(lni, "ct");
+	ac_count = cxip_lni_res_cnt(lni, "ac");
+
+	CXIP_INFO("Resource usage: %s RGID: %u CQ: %u PTE: %u EQ: %u CT: %u AC: %u\n",
+		  lni->iface->info->device_name, lni->lni->id, cq_count,
+		  pt_count, eq_count, ct_count, ac_count);
+}
+
+/*
+ * cxip_get_if() - Get a reference to the device interface associated with a
+ * provided NIC address. A process can open each interface once to support many
+ * FI Domains. An IF is used to allocate the various device resources including
+ * CMDQs, EVTQs, and PtlTEs.
+ */
+int cxip_get_if(uint32_t nic_addr, struct cxip_if **iface)
+{
+	int ret;
+	struct cxip_if *if_entry;
+
+	/* The IF list device info is static, no need to lock */
+	if_entry = cxip_if_lookup_addr(nic_addr);
+	if (!if_entry) {
+		CXIP_DBG("interface not found\n");
+		return -FI_ENODEV;
+	}
+
+	if (!if_entry->link) {
+		CXIP_INFO("Interface %s link down.\n",
+			  if_entry->info->device_name);
+		return -FI_ENODEV;
+	}
+
+	/* Lock the IF to serialize opening the device */
+	ofi_spin_lock(&if_entry->lock);
+
+	if (!if_entry->dev) {
+		ret = cxil_open_device(if_entry->info->dev_id, &if_entry->dev);
+		if (ret) {
+			CXIP_WARN("Failed to open CXI Device, ret: %d\n", ret);
+			ret = -FI_ENODEV;
+			goto unlock;
+		}
+
+		CXIP_DBG("Opened %s\n", if_entry->info->device_name);
+	}
+
+	ofi_atomic_inc32(&if_entry->ref);
+	*iface = if_entry;
+
+	ofi_spin_unlock(&if_entry->lock);
+
+	return FI_SUCCESS;
+
+unlock:
+	ofi_spin_unlock(&if_entry->lock);
+
+	return ret;
+}
+
+/*
+ * cxip_put_if() - Drop a reference to the device interface.
+ */
+void cxip_put_if(struct cxip_if *iface)
+{
+	ofi_spin_lock(&iface->lock);
+
+	if (!ofi_atomic_dec32(&iface->ref)) {
+		cxil_close_device(iface->dev);
+		iface->dev = NULL;
+
+		CXIP_DBG("Closed %s\n", iface->info->device_name);
+	}
+
+	ofi_spin_unlock(&iface->lock);
+}
+
+int cxip_if_valid_rgroup_vni(struct cxip_if *iface, unsigned int rgroup_id,
+			     unsigned int vni)
+{
+	struct cxi_svc_desc svc_desc;
+	bool vni_found = false;
+	int ret;
+	int i;
+
+	ret = cxil_get_svc(iface->dev, rgroup_id, &svc_desc);
+	if (ret) {
+		CXIP_WARN("cxil_get_svc with %s and rgroup_id %d failed: %d:%s\n",
+			  iface->dev->info.device_name, rgroup_id, ret,
+			  strerror(-ret));
+		return -FI_EINVAL;
+	}
+
+	if (svc_desc.restricted_vnis) {
+		for (i = 0; i < svc_desc.num_vld_vnis; i++) {
+			if (vni == svc_desc.vnis[i]) {
+				vni_found = true;
+				break;
+			}
+		}
+
+		if (!vni_found) {
+			CXIP_WARN("Invalid VNI %d for %s and svc_id %d\n",
+				  vni, iface->dev->info.device_name,
+				  rgroup_id);
+			return -FI_EINVAL;
+		}
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_alloc_lni() - Allocate an LNI
+ */
+int cxip_alloc_lni(struct cxip_if *iface, uint32_t svc_id,
+		   struct cxip_lni **if_lni)
+{
+	struct cxip_lni *lni;
+	int ret;
+
+	lni = calloc(1, sizeof(*lni));
+	if (!lni) {
+		CXIP_WARN("Unable to allocate LNI\n");
+		return -FI_ENOMEM;
+	}
+
+	ret = cxil_alloc_lni(iface->dev, &lni->lni, svc_id);
+	if (ret) {
+		CXIP_WARN("Failed to allocate LNI, ret: %d\n", ret);
+		ret = -FI_ENOSPC;
+		goto free_lni;
+	}
+
+	lni->iface = iface;
+	ofi_spin_init(&lni->lock);
+	dlist_init(&lni->remap_cps);
+
+	CXIP_DBG("Allocated LNI, %s RGID: %u\n",
+		 lni->iface->info->device_name, lni->lni->id);
+
+	*if_lni = lni;
+
+	return FI_SUCCESS;
+
+free_lni:
+	free(lni);
+
+	return ret;
+}
+
+/*
+ * cxip_free_lni() - Free an LNI
+ */
+void cxip_free_lni(struct cxip_lni *lni)
+{
+	int ret;
+	int i;
+	struct dlist_entry *tmp;
+	struct cxip_remap_cp *sw_cp;
+
+	cxip_lni_res_dump(lni);
+
+	CXIP_DBG("Freeing LNI, %s RGID: %u\n",
+		 lni->iface->info->device_name, lni->lni->id);
+
+	dlist_foreach_container_safe(&lni->remap_cps, struct cxip_remap_cp,
+				     sw_cp, remap_entry, tmp)
+		free(sw_cp);
+
+	for (i = 0; i < lni->n_cps; i++) {
+		ret = cxil_destroy_cp(lni->hw_cps[i]);
+		if (ret)
+			CXIP_WARN("Failed to destroy CP: %d\n", ret);
+	}
+
+	ret = cxil_destroy_lni(lni->lni);
+	if (ret)
+		CXIP_WARN("Failed to destroy LNI: %d\n", ret);
+
+	free(lni);
+}
+
+/*
+ * netdev_ama_check - Return true if the netdev has an AMA installed.
+ */
+static bool netdev_ama_check(char *netdev)
+{
+	int rc;
+	char addr_path[FI_PATH_MAX];
+	FILE *f;
+	int val;
+
+	rc = snprintf(addr_path, FI_PATH_MAX,
+		      "/sys/class/net/%s/addr_assign_type",
+		      netdev);
+	if (rc < 0)
+		return false;
+
+	f = fopen(addr_path, "r");
+	if (!f)
+		return false;
+
+	rc = fscanf(f, "%d", &val);
+
+	fclose(f);
+
+	if (rc != 1)
+		return false;
+
+	/* Check for temporary address */
+	if (val != 3)
+		return false;
+
+	rc = snprintf(addr_path, FI_PATH_MAX, "/sys/class/net/%s/address",
+		      netdev);
+	if (rc < 0)
+		return false;
+
+	f = fopen(addr_path, "r");
+	if (!f)
+		return false;
+
+	rc = fscanf(f, "%x:%*x:%*x:%*x:%*x", &val);
+
+	fclose(f);
+
+	if (rc != 1)
+		return false;
+
+	/* Check for locally administered unicast address */
+	if ((val & 0x3) != 0x2)
+		return false;
+
+	return true;
+}
+
+/*
+ * netdev_link - Return netdev link state.
+ */
+static int netdev_link(char *netdev, int *link)
+{
+	int rc;
+	char path[FI_PATH_MAX];
+	FILE *f;
+	char state[20];
+	int carrier;
+
+	rc = snprintf(path, FI_PATH_MAX, "/sys/class/net/%s/operstate",
+		      netdev);
+	if (rc < 0)
+		return -1;
+
+	f = fopen(path, "r");
+	if (!f)
+		return -1;
+
+	rc = fscanf(f, "%20s", state);
+
+	fclose(f);
+
+	if (!strncmp(state, "up", strlen("up"))) {
+		*link = 1;
+		return 0;
+	}
+
+	if (strncmp(state, "unknown", strlen("unknown"))) {
+		/* State is not not up or unknown, link is down. */
+		*link = 0;
+		return 0;
+	}
+
+	/* operstate is unknown, must check carrier. */
+	rc = snprintf(path, FI_PATH_MAX, "/sys/class/net/%s/carrier",
+		      netdev);
+	if (rc < 0)
+		return -1;
+
+	f = fopen(path, "r");
+	if (!f)
+		return -1;
+
+	rc = fscanf(f, "%d", &carrier);
+
+	fclose(f);
+
+	if (carrier)
+		*link = 1;
+	else
+		*link = 0;
+
+	return 0;
+}
+
+/*
+ * netdev_speed - Return netdev interface speed.
+ */
+static int netdev_speed(char *netdev, int *speed)
+{
+	int rc;
+	char path[FI_PATH_MAX];
+	FILE *f;
+	int val;
+
+	rc = snprintf(path, FI_PATH_MAX, "/sys/class/net/%s/speed",
+		      netdev);
+	if (rc < 0)
+		return -1;
+
+	f = fopen(path, "r");
+	if (!f)
+		return -1;
+
+	rc = fscanf(f, "%u", &val);
+
+	fclose(f);
+
+	if (rc != 1)
+		return -1;
+
+	*speed = val;
+
+	return 0;
+}
+
+/*
+ * netdev_netdev - Look up the netdev associated with an RDMA device file.
+ */
+static int netdev_lookup(struct cxil_devinfo *info, char **netdev)
+{
+	glob_t globbuf;
+	int rc;
+	int count;
+	int i;
+	char if_path[FI_PATH_MAX];
+	char addr_path[FI_PATH_MAX];
+	char *addr;
+	unsigned int dom;
+	unsigned int bus;
+	unsigned int dev;
+	unsigned int func;
+
+	rc = glob("/sys/class/net/*", 0, NULL, &globbuf);
+	if (rc)
+		return -1;
+
+	count = globbuf.gl_pathc;
+
+	for (i = 0; i < count; i++) {
+		rc = snprintf(if_path, FI_PATH_MAX, "%s/device",
+			      globbuf.gl_pathv[i]);
+		if (rc < 0)
+			goto free_glob;
+
+		rc = readlink(if_path, addr_path, FI_PATH_MAX-1);
+		if (rc < 0) {
+			/* A virtual device, like a bridge, doesn't have a
+			 * device link.
+			 */
+			if (errno == ENOENT || errno == ENOTDIR)
+				continue;
+
+			goto free_glob;
+		}
+		addr_path[rc] = '\0';
+
+		addr = basename(addr_path);
+
+		rc = sscanf(addr, "%x:%x:%x.%x", &dom, &bus, &dev, &func);
+		if (rc != 4)
+			continue;
+
+		if (info->pci_domain == dom &&
+		    info->pci_bus == bus &&
+		    info->pci_device == dev &&
+		    info->pci_function == func) {
+			*netdev = strdup(basename(globbuf.gl_pathv[i]));
+			if (!*netdev)
+				goto free_glob;
+
+			globfree(&globbuf);
+			return 0;
+		}
+	}
+
+free_glob:
+	globfree(&globbuf);
+
+	return -1;
+}
+
+/*
+ * cxip_query_if_list() - Populate static IF data during initialization.
+ */
+static void cxip_query_if_list(struct slist *if_list)
+{
+	struct cxip_if *if_entry;
+	int ret;
+	int i;
+	char *netdev;
+	int speed = 0;
+	int link = 0;
+
+	slist_init(if_list);
+
+	/* The cxi_dev_list is freed in the provider IF destructor */
+	ret = cxil_get_device_list(&cxi_dev_list);
+	if (ret) {
+		CXIP_WARN("cxil_get_device_list failed\n");
+		return;
+	}
+
+	if (cxi_dev_list->count == 0) {
+		CXIP_DBG("No IFs found\n");
+		return;
+	}
+
+	if (cxi_dev_list->info[0].min_free_shift) {
+		CXIP_WARN("Non-zero min_free_shift not supported\n");
+		return;
+	}
+
+	for (i = 0; i < cxi_dev_list->count; i++) {
+		/* Ignore cxi devices not included in device name string. */
+		if (cxip_env.device_name &&
+		    (strstr(cxip_env.device_name,
+			    cxi_dev_list->info[i].device_name) == NULL))
+			continue;
+
+		if (!getenv("CXIP_SKIP_RH_CHECK") &&
+		    cxi_dev_list->info[i].device_platform == C_PLATFORM_ASIC &&
+		    !cxil_rh_running(&cxi_dev_list->info[i])) {
+			CXIP_LOG("CXI retry handler not running for device: %s\n",
+				 cxi_dev_list->info[i].device_name);
+			continue;
+		}
+
+		ret = netdev_lookup(&cxi_dev_list->info[i], &netdev);
+		if (ret) {
+			CXIP_LOG("CXI netdev not found for device: %s\n",
+				 cxi_dev_list->info[i].device_name);
+			netdev = strdup("DNE");
+		} else {
+			ret = netdev_link(netdev, &link);
+			if (ret)
+				CXIP_WARN("Failed to read netdev link: %s\n",
+					  netdev);
+
+			ret = netdev_speed(netdev, &speed);
+			if (ret)
+				CXIP_WARN("Failed to read netdev speed: %s\n",
+					  netdev);
+
+			CXIP_DBG("Device %s has netdev %s (link: %u speed: %u)\n",
+				 cxi_dev_list->info[i].device_name,
+				 netdev, link, speed);
+		}
+
+		if (!getenv("CXIP_SKIP_AMA_CHECK") &&
+		    !netdev_ama_check(netdev)) {
+			CXIP_LOG("CXI device %s, netdev %s AMA not recognized\n",
+				 cxi_dev_list->info[i].device_name,
+				 netdev);
+			free(netdev);
+			continue;
+		}
+
+		free(netdev);
+
+		if_entry = calloc(1, sizeof(struct cxip_if));
+		if_entry->info = &cxi_dev_list->info[i];
+		if_entry->link = link;
+		if_entry->speed = speed;
+
+		ofi_atomic_initialize32(&if_entry->ref, 0);
+		dlist_init(&if_entry->ptes);
+		ofi_spin_init(&if_entry->lock);
+		slist_insert_tail(&if_entry->if_entry, if_list);
+	}
+}
+
+/*
+ * cxip_free_if_list() - Tears down static IF data.
+ */
+static void cxip_free_if_list(struct slist *if_list)
+{
+	struct slist_entry *entry;
+	struct cxip_if *if_entry;
+
+	while (!slist_empty(if_list)) {
+		entry = slist_remove_head(if_list);
+		if_entry = container_of(entry, struct cxip_if, if_entry);
+		ofi_spin_destroy(&if_entry->lock);
+		free(if_entry);
+	}
+
+	cxil_free_device_list(cxi_dev_list);
+}
+
+/*
+ * cxip_if_init() - The provider IF constructor.  Initializes static IF data.
+ */
+void cxip_if_init(void)
+{
+	cxip_query_if_list(&cxip_if_list);
+}
+
+/*
+ * cxip_if_fini() - The provider IF destructor.  Tears down IF data.
+ */
+void cxip_if_fini(void)
+{
+	cxip_free_if_list(&cxip_if_list);
+}
diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c
new file mode 100644
index 00000000000..25c392fa6b2
--- /dev/null
+++ b/prov/cxi/src/cxip_info.c
@@ -0,0 +1,1898 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2019,2022 Hewlett Packard Enterprise Development LP
+ */
+
+/* CXI fabric discovery implementation. */
+
+#include "ofi_prov.h"
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_FABRIC, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_FABRIC, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_FABRIC, __VA_ARGS__)
+
+char cxip_prov_name[] = "cxi";
+
+struct fi_fabric_attr cxip_fabric_attr = {
+	.prov_version = CXIP_PROV_VERSION,
+	.name = cxip_prov_name,
+};
+
+/* No ODP, provider specified MR keys */
+struct fi_domain_attr cxip_prov_key_domain_attr = {
+	.name = NULL,
+	.threading = FI_THREAD_SAFE,
+	.control_progress = FI_PROGRESS_MANUAL,
+	.data_progress = FI_PROGRESS_MANUAL,
+	.resource_mgmt = FI_RM_ENABLED,
+	.av_type = FI_AV_UNSPEC,
+	.mr_mode = FI_MR_PROV_KEY | FI_MR_ALLOCATED | FI_MR_ENDPOINT,
+	.mr_key_size = CXIP_MR_PROV_KEY_SIZE,
+	.cq_data_size = CXIP_REMOTE_CQ_DATA_SZ,
+	.cq_cnt = 32,
+	.ep_cnt = 128,
+	.tx_ctx_cnt = CXIP_EP_MAX_TX_CNT,
+	.rx_ctx_cnt = CXIP_EP_MAX_RX_CNT,
+	.max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT,
+	.max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT,
+	.max_ep_stx_ctx = 0,
+	.max_ep_srx_ctx = 0,
+	.cntr_cnt = 16,
+	.mr_iov_limit = 1,
+	.mr_cnt = 100,
+	.caps = CXIP_DOM_CAPS,
+	.auth_key_size = sizeof(struct cxi_auth_key),
+	.max_ep_auth_key = 1,
+};
+
+/* ODP, provider specified MR keys */
+struct fi_domain_attr cxip_odp_prov_key_domain_attr = {
+	.name = NULL,
+	.threading = FI_THREAD_SAFE,
+	.control_progress = FI_PROGRESS_MANUAL,
+	.data_progress = FI_PROGRESS_MANUAL,
+	.resource_mgmt = FI_RM_ENABLED,
+	.av_type = FI_AV_UNSPEC,
+	.mr_mode = FI_MR_PROV_KEY | FI_MR_ENDPOINT,
+	.mr_key_size = CXIP_MR_PROV_KEY_SIZE,
+	.cq_data_size = CXIP_REMOTE_CQ_DATA_SZ,
+	.cq_cnt = 32,
+	.ep_cnt = 128,
+	.tx_ctx_cnt = CXIP_EP_MAX_TX_CNT,
+	.rx_ctx_cnt = CXIP_EP_MAX_RX_CNT,
+	.max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT,
+	.max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT,
+	.max_ep_stx_ctx = 0,
+	.max_ep_srx_ctx = 0,
+	.cntr_cnt = 16,
+	.mr_iov_limit = 1,
+	.mr_cnt = 100,
+	.caps = CXIP_DOM_CAPS,
+	.auth_key_size = sizeof(struct cxi_auth_key),
+	.max_ep_auth_key = 1,
+};
+
+/* No ODP, client specified MR keys */
+struct fi_domain_attr cxip_client_key_domain_attr = {
+	.name = NULL,
+	.threading = FI_THREAD_SAFE,
+	.control_progress = FI_PROGRESS_MANUAL,
+	.data_progress = FI_PROGRESS_MANUAL,
+	.resource_mgmt = FI_RM_ENABLED,
+	.av_type = FI_AV_UNSPEC,
+	.mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED,
+	.mr_key_size = CXIP_MR_KEY_SIZE,
+	.cq_data_size = CXIP_REMOTE_CQ_DATA_SZ,
+	.cq_cnt = 32,
+	.ep_cnt = 128,
+	.tx_ctx_cnt = CXIP_EP_MAX_TX_CNT,
+	.rx_ctx_cnt = CXIP_EP_MAX_RX_CNT,
+	.max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT,
+	.max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT,
+	.max_ep_stx_ctx = 0,
+	.max_ep_srx_ctx = 0,
+	.cntr_cnt = 16,
+	.mr_iov_limit = 1,
+	.mr_cnt = 100,
+	.caps = CXIP_DOM_CAPS,
+	.auth_key_size = sizeof(struct cxi_auth_key),
+	.max_ep_auth_key = 1,
+};
+
+/* ODP, client specified MR keys */
+struct fi_domain_attr cxip_odp_client_key_domain_attr = {
+	.name = NULL,
+	.threading = FI_THREAD_SAFE,
+	.control_progress = FI_PROGRESS_MANUAL,
+	.data_progress = FI_PROGRESS_MANUAL,
+	.resource_mgmt = FI_RM_ENABLED,
+	.av_type = FI_AV_UNSPEC,
+	.mr_mode = FI_MR_ENDPOINT,
+	.mr_key_size = CXIP_MR_KEY_SIZE,
+	.cq_data_size = CXIP_REMOTE_CQ_DATA_SZ,
+	.cq_cnt = 32,
+	.ep_cnt = 128,
+	.tx_ctx_cnt = CXIP_EP_MAX_TX_CNT,
+	.rx_ctx_cnt = CXIP_EP_MAX_RX_CNT,
+	.max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT,
+	.max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT,
+	.max_ep_stx_ctx = 0,
+	.max_ep_srx_ctx = 0,
+	.cntr_cnt = 16,
+	.mr_iov_limit = 1,
+	.mr_cnt = 100,
+	.caps = CXIP_DOM_CAPS,
+	.auth_key_size = sizeof(struct cxi_auth_key),
+	.max_ep_auth_key = 1,
+};
+
+/* No ODP, provider specified MR keys */
+struct fi_domain_attr cxip_prov_key_multi_auth_key_domain_attr = {
+	.name = NULL,
+	.threading = FI_THREAD_SAFE,
+	.control_progress = FI_PROGRESS_MANUAL,
+	.data_progress = FI_PROGRESS_MANUAL,
+	.resource_mgmt = FI_RM_ENABLED,
+	.av_type = FI_AV_UNSPEC,
+	.mr_mode = FI_MR_PROV_KEY | FI_MR_ALLOCATED | FI_MR_ENDPOINT,
+	.mr_key_size = CXIP_MR_PROV_KEY_SIZE,
+	.cq_data_size = CXIP_REMOTE_CQ_DATA_SZ,
+	.cq_cnt = 32,
+	.ep_cnt = 128,
+	.tx_ctx_cnt = CXIP_EP_MAX_TX_CNT,
+	.rx_ctx_cnt = CXIP_EP_MAX_RX_CNT,
+	.max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT,
+	.max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT,
+	.max_ep_stx_ctx = 0,
+	.max_ep_srx_ctx = 0,
+	.cntr_cnt = 16,
+	.mr_iov_limit = 1,
+	.mr_cnt = 100,
+	.caps = CXIP_DOM_CAPS,
+	.auth_key_size = sizeof(struct cxi_auth_key),
+
+	/* Set to the number of VNIs supported by a single CXI service. */
+	.max_ep_auth_key = 4,
+};
+
+/* ODP, provider specified MR keys */
+struct fi_domain_attr cxip_odp_prov_key_multi_auth_key_domain_attr = {
+	.name = NULL,
+	.threading = FI_THREAD_SAFE,
+	.control_progress = FI_PROGRESS_MANUAL,
+	.data_progress = FI_PROGRESS_MANUAL,
+	.resource_mgmt = FI_RM_ENABLED,
+	.av_type = FI_AV_UNSPEC,
+	.mr_mode = FI_MR_PROV_KEY | FI_MR_ENDPOINT,
+	.mr_key_size = CXIP_MR_PROV_KEY_SIZE,
+	.cq_data_size = CXIP_REMOTE_CQ_DATA_SZ,
+	.cq_cnt = 32,
+	.ep_cnt = 128,
+	.tx_ctx_cnt = CXIP_EP_MAX_TX_CNT,
+	.rx_ctx_cnt = CXIP_EP_MAX_RX_CNT,
+	.max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT,
+	.max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT,
+	.max_ep_stx_ctx = 0,
+	.max_ep_srx_ctx = 0,
+	.cntr_cnt = 16,
+	.mr_iov_limit = 1,
+	.mr_cnt = 100,
+	.caps = CXIP_DOM_CAPS,
+	.auth_key_size = sizeof(struct cxi_auth_key),
+
+	/* Set to the number of VNIs supported by a single CXI service. */
+	.max_ep_auth_key = 4,
+};
+
+/* No ODP, client specified MR keys */
+struct fi_domain_attr cxip_client_key_multi_auth_key_domain_attr = {
+	.name = NULL,
+	.threading = FI_THREAD_SAFE,
+	.control_progress = FI_PROGRESS_MANUAL,
+	.data_progress = FI_PROGRESS_MANUAL,
+	.resource_mgmt = FI_RM_ENABLED,
+	.av_type = FI_AV_UNSPEC,
+	.mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED,
+	.mr_key_size = CXIP_MR_KEY_SIZE,
+	.cq_data_size = CXIP_REMOTE_CQ_DATA_SZ,
+	.cq_cnt = 32,
+	.ep_cnt = 128,
+	.tx_ctx_cnt = CXIP_EP_MAX_TX_CNT,
+	.rx_ctx_cnt = CXIP_EP_MAX_RX_CNT,
+	.max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT,
+	.max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT,
+	.max_ep_stx_ctx = 0,
+	.max_ep_srx_ctx = 0,
+	.cntr_cnt = 16,
+	.mr_iov_limit = 1,
+	.mr_cnt = 100,
+	.caps = CXIP_DOM_CAPS,
+	.auth_key_size = sizeof(struct cxi_auth_key),
+
+	/* Set to the number of VNIs supported by a single CXI service. */
+	.max_ep_auth_key = 4,
+};
+
+/* ODP, client specified MR keys */
+struct fi_domain_attr cxip_odp_client_key_multi_auth_key_domain_attr = {
+	.name = NULL,
+	.threading = FI_THREAD_SAFE,
+	.control_progress = FI_PROGRESS_MANUAL,
+	.data_progress = FI_PROGRESS_MANUAL,
+	.resource_mgmt = FI_RM_ENABLED,
+	.av_type = FI_AV_UNSPEC,
+	.mr_mode = FI_MR_ENDPOINT,
+	.mr_key_size = CXIP_MR_KEY_SIZE,
+	.cq_data_size = CXIP_REMOTE_CQ_DATA_SZ,
+	.cq_cnt = 32,
+	.ep_cnt = 128,
+	.tx_ctx_cnt = CXIP_EP_MAX_TX_CNT,
+	.rx_ctx_cnt = CXIP_EP_MAX_RX_CNT,
+	.max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT,
+	.max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT,
+	.max_ep_stx_ctx = 0,
+	.max_ep_srx_ctx = 0,
+	.cntr_cnt = 16,
+	.mr_iov_limit = 1,
+	.mr_cnt = 100,
+	.caps = CXIP_DOM_CAPS,
+	.auth_key_size = sizeof(struct cxi_auth_key),
+
+	/* Set to the number of VNIs supported by a single CXI service. */
+	.max_ep_auth_key = 4,
+};
+
+struct fi_ep_attr cxip_ep_attr = {
+	.type = FI_EP_RDM,
+	.protocol = FI_PROTO_CXI,
+	.protocol_version = CXIP_WIRE_PROTO_VERSION,
+	.max_msg_size = CXIP_EP_MAX_MSG_SZ,
+	.max_order_raw_size = -1,
+	.max_order_war_size = -1,
+	.max_order_waw_size = -1,
+	.mem_tag_format = FI_TAG_GENERIC >> (64 - CXIP_TAG_WIDTH),
+	.auth_key_size = sizeof(struct cxi_auth_key),
+};
+
+struct fi_tx_attr cxip_tx_attr = {
+	.caps = CXIP_EP_CAPS & ~OFI_IGNORED_TX_CAPS,
+	.op_flags = CXIP_TX_OP_FLAGS,
+	.msg_order = CXIP_MSG_ORDER,
+	.inject_size = CXIP_INJECT_SIZE,
+	.size = CXIP_MAX_TX_SIZE,
+	.iov_limit = 1,
+	.rma_iov_limit = 1,
+};
+
+struct fi_rx_attr cxip_rx_attr = {
+	.caps = CXIP_EP_CAPS & ~OFI_IGNORED_RX_CAPS,
+	.op_flags = CXIP_RX_OP_FLAGS,
+	.msg_order = CXIP_MSG_ORDER,
+	.comp_order = FI_ORDER_NONE,
+	.total_buffered_recv = CXIP_UX_BUFFER_SIZE,
+	.size = CXIP_MAX_RX_SIZE,
+	.iov_limit = 1,
+};
+
+struct fi_tx_attr cxip_multi_auth_key_tx_attr = {
+	.caps = CXIP_EP_CAPS & ~OFI_IGNORED_TX_CAPS & ~FI_DIRECTED_RECV,
+	.op_flags = CXIP_TX_OP_FLAGS,
+	.msg_order = CXIP_MSG_ORDER,
+	.inject_size = CXIP_INJECT_SIZE,
+	.size = CXIP_MAX_TX_SIZE,
+	.iov_limit = 1,
+	.rma_iov_limit = 1,
+};
+
+struct fi_rx_attr cxip_multi_auth_key_rx_attr = {
+	.caps = CXIP_EP_CAPS & ~OFI_IGNORED_RX_CAPS & ~FI_DIRECTED_RECV,
+	.op_flags = CXIP_RX_OP_FLAGS,
+	.msg_order = CXIP_MSG_ORDER,
+	.comp_order = FI_ORDER_NONE,
+	.total_buffered_recv = CXIP_UX_BUFFER_SIZE,
+	.size = CXIP_MAX_RX_SIZE,
+	.iov_limit = 1,
+};
+
+/* The CXI provider supports multiple operating modes by exporting
+ * several fi_info structures. The application can filter the fi_info
+ * with hints, or choose the fi_info based on desired application
+ * behavior. Matched fi_info are returned in the order of highest
+ * to lowest provider performance.:
+ *
+ * 1. Pinned memory with provider MR Keys
+ * 2. Pinned memory with application provided MR Keys
+ * 3. On-Demand paging with provider MR Keys
+ * 4. On-Demand paging with application provided MR Keys
+ */
+struct fi_info cxip_infos[] = {
+	{
+		.caps = CXIP_CAPS,
+		.addr_format = FI_ADDR_CXI,
+		.tx_attr = &cxip_tx_attr,
+		.rx_attr = &cxip_rx_attr,
+		.ep_attr = &cxip_ep_attr,
+		.domain_attr = &cxip_prov_key_domain_attr,
+		.fabric_attr = &cxip_fabric_attr,
+	},
+	{
+		.caps = CXIP_CAPS,
+		.addr_format = FI_ADDR_CXI,
+		.tx_attr = &cxip_tx_attr,
+		.rx_attr = &cxip_rx_attr,
+		.ep_attr = &cxip_ep_attr,
+		.domain_attr = &cxip_client_key_domain_attr,
+		.fabric_attr = &cxip_fabric_attr,
+	},
+	{
+		.caps = CXIP_CAPS,
+		.addr_format = FI_ADDR_CXI,
+		.tx_attr = &cxip_tx_attr,
+		.rx_attr = &cxip_rx_attr,
+		.ep_attr = &cxip_ep_attr,
+		.domain_attr = &cxip_odp_prov_key_domain_attr,
+		.fabric_attr = &cxip_fabric_attr,
+	},
+	{
+		.caps = CXIP_CAPS,
+		.addr_format = FI_ADDR_CXI,
+		.tx_attr = &cxip_tx_attr,
+		.rx_attr = &cxip_rx_attr,
+		.ep_attr = &cxip_ep_attr,
+		.domain_attr = &cxip_odp_client_key_domain_attr,
+		.fabric_attr = &cxip_fabric_attr,
+	},
+	{
+		.caps = CXIP_CAPS & ~FI_DIRECTED_RECV,
+		.addr_format = FI_ADDR_CXI,
+		.tx_attr = &cxip_multi_auth_key_tx_attr,
+		.rx_attr = &cxip_multi_auth_key_rx_attr,
+		.ep_attr = &cxip_ep_attr,
+		.domain_attr = &cxip_prov_key_multi_auth_key_domain_attr,
+		.fabric_attr = &cxip_fabric_attr,
+	},
+	{
+		.caps = CXIP_CAPS & ~FI_DIRECTED_RECV,
+		.addr_format = FI_ADDR_CXI,
+		.tx_attr = &cxip_multi_auth_key_tx_attr,
+		.rx_attr = &cxip_multi_auth_key_rx_attr,
+		.ep_attr = &cxip_ep_attr,
+		.domain_attr = &cxip_client_key_multi_auth_key_domain_attr,
+		.fabric_attr = &cxip_fabric_attr,
+	},
+	{
+		.caps = CXIP_CAPS & ~FI_DIRECTED_RECV,
+		.addr_format = FI_ADDR_CXI,
+		.tx_attr = &cxip_multi_auth_key_tx_attr,
+		.rx_attr = &cxip_multi_auth_key_rx_attr,
+		.ep_attr = &cxip_ep_attr,
+		.domain_attr = &cxip_odp_prov_key_multi_auth_key_domain_attr,
+		.fabric_attr = &cxip_fabric_attr,
+	},
+	{
+		.caps = CXIP_CAPS & ~FI_DIRECTED_RECV,
+		.addr_format = FI_ADDR_CXI,
+		.tx_attr = &cxip_multi_auth_key_tx_attr,
+		.rx_attr = &cxip_multi_auth_key_rx_attr,
+		.ep_attr = &cxip_ep_attr,
+		.domain_attr = &cxip_odp_client_key_multi_auth_key_domain_attr,
+		.fabric_attr = &cxip_fabric_attr,
+	},
+};
+
+struct fi_provider cxip_prov;
+
+struct util_prov cxip_util_prov = {
+	.prov = &cxip_prov,
+	.info = NULL,
+	.flags = 0,
+};
+
+int s_page_size;
+
+/* Get _SC_PAGESIZE */
+static void set_system_page_size(void)
+{
+	if (!s_page_size)
+		s_page_size = sysconf(_SC_PAGESIZE);
+}
+
+/*
+ * cxip_info_alloc() - Create a fabric info structure for the CXI interface.
+ */
+static int cxip_info_alloc(struct cxip_if *nic_if, int info_index,
+			   struct fi_info **info)
+{
+	int ret;
+	struct fi_info *fi;
+	struct cxip_addr addr = {};
+
+	/* If the forcing of ODP mode was requested, remove any info that
+	 * supports FI_MR_ALLOCATED.
+	 */
+	if (cxip_env.force_odp &&
+	    cxip_infos[info_index].domain_attr->mr_mode & FI_MR_ALLOCATED)
+		return -FI_ENODATA;
+
+	/* For now only expose ODP fi_info if ODP selection is enabled.
+	 * TODO: When ODP is always available remove this filter.
+	 */
+	if (!(cxip_infos[info_index].domain_attr->mr_mode & FI_MR_ALLOCATED) &&
+	    !cxip_env.odp)
+		return -FI_ENODATA;
+
+	fi = fi_dupinfo(&cxip_infos[info_index]);
+	if (!fi)
+		return -FI_ENOMEM;
+
+	fi->domain_attr->name = strdup(nic_if->info->device_name);
+	if (!fi->domain_attr->name)
+		return -ENOMEM;
+
+	addr.nic = nic_if->info->nic_addr;
+	addr.pid = C_PID_ANY;
+	fi->src_addr = mem_dup(&addr, sizeof(addr));
+	if (!fi->src_addr) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	fi->src_addrlen = sizeof(addr);
+
+	ret = cxip_nic_alloc(nic_if, &fi->nic);
+	if (ret != FI_SUCCESS)
+		goto err;
+
+	*info = fi;
+	return FI_SUCCESS;
+
+err:
+	fi_freeinfo((void *)fi);
+	return ret;
+}
+
+/*
+ * cxip_info_init() - Initialize fabric info for each CXI interface.
+ */
+static int cxip_info_init(void)
+{
+	struct slist_entry *entry, *prev __attribute__ ((unused));
+	struct cxip_if *tmp;
+	struct cxip_if *nic_if;
+	struct fi_info **fi_list = (void *)&cxip_util_prov.info;
+	struct fi_info *fi;
+	int ndx;
+	int ret;
+
+	slist_foreach(&cxip_if_list, entry, prev) {
+		/* Bit hacky... but use cxip_if list entry as input to
+		 * cxip_get_if(). cxip_get_if() will init a cxil_dev which is
+		 * used to build a NIC info.
+		 */
+		tmp = container_of(entry, struct cxip_if, if_entry);
+		ret = cxip_get_if(tmp->info->nic_addr, &nic_if);
+		if (ret != FI_SUCCESS)
+			continue;
+
+		for (ndx = 0; ndx < ARRAY_SIZE(cxip_infos); ndx++) {
+			ret = cxip_info_alloc(nic_if, ndx, &fi);
+			if (ret == -FI_ENODATA)
+				continue;;
+			if (ret != FI_SUCCESS) {
+				cxip_put_if(nic_if);
+				goto free_info;
+			}
+
+			CXIP_DBG("%s info created\n",
+				 nic_if->info->device_name);
+			*fi_list = fi;
+			fi_list = &(fi->next);
+		}
+	}
+
+	return FI_SUCCESS;
+
+free_info:
+	fi_freeinfo((void *)cxip_util_prov.info);
+	return ret;
+}
+
+static bool cxip_env_validate_device_token(const char *device_token)
+{
+	unsigned int device_index;
+	unsigned int device_strlen;
+
+	/* Only allow for device tokens of cxi0 - cxi99. */
+	device_strlen = strlen(device_token);
+	if (device_strlen != 4 && device_strlen != 5)
+		return false;
+
+	/* Ensure device token is of cxi## format. */
+	if (sscanf(device_token, "cxi%u", &device_index) != 1)
+		return false;
+
+	/* Ensure that a device string length of 5 chars is only true if the
+	 * device index is greater than 9.
+	 */
+	if (device_strlen == 5 && device_index < 10)
+		return false;
+
+	return true;
+}
+
+static int cxip_env_validate_device_name(const char *device_name)
+{
+	const char *device_token;
+	char *device_name_copy;
+	int ret = FI_SUCCESS;
+
+	device_name_copy = malloc(strlen(device_name) + 1);
+	if (!device_name_copy)
+		return -FI_ENOMEM;
+
+	strcpy(device_name_copy, device_name);
+
+	device_token = strtok(device_name_copy, ",");
+	while (device_token != NULL) {
+		if (!cxip_env_validate_device_token(device_token)) {
+			ret = -FI_EINVAL;
+			break;
+		}
+
+		device_token = strtok(NULL, ",");
+	}
+
+	free(device_name_copy);
+
+	return ret;
+}
+
+static int cxip_env_validate_url(const char *url)
+{
+	/* Trying to validate further is likely to generate false failures */
+	if (url && strlen(url) > 7 && !strncasecmp(url, "http://", 7))
+		return FI_SUCCESS;
+	if (url && strlen(url) > 8 && !strncasecmp(url, "https://", 8))
+		return FI_SUCCESS;
+	return -FI_EINVAL;
+}
+
+static const char * const cxip_rdzv_proto_strs[] = {
+	[CXIP_RDZV_PROTO_DEFAULT] = "default",
+	[CXIP_RDZV_PROTO_ALT_READ] = "alt_read",
+	[CXIP_RDZV_PROTO_ALT_WRITE] = "alt_write",
+};
+
+const char *cxip_rdzv_proto_to_str(enum cxip_rdzv_proto proto)
+{
+	if (proto > CXIP_RDZV_PROTO_ALT_WRITE)
+		return NULL;
+
+	return cxip_rdzv_proto_strs[proto];
+}
+
+/* Provider environment variables are FI_CXI_{NAME} in all-caps */
+struct cxip_environment cxip_env = {
+	.odp = false,
+	.force_odp = false,
+	.ats = false,
+	.iotlb = true,
+	.ats_mlock_mode = CXIP_ATS_MLOCK_ALL,
+	.fork_safe_requested = false,
+	.rx_match_mode = CXIP_PTLTE_DEFAULT_MODE,
+	.rdzv_threshold = CXIP_RDZV_THRESHOLD,
+	.rdzv_get_min = 2049, /* Avoid single packet Gets */
+	.rdzv_eager_size = CXIP_RDZV_THRESHOLD,
+	.rdzv_aligned_sw_rget = 1,
+	.disable_non_inject_msg_idc = 0,
+	.disable_host_register = 0,
+	.oflow_buf_size = CXIP_OFLOW_BUF_SIZE,
+	.oflow_buf_min_posted = CXIP_OFLOW_BUF_MIN_POSTED,
+	.oflow_buf_max_cached = CXIP_OFLOW_BUF_MAX_CACHED,
+	.safe_devmem_copy_threshold =  CXIP_SAFE_DEVMEM_COPY_THRESH,
+	.optimized_mrs = true,
+	.mr_match_events = false,
+	.prov_key_cache = true,
+	.llring_mode = CXIP_LLRING_IDLE,
+	.cq_policy = CXI_CQ_UPDATE_LOW_FREQ_EMPTY,
+	.default_vni = 10,
+	.eq_ack_batch_size = 32,
+	.req_buf_size = CXIP_REQ_BUF_SIZE,
+	.req_buf_min_posted = CXIP_REQ_BUF_MIN_POSTED,
+	.req_buf_max_cached = CXIP_REQ_BUF_MAX_CACHED,
+	.msg_offload = 1,
+	.msg_lossless = 0,
+	.sw_rx_tx_init_max = CXIP_SW_RX_TX_INIT_MAX_DEFAULT,
+	.hybrid_preemptive = 0,
+	.hybrid_recv_preemptive = 0,
+	.hybrid_posted_recv_preemptive = 0,
+	.hybrid_unexpected_msg_preemptive = 0,
+	.fc_retry_usec_delay = 1000,
+	.ctrl_rx_eq_max_size = 67108864,
+	.default_cq_size = CXIP_CQ_DEF_SZ,
+	.default_tx_size = CXIP_DEFAULT_TX_SIZE,
+	.default_rx_size = CXIP_DEFAULT_RX_SIZE,
+	.disable_eq_hugetlb = false,
+	.zbcoll_radix = 2,
+	.cq_fill_percent = 50,
+	.enable_unrestricted_end_ro = true,
+	.rget_tc = FI_TC_UNSPEC,
+	.cacheline_size = CXIP_DEFAULT_CACHE_LINE_SIZE,
+	.coll_job_id = NULL,
+	.coll_job_step_id = NULL,
+	.coll_mcast_token = NULL,
+	.hwcoll_addrs_per_job = 0,
+	.hwcoll_min_nodes = -1,
+	.coll_fabric_mgr_url = NULL,
+	.coll_retry_usec = CXIP_COLL_MAX_RETRY_USEC,
+	.coll_timeout_usec = CXIP_COLL_MAX_TIMEOUT_USEC,
+	.coll_use_dma_put = false,
+	.telemetry_rgid = -1,
+	.disable_hmem_dev_register = 0,
+	.ze_hmem_supported = 0,
+	.rdzv_proto = CXIP_RDZV_PROTO_DEFAULT,
+	.enable_trig_op_limit = false,
+};
+
+static void cxip_env_init(void)
+{
+	char *param_str = NULL;
+	size_t min_free;
+	int ret;
+
+	gethostname(cxip_env.hostname, sizeof(cxip_env.hostname));
+
+	fi_param_define(&cxip_prov, "rget_tc", FI_PARAM_STRING,
+			"Traffic class used for software initiated rendezvous gets.");
+	fi_param_get_str(&cxip_prov, "rget_tc", &param_str);
+
+	if (param_str) {
+		if (!strcmp(param_str, "BEST_EFFORT"))
+			cxip_env.rget_tc = FI_TC_BEST_EFFORT;
+		else if (!strcmp(param_str, "LOW_LATENCY"))
+			cxip_env.rget_tc = FI_TC_LOW_LATENCY;
+		else if (!strcmp(param_str, "DEDICATED_ACCESS"))
+			cxip_env.rget_tc = FI_TC_DEDICATED_ACCESS;
+		else if (!strcmp(param_str, "BULK_DATA"))
+			cxip_env.rget_tc = FI_TC_BULK_DATA;
+		else
+			CXIP_WARN("Unrecognized rget_tc: %s\n", param_str);
+		param_str = NULL;
+	}
+
+	cxip_env.cacheline_size = cxip_cacheline_size();
+	CXIP_DBG("Provider using cacheline size of %d\n",
+		 cxip_env.cacheline_size);
+
+	fi_param_define(&cxip_prov, "rdzv_aligned_sw_rget", FI_PARAM_BOOL,
+			"Enables SW RGet address alignment (default: %d).",
+			cxip_env.rdzv_aligned_sw_rget);
+	fi_param_get_bool(&cxip_prov, "rdzv_aligned_sw_rget",
+			  &cxip_env.rdzv_aligned_sw_rget);
+
+	fi_param_define(&cxip_prov, "enable_trig_op_limit", FI_PARAM_BOOL,
+			"Enable enforcement of triggered operation limit. "
+			"Doing this can result in degrade "
+			"fi_control(FI_QUEUE_WORK) performance at the cost of "
+			"potentially deadlocking. If disabled, applications "
+			"must prevent deadlock by ensuring triggered op limit "
+			"is not exceeded. Default: %d.",
+			cxip_env.enable_trig_op_limit);
+	fi_param_get_bool(&cxip_prov, "enable_trig_op_limit",
+			  &cxip_env.enable_trig_op_limit);
+
+	fi_param_define(&cxip_prov, "disable_non_inject_msg_idc", FI_PARAM_BOOL,
+			"Disables IDC for non-inject messages (default: %d).",
+			cxip_env.disable_non_inject_msg_idc);
+	fi_param_get_bool(&cxip_prov, "disable_non_inject_msg_idc",
+			  &cxip_env.disable_non_inject_msg_idc);
+
+	fi_param_define(&cxip_prov, "disable_host_register", FI_PARAM_BOOL,
+			"Disables host buffer GPU registration (default: %d).",
+			cxip_env.disable_host_register);
+	fi_param_get_bool(&cxip_prov, "disable_host_register",
+			  &cxip_env.disable_host_register);
+
+	fi_param_define(&cxip_prov, "enable_unrestricted_end_ro", FI_PARAM_BOOL,
+			"Default: %d", cxip_env.enable_unrestricted_end_ro);
+	fi_param_get_bool(&cxip_prov, "enable_unrestricted_end_ro",
+			  &cxip_env.enable_unrestricted_end_ro);
+
+	fi_param_define(&cxip_prov, "odp", FI_PARAM_BOOL,
+			"Enables on-demand paging (default %d).", cxip_env.odp);
+	fi_param_get_bool(&cxip_prov, "odp", &cxip_env.odp);
+
+	fi_param_define(&cxip_prov, "force_odp", FI_PARAM_BOOL,
+			"Force use of on-demand paging (default %d).",
+			cxip_env.force_odp);
+	fi_param_get_bool(&cxip_prov, "force_odp", &cxip_env.force_odp);
+	if (cxip_env.force_odp && !cxip_env.odp) {
+		cxip_env.odp = true;
+		CXIP_INFO("Forcing ODP usage enabled ODP mode\n");
+	}
+
+	fi_param_define(&cxip_prov, "ats", FI_PARAM_BOOL,
+			"Enables PCIe ATS.");
+	fi_param_get_bool(&cxip_prov, "ats", &cxip_env.ats);
+
+	fi_param_define(&cxip_prov, "iotlb", FI_PARAM_BOOL,
+			"Enables the NIC IOTLB (default %d).", cxip_env.iotlb);
+	fi_param_get_bool(&cxip_prov, "iotlb", &cxip_env.iotlb);
+
+	fi_param_define(&cxip_prov, "ats_mlock_mode", FI_PARAM_STRING,
+			"Sets ATS mlock mode (off | all).");
+	fi_param_get_str(&cxip_prov, "ats_mlock_mode", &param_str);
+
+	if (param_str) {
+		if (!strcmp(param_str, "off"))
+			cxip_env.ats_mlock_mode = CXIP_ATS_MLOCK_OFF;
+		else if (!strcmp(param_str, "all"))
+			cxip_env.ats_mlock_mode = CXIP_ATS_MLOCK_ALL;
+		else
+			CXIP_WARN("Unrecognized ats_mlock_mode: %s\n",
+				  param_str);
+		param_str = NULL;
+	}
+
+	fi_param_define(&cxip_prov, "device_name", FI_PARAM_STRING,
+			"Restrict CXI provider to specific CXI devices. Format is a comma separated list of CXI devices (e.g. cxi0,cxi1).");
+	fi_param_get_str(&cxip_prov, "device_name", &cxip_env.device_name);
+
+	if (cxip_env.device_name) {
+		ret = cxip_env_validate_device_name(cxip_env.device_name);
+		if (ret) {
+			CXIP_WARN("Failed to validate device name: name=%s rc=%d. Ignoring device name.\n",
+				  cxip_env.device_name, ret);
+			cxip_env.device_name = NULL;
+		}
+	}
+
+	/* Keep track if CXI_FORK_SAFE/CXI_FORK_SAFE_HP was requested. This
+	 * is used to avoid mapping memory is some cases.
+	 */
+	if (getenv("CXI_FORK_SAFE") || getenv("CXI_FORK_SAFE_HP"))
+		cxip_env.fork_safe_requested = true;
+
+	/* Counters env string is validate when the cxip_env.telemetry string
+	 * is used.
+	 */
+	fi_param_define(&cxip_prov, "telemetry", FI_PARAM_STRING,
+			"Perform a telemetry delta captured between fi_domain open and close. "
+			"Format is a comma separated list of telemetry files as defined in /sys/class/cxi/cxi*/device/telemetry/. "
+			"Default is counter delta captured disabled.");
+	fi_param_get_str(&cxip_prov, "telemetry", &cxip_env.telemetry);
+
+	fi_param_define(&cxip_prov, "telemetry_rgid", FI_PARAM_INT,
+			"Resource group ID (RGID) to restrict the telemetry collection to. "
+			"Value less than 0 is no restrictions. "
+			"Default is no restrictions.");
+	fi_param_get_int(&cxip_prov, "telemetry_rgid",
+			 &cxip_env.telemetry_rgid);
+
+	fi_param_define(&cxip_prov, "rx_match_mode", FI_PARAM_STRING,
+			"Sets RX message match mode (hardware | software | hybrid).");
+	fi_param_get_str(&cxip_prov, "rx_match_mode", &param_str);
+
+	if (param_str) {
+		if (!strcasecmp(param_str, "hardware")) {
+			cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE;
+			cxip_env.msg_offload = true;
+		} else if (!strcmp(param_str, "software")) {
+			cxip_env.rx_match_mode = CXIP_PTLTE_SOFTWARE_MODE;
+			cxip_env.msg_offload = false;
+		} else if (!strcmp(param_str, "hybrid")) {
+			cxip_env.rx_match_mode = CXIP_PTLTE_HYBRID_MODE;
+			cxip_env.msg_offload = true;
+		} else {
+			CXIP_WARN("Unrecognized rx_match_mode: %s\n",
+				  param_str);
+			cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE;
+			cxip_env.msg_offload = true;
+		}
+
+		param_str = NULL;
+	}
+
+	fi_param_define(&cxip_prov, "rdzv_threshold", FI_PARAM_SIZE_T,
+			"Message size threshold for rendezvous protocol.");
+	fi_param_get_size_t(&cxip_prov, "rdzv_threshold",
+			    &cxip_env.rdzv_threshold);
+
+	/* Rendezvous protocol does support FI_INJECT, make sure
+	 * eager send message is selected for FI_INJECT.
+	 */
+	if (cxip_env.rdzv_threshold < CXIP_INJECT_SIZE) {
+		cxip_env.rdzv_threshold = CXIP_INJECT_SIZE;
+		CXIP_WARN("Increased rdzv_threshold size to: %lu\n",
+			  cxip_env.rdzv_threshold);
+	}
+
+	/* If aligned SW Rget is enabled, rendezvous eager data must
+	 * be greater than cache-line size.
+	 */
+	if (cxip_env.rdzv_aligned_sw_rget &&
+	    cxip_env.rdzv_threshold < cxip_env.cacheline_size) {
+		cxip_env.rdzv_threshold = cxip_env.cacheline_size;
+		CXIP_WARN("Increased rdzv_threshold size to: %lu\n",
+			  cxip_env.rdzv_threshold);
+	}
+
+	fi_param_define(&cxip_prov, "rdzv_get_min", FI_PARAM_SIZE_T,
+			"Minimum rendezvous Get payload size.");
+	fi_param_get_size_t(&cxip_prov, "rdzv_get_min",
+			    &cxip_env.rdzv_get_min);
+
+	fi_param_define(&cxip_prov, "rdzv_eager_size", FI_PARAM_SIZE_T,
+			"Eager data size for rendezvous protocol.");
+	fi_param_get_size_t(&cxip_prov, "rdzv_eager_size",
+			    &cxip_env.rdzv_eager_size);
+
+	if (cxip_env.rdzv_eager_size > cxip_env.rdzv_threshold) {
+		cxip_env.rdzv_eager_size = cxip_env.rdzv_threshold;
+		CXIP_WARN("Invalid rdzv_eager_size, new size: %lu\n",
+			  cxip_env.rdzv_eager_size);
+	}
+
+	fi_param_define(&cxip_prov, "oflow_buf_size", FI_PARAM_SIZE_T,
+			"Overflow buffer size.");
+	fi_param_get_size_t(&cxip_prov, "oflow_buf_size",
+			    &cxip_env.oflow_buf_size);
+
+	if (cxip_env.rdzv_threshold > cxip_env.oflow_buf_size) {
+		CXIP_WARN("Invalid rdzv_threshold: %lu\n",
+			  cxip_env.rdzv_threshold);
+		cxip_env.rdzv_threshold = CXIP_RDZV_THRESHOLD;
+	}
+
+	if (cxip_env.rdzv_get_min >
+	    (cxip_env.oflow_buf_size - cxip_env.rdzv_threshold)) {
+		CXIP_WARN("Invalid rdzv_get_min: %lu\n",
+			  cxip_env.rdzv_get_min);
+		cxip_env.rdzv_get_min = 0;
+	}
+
+	/* Allow either FI_CXI_OFLOW_BUF_COUNT or FI_CXI_FLOW_BUF_MIN_POSTED */
+	fi_param_define(&cxip_prov, "oflow_buf_count", FI_PARAM_SIZE_T,
+			"Overflow buffer count/min posted.");
+	fi_param_get_size_t(&cxip_prov, "oflow_buf_count",
+			    &cxip_env.oflow_buf_min_posted);
+	fi_param_define(&cxip_prov, "oflow_buf_min_posted", FI_PARAM_SIZE_T,
+			"Overflow buffer count/min posted.");
+	fi_param_get_size_t(&cxip_prov, "oflow_buf_min_posted",
+			    &cxip_env.oflow_buf_min_posted);
+	cxip_env.oflow_buf_max_cached = cxip_env.oflow_buf_min_posted * 3;
+
+	fi_param_define(&cxip_prov, "oflow_buf_max_cached", FI_PARAM_SIZE_T,
+			"Maximum number of overflow buffers cached.");
+	fi_param_get_size_t(&cxip_prov, "oflow_buf_max_cached",
+			    &cxip_env.oflow_buf_max_cached);
+	if (cxip_env.oflow_buf_max_cached && cxip_env.oflow_buf_max_cached <
+	    cxip_env.oflow_buf_min_posted) {
+		cxip_env.oflow_buf_max_cached = cxip_env.oflow_buf_min_posted;
+		CXIP_WARN("Adjusted oflow buffer max cached to %lu\n",
+			  cxip_env.oflow_buf_max_cached);
+	}
+
+	fi_param_define(&cxip_prov, "safe_devmem_copy_threshold",
+			FI_PARAM_SIZE_T,
+			"Max memcpy for load/store HMEM access (default %lu).",
+			cxip_env.safe_devmem_copy_threshold);
+	fi_param_get_size_t(&cxip_prov, "safe_devmem_copy_threshold",
+			    &cxip_env.safe_devmem_copy_threshold);
+
+	fi_param_define(&cxip_prov, "optimized_mrs", FI_PARAM_BOOL,
+			"Enables optimized memory regions.");
+	fi_param_get_bool(&cxip_prov, "optimized_mrs",
+			  &cxip_env.optimized_mrs);
+
+	fi_param_define(&cxip_prov, "mr_match_events", FI_PARAM_BOOL,
+			"Enable MR match counting (default %lu).",
+			&cxip_env.mr_match_events);
+	fi_param_get_bool(&cxip_prov, "mr_match_events",
+			  &cxip_env.mr_match_events);
+
+	fi_param_define(&cxip_prov, "prov_key_cache", FI_PARAM_BOOL,
+			"Disable caching of FI_MR_PROV_KEY (default %lu).",
+			&cxip_env.prov_key_cache);
+	fi_param_get_bool(&cxip_prov, "prov_key_cache",
+			  &cxip_env.prov_key_cache);
+
+	fi_param_define(&cxip_prov, "llring_mode", FI_PARAM_STRING,
+			"Set low-latency command queue ring mode.");
+	fi_param_get_str(&cxip_prov, "llring_mode", &param_str);
+
+	if (param_str) {
+		if (!strcmp(param_str, "always"))
+			cxip_env.llring_mode = CXIP_LLRING_ALWAYS;
+		else if (!strcmp(param_str, "idle"))
+			cxip_env.llring_mode = CXIP_LLRING_IDLE;
+		else if (!strcmp(param_str, "never"))
+			cxip_env.llring_mode = CXIP_LLRING_NEVER;
+		else
+			CXIP_WARN("Unrecognized llring_mode: %s\n",
+				  param_str);
+
+		param_str = NULL;
+	}
+
+	fi_param_define(&cxip_prov, "zbcoll_radix", FI_PARAM_INT,
+			"Set radix of the zero-byte barrier tree.");
+	fi_param_get_int(&cxip_prov, "zbcoll_radix", &cxip_env.zbcoll_radix);
+	if (cxip_env.zbcoll_radix < 2) {
+		CXIP_WARN("Invalid zbcoll_radix=%d, reset to 2\n",
+			  cxip_env.zbcoll_radix);
+		cxip_env.zbcoll_radix = 2;
+	}
+
+	fi_param_define(&cxip_prov, "cq_policy", FI_PARAM_STRING,
+			"Set Command Queue write-back policy.");
+	fi_param_get_str(&cxip_prov, "cq_policy", &param_str);
+
+	if (param_str) {
+		if (!strcmp(param_str, "always"))
+			cxip_env.cq_policy = CXI_CQ_UPDATE_ALWAYS;
+		else if (!strcmp(param_str, "high_empty"))
+			cxip_env.cq_policy = CXI_CQ_UPDATE_HIGH_FREQ_EMPTY;
+		else if (!strcmp(param_str, "low_empty"))
+			cxip_env.cq_policy = CXI_CQ_UPDATE_LOW_FREQ_EMPTY;
+		else if (!strcmp(param_str, "low"))
+			cxip_env.cq_policy = CXI_CQ_UPDATE_LOW_FREQ;
+		else
+			CXIP_WARN("Unrecognized cq_policy: %s\n",
+				  param_str);
+
+		param_str = NULL;
+	}
+
+	fi_param_define(&cxip_prov, "default_vni", FI_PARAM_SIZE_T,
+			"Default VNI value used only for service IDs where the VNI is not restricted.");
+	fi_param_get_size_t(&cxip_prov, "default_vni", &cxip_env.default_vni);
+
+	fi_param_define(&cxip_prov, "eq_ack_batch_size", FI_PARAM_SIZE_T,
+			"Number of EQ events to process before acknowledgement");
+	fi_param_get_size_t(&cxip_prov, "eq_ack_batch_size",
+			    &cxip_env.eq_ack_batch_size);
+
+	if (!cxip_env.eq_ack_batch_size)
+		cxip_env.eq_ack_batch_size = 1;
+
+	fi_param_define(&cxip_prov, "msg_lossless", FI_PARAM_BOOL,
+			"Enable/Disable lossless message matching.");
+	fi_param_get_bool(&cxip_prov, "msg_lossless", &cxip_env.msg_lossless);
+
+	fi_param_define(&cxip_prov, "req_buf_size", FI_PARAM_SIZE_T,
+			"Size of request buffer.");
+	fi_param_get_size_t(&cxip_prov, "req_buf_size", &cxip_env.req_buf_size);
+
+	fi_param_define(&cxip_prov, "req_buf_min_posted", FI_PARAM_SIZE_T,
+			"Minimum number of request buffer posted.");
+	fi_param_get_size_t(&cxip_prov, "req_buf_min_posted",
+			    &cxip_env.req_buf_min_posted);
+
+	/* Allow either FI_CXI_REQ_BUF_MAX_CACHED or FI_CXI_REQ_BUF_MAX_COUNT */
+	fi_param_define(&cxip_prov, "req_buf_max_count", FI_PARAM_SIZE_T,
+			"Maximum number of request buffer cached.");
+	fi_param_get_size_t(&cxip_prov, "req_buf_max_count",
+			    &cxip_env.req_buf_max_cached);
+	fi_param_define(&cxip_prov, "req_buf_max_cached", FI_PARAM_SIZE_T,
+			"Maximum number of request buffer cached.");
+	fi_param_get_size_t(&cxip_prov, "req_buf_max_cached",
+			    &cxip_env.req_buf_max_cached);
+
+	/* Parameters to tailor hybrid hardware to software transitions
+	 * that are initiated by software.
+	 */
+	fi_param_define(&cxip_prov, "hybrid_preemptive", FI_PARAM_BOOL,
+			"Enable/Disable low LE preemptive UX transitions.");
+	fi_param_get_bool(&cxip_prov, "hybrid_preemptive",
+			  &cxip_env.hybrid_preemptive);
+	if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE &&
+	    cxip_env.hybrid_preemptive) {
+		cxip_env.hybrid_preemptive = false;
+		CXIP_WARN("Not in hybrid mode, ignoring preemptive\n");
+	}
+
+	fi_param_define(&cxip_prov, "hybrid_recv_preemptive", FI_PARAM_BOOL,
+			"Enable/Disable low LE preemptive recv transitions.");
+	fi_param_get_bool(&cxip_prov, "hybrid_recv_preemptive",
+			  &cxip_env.hybrid_recv_preemptive);
+
+	if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE &&
+	    cxip_env.hybrid_recv_preemptive) {
+		CXIP_WARN("Not in hybrid mode, ignore LE  recv preemptive\n");
+		cxip_env.hybrid_recv_preemptive = 0;
+	}
+
+	fi_param_define(&cxip_prov, "hybrid_posted_recv_preemptive",
+			FI_PARAM_BOOL,
+			"Enable preemptive transition to software endpoint when number of posted receives exceeds RX attribute size");
+	fi_param_get_bool(&cxip_prov, "hybrid_posted_recv_preemptive",
+			  &cxip_env.hybrid_posted_recv_preemptive);
+
+	if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE &&
+	    cxip_env.hybrid_posted_recv_preemptive) {
+		CXIP_WARN("Not in hybrid mode, ignore hybrid_posted_recv_preemptive\n");
+		cxip_env.hybrid_posted_recv_preemptive = 0;
+	}
+
+	fi_param_define(&cxip_prov, "hybrid_unexpected_msg_preemptive",
+			FI_PARAM_BOOL,
+			"Enable preemptive transition to software endpoint when number of hardware unexpected messages exceeds RX attribute size");
+	fi_param_get_bool(&cxip_prov, "hybrid_unexpected_msg_preemptive",
+			  &cxip_env.hybrid_unexpected_msg_preemptive);
+
+	if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE &&
+	    cxip_env.hybrid_unexpected_msg_preemptive) {
+		CXIP_WARN("Not in hybrid mode, ignore hybrid_unexpected_msg_preemptive\n");
+		cxip_env.hybrid_unexpected_msg_preemptive = 0;
+	}
+
+	if (cxip_software_pte_allowed()) {
+		min_free = CXIP_REQ_BUF_HEADER_MAX_SIZE +
+			cxip_env.rdzv_threshold + cxip_env.rdzv_get_min;
+
+		if (cxip_env.req_buf_size < min_free) {
+			cxip_env.req_buf_size = min_free;
+			CXIP_WARN("Requested request buffer size to small. Setting to %lu bytes\n",
+				  cxip_env.req_buf_size);
+		}
+
+		if (cxip_env.req_buf_min_posted < 2) {
+			cxip_env.req_buf_min_posted = 2;
+			CXIP_WARN("Adjusted request buffer min posted to %lu\n",
+				  cxip_env.req_buf_min_posted);
+		}
+
+		/* Zero max count is unlimited */
+		if (cxip_env.req_buf_max_cached &&
+		    cxip_env.req_buf_max_cached < cxip_env.req_buf_min_posted) {
+			cxip_env.req_buf_max_cached =
+					cxip_env.req_buf_min_posted;
+			CXIP_WARN("Adjusted request buffer max cached to %lu\n",
+				  cxip_env.req_buf_max_cached);
+		}
+	}
+
+	fi_param_define(&cxip_prov, "fc_retry_usec_delay", FI_PARAM_INT,
+			"Micro-second delay before retrying failed flow-control messages. Default: %d usecs",
+			cxip_env.fc_retry_usec_delay);
+	fi_param_get_int(&cxip_prov, "fc_retry_usec_delay",
+			 &cxip_env.fc_retry_usec_delay);
+	if (cxip_env.fc_retry_usec_delay < 0) {
+		cxip_env.fc_retry_usec_delay = 0;
+		CXIP_WARN("FC retry delay invalid. Setting to %d usecs\n",
+			  cxip_env.fc_retry_usec_delay);
+	}
+
+	fi_param_define(&cxip_prov, "sw_rx_tx_init_max", FI_PARAM_INT,
+			"Max TX S/W RX processing will initiate. Default: %d",
+			cxip_env.sw_rx_tx_init_max);
+	fi_param_get_int(&cxip_prov, "sw_rx_tx_init_max",
+			 &cxip_env.sw_rx_tx_init_max);
+	if (cxip_env.sw_rx_tx_init_max < CXIP_SW_RX_TX_INIT_MIN) {
+		cxip_env.sw_rx_tx_init_max = CXIP_SW_RX_TX_INIT_MIN;
+		CXIP_WARN("Max TX S/W RX processing initiates adjusted to: %d",
+			  cxip_env.sw_rx_tx_init_max);
+	}
+
+	fi_param_define(&cxip_prov, "ctrl_rx_eq_max_size", FI_PARAM_SIZE_T,
+			"Control receive event queue max size. Values are aligned up to 4KiB. Default: %lu bytes",
+			cxip_env.ctrl_rx_eq_max_size);
+	fi_param_get_size_t(&cxip_prov, "ctrl_rx_eq_max_size",
+			    &cxip_env.ctrl_rx_eq_max_size);
+
+	fi_param_define(&cxip_prov, "default_cq_size", FI_PARAM_SIZE_T,
+			"Default provider CQ size (default: %lu).",
+			cxip_env.default_cq_size);
+	fi_param_get_size_t(&cxip_prov, "default_cq_size",
+			    &cxip_env.default_cq_size);
+	if (cxip_env.default_cq_size == 0) {
+		cxip_env.default_cq_size = CXIP_CQ_DEF_SZ;
+		CXIP_WARN("Default CQ size invalid. Setting to %lu\n",
+			  cxip_env.default_cq_size);
+	}
+
+	/* FI_CXI_DISABLE_EQ_HUGETLB will deprecate use of
+	 * FI_CXI_DISABLE_CQ_HUGETLB, both are allowed for now.
+	 */
+	fi_param_define(&cxip_prov, "disable_cq_hugetlb", FI_PARAM_BOOL,
+			"Disable 2MiB hugetlb allocates for HW event queues (default: %u).",
+			cxip_env.disable_eq_hugetlb);
+	fi_param_get_bool(&cxip_prov, "disable_cq_hugetlb",
+			  &cxip_env.disable_eq_hugetlb);
+	fi_param_define(&cxip_prov, "disable_eq_hugetlb", FI_PARAM_BOOL,
+			"Disable 2MiB hugetlb allocates for HW event queues (default: %u).",
+			cxip_env.disable_eq_hugetlb);
+	fi_param_get_bool(&cxip_prov, "disable_eq_hugetlb",
+			  &cxip_env.disable_eq_hugetlb);
+
+	fi_param_define(&cxip_prov, "cq_fill_percent", FI_PARAM_SIZE_T,
+			"Fill percent of underlying hardware event queue used to determine when completion queue is saturated (default: %lu).",
+			cxip_env.cq_fill_percent);
+	fi_param_get_size_t(&cxip_prov, "cq_fill_percent",
+			    &cxip_env.cq_fill_percent);
+
+	if (cxip_env.cq_fill_percent < 1 ||
+	    cxip_env.cq_fill_percent > 100) {
+		cxip_env.cq_fill_percent = 50;
+		CXIP_WARN("CQ fill percent invalid. Setting to %lu.\n",
+			  cxip_env.cq_fill_percent);
+	}
+
+	fi_param_define(&cxip_prov, "coll_job_id", FI_PARAM_STRING,
+		"Collective job identifier (default %s).",
+		cxip_env.coll_job_id);
+	fi_param_get_str(&cxip_prov, "coll_job_id",
+			  &cxip_env.coll_job_id);
+
+	fi_param_define(&cxip_prov, "coll_job_step_id", FI_PARAM_STRING,
+		"Collective job-step identifier (default %s).",
+		cxip_env.coll_job_step_id);
+	fi_param_get_str(&cxip_prov, "coll_job_step_id",
+			  &cxip_env.coll_job_step_id);
+
+	fi_param_define(&cxip_prov, "coll_fabric_mgr_url", FI_PARAM_STRING,
+		"Fabric multicast REST API URL (default %s).",
+		cxip_env.coll_fabric_mgr_url);
+	fi_param_get_str(&cxip_prov, "coll_fabric_mgr_url",
+			  &cxip_env.coll_fabric_mgr_url);
+	if (cxip_env.coll_fabric_mgr_url) {
+		ret = cxip_env_validate_url(cxip_env.coll_fabric_mgr_url);
+		if (ret) {
+			CXIP_WARN("Failed to validate fabric multicast URL: name=%s rc=%d. Ignoring URL.\n",
+				  cxip_env.coll_fabric_mgr_url, ret);
+			cxip_env.coll_fabric_mgr_url = NULL;
+		}
+	}
+
+	fi_param_define(&cxip_prov, "coll_mcast_token", FI_PARAM_STRING,
+		"Fabric multicast REST API TOKEN (default none).",
+		cxip_env.coll_mcast_token);
+	fi_param_get_str(&cxip_prov, "coll_mcast_token",
+			  &cxip_env.coll_mcast_token);
+
+	fi_param_define(&cxip_prov, "coll_use_dma_put", FI_PARAM_BOOL,
+		"Use DMA Put for collectives (default: %d).",
+		cxip_env.coll_use_dma_put);
+	fi_param_get_bool(&cxip_prov, "coll_use_dma_put",
+			  &cxip_env.coll_use_dma_put);
+
+	fi_param_define(&cxip_prov, "hwcoll_addrs_per_job", FI_PARAM_SIZE_T,
+		"Maximum hardware collective addresses allowed.");
+	fi_param_get_size_t(&cxip_prov, "hwcoll_addrs_per_job",
+			    &cxip_env.hwcoll_addrs_per_job);
+
+	fi_param_define(&cxip_prov, "hwcoll_min_nodes", FI_PARAM_SIZE_T,
+		"Minimum number of nodes required for hwcoll.");
+	fi_param_get_size_t(&cxip_prov, "hwcoll_min_nodes",
+			    &cxip_env.hwcoll_min_nodes);
+
+	fi_param_define(&cxip_prov, "coll_retry_usec", FI_PARAM_SIZE_T,
+		"Retry period (usec) (default %d, min %d, max %d).",
+		cxip_env.coll_retry_usec, CXIP_COLL_MIN_RETRY_USEC,
+		CXIP_COLL_MAX_RETRY_USEC);
+	fi_param_get_size_t(&cxip_prov, "coll_retry_usec",
+			    &cxip_env.coll_retry_usec);
+	if (cxip_env.coll_retry_usec < CXIP_COLL_MIN_RETRY_USEC)
+		cxip_env.coll_retry_usec = CXIP_COLL_MIN_RETRY_USEC;
+	if (cxip_env.coll_retry_usec > CXIP_COLL_MAX_RETRY_USEC)
+		cxip_env.coll_retry_usec = CXIP_COLL_MAX_RETRY_USEC;
+
+	fi_param_define(&cxip_prov, "coll_timeout_usec", FI_PARAM_SIZE_T,
+		"Reduction tree timeout (usec) (default %d, min %d, max %d).",
+		cxip_env.coll_timeout_usec, CXIP_COLL_MIN_TIMEOUT_USEC,
+		CXIP_COLL_MAX_TIMEOUT_USEC);
+	fi_param_get_size_t(&cxip_prov, "coll_timeout_usec",
+			    &cxip_env.coll_timeout_usec);
+	if (cxip_env.coll_timeout_usec < CXIP_COLL_MIN_TIMEOUT_USEC)
+		cxip_env.coll_timeout_usec = CXIP_COLL_MIN_TIMEOUT_USEC;
+	if (cxip_env.coll_timeout_usec > CXIP_COLL_MAX_TIMEOUT_USEC)
+		cxip_env.coll_timeout_usec = CXIP_COLL_MAX_TIMEOUT_USEC;
+
+	fi_param_define(&cxip_prov, "default_tx_size", FI_PARAM_SIZE_T,
+			"Default provider tx_attr.size (default: %lu).",
+			cxip_env.default_tx_size);
+	fi_param_get_size_t(&cxip_prov, "default_tx_size",
+			    &cxip_env.default_tx_size);
+	if (cxip_env.default_tx_size < 16 ||
+	    cxip_env.default_tx_size > CXIP_MAX_TX_SIZE) {
+		cxip_env.default_tx_size = CXIP_DEFAULT_TX_SIZE;
+		CXIP_WARN("Default TX size invalid. Setting to %lu\n",
+			  cxip_env.default_tx_size);
+	}
+
+	fi_param_define(&cxip_prov, "default_rx_size", FI_PARAM_SIZE_T,
+			"Default provider rx_attr.size (default: %lu).",
+			cxip_env.default_rx_size);
+	fi_param_get_size_t(&cxip_prov, "default_rx_size",
+			    &cxip_env.default_rx_size);
+	if (cxip_env.default_rx_size < 16 ||
+	    cxip_env.default_rx_size > CXIP_MAX_RX_SIZE) {
+		cxip_env.default_rx_size = CXIP_DEFAULT_RX_SIZE;
+		CXIP_WARN("Default RX size invalid. Setting to %lu\n",
+			  cxip_env.default_rx_size);
+	}
+
+	fi_param_define(&cxip_prov, "disable_hmem_dev_register", FI_PARAM_BOOL,
+			"Disable registering HMEM device buffer for load/store access (default: %u).",
+			cxip_env.disable_hmem_dev_register);
+	fi_param_get_bool(&cxip_prov, "disable_hmem_dev_register",
+			  &cxip_env.disable_hmem_dev_register);
+
+	/* Check if ZE device memory can be supported. Provide env var to
+	 * override just in case these checks become invalid.
+	 */
+	fi_param_define(&cxip_prov, "force_ze_hmem_support", FI_PARAM_BOOL,
+			"Disable ZE implicit scaling and KDM checks and force ZE HMEM support.");
+	fi_param_get_bool(&cxip_prov, "force_ze_hmem_support",
+			  &cxip_env.ze_hmem_supported);
+
+	if (!cxip_env.ze_hmem_supported) {
+		param_str = getenv("EnableImplicitScaling");
+		if (param_str && atoi(param_str) == 0) {
+			param_str = getenv("NEOReadDebugKeys");
+			if (param_str && atoi(param_str) == 1)
+				cxip_env.ze_hmem_supported = 1;
+		}
+		param_str = NULL;
+	}
+
+	fi_param_define(&cxip_prov, "rdzv_proto", FI_PARAM_STRING,
+			"Sets preferred rendezvous protocol [default | alt_read] (default %s).",
+			cxip_rdzv_proto_to_str(cxip_env.rdzv_proto));
+	fi_param_get_str(&cxip_prov, "rdzv_proto", &param_str);
+
+	if (param_str) {
+		char *ch = param_str;
+		int chars = 8;
+
+		while (ch && chars) {
+			if (*ch == '-')
+				*ch = '_';
+			ch++;
+			chars--;
+		}
+
+		if (!strcmp(param_str, "default"))
+			cxip_env.rdzv_proto = CXIP_RDZV_PROTO_DEFAULT;
+		else if (!strcmp(param_str, "alt_read"))
+			cxip_env.rdzv_proto = CXIP_RDZV_PROTO_ALT_READ;
+		else {
+			CXIP_WARN("Unrecognized rendezvous protocol: %s\n",
+				  param_str);
+			cxip_env.rdzv_proto = CXIP_RDZV_PROTO_DEFAULT;
+		}
+
+		param_str = NULL;
+	}
+
+	set_system_page_size();
+}
+
+/*
+ * CXI_INI - Provider constructor.
+ */
+CXI_INI
+{
+	cxip_env_init();
+
+	cxip_curl_init();
+
+	cxip_if_init();
+
+	cxip_info_init();
+
+	cxip_fault_inject_init();
+
+	return &cxip_prov;
+}
+
+/*
+ * cxip_fini() - Provider destructor.
+ */
+static void cxip_fini(void)
+{
+	cxip_fault_inject_fini();
+
+	fi_freeinfo((void *)cxip_util_prov.info);
+
+	cxip_if_fini();
+
+	cxip_curl_fini();
+}
+
+static void cxip_alter_caps(struct fi_info *info, const struct fi_info *hints)
+{
+	/* If FI_COLLECTIVE explicitly requested then must enable
+	 * FI_MSG for send and receive if not already enabled.
+	 */
+	if (hints && hints->caps && (hints->caps & FI_COLLECTIVE)) {
+		if (!(info->caps & (FI_MSG | FI_TAGGED))) {
+			info->caps |= FI_MSG | FI_SEND | FI_RECV;
+			info->tx_attr->caps |= FI_MSG | FI_SEND;
+			info->rx_attr->caps |= FI_MSG | FI_RECV;
+		}
+	}
+}
+
+static void cxip_alter_tx_attr(struct fi_tx_attr *attr,
+			       const struct fi_tx_attr *hints,
+			       uint64_t info_caps)
+{
+	if (!hints || hints->size == 0)
+		attr->size = cxip_env.default_tx_size;
+}
+
+static void cxip_alter_rx_attr(struct fi_rx_attr *attr,
+			       const struct fi_rx_attr *hints,
+			       uint64_t info_caps)
+{
+	if (!hints || hints->size == 0)
+		attr->size = cxip_env.default_rx_size;
+}
+
+static void cxip_alter_info(struct fi_info *info, const struct fi_info *hints,
+			    uint32_t api_version)
+{
+	for (; info; info = info->next) {
+		fi_control(&info->nic->fid, FI_OPT_CXI_NIC_REFRESH_ATTR, NULL);
+
+		cxip_alter_caps(info, hints);
+		cxip_alter_tx_attr(info->tx_attr, hints ? hints->tx_attr : NULL,
+				   info->caps);
+		cxip_alter_rx_attr(info->rx_attr, hints ? hints->rx_attr : NULL,
+				   info->caps);
+
+		/* Remove secondary capabilities that impact performance if
+		 * hints are not specified. They must be explicitly requested.
+		 */
+		if (!hints) {
+			info->caps &= ~(FI_SOURCE | FI_SOURCE_ERR);
+			info->rx_attr->caps &= ~(FI_SOURCE | FI_SOURCE_ERR);
+		}
+	}
+}
+
+static int cxip_alter_auth_key_align_domain_ep(struct fi_info **info)
+{
+	struct fi_info *fi_ptr;
+
+	/* CXI provider requires the endpoint to have the same service ID as the
+	 * domain. Account for edge case where users only set endpoint auth_key
+	 * and leave domain auth_key as NULL by duplicating the endpoint
+	 * auth_key to the domain.
+	 */
+	for (fi_ptr = *info; fi_ptr; fi_ptr = fi_ptr->next) {
+		if (!fi_ptr->domain_attr->auth_key &&
+		    fi_ptr->ep_attr->auth_key) {
+			fi_ptr->domain_attr->auth_key =
+				mem_dup(fi_ptr->ep_attr->auth_key,
+					fi_ptr->ep_attr->auth_key_size);
+			if (!fi_ptr->domain_attr->auth_key)
+				return -FI_ENOMEM;
+
+			fi_ptr->domain_attr->auth_key_size =
+				fi_ptr->ep_attr->auth_key_size;
+		}
+	}
+
+	return FI_SUCCESS;
+}
+
+static void cxip_alter_auth_key_scrub_auth_key_size(const struct fi_info *hints,
+						    struct fi_info **info)
+{
+	struct fi_info *fi_ptr;
+	bool av_auth_key = false;
+
+	if (hints && hints->domain_attr)
+		av_auth_key =
+			hints->domain_attr->auth_key_size == FI_AV_AUTH_KEY;
+
+	/* Zero the auth_key_size for any NULL auth_key. */
+	for (fi_ptr = *info; fi_ptr; fi_ptr = fi_ptr->next) {
+		if (!fi_ptr->domain_attr->auth_key && !av_auth_key)
+			fi_ptr->domain_attr->auth_key_size = 0;
+
+		if (!fi_ptr->ep_attr->auth_key)
+			fi_ptr->ep_attr->auth_key_size = 0;
+	}
+}
+
+static int cxip_alter_auth_key_validate(struct fi_info **info)
+{
+	struct fi_info *fi_ptr;
+	struct fi_info *fi_ptr_tmp;
+	struct fi_info *fi_prev_ptr;
+	int ret;
+
+	/* Core auth_key checks only verify auth_key_size. This check verifies
+	 * that the user provided auth_key is valid.
+	 */
+	fi_ptr = *info;
+	*info = NULL;
+	fi_prev_ptr = NULL;
+
+	while (fi_ptr) {
+		ret = cxip_check_auth_key_info(fi_ptr);
+		if (ret) {
+			/* discard entry */
+			if (fi_prev_ptr)
+				fi_prev_ptr->next = fi_ptr->next;
+
+			fi_ptr_tmp = fi_ptr;
+			fi_ptr = fi_ptr->next;
+			fi_ptr_tmp->next = NULL;
+			fi_freeinfo(fi_ptr_tmp);
+			continue;
+		}
+
+		if (*info == NULL)
+				*info = fi_ptr;
+
+		fi_prev_ptr = fi_ptr;
+		fi_ptr = fi_ptr->next;
+	}
+
+	return FI_SUCCESS;
+}
+
+int cxip_gen_auth_key(struct fi_info *info, struct cxi_auth_key *key)
+{
+	struct cxip_nic_attr *nic_attr;
+
+	memset(key, 0, sizeof(*key));
+
+	if (info->domain_attr->auth_key) {
+		CXIP_WARN("Domain auth_key not NULL\n");
+		return -FI_EINVAL;
+	}
+
+	if (!info->nic || !info->nic->prov_attr) {
+		CXIP_WARN("Missing NIC provider attributes\n");
+		return -FI_EINVAL;
+	}
+
+	nic_attr = (struct cxip_nic_attr *)info->nic->prov_attr;
+	if (nic_attr->default_rgroup_id == 0)
+		return -FI_ENOSYS;
+
+	key->svc_id = nic_attr->default_rgroup_id;
+	key->vni = nic_attr->default_vni;
+
+	return FI_SUCCESS;
+}
+
+static int cxip_alter_auth_key(const struct fi_info *hints,
+			       struct fi_info **info)
+{
+	int ret;
+
+	ret = cxip_alter_auth_key_align_domain_ep(info);
+	if (ret)
+		return ret;
+
+	cxip_alter_auth_key_scrub_auth_key_size(hints, info);
+
+	return cxip_alter_auth_key_validate(info);
+}
+
+static int cxip_validate_iface_auth_key(struct cxip_if *iface,
+					struct cxi_auth_key *auth_key)
+{
+	if (!auth_key)
+		return FI_SUCCESS;
+
+	return cxip_if_valid_rgroup_vni(iface, auth_key->svc_id, auth_key->vni);
+}
+
+int cxip_check_auth_key_info(struct fi_info *info)
+{
+	struct cxip_addr *src_addr;
+	struct cxip_if *iface;
+	int ret;
+
+	src_addr = (struct cxip_addr *)info->src_addr;
+	if (!src_addr) {
+		CXIP_WARN("NULL src_addr in fi_info\n");
+		return -FI_EINVAL;
+	}
+
+	ret = cxip_get_if(src_addr->nic, &iface);
+	if (ret) {
+		CXIP_WARN("cxip_get_if with NIC %#x failed: %d:%s\n",
+				src_addr->nic, ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	if (info->domain_attr) {
+		ret = cxip_validate_iface_auth_key(iface,
+						   (struct cxi_auth_key *)info->domain_attr->auth_key);
+		if (ret) {
+			CXIP_WARN("Invalid domain auth_key\n");
+			goto err_put_if;
+		}
+	}
+
+	if (info->ep_attr) {
+		ret = cxip_validate_iface_auth_key(iface,
+						   (struct cxi_auth_key *)info->ep_attr->auth_key);
+		if (ret) {
+			CXIP_WARN("Invalid endpoint auth_key\n");
+			goto err_put_if;
+		}
+	}
+
+	cxip_put_if(iface);
+
+	return FI_SUCCESS;
+
+err_put_if:
+	cxip_put_if(iface);
+
+	return ret;
+}
+
+/*
+ * cxip_getinfo() - Provider fi_getinfo() implementation.
+ */
+static int
+cxip_getinfo(uint32_t version, const char *node, const char *service,
+	     uint64_t flags, const struct fi_info *hints,
+	     struct fi_info **info)
+{
+	int ret;
+	struct fi_info *fi_ptr;
+	struct fi_info *fi_ptr_tmp;
+	struct fi_info *fi_prev_ptr;
+	struct ether_addr *mac;
+	uint32_t scan_nic = 0;
+	uint32_t scan_pid = 0;
+	struct cxip_addr *addr;
+	struct cxip_if *iface;
+	bool copy_dest = NULL;
+	struct fi_info *temp_hints = NULL;
+
+	if (flags & FI_SOURCE) {
+		if (!node && !service) {
+			CXIP_WARN("FI_SOURCE set, but no node or service\n");
+			return -FI_EINVAL;
+		}
+	}
+
+	if (node) {
+		iface = cxip_if_lookup_name(node);
+		if (iface) {
+			scan_nic = iface->info->nic_addr;
+		} else if ((mac = ether_aton(node))) {
+			scan_nic = cxip_mac_to_nic(mac);
+		} else if (sscanf(node, "%i", &scan_nic) != 1) {
+			CXIP_WARN("Invalid node: %s\n", node);
+			return -FI_EINVAL;
+		}
+
+		CXIP_DBG("Node NIC: %#x\n", scan_nic);
+	}
+
+	if (service) {
+		if (sscanf(service, "%i", &scan_pid) != 1) {
+			CXIP_WARN("Invalid service: %s\n", service);
+			return -FI_EINVAL;
+		}
+
+		if (scan_pid >= C_PID_ANY) {
+			CXIP_WARN("Service out of range [0-%d): %u\n",
+				  C_PID_ANY, scan_pid);
+			return -FI_EINVAL;
+		}
+
+		CXIP_DBG("Service PID: %u\n", scan_pid);
+	}
+
+	/* Previously when remote access ODP was not enabled, the provider
+	 * did not indicate it required FI_MR_ALLOCATED. To correct this
+	 * while not breaking applications, when ODP is NOT enabled add
+	 * FI_MR_ALLOCATED to the hints. Note that if the client sets
+	 * FI_MR_UNSPEC in hints the correct provider required mode
+	 * bits will be returned that the applicaiton must support.
+	 *
+	 * TODO: When ODP is enabled by default, this should be removed
+	 * and applications should use hints to pick the desired mode.
+	 */
+	if (!cxip_env.odp && hints && hints->domain_attr &&
+	    hints->domain_attr->mr_mode  == FI_MR_ENDPOINT) {
+		temp_hints = fi_dupinfo(hints);
+		if (!temp_hints)
+			return -FI_ENOMEM;
+
+		temp_hints->domain_attr->mr_mode |= FI_MR_ALLOCATED;
+
+		CXIP_INFO("FI_MR_ALLOCATED added to hints MR mode\n");
+	}
+
+	/* Find all matching domains, ignoring addresses. */
+	ret = util_getinfo(&cxip_util_prov, version, NULL, NULL, 0,
+			   temp_hints ? temp_hints : hints,
+			   info);
+	if (temp_hints)
+		fi_freeinfo(temp_hints);
+
+	if (ret)
+		return ret;
+
+	/* Remove any info that did match based on mr_mode requirements.
+	 * Note that mr_mode FI_MR_ENDPOINT is only required if target
+	 * RMA/ATOMIC access is required.
+	 */
+	if (hints) {
+		fi_ptr = *info;
+		*info = NULL;
+		fi_prev_ptr = NULL;
+
+		while (fi_ptr) {
+			if (fi_ptr->caps & (FI_ATOMIC | FI_RMA) &&
+			    !fi_ptr->domain_attr->mr_mode) {
+				/* discard entry */
+				if (fi_prev_ptr)
+					fi_prev_ptr->next = fi_ptr->next;
+
+				fi_ptr_tmp = fi_ptr;
+				fi_ptr = fi_ptr->next;
+				fi_ptr_tmp->next = NULL;
+				fi_freeinfo(fi_ptr_tmp);
+				continue;
+			}
+
+			/* Keep the matching info */
+			if (*info == NULL)
+				*info = fi_ptr;
+
+			fi_prev_ptr = fi_ptr;
+			fi_ptr = fi_ptr->next;
+		}
+	}
+
+	/* Search for a specific OFI Domain by node string. */
+	if (flags & FI_SOURCE && node) {
+		iface = cxip_if_lookup_addr(scan_nic);
+		if (!iface) {
+			/* This shouldn't fail. */
+			ret = -FI_EINVAL;
+			goto freeinfo;
+		}
+
+		fi_ptr = *info;
+		*info = NULL;
+		fi_prev_ptr = NULL;
+
+		while (fi_ptr) {
+			if (strcmp(fi_ptr->domain_attr->name,
+				   iface->info->device_name)) {
+				/* discard entry */
+				if (fi_prev_ptr)
+					fi_prev_ptr->next = fi_ptr->next;
+
+				fi_ptr_tmp = fi_ptr;
+				fi_ptr = fi_ptr->next;
+				fi_ptr_tmp->next = NULL;
+				fi_freeinfo(fi_ptr_tmp);
+				continue;
+			}
+
+			/* Keep the matching info */
+			if (*info == NULL)
+				*info = fi_ptr;
+
+			fi_prev_ptr = fi_ptr;
+			fi_ptr = fi_ptr->next;
+		}
+	}
+
+	/* Search for a specific OFI Domain by name. The CXI Domain name
+	 * matches the NIC device file name (cxi[0-9]).
+	 */
+	if (hints && hints->domain_attr && hints->domain_attr->name) {
+		fi_ptr = *info;
+		*info = NULL;
+		fi_prev_ptr = NULL;
+
+		while (fi_ptr) {
+			if (strcmp(fi_ptr->domain_attr->name,
+				   hints->domain_attr->name)) {
+				/* discard entry */
+				if (fi_prev_ptr)
+					fi_prev_ptr->next = fi_ptr->next;
+
+				fi_ptr_tmp = fi_ptr;
+				fi_ptr = fi_ptr->next;
+				fi_ptr_tmp->next = NULL;
+				fi_freeinfo(fi_ptr_tmp);
+				continue;
+			}
+
+			/* Keep the matching info */
+			if (*info == NULL)
+				*info = fi_ptr;
+
+			fi_prev_ptr = fi_ptr;
+			fi_ptr = fi_ptr->next;
+		}
+	}
+
+	cxip_alter_info(*info, hints, version);
+
+	/* Check if any infos remain. */
+	if (!*info)
+		return FI_SUCCESS;
+
+	for (fi_ptr = *info; fi_ptr; fi_ptr = fi_ptr->next) {
+		if (flags & FI_SOURCE) {
+			/* Set client-assigned PID value in source address. */
+			if (service) {
+				addr = (struct cxip_addr *)fi_ptr->src_addr;
+				addr->pid = scan_pid;
+			}
+
+			copy_dest = (hints && hints->dest_addr);
+		} else {
+			if (node) {
+				struct cxip_addr addr = {};
+
+				addr.nic = scan_nic;
+				addr.pid = scan_pid;
+
+				fi_ptr->dest_addr = mem_dup(&addr,
+							    sizeof(addr));
+				if (!fi_ptr->dest_addr) {
+					ret = -FI_ENOMEM;
+					goto freeinfo;
+				}
+				fi_ptr->dest_addrlen = sizeof(addr);
+			} else {
+				copy_dest = (hints && hints->dest_addr);
+			}
+
+			if (hints && hints->src_addr) {
+				fi_ptr->src_addr = mem_dup(hints->src_addr,
+							   hints->src_addrlen);
+				if (!fi_ptr->src_addr) {
+					ret = -FI_ENOMEM;
+					goto freeinfo;
+				}
+				fi_ptr->src_addrlen = hints->src_addrlen;
+				fi_ptr->addr_format = hints->addr_format;
+			}
+		}
+
+		if (copy_dest) {
+			fi_ptr->dest_addr = mem_dup(hints->dest_addr,
+						    hints->dest_addrlen);
+			if (!fi_ptr->dest_addr) {
+				ret = -FI_ENOMEM;
+				goto freeinfo;
+			}
+			fi_ptr->dest_addrlen = hints->dest_addrlen;
+			fi_ptr->addr_format = hints->addr_format;
+		}
+	}
+
+	ret = cxip_alter_auth_key(hints, info);
+	if (ret)
+		goto freeinfo;
+
+	/* Nothing left to do if hints weren't provided. */
+	if (!hints)
+		return FI_SUCCESS;
+
+	/* util_getinfo() returns a list of fi_info that match the MR mode
+	 * for each nic. They are listed in provider preference order.
+	 * Since hints were provided, keep only the most preferred fi_info for
+	 * any given domain/interface using the same address format. We
+	 * always keep the first one.
+	 */
+	fi_ptr = *info;
+	fi_prev_ptr = NULL;
+
+	while (fi_ptr) {
+		if (fi_prev_ptr &&
+		    !strcmp(fi_ptr->domain_attr->name,
+			    fi_prev_ptr->domain_attr->name) &&
+		    fi_ptr->addr_format == fi_prev_ptr->addr_format) {
+			/* discard entry */
+			fi_prev_ptr->next = fi_ptr->next;
+			fi_ptr_tmp = fi_ptr;
+			fi_ptr = fi_ptr->next;
+
+			fi_ptr_tmp->next = NULL;
+			fi_freeinfo(fi_ptr_tmp);
+			continue;
+		}
+
+		/* Keep the preferred info for this domain */
+		fi_prev_ptr = fi_ptr;
+		fi_ptr = fi_ptr->next;
+	}
+
+	/* util_getinfo() returns a list of fi_info for each matching OFI
+	 * Domain (physical CXI interface).
+	 *
+	 * Perform fixups:
+	 * -Use input ordering requirements.
+	 * -Remove unrequested secondary caps that impact performance.
+	 */
+	for (fi_ptr = *info; fi_ptr; fi_ptr = fi_ptr->next) {
+		/* Ordering requirements prevent the use of restricted packets.
+		 * If hints exist, copy msg_order settings directly.
+		 */
+		fi_ptr->tx_attr->msg_order = hints->tx_attr->msg_order;
+
+		/* Requesting FI_RMA_EVENT prevents the use of restricted
+		 * packets. Do not set FI_RMA_EVENT unless explicitly
+		 * requested.
+		 */
+		if (hints->caps && !(hints->caps & FI_RMA_EVENT)) {
+			fi_ptr->caps &= ~FI_RMA_EVENT;
+			fi_ptr->rx_attr->caps &= ~FI_RMA_EVENT;
+		}
+
+		/* FI_SOURCE_ERR requires that FI_SOURCE be set, it is
+		 * an error if requested but can not be honored.
+		 */
+		if (hints->caps & FI_SOURCE_ERR && !(hints->caps & FI_SOURCE)) {
+			ret = -FI_ENODATA;
+			goto freeinfo;
+		}
+
+		/* Requesting FI_SOURCE adds overhead to a receive operation.
+		 * Do not set FI_SOURCE unless explicitly requested.
+		 */
+		if (!(hints->caps & FI_SOURCE)) {
+			fi_ptr->caps &= ~FI_SOURCE;
+			fi_ptr->rx_attr->caps &= ~FI_SOURCE;
+		}
+
+		/* Requesting FI_SOURCE_ERR adds additional overhead to receive
+		 * operations beyond FI_SOURCE, do not set if not explicitly
+		 * asked.
+		 */
+		if (!(hints->caps & FI_SOURCE_ERR)) {
+			fi_ptr->caps &= ~FI_SOURCE_ERR;
+			fi_ptr->rx_attr->caps &= ~FI_SOURCE_ERR;
+		}
+
+		/* Requesting FI_FENCE prevents the use PCIe RO for RMA. Do not
+		 * set FI_FENCE unless explicitly requested.
+		 */
+		if (hints->caps && !(hints->caps & FI_FENCE)) {
+			fi_ptr->caps &= ~FI_FENCE;
+			fi_ptr->tx_attr->caps &= ~FI_FENCE;
+		}
+
+		/* Requesting FI_HMEM requires use of device memory safe
+		 * copy routines. Do not set FI_HMEM unless requested or
+		 * all supported provider capabilities are requested.
+		 */
+		if (hints->caps && !(hints->caps & FI_HMEM)) {
+			fi_ptr->caps &= ~FI_HMEM;
+			fi_ptr->tx_attr->caps &= ~FI_HMEM;
+			fi_ptr->rx_attr->caps &= ~FI_HMEM;
+		}
+	}
+
+	return FI_SUCCESS;
+
+freeinfo:
+	fi_freeinfo(*info);
+
+	return ret;
+}
+
+struct fi_provider cxip_prov = {
+	.name = cxip_prov_name,
+	.version = CXIP_PROV_VERSION,
+	.fi_version = CXIP_FI_VERSION,
+	.getinfo = cxip_getinfo,
+	.fabric = cxip_fabric,
+	.cleanup = cxip_fini,
+};
diff --git a/prov/cxi/src/cxip_iomm.c b/prov/cxi/src/cxip_iomm.c
new file mode 100644
index 00000000000..f6116f39b68
--- /dev/null
+++ b/prov/cxi/src/cxip_iomm.c
@@ -0,0 +1,618 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2019 Hewlett Packard Enterprise Development LP
+ */
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_MR, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_MR, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_MR, __VA_ARGS__)
+
+#define MAP_FAIL_MSG "cxil_map lni: %d base: 0x%p len: %ld " \
+		     "map_flags: 0x%0X failure: %d, %s\n"
+
+/**
+ * cxip_do_map() - IO map a buffer.
+ */
+static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry)
+{
+	int ret;
+	struct cxip_md *md = (struct cxip_md *)entry->data;
+	struct cxip_domain *dom;
+	uint32_t map_flags = CXI_MAP_READ | CXI_MAP_WRITE;
+	struct cxi_md_hints hints;
+	void *ze_handle;
+	void *ze_base_addr;
+	size_t ze_base_size;
+	uint64_t hmem_flags = entry->info.flags;
+
+	dom = container_of(cache, struct cxip_domain, iomm);
+
+	/* Prefer the ATS (scalable MD) whenever possible
+	 *
+	 * TODO: ATS (scalable MD) can only support CPU page sizes and should be
+	 * avoided for non-standard page sizes.
+	 */
+	if (dom->scalable_iomm && entry->info.iface == FI_HMEM_SYSTEM) {
+		md->md = dom->scalable_md.md;
+		md->dom = dom;
+		md->info = entry->info;
+
+		return FI_SUCCESS;
+	}
+
+	memset(&hints, 0, sizeof(hints));
+
+	if (entry->info.iface == FI_HMEM_SYSTEM) {
+		if (dom->ats)
+			map_flags |= CXI_MAP_ATS;
+
+		if (!dom->odp)
+			map_flags |= CXI_MAP_PIN;
+	} else {
+		/* TODO: Remove PIN when DMA buf move_notify is supported. */
+		map_flags |= CXI_MAP_DEVICE | CXI_MAP_PIN;
+
+		/* ZE support requires the use of the DMA buf FD and offset
+		 * hints fields.
+		 */
+		if (entry->info.iface == FI_HMEM_ZE) {
+			if (!cxip_env.ze_hmem_supported) {
+				CXIP_WARN("ZE device memory not supported. Try disabling implicit scaling (EnableImplicitScaling=0 NEOReadDebugKeys=1).\n");
+				return -FI_ENOSYS;
+			}
+
+			ret = ze_hmem_get_handle(entry->info.iov.iov_base,
+						 entry->info.iov.iov_len,
+						 &ze_handle);
+			if (ret) {
+				CXIP_WARN("ze_hmem_get_handle failed: %d:%s\n",
+					  ret, fi_strerror(-ret));
+				goto err;
+			}
+
+			ret = ze_hmem_get_base_addr(entry->info.iov.iov_base,
+						    entry->info.iov.iov_len,
+						    &ze_base_addr,
+						    &ze_base_size);
+			if (ret) {
+				CXIP_WARN("ze_hmem_get_base_addr failed: %d:%s\n",
+					  ret, fi_strerror(-ret));
+				goto err;
+			}
+
+			hints.dmabuf_fd = (int)(uintptr_t)ze_handle;
+			hints.dmabuf_offset =
+				(uintptr_t)entry->info.iov.iov_base -
+				(uintptr_t)ze_base_addr;
+			hints.dmabuf_valid = true;
+		}
+	}
+
+	if (!cxip_env.iotlb)
+		map_flags |= CXI_MAP_NOCACHE;
+
+	ret = cxil_map(dom->lni->lni, entry->info.iov.iov_base,
+		       entry->info.iov.iov_len, map_flags, &hints, &md->md);
+	if (ret) {
+		CXIP_WARN(MAP_FAIL_MSG, dom->lni->lni->id,
+			  entry->info.iov.iov_base, entry->info.iov.iov_len,
+			  map_flags,  ret, fi_strerror(-ret));
+		goto err;
+	}
+
+	/* zeHostMalloc() returns FI_HMEM_ZE but this cannot currently be
+	 * registered with ofi_hmem_dev_register(). Thus skip it.
+	 */
+	if (cxip_env.disable_hmem_dev_register ||
+	    ((entry->info.iface == FI_HMEM_ZE) &&
+	      (hmem_flags & FI_HMEM_HOST_ALLOC)))
+		ret = -FI_ENOSYS;
+	else
+		ret = ofi_hmem_dev_register(entry->info.iface,
+					    entry->info.iov.iov_base,
+					    entry->info.iov.iov_len,
+					    &md->handle);
+	switch (ret) {
+	case FI_SUCCESS:
+		md->handle_valid = true;
+		break;
+
+	case -FI_ENOSYS:
+		md->handle_valid = false;
+		break;
+
+	default:
+		CXIP_WARN("ofi_hmem_dev_register %s failed: %d:%s\n",
+			  fi_tostr(&entry->info.iface, FI_TYPE_HMEM_IFACE), ret,
+			  fi_strerror(-ret));
+		goto err_unmap;
+	}
+
+	md->dom = dom;
+	md->info = entry->info;
+	md->cached = true;
+	CXIP_DBG("addr:%p end:%p len:0x%lx iova:%llx lac:%d device:%d\n",
+		 entry->info.iov.iov_base,
+		 (char *)entry->info.iov.iov_base + entry->info.iov.iov_len,
+		 entry->info.iov.iov_len, md->md->iova, md->md->lac,
+		 !!(map_flags & CXI_MAP_DEVICE));
+
+	return FI_SUCCESS;
+
+err_unmap:
+	cxil_unmap(md->md);
+err:
+	md->dom = NULL;
+	return ret;
+}
+
+/**
+ * cxip_do_unmap() - IO unmap a buffer.
+ */
+static void cxip_do_unmap(struct ofi_mr_cache *cache,
+			  struct ofi_mr_entry *entry)
+{
+	int ret;
+	struct cxip_md *md = (struct cxip_md *)entry->data;
+
+	if (!md || !md->dom || md->md == md->dom->scalable_md.md)
+		return;
+
+	if (md->handle_valid)
+		ofi_hmem_dev_unregister(entry->info.iface, md->handle);
+
+	ret = cxil_unmap(md->md);
+	if (ret)
+		CXIP_WARN("cxil_unmap failed: %d\n", ret);
+
+	CXIP_DBG("addr:%p end:%p len:0x%lx iova:%llx lac:%d\n",
+		 entry->info.iov.iov_base,
+		 (char *)entry->info.iov.iov_base + entry->info.iov.iov_len,
+		 entry->info.iov.iov_len, md->md->iova, md->md->lac);
+}
+
+static int cxip_scalable_iomm_init(struct cxip_domain *dom)
+{
+	int ret;
+	uint32_t map_flags = (CXI_MAP_READ | CXI_MAP_WRITE | CXI_MAP_ATS);
+
+	if (!cxip_env.iotlb)
+		map_flags |= CXI_MAP_NOCACHE;
+
+	ret = cxil_map(dom->lni->lni, 0, 0xfffffffffffff000, map_flags, NULL,
+		       &dom->scalable_md.md);
+	if (!ret) {
+		dom->scalable_md.dom = dom;
+		dom->scalable_iomm = true;
+
+		CXIP_DBG("Scalable IOMM enabled.\n");
+
+		if (cxip_env.ats_mlock_mode == CXIP_ATS_MLOCK_ALL) {
+			ret = mlockall(MCL_CURRENT | MCL_FUTURE);
+			if (ret) {
+				CXIP_WARN("mlockall(MCL_CURRENT | MCL_FUTURE) failed: %d\n",
+					  -errno);
+			}
+		}
+
+		ret = FI_SUCCESS;
+	} else {
+		ret = -FI_ENOSYS;
+	}
+
+	return ret;
+}
+
+static void cxip_scalable_iomm_fini(struct cxip_domain *dom)
+{
+	cxil_unmap(dom->scalable_md.md);
+}
+
+static int cxip_ats_check(struct cxip_domain *dom)
+{
+	uint32_t map_flags = CXI_MAP_READ | CXI_MAP_WRITE | CXI_MAP_ATS |
+			     CXI_MAP_PIN;
+	int stack_var;
+	struct cxi_md *md;
+	int ret;
+
+	ret = cxil_map(dom->lni->lni, &stack_var, sizeof(stack_var), map_flags,
+		       NULL, &md);
+	if (!ret) {
+		cxil_unmap(md);
+		CXIP_INFO("PCIe ATS supported.\n");
+		return 1;
+	}
+
+	CXIP_INFO("PCIe ATS not supported.\n");
+	return 0;
+}
+
+static void cxip_iomm_set_rocr_dev_mem_only(struct cxip_domain *dom)
+{
+	int dev_hmem_count = 0;
+	bool rocr_support = false;
+	int i;
+
+	if (!dom->hmem) {
+		dom->rocr_dev_mem_only = false;
+		return;
+	}
+
+	for (i = 0; i < OFI_HMEM_MAX; i++) {
+		if (i == FI_HMEM_SYSTEM)
+			continue;
+
+		if (hmem_ops[i].initialized) {
+			dev_hmem_count++;
+
+			if (i == FI_HMEM_ROCR)
+				rocr_support = true;
+		}
+	}
+
+	/* If FI_HMEM_ROCR is the ONLY device supported by libfabric and the
+	 * core ROCR memory monitor is used, cxip_map can be optimized to avoid
+	 * pointer queries.
+	 */
+	if (dev_hmem_count == 1 && rocr_support &&
+	    default_rocr_monitor == rocr_monitor)
+		dom->rocr_dev_mem_only = true;
+	else
+		dom->rocr_dev_mem_only = false;
+}
+
+/*
+ * cxip_iomm_init() - Initialize domain IO memory map.
+ */
+int cxip_iomm_init(struct cxip_domain *dom)
+{
+	struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = {
+		[FI_HMEM_SYSTEM] = default_monitor,
+		[FI_HMEM_CUDA] = default_cuda_monitor,
+		[FI_HMEM_ROCR] = default_rocr_monitor,
+		[FI_HMEM_ZE] = default_ze_monitor,
+	};
+	enum fi_hmem_iface iface;
+	int ret;
+	bool scalable;
+
+	/* Check if ATS is supported */
+	if (cxip_env.ats && cxip_ats_check(dom))
+		dom->ats = true;
+
+	if (cxip_env.odp && !(dom->util_domain.mr_mode & FI_MR_ALLOCATED))
+		dom->odp = true;
+
+	if (dom->util_domain.info_domain_caps & FI_HMEM)
+		dom->hmem = true;
+
+	scalable = dom->ats && dom->odp;
+
+	CXIP_INFO("Domain ATS: %d ODP: %d HMEM: %d Scalable: %d\n",
+		  dom->ats, dom->odp, dom->hmem, scalable);
+
+	/* Unpinned ATS translation is scalable. A single MD covers all
+	 * memory addresses and a cache isn't necessary.
+	 */
+	if (scalable) {
+		ret = cxip_scalable_iomm_init(dom);
+		if (ret) {
+			CXIP_WARN("cxip_scalable_iomm_init() returned: %d\n",
+				  ret);
+			return ret;
+		}
+	}
+
+	if (!scalable || dom->hmem) {
+		dom->iomm.entry_data_size = sizeof(struct cxip_md);
+		dom->iomm.add_region = cxip_do_map;
+		dom->iomm.delete_region = cxip_do_unmap;
+		ret = ofi_mr_cache_init(&dom->util_domain, memory_monitors,
+					&dom->iomm);
+		if (ret) {
+			CXIP_INFO("MR cache init failed: %s. MR caching disabled.\n",
+				  fi_strerror(-ret));
+		} else {
+			for (iface = 0; iface < OFI_HMEM_MAX; iface++) {
+				if (dom->iomm.monitors[iface])
+					CXIP_INFO("MR cache enabled for %s memory\n",
+						  fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
+			}
+		}
+	}
+
+	cxip_iomm_set_rocr_dev_mem_only(dom);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_iomm_fini() - Finalize domain IO memory map.
+ */
+void cxip_iomm_fini(struct cxip_domain *dom)
+{
+	if (dom->scalable_iomm)
+		cxip_scalable_iomm_fini(dom);
+
+	if (!dom->scalable_iomm || dom->hmem)
+		ofi_mr_cache_cleanup(&dom->iomm);
+}
+
+static int cxip_map_cache(struct cxip_domain *dom, struct ofi_mr_info *info,
+			  struct cxip_md **md)
+{
+	struct ofi_mr_entry *entry;
+	int ret;
+
+	ret = ofi_mr_cache_search(&dom->iomm, info, &entry);
+	if (ret) {
+		CXIP_WARN("Failed to acquire mapping (%p, %lu): %d\n",
+ 			  info->iov.iov_base, info->iov.iov_len, ret);
+		return ret;
+	}
+
+	*md = (struct cxip_md *)entry->data;
+
+	return FI_SUCCESS;
+}
+
+static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr,
+			    uint64_t hmem_flags, struct cxip_md **md)
+{
+	struct cxip_md *uncached_md;
+	uint32_t map_flags;
+	int ret;
+	struct cxi_md_hints hints;
+	void *ze_handle;
+	void *ze_base_addr;
+	size_t ze_base_size;
+
+	/* Prefer the ATS (scalable MD) whenever possible
+	 *
+	 * TODO: ATS (scalable MD) can only support CPU page sizes and should be
+	 * avoided for non-standard page sizes.
+	 */
+	if (dom->scalable_iomm && attr->iface == FI_HMEM_SYSTEM) {
+		*md = &dom->scalable_md;
+		return FI_SUCCESS;
+	}
+
+	memset(&hints, 0, sizeof(hints));
+
+	uncached_md = calloc(1, sizeof(*uncached_md));
+	if (!uncached_md)
+		return -FI_ENOMEM;
+
+	map_flags = CXI_MAP_READ | CXI_MAP_WRITE;
+	if (attr->iface == FI_HMEM_SYSTEM) {
+		if (dom->ats)
+			map_flags |= CXI_MAP_ATS;
+
+		if (!dom->odp)
+			map_flags |= CXI_MAP_PIN;
+	} else {
+		/* TODO: Remove PIN when DMA buf move_notify is supported. */
+		map_flags |= CXI_MAP_DEVICE | CXI_MAP_PIN;
+
+		/* ZE support requires the use of the DMA buf FD and offset
+		 * hints fields.
+		 */
+		if (attr->iface == FI_HMEM_ZE) {
+			if (!cxip_env.ze_hmem_supported) {
+				CXIP_WARN("ZE device memory not supported. Try disabling implicit scaling (EnableImplicitScaling=0 NEOReadDebugKeys=1).\n");
+				ret = -FI_ENOSYS;
+				goto err_free_uncached_md;
+			}
+
+			ret = ze_hmem_get_handle(attr->mr_iov->iov_base,
+						 attr->mr_iov->iov_len,
+						 &ze_handle);
+			if (ret) {
+				CXIP_WARN("ze_hmem_get_handle failed: %d:%s\n",
+					  ret, fi_strerror(-ret));
+				goto err_free_uncached_md;
+			}
+
+			ret = ze_hmem_get_base_addr(attr->mr_iov->iov_base,
+						    attr->mr_iov->iov_len,
+						    &ze_base_addr,
+						    &ze_base_size);
+			if (ret) {
+				CXIP_WARN("ze_hmem_get_base_addr failed: %d:%s\n",
+					  ret, fi_strerror(-ret));
+				goto err_free_uncached_md;
+			}
+
+			hints.dmabuf_fd = (int)(uintptr_t)ze_handle;
+			hints.dmabuf_offset =
+				(uintptr_t)attr->mr_iov->iov_base -
+				(uintptr_t)ze_base_addr;
+			hints.dmabuf_valid = true;
+		}
+	}
+
+	if (!cxip_env.iotlb)
+		map_flags |= CXI_MAP_NOCACHE;
+
+	ret = cxil_map(dom->lni->lni, attr->mr_iov->iov_base,
+		       attr->mr_iov->iov_len, map_flags, &hints,
+		       &uncached_md->md);
+	if (ret) {
+		CXIP_WARN("cxil_map failed: %d:%s\n", ret, fi_strerror(-ret));
+		goto err_free_uncached_md;
+	}
+
+	/* zeHostMalloc() returns FI_HMEM_ZE but this cannot currently be
+	 * registered with ofi_hmem_dev_register(). Thus skip it.
+	 */
+	if (cxip_env.disable_hmem_dev_register ||
+	    ((attr->iface == FI_HMEM_ZE) && (hmem_flags & FI_HMEM_HOST_ALLOC)))
+		ret = -FI_ENOSYS;
+	else
+		ret = ofi_hmem_dev_register(attr->iface,
+					    (const void *)uncached_md->md->va,
+					    uncached_md->md->len,
+					    &uncached_md->handle);
+
+	switch (ret) {
+	case FI_SUCCESS:
+		uncached_md->handle_valid = true;
+		break;
+
+	case -FI_ENOSYS:
+		uncached_md->handle_valid = false;
+		break;
+
+	default:
+		CXIP_WARN("ofi_hmem_dev_register %s failed: %d:%s\n",
+			  fi_tostr(&attr->iface, FI_TYPE_HMEM_IFACE), ret,
+			  fi_strerror(-ret));
+		goto err_unmap;
+	}
+
+	uncached_md->dom = dom;
+	uncached_md->info.iov.iov_base = (void *)uncached_md->md->va;
+	uncached_md->info.iov.iov_len = uncached_md->md->len;
+	uncached_md->info.iface = attr->iface;
+
+	*md = uncached_md;
+
+	return FI_SUCCESS;
+
+err_unmap:
+	cxil_unmap(uncached_md->md);
+err_free_uncached_md:
+	free(uncached_md);
+
+	return ret;
+}
+
+static void cxip_map_get_mem_region_size(const void *buf, unsigned long len,
+					 enum fi_hmem_iface iface,
+					 void **out_buf, unsigned long *out_len)
+{
+	int ret;
+
+	ret = ofi_hmem_get_base_addr(iface, buf, len, out_buf, out_len);
+	if (ret) {
+		*out_buf = (void *)buf;
+		*out_len = len;
+	}
+
+	CXIP_DBG("%s: User addr=%p User len=%lu Region addr=%p Region len=%lu\n",
+		 fi_tostr(&iface, FI_TYPE_HMEM_IFACE), buf, len, *out_buf,
+		 *out_len);
+}
+
+/*
+ * cxip_map() - Acquire IO mapping for buf.
+ *
+ * The IO memory map is searched for a IO mapping which covers buf. If no
+ * mapping has been established, create one and cache it.
+ */
+int cxip_map(struct cxip_domain *dom, const void *buf, unsigned long len,
+	     uint64_t flags, struct cxip_md **md)
+{
+	struct iovec iov = {
+		.iov_base = (void *)buf,
+		.iov_len = len,
+	};
+	struct fi_mr_attr attr = {
+		.iov_count = 1,
+		.mr_iov = &iov,
+	};
+	struct ofi_mr_info mr_info = {};
+	uint64_t hmem_flags = 0;
+	struct ofi_mr_entry *entry;
+	bool cache = !(flags & OFI_MR_NOCACHE);
+
+	/* TODO: ATS (scalable MD) can only support CPU page sizes and should be
+	 * avoided for non-standard page sizes.
+	 */
+	if (dom->scalable_iomm && !dom->hmem) {
+		*md = &dom->scalable_md;
+		return FI_SUCCESS;
+	}
+
+	/* Since the MR cache find operates on virtual addresses and all device
+	 * memory must support a unified virtual address space with system
+	 * memory, the buffer pointer query can be avoided completely if the
+	 * corresponding entry is in the cache.
+	 */
+	if (cache && cxip_domain_mr_cache_enabled(dom)) {
+		entry = ofi_mr_cache_find(&dom->iomm, &attr, 0);
+		if (entry) {
+			*md = (struct cxip_md *)entry->data;
+			return FI_SUCCESS;
+		}
+	}
+
+	/* Since the MR cache search will allocate a new entry, the MR iface
+	 * attribute must be defined for the proper MR cache memory monitor to
+	 * be selected.
+	 */
+	if (dom->hmem)
+		attr.iface = ofi_get_hmem_iface(buf, NULL, &hmem_flags);
+
+	if (cache && cxip_domain_mr_cache_iface_enabled(dom, attr.iface)) {
+		cxip_map_get_mem_region_size(iov.iov_base, iov.iov_len,
+					     attr.iface, &iov.iov_base,
+					     &iov.iov_len);
+
+		mr_info.iface = attr.iface;
+		mr_info.iov = iov;
+
+		/* Overload IPC addr to pass in HMEM flags. */
+		mr_info.flags = hmem_flags;
+
+		return cxip_map_cache(dom, &mr_info, md);
+	}
+
+	return cxip_map_nocache(dom, &attr, flags, md);
+}
+
+static void cxip_unmap_cache(struct cxip_md *md)
+{
+	struct ofi_mr_entry *entry =
+		container_of(md, struct ofi_mr_entry, data);
+
+	ofi_mr_cache_delete(&md->dom->iomm, entry);
+}
+
+static void cxip_unmap_nocache(struct cxip_md *md)
+{
+	int ret;
+
+	if (md->handle_valid)
+		ofi_hmem_dev_unregister(md->info.iface, md->handle);
+
+	ret = cxil_unmap(md->md);
+	if (ret)
+		CXIP_WARN("cxil_unmap failed: %d\n", ret);
+
+	free(md);
+}
+
+/*
+ * cxip_unmap() - Release an IO mapping.
+ *
+ * Drop a refernce to the IO mapping. If this was the last reference, the
+ * buffer may be unmapped.
+ */
+void cxip_unmap(struct cxip_md *md)
+{
+	/* Scalable MD is owned by the CXIP domain and thus will be freed when
+	 * the domain is closed.
+	 */
+	if (md == &md->dom->scalable_md)
+		return;
+
+	if (md->cached)
+		cxip_unmap_cache(md);
+	else
+		cxip_unmap_nocache(md);
+}
diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c
new file mode 100644
index 00000000000..4ff81d5a448
--- /dev/null
+++ b/prov/cxi/src/cxip_mr.c
@@ -0,0 +1,1467 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2017 Intel Corporation, Inc.  All rights reserved.
+ * Copyright (c) 2018,2020-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <fasthash.h>
+#include <ofi_util.h>
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_MR, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_MR, __VA_ARGS__)
+
+static int cxip_mr_init(struct cxip_mr *mr, struct cxip_domain *dom,
+			const struct fi_mr_attr *attr, uint64_t flags);
+static void cxip_mr_fini(struct cxip_mr *mr);
+static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr);
+
+void cxip_mr_domain_fini(struct cxip_mr_domain *mr_domain)
+{
+	int i;
+
+	/* Assumption is this is only called when a domain is freed and only a
+	 * single thread should be freeing a domain. Thus, no lock is taken.
+	 */
+	for (i = 0; i < CXIP_MR_DOMAIN_HT_BUCKETS; i++) {
+		if (!dlist_empty(&mr_domain->buckets[i]))
+			CXIP_WARN("MR domain bucket %d is not empty\n", i);
+	}
+
+	ofi_spin_destroy(&mr_domain->lock);
+}
+
+void cxip_mr_domain_init(struct cxip_mr_domain *mr_domain)
+{
+	int i;
+
+	ofi_spin_init(&mr_domain->lock);
+
+	for (i = 0; i < CXIP_MR_DOMAIN_HT_BUCKETS; i++)
+		dlist_init(&mr_domain->buckets[i]);
+}
+
+/*
+ * cxip_ep_mr_insert() - Insert an MR key into the EP key space.
+ *
+ * Called during MR enable. The key space is a sparse 64 bits.
+ */
+static void cxip_ep_mr_insert(struct cxip_ep_obj *ep_obj, struct cxip_mr *mr)
+{
+	dlist_insert_tail(&mr->ep_entry, &ep_obj->mr_list);
+}
+
+/*
+ * cxip_ep_mr_remove() - Remove an MR key from the EP key space.
+ */
+static void cxip_ep_mr_remove(struct cxip_mr *mr)
+{
+	dlist_remove(&mr->ep_entry);
+}
+
+/*
+ * cxip_mr_cb() - Process MR LE events.
+ */
+int cxip_mr_cb(struct cxip_ctrl_req *req, const union c_event *event)
+{
+	struct cxip_mr *mr = req->mr.mr;
+	int evt_rc = cxi_event_rc(event);
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_LINK:
+		if (mr->optimized)
+			assert(mr->mr_state == CXIP_MR_ENABLED);
+		else
+			assert(mr->mr_state == CXIP_MR_DISABLED);
+
+		if (evt_rc == C_RC_OK) {
+			mr->mr_state = CXIP_MR_LINKED;
+			CXIP_DBG("MR PTE linked: %p\n", mr);
+			break;
+		}
+
+		mr->mr_state = CXIP_MR_LINK_ERR;
+		CXIP_WARN("MR PTE link: %p failed %d\n", mr, evt_rc);
+		break;
+	case C_EVENT_UNLINK:
+		assert(evt_rc == C_RC_OK);
+
+		assert(mr->mr_state == CXIP_MR_LINKED);
+		mr->mr_state = CXIP_MR_UNLINKED;
+
+		CXIP_DBG("MR PTE unlinked: %p\n", mr);
+		break;
+	case C_EVENT_MATCH:
+		ofi_atomic_inc32(&mr->match_events);
+
+		if (evt_rc != C_RC_OK)
+			goto log_err;
+		break;
+	case C_EVENT_PUT:
+	case C_EVENT_GET:
+	case C_EVENT_ATOMIC:
+	case C_EVENT_FETCH_ATOMIC:
+		if (mr->count_events)
+			ofi_atomic_inc32(&mr->access_events);
+
+		if (evt_rc != C_RC_OK)
+			goto log_err;
+
+		/* TODO handle fi_writedata/fi_inject_writedata */
+		break;
+	default:
+log_err:
+		CXIP_WARN(CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event), cxi_rc_to_str(evt_rc));
+	}
+
+	return FI_SUCCESS;
+}
+
+static int cxip_mr_wait_append(struct cxip_ep_obj *ep_obj,
+			       struct cxip_mr *mr)
+{
+	/* Wait for PTE LE append status update */
+	do {
+		sched_yield();
+		cxip_ep_tgt_ctrl_progress_locked(ep_obj);
+	} while (mr->mr_state != CXIP_MR_LINKED &&
+		 mr->mr_state != CXIP_MR_LINK_ERR);
+
+	if (mr->mr_state == CXIP_MR_LINK_ERR)
+		return -FI_ENOSPC;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_mr_enable_std() - Assign HW resources to the standard MR.
+ *
+ * Standard MRs are implemented by linking an LE describing the registered
+ * buffer to a shared, matching PtlTE. The MR key is encoded in the LE match
+ * bits. One PtlTE supports many standard MRs. The number of standard MR
+ * supported is limited by the total number of NIC LEs. Because a matching LE
+ * is used, unrestricted commands must be used to target standard MRs.
+ *
+ * Caller must hold mr->lock, mr->ep->ep_obj->lock.
+ */
+static int cxip_mr_enable_std(struct cxip_mr *mr)
+{
+	int ret;
+	struct cxip_ep_obj *ep_obj = mr->ep->ep_obj;
+	struct cxip_mr_key key = {
+		.raw = mr->key,
+	};
+	uint32_t le_flags;
+
+	mr->req.cb = cxip_mr_cb;
+
+	le_flags = C_LE_UNRESTRICTED_BODY_RO;
+	if (mr->attr.access & FI_REMOTE_WRITE)
+		le_flags |= C_LE_OP_PUT;
+	if (mr->attr.access & FI_REMOTE_READ)
+		le_flags |= C_LE_OP_GET;
+	if (mr->cntr)
+		le_flags |= C_LE_EVENT_CT_COMM;
+
+	/* TODO: to support fi_writedata(), we will want to leave
+	 * success events enabled for mr->rma_events true too.
+	 */
+	if (!mr->count_events)
+		le_flags |= C_LE_EVENT_SUCCESS_DISABLE;
+
+	ret = cxip_pte_append(ep_obj->ctrl_pte,
+			      mr->len ? CXI_VA_TO_IOVA(mr->md->md, mr->buf) : 0,
+			      mr->len, mr->len ? mr->md->md->lac : 0,
+			      C_PTL_LIST_PRIORITY, mr->req.req_id,
+			      key.key, 0, CXI_MATCH_ID_ANY,
+			      0, le_flags, mr->cntr, ep_obj->ctrl_tgq, true);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to write Append command: %d\n", ret);
+		return ret;
+	}
+
+	ret = cxip_mr_wait_append(ep_obj, mr);
+	if (ret)
+		return ret;
+
+	mr->enabled = true;
+
+	CXIP_DBG("Standard MR enabled: %p (key: 0x%016lX)\n", mr, mr->key);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_mr_disable_std() - Free HW resources from the standard MR.
+ *
+ * Caller must hold mr->lock, mr->ep->ep_obj->lock.
+ */
+static int cxip_mr_disable_std(struct cxip_mr *mr)
+{
+	int ret;
+	struct cxip_ep_obj *ep_obj = mr->ep->ep_obj;
+
+	/* TODO: Handle -FI_EAGAIN. */
+	ret = cxip_pte_unlink(ep_obj->ctrl_pte, C_PTL_LIST_PRIORITY,
+			      mr->req.req_id, ep_obj->ctrl_tgq);
+	assert(ret == FI_SUCCESS);
+
+	do {
+		sched_yield();
+		cxip_ep_tgt_ctrl_progress_locked(ep_obj);
+	} while (mr->mr_state != CXIP_MR_UNLINKED);
+
+	/* If MR event counts are recorded then we can check event counts
+	 * to determine if invalidate can be skipped.
+	 */
+	if (!mr->count_events || ofi_atomic_get32(&mr->match_events) !=
+	    ofi_atomic_get32(&mr->access_events)) {
+		/* TODO: Temporary debug helper for DAOS to track if
+		 * Match events detect a need to flush.
+		 */
+		if (mr->count_events)
+			CXIP_WARN("Match events required pte LE invalidate\n");
+
+		ret = cxil_invalidate_pte_le(ep_obj->ctrl_pte->pte, mr->key,
+					     C_PTL_LIST_PRIORITY);
+		if (ret)
+			CXIP_WARN("MR %p key 0x%016lX invalidate failed %d\n",
+				  mr, mr->key, ret);
+	}
+
+	mr->enabled = false;
+
+	CXIP_DBG("Standard MR disabled: %p (key: 0x%016lX)\n", mr, mr->key);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_mr_opt_pte_cb() - Process optimized MR state change events.
+ */
+void cxip_mr_opt_pte_cb(struct cxip_pte *pte, const union c_event *event)
+{
+	struct cxip_mr *mr = (struct cxip_mr *)pte->ctx;
+
+	switch (pte->state) {
+	case C_PTLTE_ENABLED:
+		assert(mr->mr_state == CXIP_MR_DISABLED);
+		mr->mr_state = CXIP_MR_ENABLED;
+
+		CXIP_DBG("MR PTE enabled: %p\n", mr);
+		break;
+	default:
+		CXIP_WARN("Unexpected state received: %u\n", pte->state);
+	}
+}
+
+/*
+ * cxip_mr_enable_opt() - Assign HW resources to the optimized MR.
+ *
+ * Optimized MRs are implemented by allocating a dedicated, non-matching PtlTE
+ * and linking an LE describing the registered buffer. The MR key is used to
+ * derive the PtlTE index. One PtlTE and one LE is required for each optimized
+ * MR. Because a non-matching interface is used, optimized MRs can be targeted
+ * with restricted commands. This may result in better performance.
+ *
+ * Caller must hold mr->lock, mr->ep->ep_obj->lock.
+ */
+static int cxip_mr_enable_opt(struct cxip_mr *mr)
+{
+	int ret;
+	struct cxi_pt_alloc_opts opts = {};
+	struct cxip_ep_obj *ep_obj = mr->ep->ep_obj;
+	uint32_t le_flags;
+	uint64_t ib = 0;
+	int pid_idx;
+
+	mr->req.cb = cxip_mr_cb;
+
+	ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl_tgt_evtq,
+				   &opts, cxip_mr_opt_pte_cb, mr, &mr->pte);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to allocate PTE: %d\n", ret);
+		return ret;
+	}
+
+	pid_idx = cxip_generic_mr_key_to_ptl_idx(mr->domain, mr->key, true);
+	ret = cxip_pte_map(mr->pte, pid_idx, false);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to map write pid_idx %d to PTE: %d\n",
+			  pid_idx, ret);
+		goto err_pte_free;
+	}
+
+	pid_idx = cxip_generic_mr_key_to_ptl_idx(mr->domain, mr->key, false);
+	ret = cxip_pte_map(mr->pte, pid_idx, false);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to map read pid_idx %d to PTE: %d\n",
+			  pid_idx, ret);
+		goto err_pte_free;
+	}
+
+	ret = cxip_pte_set_state(mr->pte, ep_obj->ctrl_tgq, C_PTLTE_ENABLED, 0);
+	if (ret != FI_SUCCESS) {
+		/* This is a bug, we have exclusive access to this CMDQ. */
+		CXIP_WARN("Failed to enqueue command: %d\n", ret);
+		goto err_pte_free;
+	}
+
+	le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE |
+		   C_LE_UNRESTRICTED_BODY_RO;
+	if (mr->attr.access & FI_REMOTE_WRITE)
+		le_flags |= C_LE_OP_PUT;
+	if (mr->attr.access & FI_REMOTE_READ)
+		le_flags |= C_LE_OP_GET;
+	if (mr->cntr)
+		le_flags |= C_LE_EVENT_CT_COMM;
+
+	/* When FI_FENCE is not requested, restricted operations can used PCIe
+	 * relaxed ordering. Unrestricted operations PCIe relaxed ordering is
+	 * controlled by an env for now.
+	 */
+	if (!(ep_obj->caps & FI_FENCE)) {
+		ib = 1;
+
+		if (cxip_env.enable_unrestricted_end_ro)
+			le_flags |= C_LE_UNRESTRICTED_END_RO;
+	}
+
+	ret = cxip_pte_append(mr->pte,
+			      mr->len ? CXI_VA_TO_IOVA(mr->md->md, mr->buf) : 0,
+			      mr->len, mr->len ? mr->md->md->lac : 0,
+			      C_PTL_LIST_PRIORITY, mr->req.req_id,
+			      0, ib, CXI_MATCH_ID_ANY,
+			      0, le_flags, mr->cntr, ep_obj->ctrl_tgq, true);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to write Append command: %d\n", ret);
+		goto err_pte_free;
+	}
+
+	ret = cxip_mr_wait_append(ep_obj, mr);
+	if (ret)
+		goto err_pte_free;
+
+	mr->enabled = true;
+
+	CXIP_DBG("Optimized MR enabled: %p (key: 0x%016lX)\n", mr, mr->key);
+
+	return FI_SUCCESS;
+
+err_pte_free:
+	cxip_pte_free(mr->pte);
+
+	return ret;
+}
+
+/*
+ * cxip_mr_disable_opt() - Free hardware resources for non-cached
+ * optimized MR.
+ *
+ * Caller must hold mr->lock, mr->ep->ep_obj->lock.
+ */
+static int cxip_mr_disable_opt(struct cxip_mr *mr)
+{
+	int ret;
+	struct cxip_ep_obj *ep_obj = mr->ep->ep_obj;
+
+	ret = cxip_pte_unlink(mr->pte, C_PTL_LIST_PRIORITY,
+			      mr->req.req_id, ep_obj->ctrl_tgq);
+	if (ret) {
+		CXIP_WARN("Failed to enqueue Unlink: %d\n", ret);
+		goto cleanup;
+	}
+
+	do {
+		sched_yield();
+		cxip_ep_tgt_ctrl_progress_locked(ep_obj);
+	} while (mr->mr_state != CXIP_MR_UNLINKED);
+
+cleanup:
+	cxip_pte_free(mr->pte);
+
+	mr->enabled = false;
+
+	CXIP_DBG("Optimized MR disabled: %p (key: 0x%016lX)\n", mr, mr->key);
+
+	return FI_SUCCESS;
+}
+
+static void cxip_mr_prov_opt_to_std(struct cxip_mr *mr)
+{
+	struct cxip_mr_key key = {
+		.raw = mr->mr_fid.key,
+	};
+
+	CXIP_WARN("Optimized MR unavailable, fallback to standard MR\n");
+
+	key.opt = false;
+	mr->mr_fid.key = key.raw;
+	mr->optimized = false;
+}
+
+/*
+ * cxip_mr_prov_enable_opt() - Enable a provider key optimized
+ * MR, falling back to a standard MR if resources are not available.
+ *
+ * Caller must hold mr->lock, mr->ep->ep_obj->lock.
+ */
+static int cxip_mr_prov_enable_opt(struct cxip_mr *mr)
+{
+	int ret;
+
+	ret = cxip_mr_enable_opt(mr);
+	if (!ret)
+		return ret;
+
+	cxip_mr_prov_opt_to_std(mr);
+
+	return cxip_mr_enable_std(mr);
+}
+
+/*
+ * cxip_mr_prov_cache_enable_opt() - Enable a provider key optimized
+ * MR configuring hardware if not already cached.
+ *
+ * Caller must hold mr->lock, mr->ep->ep_obj->lock.
+ */
+static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr)
+{
+	int ret;
+	int lac = mr->md->md->lac;
+	struct cxi_pt_alloc_opts opts = {};
+	struct cxip_ep_obj *ep_obj = mr->ep->ep_obj;
+	struct cxip_mr_lac_cache *mr_cache;
+	struct cxip_mr *_mr;
+	uint32_t le_flags;
+	uint64_t ib = 0;
+
+	mr_cache = &ep_obj->opt_mr_cache[lac];
+	ofi_atomic_inc32(&mr_cache->ref);
+
+	if (mr_cache->ctrl_req)
+		goto done;
+
+	mr_cache->ctrl_req = calloc(1, sizeof(struct cxip_ctrl_req));
+	if (!mr_cache->ctrl_req) {
+		ret = -FI_ENOMEM;
+		goto err;
+	}
+
+	ret = cxip_domain_ctrl_id_alloc(ep_obj->domain, mr_cache->ctrl_req);
+	if (ret) {
+		CXIP_WARN("Failed to allocate MR buffer ID: %d\n", ret);
+		goto err_free_req;
+	}
+
+	mr_cache->ctrl_req->ep_obj = ep_obj;
+	mr_cache->ctrl_req->cb = cxip_mr_cb;
+
+	/* Allocate a dummy MR used to maintain cache state for this
+	 * LAC/enable RO state PTE.
+	 */
+	_mr = calloc(1, sizeof(struct cxip_mr));
+	if (!_mr) {
+		ret = -FI_ENOMEM;
+		goto err_free_id;
+	}
+
+	mr_cache->ctrl_req->mr.mr = _mr;
+	mr_cache->ctrl_req->mr.mr->domain = ep_obj->domain;
+	mr_cache->ctrl_req->mr.mr->optimized = true;
+	mr_cache->ctrl_req->mr.mr->mr_state = CXIP_MR_DISABLED;
+
+	ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl_tgt_evtq,
+				   &opts, cxip_mr_opt_pte_cb,
+				   _mr, &_mr->pte);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to allocate PTE: %d\n", ret);
+		goto err_free_mr;
+	}
+
+	ret = cxip_pte_map(_mr->pte, CXIP_PTL_IDX_WRITE_PROV_CACHE_MR_OPT(lac),
+			   false);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to map write PTE: %d\n", ret);
+		goto err_pte_free;
+	}
+
+	ret = cxip_pte_map(_mr->pte, CXIP_PTL_IDX_READ_PROV_CACHE_MR_OPT(lac),
+			   false);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to map write PTE: %d\n", ret);
+		goto err_pte_free;
+	}
+
+	ret = cxip_pte_set_state(_mr->pte, ep_obj->ctrl_tgq,
+				 C_PTLTE_ENABLED, 0);
+	if (ret != FI_SUCCESS) {
+		/* This is a bug, we have exclusive access to this CMDQ. */
+		CXIP_WARN("Failed to enqueue command: %d\n", ret);
+		goto err_pte_free;
+	}
+
+	le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE |
+		   C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_PUT | C_LE_OP_GET;
+
+	/* When FI_FENCE is not requested, restricted operations can used PCIe
+	 * relaxed ordering. Unrestricted operations PCIe relaxed ordering is
+	 * controlled by an env for now.
+	 */
+	if (!(ep_obj->caps & FI_FENCE)) {
+		ib = 1;
+
+		if (cxip_env.enable_unrestricted_end_ro)
+			le_flags |= C_LE_UNRESTRICTED_END_RO;
+	}
+
+	ret = cxip_pte_append(_mr->pte, 0, -1ULL, lac,
+			      C_PTL_LIST_PRIORITY,
+			      mr_cache->ctrl_req->req_id,
+			      0, ib, CXI_MATCH_ID_ANY,
+			      0, le_flags, NULL, ep_obj->ctrl_tgq, true);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to write Append command: %d\n", ret);
+		goto err_pte_free;
+	}
+
+	ret = cxip_mr_wait_append(ep_obj, _mr);
+	if (ret)
+		goto err_pte_free;
+done:
+	mr->enabled = true;
+
+	CXIP_DBG("Optimized MR enabled: %p (key: 0x%016lX)\n", mr, mr->key);
+
+	return FI_SUCCESS;
+
+err_pte_free:
+	cxip_pte_free(_mr->pte);
+err_free_mr:
+	free(mr_cache->ctrl_req->mr.mr);
+err_free_id:
+	cxip_domain_ctrl_id_free(ep_obj->domain, mr_cache->ctrl_req);
+err_free_req:
+	free(mr_cache->ctrl_req);
+	mr_cache->ctrl_req = NULL;
+err:
+	cxip_mr_prov_opt_to_std(mr);
+
+	return cxip_mr_prov_cache_enable_std(mr);
+}
+
+/*
+ * cxip_mr_prov_cache_disable_opt() - Disable a provider key
+ * optimized MR.
+ *
+ * Caller must hold mr->lock, mr->ep->ep_obj->lock.
+ */
+static int cxip_mr_prov_cache_disable_opt(struct cxip_mr *mr)
+{
+	struct cxip_mr_key key = {
+		.raw = mr->key,
+	};
+	int lac = key.lac;
+	struct cxip_ep_obj *ep_obj = mr->ep->ep_obj;
+
+	assert(key.opt);
+
+	CXIP_DBG("Disable optimized cached MR: %p (key: 0x%016lX)\n",
+		 mr, mr->key);
+
+	if (ofi_atomic_get32(&ep_obj->opt_mr_cache[lac].ref) <= 0) {
+		CXIP_WARN("Cached optimized MR reference underflow\n");
+		return -FI_EINVAL;
+	}
+	ofi_atomic_dec32(&ep_obj->opt_mr_cache[lac].ref);
+	mr->enabled = false;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_mr_prov_cache_enable_std() - Enable a provider key standard
+ * MR configuring hardware if not already cached.
+ *
+ * Caller must hold mr->lock, mr->ep->ep_obj->lock.
+ */
+static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr)
+{
+	int ret;
+	int lac = mr->md->md->lac;
+	struct cxip_ep_obj *ep_obj = mr->ep->ep_obj;
+	struct cxip_mr_lac_cache *mr_cache;
+	union cxip_match_bits mb;
+	union cxip_match_bits ib;
+	uint32_t le_flags;
+
+	/* TODO: Handle enabling for each bound endpoint */
+	mr_cache = &ep_obj->std_mr_cache[lac];
+	ofi_atomic_inc32(&mr_cache->ref);
+
+	if (mr_cache->ctrl_req)
+		goto done;
+
+	mr_cache->ctrl_req = calloc(1, sizeof(struct cxip_ctrl_req));
+	if (!mr_cache->ctrl_req) {
+		ret = -FI_ENOMEM;
+		goto err;
+	}
+
+	ret = cxip_domain_ctrl_id_alloc(ep_obj->domain, mr_cache->ctrl_req);
+	if (ret) {
+		CXIP_WARN("Failed to allocate MR buffer ID: %d\n", ret);
+		goto err_free_req;
+	}
+
+	mr_cache->ctrl_req->ep_obj = ep_obj;
+	mr_cache->ctrl_req->cb = cxip_mr_cb;
+
+	/* Allocate a dummy MR used to maintain cache state transitions */
+	mr_cache->ctrl_req->mr.mr = calloc(1, sizeof(struct cxip_mr));
+	if (!mr_cache->ctrl_req->mr.mr) {
+		ret = -FI_ENOMEM;
+		goto err_free_id;
+	}
+
+	mr_cache->ctrl_req->mr.mr->domain = ep_obj->domain;
+	mr_cache->ctrl_req->mr.mr->optimized = false;
+	mr_cache->ctrl_req->mr.mr->mr_state = CXIP_MR_DISABLED;
+
+	mb.raw = 0;
+	mb.mr_lac = mr->md->md->lac;
+	mb.mr_cached = 1;
+
+	ib.raw = ~0;
+	ib.mr_lac = 0;
+	ib.mr_cached = 0;
+
+	le_flags = C_LE_EVENT_SUCCESS_DISABLE | C_LE_UNRESTRICTED_BODY_RO |
+		   C_LE_OP_PUT | C_LE_OP_GET;
+
+	ret = cxip_pte_append(ep_obj->ctrl_pte, 0, -1ULL,
+			      mb.mr_lac, C_PTL_LIST_PRIORITY,
+			      mr_cache->ctrl_req->req_id,
+			      mb.raw, ib.raw, CXI_MATCH_ID_ANY,
+			      0, le_flags, NULL, ep_obj->ctrl_tgq, true);
+
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to write Append command: %d\n", ret);
+		goto err_free_mr;
+	}
+
+	ret = cxip_mr_wait_append(ep_obj, mr_cache->ctrl_req->mr.mr);
+	if (ret)
+		goto err_free_mr;
+
+done:
+	mr->enabled = true;
+
+	CXIP_DBG("Enable cached standard MR: %p (key: 0x%016lX\n",
+		 mr, mr->key);
+
+	return FI_SUCCESS;
+
+err_free_mr:
+	free(mr_cache->ctrl_req->mr.mr);
+err_free_id:
+	cxip_domain_ctrl_id_free(ep_obj->domain, mr_cache->ctrl_req);
+err_free_req:
+	free(mr_cache->ctrl_req);
+	mr_cache->ctrl_req = NULL;
+err:
+	ofi_atomic_dec32(&mr_cache->ref);
+
+	return ret;
+}
+
+/*
+ * cxip_mr_prov_cache_disable_std() - Disable a provider standard
+ * cached MR.
+ *
+ * Caller must hold mr->lock, mr->ep->ep_obj->lock.
+ */
+static int cxip_mr_prov_cache_disable_std(struct cxip_mr *mr)
+{
+	struct cxip_mr_key key = {
+	       .raw	= mr->key,
+	};
+	int lac = key.lac;
+	struct cxip_ep_obj *ep_obj = mr->ep->ep_obj;
+
+	CXIP_DBG("Disable standard cached MR: %p (key: 0x%016lX)\n",
+		 mr, mr->key);
+	if (ofi_atomic_get32(&ep_obj->std_mr_cache[lac].ref) <= 0) {
+		CXIP_WARN("Cached standard MR reference underflow\n");
+		return -FI_EINVAL;
+	}
+	ofi_atomic_dec32(&ep_obj->std_mr_cache[lac].ref);
+	mr->enabled = false;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_mr_domain_remove() - Remove client key from domain hash.
+ */
+static void cxip_mr_domain_remove(struct cxip_mr *mr)
+{
+	if (mr->domain->is_prov_key)
+		return;
+
+	/* Only remotely accessible MR were assigned an RKEY */
+	if (!(mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE)))
+		return;
+
+	ofi_spin_lock(&mr->domain->mr_domain.lock);
+	dlist_remove(&mr->mr_domain_entry);
+	ofi_spin_unlock(&mr->domain->mr_domain.lock);
+}
+
+/*
+ * cxip_mr_domain_insert() - Validate uniqueness and insert
+ * client key in the domain hash table.
+ */
+static int cxip_mr_domain_insert(struct cxip_mr *mr)
+{
+	struct cxip_mr_domain *mr_domain = &mr->domain->mr_domain;
+	int bucket;
+	struct cxip_mr *clash_mr;
+
+	if (mr->domain->is_prov_key)
+		return FI_SUCCESS;
+
+	/* Only remotely accessible MR are assigned an RKEY */
+	if (!(mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE)))
+		return FI_SUCCESS;
+
+	mr->key = mr->attr.requested_key;
+
+	if (!cxip_generic_is_valid_mr_key(mr->key))
+		return -FI_EKEYREJECTED;
+
+	bucket = fasthash64(&mr->key, sizeof(mr->key), 0) %
+		CXIP_MR_DOMAIN_HT_BUCKETS;
+
+	ofi_spin_lock(&mr_domain->lock);
+
+	dlist_foreach_container(&mr_domain->buckets[bucket], struct cxip_mr,
+				clash_mr, mr_domain_entry) {
+		if (clash_mr->key == mr->key) {
+			ofi_spin_unlock(&mr_domain->lock);
+			return -FI_ENOKEY;
+		}
+	}
+
+	dlist_insert_tail(&mr->mr_domain_entry, &mr_domain->buckets[bucket]);
+
+	ofi_spin_unlock(&mr_domain->lock);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_init_mr_key(struct cxip_mr *mr, uint64_t req_key)
+{
+	mr->key = req_key;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_prov_init_mr_key() - Generate a provider key for
+ * a non-cached MR.
+ */
+static int cxip_prov_init_mr_key(struct cxip_mr *mr, uint64_t req_key)
+{
+	int ret;
+
+	/* Non-cached FI_MR_PROV_KEY MR keys need to be unique. */
+	ret = cxip_domain_prov_mr_id_alloc(mr->domain, mr);
+	if (ret)
+		return ret;
+
+	CXIP_DBG("Init non-cached provider MR key 0x%016lX\n", mr->key);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_prov_cache_init_mr_key() - Generate a provider key for
+ * a cached MR.
+ *
+ * Note cached MR do not support counters or target events.
+ */
+static int cxip_prov_cache_init_mr_key(struct cxip_mr *mr,
+				       uint64_t req_key)
+{
+	struct cxip_mr_key key = {};
+	struct cxi_md *md = mr->md->md;
+
+	/* If optimized enabled it is preferred for caching */
+	key.opt = mr->domain->optimized_mrs;
+	key.cached = true;
+	key.is_prov = 1;
+	key.lac = mr->len ? md->lac : 0;
+	key.lac_off = mr->len ? CXI_VA_TO_IOVA(md, mr->buf) : 0;
+	mr->key = key.raw;
+
+	CXIP_DBG("Init cached MR key 0x%016lX, lac: %d, off:0x%016lX\n",
+		 key.raw, key.lac, (uint64_t)key.lac_off);
+
+	return FI_SUCCESS;
+}
+
+static bool cxip_is_valid_mr_key(uint64_t key)
+{
+	if (key & ~CXIP_MR_KEY_MASK)
+		return false;
+
+	return true;
+}
+
+static bool cxip_is_valid_prov_mr_key(uint64_t key)
+{
+	struct cxip_mr_key cxip_key = {
+		.raw = key,
+	};
+
+	if (cxip_key.cached)
+		return cxip_key.is_prov == 1;
+
+	if (cxip_key.opt)
+		return CXIP_MR_UNCACHED_KEY_TO_IDX(cxip_key.key) <
+				CXIP_PTL_IDX_PROV_MR_OPT_CNT;
+
+	if (cxip_key.key & ~CXIP_MR_PROV_KEY_MASK)
+		return false;
+
+	return true;
+}
+
+bool cxip_generic_is_valid_mr_key(uint64_t key)
+{
+	struct cxip_mr_key cxip_key = {
+		.raw = key,
+	};
+
+	if (cxip_key.is_prov)
+		return cxip_is_valid_prov_mr_key(key);
+
+	return cxip_is_valid_mr_key(key);
+}
+
+static bool cxip_mr_key_opt(uint64_t key)
+{
+	/* Client key optimized MR controlled globally only */
+	return cxip_env.optimized_mrs && key < CXIP_PTL_IDX_MR_OPT_CNT;
+}
+
+static bool cxip_prov_mr_key_opt(uint64_t key)
+{
+	struct cxip_mr_key cxip_key = {
+		.raw = key,
+	};
+
+	if (cxip_key.opt)
+		return true;
+
+	return false;
+}
+
+bool cxip_generic_is_mr_key_opt(uint64_t key)
+{
+	struct cxip_mr_key cxip_key = {
+		.raw = key,
+	};
+
+	if (cxip_key.is_prov)
+		return cxip_prov_mr_key_opt(key);
+
+	return cxip_mr_key_opt(key);
+}
+
+static bool cxip_prov_mr_key_events(uint64_t key)
+{
+	struct cxip_mr_key cxip_key = {
+		.raw = key,
+	};
+
+	/* Cached keys can not be bound to counters or require RMA events,
+	 * the "events" field is not defined.
+	 */
+	if (cxip_key.cached)
+		return false;
+
+	if (cxip_key.events)
+		return true;
+
+	return false;
+}
+
+/* If CAPs or MR Key indicate events are required at the target */
+bool cxip_generic_is_mr_key_events(uint64_t caps, uint64_t key)
+{
+	struct cxip_mr_key cxip_key = {
+		.raw = key,
+	};
+
+	if (cxip_key.is_prov)
+		return cxip_prov_mr_key_events(key);
+
+	/* Client keys cannot indicate if they require events and
+	 * rely on FI_RMA_EVENT being set on source and target.
+	 */
+	return !!(caps & FI_RMA_EVENT);
+}
+
+/*
+ * cxip_mr_key_to_ptl_idx() Maps a client generated key to the
+ * PtlTE index.
+ */
+static int cxip_mr_key_to_ptl_idx(struct cxip_domain *dom,
+				  uint64_t key, bool write)
+{
+	if (cxip_generic_is_mr_key_opt(key))
+		return write ? CXIP_PTL_IDX_WRITE_MR_OPT(key) :
+			CXIP_PTL_IDX_READ_MR_OPT(key);
+
+	return write ? CXIP_PTL_IDX_WRITE_MR_STD : CXIP_PTL_IDX_READ_MR_STD;
+}
+
+/*
+ * cxip_prov_mr_key_to_ptl_idx() - Maps a provider generated key
+ * to the PtlTE index.
+ */
+static int cxip_prov_mr_key_to_ptl_idx(struct cxip_domain *dom,
+				       uint64_t key, bool write)
+{
+	struct cxip_mr_key cxip_key = {
+		.raw = key,
+	};
+	int idx;
+
+	if (cxip_generic_is_mr_key_opt(key)) {
+		idx = write ? CXIP_PTL_IDX_WRITE_MR_OPT_BASE :
+			      CXIP_PTL_IDX_READ_MR_OPT_BASE;
+
+		/* First 8 PTE are used for LAC cache entries */
+		if (cxip_key.cached) {
+			idx += cxip_key.lac;
+			return idx;
+		}
+
+		/* Verify within non-cached optimized range */
+		assert(CXIP_MR_UNCACHED_KEY_TO_IDX(cxip_key.key) <
+				CXIP_PTL_IDX_PROV_MR_OPT_CNT);
+
+		idx += CXIP_PTL_IDX_PROV_NUM_CACHE_IDX +
+				CXIP_MR_UNCACHED_KEY_TO_IDX(cxip_key.key);
+		return idx;
+	}
+
+	return write ? CXIP_PTL_IDX_WRITE_MR_STD : CXIP_PTL_IDX_READ_MR_STD;
+}
+
+/*
+ * cxip_generic_mr_key_to_ptl_idx() - Maps a MR RKEY to the PtlTE index.
+ */
+int cxip_generic_mr_key_to_ptl_idx(struct cxip_domain *dom, uint64_t key,
+				   bool write)
+{
+	struct cxip_mr_key cxip_key = {
+		.raw = key,
+	};
+
+	if (cxip_key.is_prov)
+		return cxip_prov_mr_key_to_ptl_idx(dom, key, write);
+
+	return cxip_mr_key_to_ptl_idx(dom, key, write);
+}
+
+/* Caller should hold ep_obj->lock */
+void cxip_ctrl_mr_cache_flush(struct cxip_ep_obj *ep_obj)
+{
+	int lac;
+	struct cxip_mr_lac_cache *mr_cache;
+	int ret;
+
+	/* Flush standard MR resources hardware resources not in use */
+	for (lac = 0; lac < CXIP_NUM_CACHED_KEY_LE; lac++) {
+		mr_cache = &ep_obj->std_mr_cache[lac];
+
+		if (!mr_cache->ctrl_req ||
+		    ofi_atomic_get32(&mr_cache->ref))
+			continue;
+
+		ret = cxip_pte_unlink(ep_obj->ctrl_pte, C_PTL_LIST_PRIORITY,
+				      mr_cache->ctrl_req->req_id,
+				      ep_obj->ctrl_tgq);
+		assert(ret == FI_SUCCESS);
+
+		do {
+			sched_yield();
+			cxip_ep_tgt_ctrl_progress_locked(ep_obj);
+		} while (mr_cache->ctrl_req->mr.mr->mr_state !=
+			 CXIP_MR_UNLINKED);
+
+		ret = cxil_invalidate_pte_le(ep_obj->ctrl_pte->pte,
+					     mr_cache->ctrl_req->req_id,
+					     C_PTL_LIST_PRIORITY);
+		if (ret)
+			CXIP_WARN("Remote MR cache flush invalidate err: %d\n",
+				  ret);
+
+		free(mr_cache->ctrl_req->mr.mr);
+		cxip_domain_ctrl_id_free(ep_obj->domain, mr_cache->ctrl_req);
+		free(mr_cache->ctrl_req);
+		mr_cache->ctrl_req = NULL;
+	}
+
+	/* Flush optimized MR resources hardware resources not in use */
+	for (lac = 0; lac < CXIP_NUM_CACHED_KEY_LE; lac++) {
+		mr_cache = &ep_obj->opt_mr_cache[lac];
+
+		if (!mr_cache->ctrl_req ||
+		    ofi_atomic_get32(&mr_cache->ref))
+			continue;
+
+		ret = cxip_pte_unlink(mr_cache->ctrl_req->mr.mr->pte,
+				      C_PTL_LIST_PRIORITY,
+				      mr_cache->ctrl_req->req_id,
+				      ep_obj->ctrl_tgq);
+		if (ret) {
+			CXIP_WARN("Failed to enqueue Unlink: %d\n", ret);
+			goto cleanup;
+		}
+
+		do {
+			sched_yield();
+			cxip_ep_tgt_ctrl_progress_locked(ep_obj);
+		} while (mr_cache->ctrl_req->mr.mr->mr_state !=
+			 CXIP_MR_UNLINKED);
+
+cleanup:
+		cxip_pte_free(mr_cache->ctrl_req->mr.mr->pte);
+		free(mr_cache->ctrl_req->mr.mr);
+		cxip_domain_ctrl_id_free(ep_obj->domain, mr_cache->ctrl_req);
+		free(mr_cache->ctrl_req);
+		mr_cache->ctrl_req = NULL;
+	}
+}
+
+struct cxip_mr_util_ops cxip_client_key_mr_util_ops = {
+	.is_cached = false,
+	.init_key = cxip_init_mr_key,
+	.enable_opt = cxip_mr_enable_opt,
+	.disable_opt = cxip_mr_disable_opt,
+	.enable_std = cxip_mr_enable_std,
+	.disable_std = cxip_mr_disable_std,
+};
+
+struct cxip_mr_util_ops cxip_prov_key_mr_util_ops = {
+	.is_cached = false,
+	.init_key = cxip_prov_init_mr_key,
+	.enable_opt = cxip_mr_prov_enable_opt,
+	.disable_opt = cxip_mr_disable_opt,
+	.enable_std = cxip_mr_enable_std,
+	.disable_std = cxip_mr_disable_std,
+};
+
+struct cxip_mr_util_ops cxip_prov_key_cache_mr_util_ops = {
+	.is_cached = true,
+	.init_key = cxip_prov_cache_init_mr_key,
+	.enable_opt = cxip_mr_prov_cache_enable_opt,
+	.disable_opt = cxip_mr_prov_cache_disable_opt,
+	.enable_std = cxip_mr_prov_cache_enable_std,
+	.disable_std = cxip_mr_prov_cache_disable_std,
+};
+
+int cxip_mr_enable(struct cxip_mr *mr)
+{
+	int ret;
+
+	/* MR which require remote access require additional resources. Locally
+	 * access MRs only do not. Thus, return FI_SUCCESS.
+	 */
+	if (mr->enabled ||
+	    !(mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE)))
+		return FI_SUCCESS;
+
+	/* Set MR operations based on key management and whether
+	 * the MR is cache-able.
+	 */
+	if (!mr->domain->is_prov_key)
+		mr->mr_util = &cxip_client_key_mr_util_ops;
+	else if (mr->md && mr->md->cached && mr->domain->prov_key_cache &&
+		 !mr->cntr && !mr->count_events && !mr->rma_events)
+		mr->mr_util = &cxip_prov_key_cache_mr_util_ops;
+	else
+		mr->mr_util = &cxip_prov_key_mr_util_ops;
+
+	/* Officially set MR key */
+	if (mr->domain->is_prov_key) {
+		ret = mr->mr_util->init_key(mr, mr->attr.requested_key);
+		if (ret) {
+			CXIP_WARN("Failed to initialize MR key: %d\n", ret);
+			return ret;
+		}
+		mr->mr_fid.key = mr->key;
+	}
+	mr->optimized = cxip_generic_is_mr_key_opt(mr->key);
+
+	ofi_genlock_lock(&mr->ep->ep_obj->lock);
+	cxip_ep_mr_insert(mr->ep->ep_obj, mr);
+
+	if (mr->optimized)
+		ret = mr->mr_util->enable_opt(mr);
+	else
+		ret = mr->mr_util->enable_std(mr);
+	ofi_genlock_unlock(&mr->ep->ep_obj->lock);
+
+	if (ret != FI_SUCCESS)
+		goto err_remove_mr;
+
+	return FI_SUCCESS;
+
+err_remove_mr:
+	cxip_ep_mr_remove(mr);
+
+	return ret;
+}
+
+int cxip_mr_disable(struct cxip_mr *mr)
+{
+	int ret;
+
+	if (!mr->enabled ||
+	    !(mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE)))
+		return FI_SUCCESS;
+
+	ofi_genlock_lock(&mr->ep->ep_obj->lock);
+	if (mr->optimized)
+		ret = mr->mr_util->disable_opt(mr);
+	else
+		ret = mr->mr_util->disable_std(mr);
+
+	cxip_ep_mr_remove(mr);
+	ofi_genlock_unlock(&mr->ep->ep_obj->lock);
+
+	return ret;
+}
+
+/*
+ * cxip_mr_close() - fi_close implemented for MRs.
+ */
+static int cxip_mr_close(struct fid *fid)
+{
+	struct cxip_mr *mr;
+	int ret;
+
+	if (!fid)
+		return -FI_EINVAL;
+
+	mr = container_of(fid, struct cxip_mr, mr_fid.fid);
+
+	ofi_spin_lock(&mr->lock);
+
+	ret = cxip_mr_disable(mr);
+	if (ret != FI_SUCCESS)
+		CXIP_WARN("Failed to disable MR: %d\n", ret);
+
+	if (mr->len)
+		cxip_unmap(mr->md);
+
+	cxip_mr_domain_remove(mr);
+
+	if (mr->ep)
+		ofi_atomic_dec32(&mr->ep->ep_obj->ref);
+
+	if (mr->cntr)
+		ofi_atomic_dec32(&mr->cntr->ref);
+
+	cxip_mr_fini(mr);
+	ofi_atomic_dec32(&mr->domain->ref);
+
+	ofi_spin_unlock(&mr->lock);
+
+	free(mr);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_mr_bind() - fi_bind() implementation for MRs.
+ */
+static int cxip_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
+{
+	struct cxip_mr *mr;
+	struct cxip_cntr *cntr;
+	struct cxip_ep *ep;
+	int ret = FI_SUCCESS;
+
+	mr = container_of(fid, struct cxip_mr, mr_fid.fid);
+
+	ofi_spin_lock(&mr->lock);
+
+	switch (bfid->fclass) {
+	case FI_CLASS_CNTR:
+		cntr = container_of(bfid, struct cxip_cntr, cntr_fid.fid);
+		if (mr->domain != cntr->domain || mr->enabled) {
+			ret = -FI_EINVAL;
+			break;
+		}
+
+		if (mr->cntr) {
+			ret = -FI_EINVAL;
+			break;
+		}
+
+		if (!(flags & FI_REMOTE_WRITE)) {
+			ret = -FI_EINVAL;
+			break;
+		}
+
+		mr->cntr = cntr;
+		ofi_atomic_inc32(&cntr->ref);
+		break;
+
+	case FI_CLASS_EP:
+		ep = container_of(bfid, struct cxip_ep, ep.fid);
+		if (mr->domain != ep->ep_obj->domain || mr->enabled) {
+			ret = -FI_EINVAL;
+			break;
+		}
+
+		if (mr->ep || !ep->ep_obj->enabled) {
+			ret = -FI_EINVAL;
+			break;
+		}
+
+		if (mr->rma_events && !(ep->ep_obj->caps & FI_RMA_EVENT)) {
+			CXIP_WARN("MR requires FI_RMA_EVENT EP cap\n");
+			ret = -FI_EINVAL;
+			break;
+		}
+
+		mr->ep = ep;
+		ofi_atomic_inc32(&ep->ep_obj->ref);
+		break;
+
+	default:
+		ret = -FI_EINVAL;
+	}
+
+	ofi_spin_unlock(&mr->lock);
+
+	return ret;
+}
+
+/*
+ * cxip_mr_control() - fi_control() implementation for MRs.
+ */
+static int cxip_mr_control(struct fid *fid, int command, void *arg)
+{
+	struct cxip_mr *mr;
+	int ret;
+
+	mr = container_of(fid, struct cxip_mr, mr_fid.fid);
+
+	ofi_spin_lock(&mr->lock);
+
+	switch (command) {
+	case FI_ENABLE:
+		/* An MR must be bound to an EP before being enabled. */
+		if (!mr->ep) {
+			ret = -FI_EINVAL;
+			break;
+		}
+
+		ret = cxip_mr_enable(mr);
+		if (ret != FI_SUCCESS)
+			CXIP_WARN("Failed to enable MR: %d\n", ret);
+
+		break;
+
+	default:
+		ret = -FI_EINVAL;
+	}
+
+	ofi_spin_unlock(&mr->lock);
+
+	return ret;
+}
+
+static struct fi_ops cxip_mr_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_mr_close,
+	.bind = cxip_mr_bind,
+	.control = cxip_mr_control,
+	.ops_open = fi_no_ops_open,
+};
+
+static void cxip_mr_fini(struct cxip_mr *mr)
+{
+	cxip_domain_ctrl_id_free(mr->domain, &mr->req);
+	cxip_domain_prov_mr_id_free(mr->domain, mr);
+}
+
+static int cxip_mr_init(struct cxip_mr *mr, struct cxip_domain *dom,
+			const struct fi_mr_attr *attr, uint64_t flags)
+{
+	int ret;
+
+	ofi_spin_init(&mr->lock);
+
+	mr->mr_fid.fid.fclass = FI_CLASS_MR;
+	mr->mr_fid.fid.context = attr->context;
+	mr->mr_fid.fid.ops = &cxip_mr_fi_ops;
+
+	/* Generation of the key for FI_MR_PROV_KEY can not be done
+	 * until the MR has been bound and enabled to at least one
+	 * endpoint.
+	 */
+	mr->mr_fid.key = FI_KEY_NOTAVAIL;
+
+	mr->domain = dom;
+	mr->flags = flags;
+	mr->attr = *attr;
+
+	mr->count_events = dom->mr_match_events;
+	ofi_atomic_initialize32(&mr->match_events, 0);
+	ofi_atomic_initialize32(&mr->access_events, 0);
+	mr->rma_events = flags & FI_RMA_EVENT;
+
+	/* Support length 1 IOV only for now */
+	mr->buf = mr->attr.mr_iov[0].iov_base;
+	mr->len = mr->attr.mr_iov[0].iov_len;
+
+	/* Allocate unique MR buffer ID if remote access MR */
+	if (mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE)) {
+		ret = cxip_domain_ctrl_id_alloc(dom, &mr->req);
+		if (ret) {
+			CXIP_WARN("Failed to allocate MR buffer ID: %d\n", ret);
+			ofi_spin_destroy(&mr->lock);
+			return -FI_ENOSPC;
+		}
+	} else {
+		mr->req.req_id = -1;
+	}
+
+	mr->mr_id = -1;
+	mr->req.mr.mr = mr;
+	mr->mr_fid.mem_desc = (void *)mr;
+	mr->mr_state = CXIP_MR_DISABLED;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * Libfabric MR creation APIs
+ */
+
+static int cxip_regattr(struct fid *fid, const struct fi_mr_attr *attr,
+			uint64_t flags, struct fid_mr **mr)
+{
+	struct cxip_domain *dom;
+	struct cxip_mr *_mr;
+	int ret;
+
+	if (fid->fclass != FI_CLASS_DOMAIN || !attr || attr->iov_count <= 0)
+		return -FI_EINVAL;
+
+	if (attr->iov_count != 1)
+		return -FI_ENOSYS;
+
+	if (flags & FI_DIRECTED_RECV) {
+		CXIP_WARN("FI_DIRECTED_RECV and MRs not supported\n");
+		return -FI_EINVAL;
+	}
+
+	dom = container_of(fid, struct cxip_domain, util_domain.domain_fid);
+
+	_mr = calloc(1, sizeof(*_mr));
+	if (!_mr)
+		return -FI_ENOMEM;
+	ret = cxip_mr_init(_mr, dom, attr, flags);
+	if (ret)
+		goto err_free_mr;
+
+	ret = cxip_mr_domain_insert(_mr);
+	if (ret)
+		goto err_cleanup_mr;
+
+	/* Client key can be set now and will be used to
+	 * detect duplicate errors. Note only remote MR
+	 * are assigned a RKEY.
+	 */
+	if (!_mr->domain->is_prov_key &&
+	    _mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE))
+		_mr->mr_fid.key = _mr->key;
+
+	if (_mr->len) {
+		ret = cxip_map(_mr->domain, (void *)_mr->buf, _mr->len, 0,
+			       &_mr->md);
+		if (ret) {
+			CXIP_WARN("Failed to map MR buffer: %d\n", ret);
+			goto err_remove_mr;
+		}
+	}
+
+	ofi_atomic_inc32(&dom->ref);
+	*mr = &_mr->mr_fid;
+
+	return FI_SUCCESS;
+
+err_remove_mr:
+	cxip_mr_domain_remove(_mr);
+
+err_cleanup_mr:
+	cxip_mr_fini(_mr);
+err_free_mr:
+	free(_mr);
+
+	return ret;
+}
+
+static int cxip_regv(struct fid *fid, const struct iovec *iov, size_t count,
+		     uint64_t access, uint64_t offset, uint64_t requested_key,
+		     uint64_t flags, struct fid_mr **mr, void *context)
+{
+	struct fi_mr_attr attr;
+
+	attr.mr_iov = iov;
+	attr.iov_count = count;
+	attr.access = access;
+	attr.offset = offset;
+	attr.requested_key = requested_key;
+	attr.context = context;
+
+	return cxip_regattr(fid, &attr, flags, mr);
+}
+
+static int cxip_reg(struct fid *fid, const void *buf, size_t len,
+		    uint64_t access, uint64_t offset, uint64_t requested_key,
+		    uint64_t flags, struct fid_mr **mr, void *context)
+{
+	struct iovec iov;
+
+	iov.iov_base = (void *)buf;
+	iov.iov_len = len;
+
+	return cxip_regv(fid, &iov, 1, access, offset, requested_key, flags, mr,
+			 context);
+}
+
+struct fi_ops_mr cxip_dom_mr_ops = {
+	.size = sizeof(struct fi_ops_mr),
+	.reg = cxip_reg,
+	.regv = cxip_regv,
+	.regattr = cxip_regattr,
+};
diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c
new file mode 100644
index 00000000000..a9f3c0f6ec9
--- /dev/null
+++ b/prov/cxi/src/cxip_msg.c
@@ -0,0 +1,6148 @@
+/*
+ * Copyright (c) 2018,2021-2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <ofi_list.h>
+#include <ofi.h>
+#include <fasthash.h>
+
+#include "cxip.h"
+
+#define FC_SW_LE_MSG_FATAL "LE exhaustion during flow control, "\
+	"FI_CXI_RX_MATCH_MODE=[hybrid|software] is required\n"
+#define FC_SW_ONLOAD_MSG_FATAL "LE resources not recovered during "\
+	"flow control. FI_CXI_RX_MATCH_MODE=[hybrid|software] is required\n"
+#define FC_OFLOW_NO_MATCH_MSG "Flow control overflow no match, increasing "\
+	"FI_CXI_OFLOW_BUF_SIZE (current is %ldB) may reduce occurrence\n"
+#define FC_REQ_FULL_MSG "Flow control request list full, increasing"\
+	" FI_CXI_REQ_BUF_SIZE value (current is %ldB) may reduce occurrence\n"
+#define FC_DROP_COUNT_MSG "Re-enable Drop count mismatch, re-enable will "\
+	"be retried on notify\n"
+
+#define WARN_RESTRICTED_DISABLED "Insufficient resources for %s "\
+	"protocol, switching to %s protocol\n"
+
+/* Defines the posted receive interval for checking LE allocation if
+ * in hybrid RX match mode and preemptive transitions to software
+ * managed EP are requested.
+ */
+#define CXIP_HYBRID_RECV_CHECK_INTERVAL	(64-1)
+
+static int cxip_recv_cb(struct cxip_req *req, const union c_event *event);
+static void cxip_ux_onload_complete(struct cxip_req *req);
+static int cxip_ux_onload(struct cxip_rxc *rxc);
+static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq);
+static int cxip_recv_req_dropped(struct cxip_req *req);
+static ssize_t _cxip_recv_req(struct cxip_req *req, bool restart_seq);
+
+static int cxip_send_req_dropped(struct cxip_txc *txc, struct cxip_req *req);
+static int cxip_send_req_dequeue(struct cxip_txc *txc, struct cxip_req *req);
+
+static void cxip_fc_progress_ctrl(struct cxip_rxc *rxc);
+static void cxip_send_buf_fini(struct cxip_req *req);
+
+/*
+ * match_put_event() - Find/add a matching event.
+ *
+ * For every Put Overflow event there is a matching Put event. These events can
+ * be generated in any order. Both events must be received before progress can
+ * be made.
+ *
+ * If the matching event exists in the mapping, matched is set to true and
+ * the deferred event is returned. If a match was not found, matched is set to
+ * false and the event is added to the deferred event mapping.
+ *
+ * The deferred match event is returned; unless it must be added to the
+ * deferred mapping and memory is insufficient.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static struct cxip_deferred_event *
+match_put_event(struct cxip_rxc *rxc, struct cxip_req *req,
+		const union c_event *event, bool *matched)
+{
+	union cxip_def_event_key key = {};
+	struct cxip_deferred_event *def_ev;
+	union cxip_match_bits mb;
+	int bucket;
+	enum c_event_type match_type =
+		event->tgt_long.event_type == C_EVENT_PUT ? C_EVENT_PUT_OVERFLOW : C_EVENT_PUT;
+
+	if (event->tgt_long.rendezvous) {
+		key.initiator = event->tgt_long.initiator.initiator.process;
+		mb.raw = event->tgt_long.match_bits;
+		key.rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) |
+			      event->tgt_long.rendezvous_id;
+		key.rdzv = 1;
+	} else {
+		key.start_addr = event->tgt_long.start;
+	}
+
+	bucket = fasthash64(&key.raw, sizeof(key.raw), 0) %
+			    CXIP_DEF_EVENT_HT_BUCKETS;
+	dlist_foreach_container(&rxc->deferred_events.bh[bucket],
+				struct cxip_deferred_event, def_ev,
+				rxc_entry) {
+		if (def_ev->key.raw == key.raw &&
+		    def_ev->ev.tgt_long.event_type == match_type &&
+		    def_ev->ev.tgt_long.return_code == event->tgt_long.return_code &&
+		    def_ev->ev.tgt_long.initiator.initiator.process == event->tgt_long.initiator.initiator.process &&
+		    def_ev->ev.tgt_long.match_bits == event->tgt_long.match_bits) {
+			*matched = true;
+			return def_ev;
+		}
+	}
+
+	/* Not found, add mapping to hash bucket */
+	*matched = false;
+
+	def_ev = calloc(1, sizeof(*def_ev));
+	if (!def_ev) {
+		RXC_WARN(rxc, "Failed allocate to memory\n");
+		return NULL;
+	}
+
+	def_ev->key.raw	= key.raw;
+	def_ev->req = req;
+	def_ev->ev = *event;
+
+	dlist_insert_tail(&def_ev->rxc_entry, &rxc->deferred_events.bh[bucket]);
+
+	return def_ev;
+}
+
+/*
+ * free_put_event() - Free a deferred put event.
+ *
+ * Free an event previously allocated added with match_put_event().
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static void free_put_event(struct cxip_rxc *rxc,
+			   struct cxip_deferred_event *def_ev)
+{
+	dlist_remove(&def_ev->rxc_entry);
+	free(def_ev);
+}
+
+/*
+ * recv_req_src_addr() - Translate request source address to FI address.
+ */
+static fi_addr_t recv_req_src_addr(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+
+	/* If the FI_SOURCE capability is enabled, convert the initiator's
+	 * address to an FI address to be reported in a CQ event. If
+	 * application AVs are symmetric, the match_id in the EQ event is
+	 * logical and translation is not needed. Otherwise, translate the
+	 * physical address in the EQ event to logical FI address.
+	 */
+	if (rxc->attr.caps & FI_SOURCE) {
+		struct cxip_addr addr = {};
+
+		if (rxc->ep_obj->av->symmetric)
+			return CXI_MATCH_ID_EP(rxc->pid_bits,
+					       req->recv.initiator);
+
+		addr.nic = CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator);
+		addr.pid = CXI_MATCH_ID_PID(rxc->pid_bits, req->recv.initiator);
+		addr.vni = req->recv.vni;
+
+		return cxip_av_lookup_fi_addr(rxc->ep_obj->av, &addr);
+	}
+
+	return FI_ADDR_NOTAVAIL;
+}
+
+/*
+ * cxip_recv_req_alloc() - Allocate a request handle for a receive,
+ * mapping the associated buffer if required.
+ *
+ * Caller must hold ep->ep_obj->lock.
+ */
+static int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len,
+			       struct cxip_req **cxip_req)
+{
+	struct cxip_domain *dom = rxc->domain;
+	struct cxip_req *req;
+	struct cxip_md *recv_md = NULL;
+	int ret;
+
+	/* Software EP only mode receives are not posted to hardware
+	 * and are not constrained by hardware buffer ID limits.
+	 *
+	 * Note: cxip_evtq_req_alloc() zeros the req.
+	 */
+	req = cxip_evtq_req_alloc(&rxc->rx_evtq, !rxc->sw_ep_only, rxc);
+	if (!req) {
+		RXC_INFO(rxc, "Recv request unavailable: -FI_EAGAIN\n");
+		return -FI_EAGAIN;
+	}
+
+	if (len) {
+		ret = cxip_map(dom, (void *)buf, len, 0, &recv_md);
+		if (ret) {
+			RXC_WARN(rxc, "Map of recv buffer failed: %d, %s\n",
+				 ret, fi_strerror(-ret));
+			goto err_free_request;
+		}
+	}
+
+	/* Initialize common receive request attributes. */
+	req->type = CXIP_REQ_RECV;
+	req->cb = cxip_recv_cb;
+	req->recv.rxc = rxc;
+	req->recv.recv_buf = buf;
+	req->recv.recv_md = recv_md;
+	req->recv.ulen = len;
+	dlist_init(&req->recv.children);
+	dlist_init(&req->recv.rxc_entry);
+
+	ofi_atomic_inc32(&rxc->orx_reqs);
+	*cxip_req = req;
+
+	return FI_SUCCESS;
+
+err_free_request:
+	cxip_evtq_req_free(req);
+
+	return ret;
+}
+
+static void cxip_recv_req_free(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+
+	assert(req->type == CXIP_REQ_RECV);
+	assert(dlist_empty(&req->recv.children));
+	assert(dlist_empty(&req->recv.rxc_entry));
+
+	ofi_atomic_dec32(&rxc->orx_reqs);
+
+	if (req->recv.recv_md)
+		cxip_unmap(req->recv.recv_md);
+
+	cxip_evtq_req_free(req);
+}
+
+/*
+ * recv_req_event_success() - Generate successful receive event completions.
+ */
+static inline int recv_req_event_success(struct cxip_rxc *rxc,
+					 struct cxip_req *req)
+{
+	int ret;
+	fi_addr_t src_addr;
+	struct cxip_addr *addr;
+
+	if (req->recv.rxc->attr.caps & FI_SOURCE) {
+		src_addr = recv_req_src_addr(req);
+		if (src_addr != FI_ADDR_NOTAVAIL ||
+		    !(rxc->attr.caps & FI_SOURCE_ERR))
+			return cxip_cq_req_complete_addr(req, src_addr);
+
+		addr = calloc(1, sizeof(*addr));
+		if (!addr)
+			return -FI_ENOMEM;
+
+		addr->nic = CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator);
+		addr->pid = CXI_MATCH_ID_PID(rxc->pid_bits,
+					     req->recv.initiator);
+
+		src_addr = cxip_av_lookup_auth_key_fi_addr(rxc->ep_obj->av,
+							   req->recv.vni);
+
+		ret = cxip_cq_req_error(req, 0, FI_EADDRNOTAVAIL, req->recv.rc,
+					addr, sizeof(*addr), src_addr);
+
+		free(addr);
+	} else {
+		ret = cxip_cq_req_complete(req);
+	}
+
+	return ret;
+}
+
+/*
+ * recv_req_report() - Report the completion of a receive operation.
+ */
+static void recv_req_report(struct cxip_req *req)
+{
+	int ret;
+	int err;
+	int success_event = (req->flags & FI_COMPLETION);
+	struct cxip_rxc *rxc = req->recv.rxc;
+	ssize_t truncated = req->recv.rlen - req->data_len;
+
+	/* data_len (i.e. mlength) should NEVER be greater than rlength. */
+	assert(truncated >= 0);
+
+	req->flags &= (FI_MSG | FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA);
+
+	if (req->recv.parent) {
+		struct cxip_req *parent = req->recv.parent;
+		bool unlinked = false;
+
+		parent->recv.mrecv_bytes += req->data_len;
+		RXC_DBG(rxc,
+			"Putting %lu mrecv bytes (req: %p consumed: %lu auto_unlinked: %u unlink_bytes: %lu addr: %#lx ulen=%u min_free=%lu hw_offloaded=%u)\n",
+			req->data_len, parent, parent->recv.mrecv_bytes,
+			parent->recv.auto_unlinked, parent->recv.mrecv_unlink_bytes,
+			req->buf, parent->recv.ulen, rxc->min_multi_recv,
+			parent->recv.hw_offloaded);
+
+		/* Handle mrecv edge case. If all unexpected headers were
+		 * onloaded, the entire mrecv buffer may be matched against the
+		 * sw_ux_list list before being offloaded to HW. Detect this
+		 * case.
+		 */
+		if (parent->recv.hw_offloaded) {
+			if (parent->recv.auto_unlinked &&
+			    parent->recv.mrecv_bytes == parent->recv.mrecv_unlink_bytes)
+				unlinked = true;
+		} else {
+			if ((parent->recv.ulen - parent->recv.mrecv_bytes) < rxc->min_multi_recv)
+				unlinked = true;
+		}
+
+		if (unlinked) {
+			RXC_DBG(rxc, "Freeing parent: %p\n", req->recv.parent);
+			cxip_recv_req_free(req->recv.parent);
+
+			req->flags |= FI_MULTI_RECV;
+		}
+	}
+
+	if (req->recv.rc == C_RC_OK && !truncated) {
+		RXC_DBG(rxc, "Request success: %p\n", req);
+
+		if (success_event) {
+			ret = recv_req_event_success(rxc, req);
+			if (ret != FI_SUCCESS)
+				RXC_WARN(rxc,
+					 "Failed to report completion: %d\n",
+					 ret);
+		}
+
+		if (req->recv.cntr) {
+			ret = cxip_cntr_mod(req->recv.cntr, 1, false, false);
+			if (ret)
+				RXC_WARN(rxc, "cxip_cntr_mod returned: %d\n",
+					 ret);
+		}
+	} else {
+		if (req->recv.unlinked) {
+			err = FI_ECANCELED;
+			if (req->recv.multi_recv)
+				req->flags |= FI_MULTI_RECV;
+			RXC_DBG(rxc, "Request canceled: %p (err: %d)\n", req,
+				err);
+		} else if (truncated) {
+			err = FI_ETRUNC;
+			RXC_DBG(rxc, "Request truncated: %p (err: %d)\n", req,
+				err);
+		} else if (req->recv.flags & FI_PEEK) {
+			req->data_len = 0;
+			err = FI_ENOMSG;
+			RXC_DBG(rxc, "Peek request not found: %p (err: %d)\n",
+				req, err);
+		} else {
+			err = proverr2errno(req->recv.rc);
+
+			RXC_WARN(rxc, "Request error: %p (err: %d, %s)\n", req,
+				 err, cxi_rc_to_str(req->recv.rc));
+		}
+
+		ret = cxip_cq_req_error(req, truncated, err, req->recv.rc,
+					NULL, 0, FI_ADDR_UNSPEC);
+		if (ret != FI_SUCCESS)
+			RXC_WARN(rxc, "Failed to report error: %d\n", ret);
+
+		if (req->recv.cntr) {
+			ret = cxip_cntr_mod(req->recv.cntr, 1, false, true);
+			if (ret)
+				RXC_WARN(rxc, "cxip_cntr_mod returned: %d\n",
+					 ret);
+		}
+	}
+}
+
+/*
+ * recv_req_tgt_event() - Update common receive request fields
+ *
+ * Populate a receive request with information found in all receive event
+ * types.
+ */
+static void
+recv_req_tgt_event(struct cxip_req *req, const union c_event *event)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	union cxip_match_bits mb = {
+		.raw = event->tgt_long.match_bits
+	};
+	uint32_t init = event->tgt_long.initiator.initiator.process;
+
+	assert(event->hdr.event_type == C_EVENT_PUT ||
+	       event->hdr.event_type == C_EVENT_PUT_OVERFLOW ||
+	       event->hdr.event_type == C_EVENT_RENDEZVOUS ||
+	       event->hdr.event_type == C_EVENT_SEARCH);
+
+	/* Rendezvous events contain the wrong match bits and do not provide
+	 * initiator context for symmetric AVs.
+	 */
+	if (event->hdr.event_type != C_EVENT_RENDEZVOUS) {
+		req->tag = mb.tag;
+		req->recv.initiator = init;
+
+		if (mb.cq_data)
+			req->flags |= FI_REMOTE_CQ_DATA;
+	}
+
+	/* remote_offset is not provided in Overflow events. */
+	if (event->hdr.event_type != C_EVENT_PUT_OVERFLOW)
+		req->recv.src_offset = event->tgt_long.remote_offset;
+
+	/* For rendezvous, initiator is the RGet DFA. */
+	if (event->hdr.event_type == C_EVENT_RENDEZVOUS) {
+		init = cxi_dfa_to_init(init, rxc->pid_bits);
+		req->recv.rget_nic = CXI_MATCH_ID_EP(rxc->pid_bits, init);
+		req->recv.rget_pid = CXI_MATCH_ID_PID(rxc->pid_bits, init);
+	}
+
+	/* Only need one event to set remaining fields. */
+	if (req->recv.tgt_event)
+		return;
+	req->recv.tgt_event = true;
+
+	/* VNI is needed to support FI_AV_AUTH_KEY. */
+	req->recv.vni = event->tgt_long.vni;
+
+	/* rlen is used to detect truncation. */
+	req->recv.rlen = event->tgt_long.rlength;
+
+	/* RC is used when generating completion events. */
+	req->recv.rc = cxi_tgt_event_rc(event);
+
+	/* Header data is provided in all completion events. */
+	req->data = event->tgt_long.header_data;
+
+	/* rdzv_id is used to correlate Put and Put Overflow events when using
+	 * offloaded RPut. Otherwise, Overflow buffer start address is used to
+	 * correlate events.
+	 */
+	if (event->tgt_long.rendezvous)
+		req->recv.rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) |
+				    event->tgt_long.rendezvous_id;
+	else
+		req->recv.oflow_start = event->tgt_long.start;
+
+	req->recv.rdzv_lac = mb.rdzv_lac;
+	req->recv.rdzv_proto = mb.rdzv_proto;
+	req->recv.rdzv_mlen = event->tgt_long.mlength;
+
+	/* data_len must be set uniquely for each protocol! */
+}
+
+/*
+ * rdzv_mrecv_req_lookup() - Search for a matching rendezvous, multi-receive
+ * child request.
+ */
+static int rdzv_mrecv_req_lookup(struct cxip_req *req,
+				 const union c_event *event,
+				 uint32_t *initiator, uint32_t *rdzv_id,
+				 bool perform_event_checks,
+				 struct cxip_req **req_out)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	struct cxip_req *child_req;
+	union cxip_match_bits mb;
+	uint32_t ev_init;
+	uint32_t ev_rdzv_id;
+	struct cxip_addr caddr;
+	int ret;
+	int i;
+
+	if (event->hdr.event_type == C_EVENT_REPLY) {
+		struct cxi_rdzv_user_ptr *user_ptr;
+
+		/* Events for software-issued operations will return a
+		 * reference to the correct request.
+		 */
+		if (!event->init_short.rendezvous) {
+			*req_out = req;
+			return FI_SUCCESS;
+		}
+
+		user_ptr = (struct cxi_rdzv_user_ptr *)
+				&event->init_short.user_ptr;
+
+		ev_init = CXI_MATCH_ID(rxc->pid_bits, user_ptr->src_pid,
+					user_ptr->src_nid);
+		ev_rdzv_id = user_ptr->rendezvous_id;
+	} else if (event->hdr.event_type == C_EVENT_RENDEZVOUS) {
+		struct cxip_rxc *rxc = req->recv.rxc;
+		uint32_t dfa = event->tgt_long.initiator.initiator.process;
+
+		ev_init = cxi_dfa_to_init(dfa, rxc->pid_bits);
+		mb.raw = event->tgt_long.match_bits;
+
+		ev_rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) |
+			     event->tgt_long.rendezvous_id;
+	} else {
+		ev_init = event->tgt_long.initiator.initiator.process;
+		mb.raw = event->tgt_long.match_bits;
+
+		ev_rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) |
+			     event->tgt_long.rendezvous_id;
+	}
+
+	if ((event->hdr.event_type == C_EVENT_PUT_OVERFLOW ||
+	     event->hdr.event_type == C_EVENT_PUT)  &&
+	    rxc->ep_obj->av->symmetric) {
+		ret = cxip_av_lookup_addr(rxc->ep_obj->av,
+					  CXI_MATCH_ID_EP(rxc->pid_bits, ev_init),
+					  &caddr);
+		if (ret != FI_SUCCESS)
+			RXC_FATAL(rxc, "Lookup of FI addr 0x%x: failed %d\n",
+				  ev_init, ret);
+
+		ev_init = CXI_MATCH_ID(rxc->pid_bits,
+				       CXI_MATCH_ID_PID(rxc->pid_bits, ev_init),
+				       caddr.nic);
+	}
+
+	*initiator = ev_init;
+	*rdzv_id = ev_rdzv_id;
+
+	/* Events for hardware-issued operations will return a rendezvous_id
+	 * and initiator data. Use these fields to find a matching child
+	 * request.
+	 */
+	dlist_foreach_container(&req->recv.children,
+				struct cxip_req, child_req,
+				recv.children) {
+		if (child_req->recv.rdzv_id == ev_rdzv_id &&
+		    child_req->recv.rdzv_initiator == ev_init) {
+
+			if (perform_event_checks) {
+				/* There is an edge case where source may reuse the
+				 * same rendezvous ID before the target has had time to
+				 * process the C_EVENT_REPLY. If this is the case, an
+				 * incorrect child_req match would occur. To prevent
+				 * this, the events seen are stored with the child_req.
+				 * If a redundant event is seen, this is a sign
+				 * C_EVENT_REPLY needs to be process. Thus, return
+				 * -FI_EAGAIN to process TX EQ.
+				 */
+				for (i = 0; i < child_req->recv.rdzv_events; i++) {
+					if (child_req->recv.rdzv_event_types[i] == event->hdr.event_type) {
+						assert(event->hdr.event_type != C_EVENT_REPLY);
+						return -FI_EAGAIN;
+					}
+				}
+			}
+
+			*req_out = child_req;
+			return FI_SUCCESS;
+		}
+	}
+
+	return -FI_ENOMSG;
+}
+
+/*
+ * mrecv_req_dup() - Create a new request using an event targeting a
+ * multi-recv buffer.
+ *
+ * @mrecv_req: A previously posted multi-recv buffer request.
+ */
+static struct cxip_req *mrecv_req_dup(struct cxip_req *mrecv_req)
+{
+	struct cxip_rxc *rxc = mrecv_req->recv.rxc;
+	struct cxip_req *req;
+
+	req = cxip_evtq_req_alloc(&rxc->rx_evtq, 0, rxc);
+	if (!req)
+		return NULL;
+
+	/* Duplicate the parent request. */
+	req->cb = mrecv_req->cb;
+	req->context = mrecv_req->context;
+	req->flags = mrecv_req->flags;
+	req->type = mrecv_req->type;
+	req->recv = mrecv_req->recv;
+
+	/* Update fields specific to this Send */
+	req->recv.parent = mrecv_req;
+
+	/* Start pointer and data_len must be set elsewhere! */
+
+	return req;
+}
+
+/*
+ * rdzv_mrecv_req_event() - Look up a multi-recieve child request using an
+ * event and multi-recv request.
+ *
+ * Each rendezvous Put transaction targeting a multi-receive buffer is tracked
+ * using a separate child request. A child request is uniquely identified by
+ * rendezvous ID and source address. Return a reference to a child request
+ * which matches the event. Allocate a new child request, if necessary.
+ */
+static struct cxip_req *
+rdzv_mrecv_req_event(struct cxip_req *mrecv_req, const union c_event *event)
+{
+	uint32_t ev_init;
+	uint32_t ev_rdzv_id;
+	struct cxip_req *req;
+	struct cxip_rxc *rxc __attribute__((unused)) = mrecv_req->recv.rxc;
+	int ret;
+
+	assert(event->hdr.event_type == C_EVENT_REPLY ||
+	       event->hdr.event_type == C_EVENT_PUT ||
+	       event->hdr.event_type == C_EVENT_PUT_OVERFLOW ||
+	       event->hdr.event_type == C_EVENT_RENDEZVOUS);
+
+	ret = rdzv_mrecv_req_lookup(mrecv_req, event, &ev_init, &ev_rdzv_id,
+				    true, &req);
+	switch (ret) {
+	case -FI_EAGAIN:
+		return NULL;
+
+	case -FI_ENOMSG:
+		req = mrecv_req_dup(mrecv_req);
+		if (!req)
+			return NULL;
+
+		/* Store event initiator and rdzv_id for matching. */
+		req->recv.rdzv_id = ev_rdzv_id;
+		req->recv.rdzv_initiator = ev_init;
+
+		dlist_insert_tail(&req->recv.children,
+				  &mrecv_req->recv.children);
+
+		RXC_DBG(rxc, "New child: %p parent: %p event: %s\n", req,
+			mrecv_req, cxi_event_to_str(event));
+		return req;
+
+	case FI_SUCCESS:
+		RXC_DBG(rxc, "Found child: %p parent: %p event: %s\n", req,
+			mrecv_req, cxi_event_to_str(event));
+		return req;
+
+	default:
+		RXC_FATAL(rxc, "Unhandled rdzv_mrecv_req_lookup %d\n", ret);
+	}
+}
+
+/*
+ * rdzv_recv_req_event() - Count a rendezvous event.
+ *
+ * Call for each target rendezvous event generated on a user receive buffer.
+ * After three events, a rendezvous receive is complete. The three events could
+ * be either:
+ *   -Put, Rendezvous, Reply -- or
+ *   -Put Overflow, Rendezvous, Reply
+ *
+ * For a restricted Get there is a fourth event, the ACK of the notify.
+ *
+ * In either case, the events could be generated in any order. As soon as the
+ * events expected are processed, the request is complete.
+ */
+static void rdzv_recv_req_event(struct cxip_req *req, enum c_event_type type)
+{
+	int total_events = req->recv.done_notify ? 4 : 3;
+
+	req->recv.rdzv_event_types[req->recv.rdzv_events] = type;
+
+	if (++req->recv.rdzv_events == total_events) {
+		if (req->recv.multi_recv) {
+			dlist_remove(&req->recv.children);
+			recv_req_report(req);
+			cxip_evtq_req_free(req);
+		} else {
+			recv_req_report(req);
+			cxip_recv_req_free(req);
+		}
+	}
+}
+
+/*
+ * oflow_req_put_bytes() - Consume bytes in the Overflow buffer.
+ *
+ * An Overflow buffer is freed when all bytes are consumed by the NIC.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static void oflow_req_put_bytes(struct cxip_req *req, size_t bytes)
+{
+	struct cxip_ptelist_buf *oflow_buf = req->req_ctx;
+
+	/* Non-zero length UX messages with 0 eager portion do not
+	 * have a dependency on the oflow buffer.
+	 */
+	if (bytes == 0)
+		return;
+
+	oflow_buf->cur_offset += bytes;
+
+	RXC_DBG(oflow_buf->rxc, "Putting %lu bytes (%lu/%lu): %p\n", bytes,
+		oflow_buf->cur_offset, oflow_buf->unlink_length, req);
+
+	if (oflow_buf->cur_offset == oflow_buf->unlink_length)
+		cxip_ptelist_buf_consumed(oflow_buf);
+}
+
+/*
+ * issue_rdzv_get() - Perform a Get to pull source data from the Initiator of a
+ * Send operation.
+ */
+static int issue_rdzv_get(struct cxip_req *req)
+{
+	struct c_full_dma_cmd cmd = {};
+	uint64_t local_addr;
+	uint64_t rem_offset;
+	uint32_t align_bytes;
+	uint32_t mlen;
+	struct cxip_rxc *rxc = req->recv.rxc;
+	uint32_t pid_idx = rxc->domain->iface->dev->info.rdzv_get_idx;
+	uint8_t idx_ext;
+	union cxip_match_bits mb = {};
+	int ret;
+	union c_fab_addr dfa;
+
+	if (req->recv.rdzv_proto == CXIP_RDZV_PROTO_ALT_WRITE)
+		RXC_WARN_ONCE(rxc, "Rendezvous protocol: %s not implemented\n",
+			      cxip_rdzv_proto_to_str(req->recv.rdzv_proto));
+
+	cmd.command.cmd_type = C_CMD_TYPE_DMA;
+	cmd.command.opcode = C_CMD_GET;
+	cmd.lac = req->recv.recv_md->md->lac;
+	cmd.event_send_disable = 1;
+
+	/* Must deliver to TX event queue */
+	cmd.eq = cxip_evtq_eqn(&rxc->ep_obj->txc.tx_evtq);
+
+	if (req->recv.rdzv_proto == CXIP_RDZV_PROTO_ALT_READ) {
+		pid_idx = CXIP_PTL_IDX_RDZV_RESTRICTED(req->recv.rdzv_lac);
+		cmd.restricted = 1;
+		req->recv.done_notify = true;
+	} else {
+		pid_idx = rxc->domain->iface->dev->info.rdzv_get_idx;
+		mb.rdzv_lac = req->recv.rdzv_lac;
+		mb.rdzv_id_lo = req->recv.rdzv_id;
+		mb.rdzv_id_hi = req->recv.rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH;
+	}
+	cmd.match_bits = mb.raw;
+
+	cmd.user_ptr = (uint64_t)req;
+	cxi_build_dfa(req->recv.rget_nic, req->recv.rget_pid, rxc->pid_bits,
+		      pid_idx, &dfa, &idx_ext);
+	cmd.dfa = dfa;
+	cmd.index_ext = idx_ext;
+
+	local_addr = CXI_VA_TO_IOVA(req->recv.recv_md->md,
+				    req->recv.recv_buf);
+	local_addr += req->recv.rdzv_mlen;
+
+	rem_offset = req->recv.src_offset;
+	mlen = req->recv.rdzv_mlen;
+
+	RXC_DBG(rxc, "SW RGet addr: 0x%" PRIx64 " len %" PRId64
+		" rem_off: %" PRId64 " restricted: %d\n", local_addr,
+		req->data_len - req->recv.rdzv_mlen, rem_offset,
+		cmd.restricted);
+
+	/* Align mask will be non-zero if local DMA address cache-line
+	 * alignment is desired.
+	 */
+	if (mlen >= rxc->rget_align_mask) {
+		align_bytes = local_addr & rxc->rget_align_mask;
+		local_addr -= align_bytes;
+		rem_offset -= align_bytes;
+		mlen -= align_bytes;
+	}
+
+	if (req->data_len < mlen)
+		cmd.request_len = 0;
+	else
+		cmd.request_len = req->data_len - mlen;
+
+	cmd.local_addr = local_addr;
+	cmd.remote_offset = rem_offset;
+
+	RXC_DBG(rxc, "Aligned addr: 0x%" PRIx64 " len %d rem_off %" PRId64 "\n",
+		(uint64_t)cmd.local_addr, cmd.request_len,
+		(uint64_t)cmd.remote_offset);
+
+	/* Issue Rendezvous Get command */
+	ret = cxi_cq_emit_dma(rxc->tx_cmdq->dev_cmdq, &cmd);
+	if (ret) {
+		RXC_DBG(rxc, "Failed to queue GET command: %d\n", ret);
+		return -FI_EAGAIN;
+	}
+
+	cxi_cq_ring(rxc->tx_cmdq->dev_cmdq);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_notify_match_cb() - Callback function for match complete notifiction
+ * Ack events.
+ */
+static int
+cxip_notify_match_cb(struct cxip_req *req, const union c_event *event)
+{
+	RXC_DBG(req->recv.rxc, "Match complete: %p\n", req);
+
+	recv_req_report(req);
+
+	if (req->recv.multi_recv)
+		cxip_evtq_req_free(req);
+	else
+		cxip_recv_req_free(req);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_notify_match() - Notify the initiator of a Send that the match is
+ * complete at the target.
+ *
+ * A transaction ID corresponding to the matched Send request is sent back to
+ * the initiator in the match_bits field of a zero-byte Put.
+ */
+static int cxip_notify_match(struct cxip_req *req, const union c_event *event)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	uint32_t pid_idx = rxc->domain->iface->dev->info.rdzv_get_idx;
+	uint32_t init = event->tgt_long.initiator.initiator.process;
+	uint32_t nic = CXI_MATCH_ID_EP(rxc->pid_bits, init);
+	uint32_t pid = CXI_MATCH_ID_PID(rxc->pid_bits, init);
+	union c_fab_addr dfa;
+	uint8_t idx_ext;
+	union cxip_match_bits mb = {
+		.le_type = CXIP_LE_TYPE_ZBP,
+	};
+	union cxip_match_bits event_mb;
+	union c_cmdu cmd = {};
+	int ret;
+
+	event_mb.raw = event->tgt_long.match_bits;
+	mb.tx_id = event_mb.tx_id;
+
+	cxi_build_dfa(nic, pid, rxc->pid_bits, pid_idx, &dfa, &idx_ext);
+
+	cmd.c_state.event_send_disable = 1;
+	cmd.c_state.index_ext = idx_ext;
+	cmd.c_state.eq = cxip_evtq_eqn(&rxc->ep_obj->txc.tx_evtq);
+
+	ret = cxip_cmdq_emit_c_state(rxc->tx_cmdq, &cmd.c_state);
+	if (ret) {
+		RXC_DBG(rxc, "Failed to issue C_STATE command: %d\n", ret);
+		return ret;
+	}
+
+	memset(&cmd.idc_msg, 0, sizeof(cmd.idc_msg));
+	cmd.idc_msg.dfa = dfa;
+	cmd.idc_msg.match_bits = mb.raw;
+
+	cmd.idc_msg.user_ptr = (uint64_t)req;
+
+	ret = cxi_cq_emit_idc_msg(rxc->tx_cmdq->dev_cmdq, &cmd.idc_msg,
+				  NULL, 0);
+	if (ret) {
+		RXC_DBG(rxc, "Failed to write IDC: %d\n", ret);
+
+		/* Return error according to Domain Resource Management
+		 */
+		return -FI_EAGAIN;
+	}
+
+	req->cb = cxip_notify_match_cb;
+
+	cxi_cq_ring(rxc->tx_cmdq->dev_cmdq);
+
+	RXC_DBG(rxc, "Queued match completion message: %p\n", req);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * mrecv_req_oflow_event() - Set start and length uniquely for an unexpected
+ * mrecv request.
+ *
+ * Overflow buffer events contain a start address representing the offset into
+ * the Overflow buffer where data was written. When a unexpected header is
+ * later matched to a multi-receive buffer in the priority list, The Put
+ * Overflow event does not contain the offset into the Priority list buffer
+ * where data should be copied. Software must track the the Priority list
+ * buffer offset using ordered Put Overflow events.
+ */
+static int mrecv_req_put_bytes(struct cxip_req *req, uint32_t rlen)
+{
+	uintptr_t mrecv_head;
+	uintptr_t mrecv_tail;
+	size_t mrecv_bytes_remaining;
+
+	mrecv_head = (uintptr_t)req->recv.recv_buf + req->recv.start_offset;
+	mrecv_tail = (uintptr_t)req->recv.recv_buf + req->recv.ulen;
+	mrecv_bytes_remaining = mrecv_tail - mrecv_head;
+
+	rlen = MIN(mrecv_bytes_remaining, rlen);
+	req->recv.start_offset += rlen;
+
+	return rlen;
+}
+
+/* cxip_recv_req_set_rget_info() - Set RGet NIC and PID fields. Used for
+ * messages where a rendezvous event will not be generated. Current usages are
+ * for the eager long protocol and rendezvous operations which have unexpected
+ * headers onloaded due to flow control.
+ */
+static void cxip_recv_req_set_rget_info(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	int ret;
+
+	if (rxc->ep_obj->av->symmetric) {
+		struct cxip_addr caddr;
+
+		RXC_DBG(rxc, "Translating initiator: %x, req: %p\n",
+			req->recv.initiator, req);
+
+		ret = cxip_av_lookup_addr(rxc->ep_obj->av,
+				          CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator),
+					  &caddr);
+		if (ret != FI_SUCCESS)
+			RXC_FATAL(rxc, "Failed to look up FI addr: %d\n", ret);
+
+		req->recv.rget_nic = caddr.nic;
+	} else {
+		req->recv.rget_nic = CXI_MATCH_ID_EP(rxc->pid_bits,
+						     req->recv.initiator);
+	}
+
+	req->recv.rget_pid = CXI_MATCH_ID_PID(rxc->pid_bits,
+					      req->recv.initiator);
+}
+
+/*
+ * cxip_ux_send() - Progress an unexpected Send after receiving matching Put
+ * and Put and Put Overflow events.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int cxip_ux_send(struct cxip_req *match_req, struct cxip_req *oflow_req,
+			const union c_event *put_event, uint64_t mrecv_start,
+			uint32_t mrecv_len, bool remove_recv_entry)
+{
+	struct cxip_ptelist_buf *buf;
+	void *oflow_va;
+	size_t oflow_bytes;
+	union cxip_match_bits mb;
+	ssize_t ret;
+	struct cxip_req *parent_req = match_req;
+
+	assert(match_req->type == CXIP_REQ_RECV);
+
+	if (match_req->recv.multi_recv) {
+		if (put_event->tgt_long.rendezvous)
+			match_req = rdzv_mrecv_req_event(match_req, put_event);
+		else
+			match_req = mrecv_req_dup(match_req);
+		if (!match_req)
+			return -FI_EAGAIN;
+
+		/* Set start and length uniquely for an unexpected
+		 * mrecv request.
+		 */
+		match_req->recv.recv_buf = (uint8_t *)
+				match_req->recv.parent->recv.recv_buf +
+				mrecv_start;
+		match_req->buf = (uint64_t)match_req->recv.recv_buf;
+		match_req->data_len = mrecv_len;
+	} else {
+		match_req->data_len = put_event->tgt_long.rlength;
+		if (match_req->data_len > match_req->recv.ulen)
+			match_req->data_len = match_req->recv.ulen;
+	}
+
+	recv_req_tgt_event(match_req, put_event);
+	buf = oflow_req->req_ctx;
+	oflow_va = (void *)CXI_IOVA_TO_VA(buf->md->md,
+					  put_event->tgt_long.start);
+
+	/* Copy data out of overflow buffer. */
+	oflow_bytes = MIN(put_event->tgt_long.mlength, match_req->data_len);
+	cxip_copy_to_md(match_req->recv.recv_md, match_req->recv.recv_buf,
+			oflow_va, oflow_bytes);
+
+	if (oflow_req->type == CXIP_REQ_OFLOW)
+		oflow_req_put_bytes(oflow_req, put_event->tgt_long.mlength);
+
+	/* Remaining unexpected rendezvous processing is deferred until RGet
+	 * completes.
+	 */
+	if (put_event->tgt_long.rendezvous) {
+		if (remove_recv_entry)
+			dlist_remove_init(&parent_req->recv.rxc_entry);
+
+		rdzv_recv_req_event(match_req, put_event->hdr.event_type);
+		return FI_SUCCESS;
+	}
+
+	mb.raw = put_event->tgt_long.match_bits;
+
+	/* Check if the initiator requires match completion guarantees.
+	 * If so, notify the initiator that the match is now complete.
+	 * Delay the Receive event until the notification is complete.
+	 */
+	if (mb.match_comp) {
+		ret = cxip_notify_match(match_req, put_event);
+		if (ret != FI_SUCCESS) {
+			if (match_req->recv.multi_recv)
+				cxip_evtq_req_free(match_req);
+
+			return -FI_EAGAIN;
+		}
+
+		if (remove_recv_entry)
+			dlist_remove_init(&parent_req->recv.rxc_entry);
+
+		return FI_SUCCESS;
+	}
+
+	if (remove_recv_entry)
+		dlist_remove_init(&parent_req->recv.rxc_entry);
+
+	recv_req_report(match_req);
+
+	if (match_req->recv.multi_recv)
+		cxip_evtq_req_free(match_req);
+	else
+		cxip_recv_req_free(match_req);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ux_send_zb() - Progress an unexpected zero-byte Send after receiving
+ * a Put Overflow event.
+ *
+ * Zero-byte Put events for unexpected Sends are discarded. Progress the Send
+ * using only the Overflow event. There is no Send data to be copied out.
+ */
+static int cxip_ux_send_zb(struct cxip_req *match_req,
+			   const union c_event *oflow_event,
+			   uint64_t mrecv_start, bool remove_recv_entry)
+{
+	union cxip_match_bits mb;
+	int ret;
+	struct cxip_req *parent_req = match_req;
+
+	assert(!oflow_event->tgt_long.rlength);
+
+	if (match_req->recv.multi_recv) {
+		match_req = mrecv_req_dup(match_req);
+		if (!match_req)
+			return -FI_EAGAIN;
+
+		match_req->buf = (uint64_t)
+				match_req->recv.parent->recv.recv_buf +
+				mrecv_start;
+	}
+
+	recv_req_tgt_event(match_req, oflow_event);
+
+	match_req->data_len = 0;
+
+	mb.raw = oflow_event->tgt_long.match_bits;
+
+	/* Check if the initiator requires match completion guarantees.
+	 * If so, notify the initiator that the match is now complete.
+	 * Delay the Receive event until the notification is complete.
+	 */
+	if (mb.match_comp) {
+		ret = cxip_notify_match(match_req, oflow_event);
+		if (ret != FI_SUCCESS) {
+			if (match_req->recv.multi_recv)
+				cxip_evtq_req_free(match_req);
+
+			return -FI_EAGAIN;
+		}
+
+		if (remove_recv_entry)
+			dlist_remove_init(&parent_req->recv.rxc_entry);
+
+		return FI_SUCCESS;
+	}
+
+	if (remove_recv_entry)
+		dlist_remove_init(&parent_req->recv.rxc_entry);
+
+	recv_req_report(match_req);
+
+	if (match_req->recv.multi_recv)
+		cxip_evtq_req_free(match_req);
+	else
+		cxip_recv_req_free(match_req);
+
+	return FI_SUCCESS;
+}
+
+static bool cxip_ux_is_onload_complete(struct cxip_req *req)
+{
+	return !req->search.puts_pending && req->search.complete;
+}
+
+/*
+ * recv_req_peek_complete - FI_PEEK operation completed
+ */
+static void recv_req_peek_complete(struct cxip_req *req,
+				   struct cxip_ux_send *ux_send)
+{
+	/* If no unexpected message match we need to return original
+	 * tag in the completion.
+	 */
+	if (req->recv.rc != C_RC_OK)
+		req->tag = req->recv.tag;
+	else if (req->recv.flags & FI_CLAIM)
+		((struct fi_context *)req->context)->internal[0] = ux_send;
+
+	/* Avoid truncation processing, peek does not receive data */
+	req->data_len = req->recv.rlen;
+
+	recv_req_report(req);
+
+	cxip_recv_req_free(req);
+}
+
+/* Caller must hold ep_obj->lock. */
+static int cxip_oflow_process_put_event(struct cxip_rxc *rxc,
+					struct cxip_req *req,
+					const union c_event *event)
+{
+	int ret;
+	struct cxip_deferred_event *def_ev;
+	struct cxip_req *save_req;
+	bool matched;
+
+	def_ev = match_put_event(rxc, req, event, &matched);
+	if (!matched)
+		return !def_ev ? -FI_EAGAIN : FI_SUCCESS;
+
+	RXC_DBG(rxc, "Overflow beat Put event: %p\n", def_ev->req);
+
+	if (def_ev->ux_send) {
+		/* UX Send was onloaded for one of these reasons:
+		 * 1) Flow control
+		 * 2) ULE was claimed by a FI_CLAIM
+		 */
+		save_req = def_ev->req;
+		def_ev->ux_send->req = req;
+		def_ev->ux_send->put_ev = *event;
+
+		if (def_ev->ux_send->claimed) {
+			recv_req_tgt_event(save_req, &def_ev->ux_send->put_ev);
+			recv_req_peek_complete(save_req, def_ev->ux_send);
+			RXC_DBG(rxc, "FI_CLAIM put complete: %p, ux_send %p\n",
+				save_req, def_ev->ux_send);
+			goto done;
+		} else {
+			def_ev->req->search.puts_pending--;
+			RXC_DBG(rxc, "put complete: %p\n", def_ev->req);
+		}
+
+		if (cxip_ux_is_onload_complete(def_ev->req))
+			cxip_ux_onload_complete(def_ev->req);
+
+	} else {
+		ret = cxip_ux_send(def_ev->req, req, event, def_ev->mrecv_start,
+				   def_ev->mrecv_len, false);
+		if (ret != FI_SUCCESS)
+			return -FI_EAGAIN;
+	}
+
+done:
+	free_put_event(rxc, def_ev);
+
+	return FI_SUCCESS;
+}
+
+/* Caller must hold ep_obj->lock */
+static int cxip_recv_pending_ptlte_disable(struct cxip_rxc *rxc,
+					   bool check_fc)
+{
+	int ret;
+
+	assert(rxc->state == RXC_ENABLED ||
+	       rxc->state == RXC_ONLOAD_FLOW_CONTROL ||
+	       rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE ||
+	       rxc->state == RXC_FLOW_CONTROL ||
+	       rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED ||
+	       rxc->state == RXC_PENDING_PTLTE_DISABLE);
+
+	/* Having flow control triggered while in flow control is a sign of LE
+	 * exhaustion. Software endpoint mode is required to scale past hardware
+	 * LE limit.
+	 */
+	if (check_fc && rxc->state == RXC_FLOW_CONTROL)
+		RXC_FATAL(rxc, FC_SW_LE_MSG_FATAL);
+
+	if (rxc->state != RXC_ENABLED)
+		return FI_SUCCESS;
+
+	RXC_DBG(rxc, "Manual request PTLTE_DISABLED\n");
+
+	ret = cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq, C_PTLTE_DISABLED,
+				 0);
+	if (ret == FI_SUCCESS)
+		rxc->state = RXC_PENDING_PTLTE_DISABLE;
+
+	return ret;
+}
+
+/* cxip_rxp_check_le_usage_hybrid_preempt() - Examines LE Pool usage and forces
+ * a preemptive hardware to software transition if needed.
+ *
+ * In cases where the LE pool entry reservation is insufficient to meet request
+ * list buffers (due to multiple EP sharing an LE Pool or insufficient LE Pool
+ * reservation value), then enabling the periodic checking of LE allocations
+ * can be used to force preemptive transitions to software match mode before
+ * resources are exhausted or so depleted they are starve software managed
+ * endpoint. The lpe_stat_2 is set to the number of LE pool entries allocated
+ * to the LE pool and lpe_stat_1 is the current allocation. Skid is required
+ * as stats are relative to hardware processing, not software processing of
+ * the event.
+ *
+ * Caller should hold ep_obj->lock.
+ */
+static inline bool
+cxip_rxp_check_le_usage_hybrid_preempt(struct cxip_rxc *rxc,
+				       const union c_event *event)
+{
+	if (event->tgt_long.lpe_stat_1 > (event->tgt_long.lpe_stat_2 >> 1) &&
+	    rxc->state == RXC_ENABLED) {
+		if (cxip_recv_pending_ptlte_disable(rxc, false))
+			RXC_WARN(rxc, "Force FC failed\n");
+		return true;
+	}
+	return false;
+}
+
+static int cxip_rxc_check_ule_hybrid_preempt(struct cxip_rxc *rxc)
+{
+	int ret;
+	int count;
+
+	if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE &&
+	    cxip_env.hybrid_unexpected_msg_preemptive == 1) {
+		count = ofi_atomic_get32(&rxc->orx_hw_ule_cnt);
+
+		if (rxc->state == RXC_ENABLED && count > rxc->attr.size) {
+			ret = cxip_recv_pending_ptlte_disable(rxc, false);
+			if (ret == FI_SUCCESS) {
+				RXC_WARN(rxc,
+					 "Transitioning to SW EP due to too many unexpected messages: posted_count=%u request_size=%lu\n",
+					 ret, rxc->attr.size);
+			} else {
+				assert(ret == -FI_EAGAIN);
+				RXC_WARN(rxc,
+					 "Failed to transition to SW EP: %d\n",
+					 ret);
+			}
+
+			return ret;
+		}
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_oflow_cb() - Process an Overflow buffer event.
+ *
+ * Overflow buffers are used to land unexpected Send data. Link, Unlink
+ * and Put events are expected from Overflow buffers. However, Link
+ * events will only be requested when running in hybrid RX match mode
+ * with FI_CXI_HYBRID_PREEMPTIVE=1.
+ *
+ * An Unlink event indicates that buffer space was exhausted. Overflow buffers
+ * are configured to use locally managed LEs. When enough Puts match in an
+ * Overflow buffer, consuming its space, the NIC automatically unlinks the LE.
+ * An automatic Unlink event is generated before the final Put which caused
+ * buffer space to become exhausted.
+ *
+ * An Unlink event is generated by an Unlink command. Overflow buffers are
+ * manually unlinked in this way during teardown. When an LE is manually
+ * unlinked the auto_unlinked field in the corresponding event is zero. In this
+ * case, the request is freed immediately.
+ *
+ * A Put event is generated for each Put that matches the Overflow buffer LE.
+ * This event indicates that data is available in the Overflow buffer. This
+ * event must be correlated to a Put Overflow event from a user receive buffer
+ * LE. The Put Overflow event may arrive before or after the Put event.
+ *
+ * When each Put event arrives, check for the existence of a previously posted
+ * receive buffer which generated a matching Put Overflow event. If such a
+ * buffer exists, copy data from the Overflow buffer to the user receive
+ * buffer. Otherwise, store a record of the Put event for matching once a user
+ * posts a new buffer that matches the unexpected Put.
+ *
+ * If data will remain in the Overflow buffer, take a reference to it to
+ * prevent it from being freed. If an Unlink-Put event is detected, drop a
+ * reference to the Overflow buffer so it is automatically freed once all user
+ * data is copied out.
+ */
+static int cxip_oflow_cb(struct cxip_req *req, const union c_event *event)
+{
+	struct cxip_ptelist_buf *oflow_buf = req->req_ctx;
+	struct cxip_rxc *rxc = oflow_buf->rxc;
+	int ret = FI_SUCCESS;
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_LINK:
+		/* Success events only used with hybrid preemptive */
+		if (cxi_event_rc(event) == C_RC_OK) {
+
+			if (!cxip_env.hybrid_preemptive)
+				return FI_SUCCESS;
+
+			/* Check for possible hybrid mode preemptive
+			 * transitions to software managed mode.
+			 */
+			if (cxip_rxp_check_le_usage_hybrid_preempt(rxc, event))
+				RXC_WARN(rxc,
+					 "Force preemptive switch to SW EP\n");
+			return FI_SUCCESS;
+		}
+
+		assert(cxi_event_rc(event) == C_RC_NO_SPACE);
+
+		RXC_DBG(rxc, "Oflow LE append failed\n");
+
+		ret = cxip_recv_pending_ptlte_disable(rxc, true);
+		if (ret != FI_SUCCESS)
+			RXC_WARN(rxc, "Force disable failed %d %s\n",
+				 ret, fi_strerror(-ret));
+		cxip_ptelist_buf_link_err(oflow_buf, cxi_event_rc(event));
+		return ret;
+	case C_EVENT_UNLINK:
+		assert(!event->tgt_long.auto_unlinked);
+
+		cxip_ptelist_buf_unlink(oflow_buf);
+		return FI_SUCCESS;
+	case C_EVENT_PUT:
+		/* Put event handling is complicated. Handle below. */
+		break;
+	default:
+		RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+
+	ofi_atomic_inc32(&rxc->orx_hw_ule_cnt);
+
+	if (event->tgt_long.auto_unlinked) {
+
+		oflow_buf->unlink_length = event->tgt_long.start -
+			CXI_VA_TO_IOVA(oflow_buf->md->md, oflow_buf->data)
+			+ event->tgt_long.mlength;
+
+		ofi_atomic_dec32(&oflow_buf->pool->bufs_linked);
+
+		RXC_DBG(rxc, "Oflow auto unlink buf %p, linked %u\n", oflow_buf,
+			ofi_atomic_get32(&oflow_buf->pool->bufs_linked));
+
+		/* Replace the eager overflow buffer. */
+		cxip_ptelist_buf_replenish(rxc->oflow_list_bufpool, false);
+	}
+
+	ret = cxip_rxc_check_ule_hybrid_preempt(rxc);
+	if (ret)
+		goto err_dec_ule;
+
+	/* Drop all unexpected 0-byte Put events. */
+	if (!event->tgt_long.rlength)
+		return FI_SUCCESS;
+
+	/* Handle Put events */
+	ret = cxip_oflow_process_put_event(rxc, req, event);
+	if (ret)
+		goto err_dec_ule;
+
+	return FI_SUCCESS;
+
+err_dec_ule:
+	ofi_atomic_dec32(&rxc->orx_hw_ule_cnt);
+
+	return ret;
+}
+
+static void report_send_completion(struct cxip_req *req, bool sw_cntr);
+static void rdzv_send_req_event(struct cxip_req *req);
+
+/*
+ * cxip_rdzv_pte_zbp_cb() - Process zero-byte Put events.
+ *
+ * Zero-byte Puts (ZBP) are used to transfer small messages without consuming
+ * buffers outside of the EQ. ZBPs are currently only used for match complete
+ * messages.
+ */
+int cxip_rdzv_pte_zbp_cb(struct cxip_req *req, const union c_event *event)
+{
+	struct cxip_rdzv_pte *rdzv_pte = req->req_ctx;
+	struct cxip_txc *txc = rdzv_pte->txc;
+	struct cxip_req *put_req;
+	union cxip_match_bits mb;
+	int event_rc = cxi_event_rc(event);
+	int rdzv_id;
+	int ret;
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_LINK:
+		if (event_rc == C_RC_OK)
+			ofi_atomic_inc32(&rdzv_pte->le_linked_success_count);
+		else
+			ofi_atomic_inc32(&rdzv_pte->le_linked_failure_count);
+		return FI_SUCCESS;
+
+	case C_EVENT_PUT:
+		mb.raw = event->tgt_long.match_bits;
+
+		if (mb.rdzv_done) {
+			rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) |
+				mb.rdzv_id_lo;
+			put_req = cxip_rdzv_id_lookup(txc, rdzv_id);
+			if (!put_req) {
+				TXC_WARN(txc, "Failed to find RDZV ID: %d\n",
+					 rdzv_id);
+				return FI_SUCCESS;
+			}
+
+			if (event_rc != C_RC_OK)
+				TXC_WARN(txc, "RDZV Done error: %p rc: %s\n",
+					 put_req, cxi_rc_to_str(event_rc));
+			else
+				TXC_DBG(txc, "RDZV Done ACK: %p rc: %s\n",
+					put_req, cxi_rc_to_str(event_rc));
+
+			put_req->send.rc = event_rc;
+			rdzv_send_req_event(put_req);
+
+			return FI_SUCCESS;
+		}
+
+		/* Match complete */
+		put_req = cxip_tx_id_lookup(txc, mb.tx_id);
+		if (!put_req) {
+			TXC_WARN(txc, "Failed to find TX ID: %d\n", mb.tx_id);
+			return FI_SUCCESS;
+		}
+
+		event_rc = cxi_tgt_event_rc(event);
+		if (event_rc != C_RC_OK)
+			TXC_WARN(txc, "ZBP error: %p rc: %s\n", put_req,
+				 cxi_rc_to_str(event_rc));
+		else
+			TXC_DBG(txc, "ZBP received: %p rc: %s\n", put_req,
+				cxi_rc_to_str(event_rc));
+
+		ret = cxip_send_req_dequeue(put_req->send.txc, put_req);
+		if (ret != FI_SUCCESS)
+			return ret;
+
+		cxip_tx_id_free(txc, mb.tx_id);
+
+		/* The unexpected message has been matched. Generate a
+		 * completion event. The ZBP event is guaranteed to arrive
+		 * after the eager Send Ack, so the transfer is always done at
+		 * this point.
+		 *
+		 * If MATCH_COMPLETE was requested, software must manage
+		 * counters.
+		 */
+		report_send_completion(put_req, true);
+
+		ofi_atomic_dec32(&put_req->send.txc->otx_reqs);
+		cxip_evtq_req_free(put_req);
+
+		return FI_SUCCESS;
+
+	default:
+		TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+}
+
+/*
+ * cxip_oflow_bufpool_fini() - Finalize overflow buffers used for messaging.
+ *
+ * Must be called with the RX PtlTE disabled.
+ */
+void cxip_oflow_bufpool_fini(struct cxip_rxc *rxc)
+{
+	struct cxip_deferred_event *def_ev = NULL;
+	struct cxip_ptelist_buf *oflow_buf;
+	struct dlist_entry *tmp;
+	int i;
+	int def_events = 0;
+
+	/* Clean up unexpected Put records. The PtlTE is disabled, so no more
+	 * events can be expected.
+	 */
+	for (i = 0; i < CXIP_DEF_EVENT_HT_BUCKETS; i++) {
+		dlist_foreach_container_safe(&rxc->deferred_events.bh[i],
+					     struct cxip_deferred_event,
+					     def_ev, rxc_entry, tmp) {
+			/* Dropping the last reference will cause the
+			 * oflow_buf to be removed from the RXC list and
+			 * freed.
+			 */
+			oflow_buf = def_ev->req->req_ctx;
+
+			if (oflow_buf->le_type == CXIP_LE_TYPE_RX)
+				oflow_req_put_bytes(def_ev->req,
+					    def_ev->ev.tgt_long.mlength);
+
+			free_put_event(rxc, def_ev);
+			def_events++;
+		}
+	}
+
+	if (def_events)
+		RXC_DBG(rxc, "Freed %d deferred event(s)\n", def_events);
+
+	cxip_ptelist_bufpool_fini(rxc->oflow_list_bufpool);
+}
+
+int cxip_oflow_bufpool_init(struct cxip_rxc *rxc)
+{
+	struct cxip_ptelist_bufpool_attr attr = {
+		.list_type = C_PTL_LIST_OVERFLOW,
+		.ptelist_cb = cxip_oflow_cb,
+		.buf_size = cxip_env.oflow_buf_size,
+		.min_posted = cxip_env.oflow_buf_min_posted,
+		.max_posted = cxip_env.oflow_buf_min_posted, /* min == max */
+		.max_cached = cxip_env.oflow_buf_max_cached,
+		.min_space_avail = rxc->max_eager_size,
+	};
+
+	return cxip_ptelist_bufpool_init(rxc, &rxc->oflow_list_bufpool, &attr);
+}
+
+/*
+ * cxip_rdzv_done_notify() - Sends a rendezvous complete from target to source
+ *
+ * Sends a zero byte matching notification to the source of rendezvous
+ * indicating completion of a rendezvous. This is used when restricted get
+ * DMA (CXIP_RDZV_PROTO_ALT_READ) is used to transfer non-eager data.
+ */
+static int cxip_rdzv_done_notify(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	union c_fab_addr dfa;
+	uint32_t pid_idx = CXIP_PTL_IDX_RDZV_DEST;
+	uint32_t match_id;
+	struct c_full_dma_cmd cmd = {};
+	union cxip_match_bits mb = {};
+	int ret;
+	uint8_t idx_ext;
+
+	mb.rdzv_id_lo = req->recv.rdzv_id;
+	mb.rdzv_id_hi = req->recv.rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH;
+	mb.rdzv_done = 1;
+	mb.le_type = CXIP_LE_TYPE_ZBP;
+
+	cxi_build_dfa(req->recv.rget_nic, req->recv.rget_pid, rxc->pid_bits,
+		      pid_idx, &dfa, &idx_ext);
+	match_id = CXI_MATCH_ID(rxc->pid_bits, rxc->ep_obj->src_addr.pid,
+				rxc->ep_obj->src_addr.nic);
+
+	cmd.command.cmd_type = C_CMD_TYPE_DMA;
+	cmd.command.opcode = C_CMD_PUT;
+	cmd.index_ext = idx_ext;
+	cmd.event_send_disable = 1;
+	cmd.dfa = dfa;
+	cmd.eq = cxip_evtq_eqn(&rxc->ep_obj->txc.tx_evtq);
+	cmd.user_ptr = (uint64_t)req;
+	cmd.initiator = match_id;
+	cmd.match_bits = mb.raw;
+
+	ret = cxi_cq_emit_dma(rxc->tx_cmdq->dev_cmdq, &cmd);
+	if (ret != FI_SUCCESS) {
+		RXC_DBG(rxc, "Faile to write notify IDC: %d %s\n",
+			ret, fi_strerror(-ret));
+		return -FI_EAGAIN;
+	}
+
+	cxi_cq_ring(rxc->tx_cmdq->dev_cmdq);
+
+	RXC_DBG(rxc, "RDZV done notify send RDZV ID: %d\n",
+		req->recv.rdzv_id);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	struct cxip_deferred_event *def_ev;
+	int event_rc;
+	int ret;
+	bool matched;
+
+	switch (event->hdr.event_type) {
+	/* When errors happen, send events can occur before the put/get event.
+	 * These events should just be dropped.
+	 */
+	case C_EVENT_SEND:
+		RXC_WARN(rxc, CXIP_UNEXPECTED_EVENT,
+			 cxi_event_to_str(event),
+			 cxi_rc_to_str(cxi_event_rc(event)));
+		return FI_SUCCESS;
+
+	case C_EVENT_PUT_OVERFLOW:
+		/* We matched an unexpected header */
+		/* Check for a previously received unexpected Put event,
+		 * if not found defer until it arrives.
+		 */
+		def_ev = match_put_event(rxc, req, event, &matched);
+		if (!def_ev)
+			return -FI_EAGAIN;
+
+		/* For multi-recv, management of start_offset requires events
+		 * manage_local related events to arrive in order.
+		 * Only C_EVENT_PUT_OVERFLOW events meet this criteria.
+		 */
+		def_ev->mrecv_start = req->recv.start_offset;
+		def_ev->mrecv_len =
+			mrecv_req_put_bytes(req, event->tgt_long.rlength);
+
+		if (req->recv.multi_recv && event->tgt_long.auto_unlinked) {
+			/* If a C_EVENT_PUT_OVERFLOW unlinks a multi-recv
+			 * buffer, mrecv_start contains the number of bytes
+			 * consumed before this C_EVENT_PUT_OVERFLOW. Adding in
+			 * mrecv_len gets the total bytes consumed.
+			 */
+			req->recv.auto_unlinked = true;
+			req->recv.mrecv_unlink_bytes =
+				def_ev->mrecv_start + def_ev->mrecv_len;
+		}
+
+		ofi_atomic_dec32(&rxc->orx_hw_ule_cnt);
+
+		if (!matched)
+			return FI_SUCCESS;
+
+		RXC_DBG(rxc, "Matched deferred event: %p\n", def_ev);
+
+		ret = cxip_ux_send(req, def_ev->req, &def_ev->ev,
+				   def_ev->mrecv_start, def_ev->mrecv_len,
+				   false);
+		if (ret == FI_SUCCESS) {
+			free_put_event(rxc, def_ev);
+		} else {
+			/* undo mrecv_req_put_bytes() and orx_hw_ule_cnt dec */
+			req->recv.start_offset -= def_ev->mrecv_len;
+			ofi_atomic_inc32(&rxc->orx_hw_ule_cnt);
+		}
+
+		return ret;
+	case C_EVENT_PUT:
+		/* Eager data was delivered directly to the user buffer. */
+		if (req->recv.multi_recv) {
+			if (event->tgt_long.auto_unlinked) {
+				uintptr_t mrecv_head;
+				uintptr_t mrecv_tail;
+				size_t mrecv_bytes_remaining;
+				size_t rlen;
+
+				/* For C_EVENT_PUT, need to calculate how much
+				 * of the multi-recv buffer was consumed while
+				 * factoring in any truncation.
+				 */
+				mrecv_head =
+					CXI_IOVA_TO_VA(req->recv.recv_md->md,
+						       event->tgt_long.start);
+				mrecv_tail = (uintptr_t)req->recv.recv_buf +
+					req->recv.ulen;
+				mrecv_bytes_remaining = mrecv_tail - mrecv_head;
+				rlen = MIN(mrecv_bytes_remaining,
+					   event->tgt_long.rlength);
+
+				req->recv.auto_unlinked = true;
+				req->recv.mrecv_unlink_bytes =
+					mrecv_head -
+					(uintptr_t)req->recv.recv_buf + rlen;
+			}
+
+			req = rdzv_mrecv_req_event(req, event);
+			if (!req)
+				return -FI_EAGAIN;
+
+			/* Set start pointer and data_len using Rendezvous or
+			 * Put Overflow event (depending on if message was
+			 * unexpected).
+			 */
+		}
+
+		recv_req_tgt_event(req, event);
+
+		/* Count the rendezvous event. */
+		rdzv_recv_req_event(req, event->hdr.event_type);
+		return FI_SUCCESS;
+	case C_EVENT_RENDEZVOUS:
+		if (req->recv.multi_recv) {
+			req = rdzv_mrecv_req_event(req, event);
+			if (!req)
+				return -FI_EAGAIN;
+
+			/* Use Rendezvous event to set start pointer and
+			 * data_len for expected Sends.
+			 */
+			struct cxip_req *parent = req->recv.parent;
+			size_t mrecv_bytes_remaining;
+
+			req->buf = CXI_IOVA_TO_VA(
+					parent->recv.recv_md->md,
+					event->tgt_long.start) -
+					event->tgt_long.mlength;
+			req->recv.recv_buf = (void *)req->buf;
+
+			mrecv_bytes_remaining =
+				(uint64_t)parent->recv.recv_buf +
+				parent->recv.ulen -
+				(uint64_t)req->recv.recv_buf;
+			req->data_len = MIN(mrecv_bytes_remaining,
+					    event->tgt_long.rlength);
+		} else {
+			req->data_len = MIN(req->recv.ulen, event->tgt_long.rlength);
+		}
+
+		recv_req_tgt_event(req, event);
+
+		if (!event->tgt_long.get_issued) {
+			if (ofi_atomic_inc32(&rxc->orx_tx_reqs) >
+			    rxc->max_tx || issue_rdzv_get(req)) {
+
+				/* Could not issue get */
+				ofi_atomic_dec32(&rxc->orx_tx_reqs);
+
+				/* Undo multi-recv event processing. */
+				if (req->recv.multi_recv &&
+				    !req->recv.rdzv_events) {
+					dlist_remove(&req->recv.children);
+					cxip_evtq_req_free(req);
+				}
+				return -FI_EAGAIN;
+			}
+
+			RXC_DBG(rxc, "Software issued Get, req: %p\n", req);
+		}
+
+		/* Count the rendezvous event. */
+		rdzv_recv_req_event(req, event->hdr.event_type);
+		return FI_SUCCESS;
+	case C_EVENT_REPLY:
+		/* If mrecv, look up the correct child request. */
+		if (req->recv.multi_recv) {
+			req = rdzv_mrecv_req_event(req, event);
+			if (!req)
+				return -FI_EAGAIN;
+		}
+
+		/* If a rendezvous operation requires a done notification
+		 * send it. Must wait for the ACK from the notify to be returned
+		 * before completing the target operation.
+		 */
+		if (req->recv.done_notify) {
+			if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > rxc->max_tx ||
+			    cxip_rdzv_done_notify(req)) {
+
+				/* Could not issue notify, will be retried */
+				ofi_atomic_dec32(&rxc->orx_tx_reqs);
+				return -FI_EAGAIN;
+			}
+		}
+
+		/* Rendezvous Get completed, update event counts and
+		 * complete if using unrestricted get protocol.
+		 */
+		req->recv.rc = cxi_init_event_rc(event);
+		rdzv_recv_req_event(req, event->hdr.event_type);
+
+		/* If RGet initiated by software return the TX credit */
+		if (!event->init_short.rendezvous) {
+			ofi_atomic_dec32(&rxc->orx_tx_reqs);
+			assert(ofi_atomic_get32(&rxc->orx_tx_reqs) >= 0);
+		}
+
+		return FI_SUCCESS;
+
+	case C_EVENT_ACK:
+		event_rc = cxi_init_event_rc(event);
+		if (event_rc != C_RC_OK)
+			RXC_WARN(rxc, "%#x:%u Bad RDZV notify ACK status %s\n",
+				 req->recv.rget_nic, req->recv.rget_pid,
+				 cxi_rc_to_str(event_rc));
+
+		/* Special case of the ZBP destination EQ being full and ZBP
+		 * could not complete. This must be retried, we use the TX
+		 * credit already allocated.
+		 */
+		if (event_rc == C_RC_ENTRY_NOT_FOUND) {
+			usleep(CXIP_DONE_NOTIFY_RETRY_DELAY_US);
+
+			if (cxip_rdzv_done_notify(req))
+				return -FI_EAGAIN;
+
+			return FI_SUCCESS;
+		}
+
+		/* Reflect the completion status of the ACK in the target
+		 * side completion so that a failure will not go undetected.
+		 */
+		req->recv.rc = event_rc;
+		ofi_atomic_dec32(&req->recv.rxc->orx_tx_reqs);
+		rdzv_recv_req_event(req, event->hdr.event_type);
+
+		return FI_SUCCESS;
+
+	default:
+		RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+}
+
+static void cxip_rxc_record_req_stat(struct cxip_rxc *rxc, enum c_ptl_list list,
+				     size_t rlength, struct cxip_req *req)
+{
+	enum fi_hmem_iface iface = rlength ? req->recv.recv_md->info.iface : FI_HMEM_SYSTEM;
+
+	cxip_msg_counters_msg_record(&rxc->cntrs, list, iface, rlength);
+}
+
+/*
+ * cxip_recv_cb() - Process a user receive buffer event.
+ *
+ * A user receive buffer is described by an LE linked to the Priority list.
+ * Link, Unlink, Put, Put Overflow, and Reply events are expected from a user
+ * receive buffer.
+ *
+ * A Link event indicates that a new user buffer has been linked to the
+ * priority list. Successful Link events may be suppressed.
+ *
+ * An Unlink event indicates that a user buffer has been unlinked. Normally, a
+ * receive is used once and unlinked when it is matched with a Send. In this
+ * case, a successful Unlink event may be suppressed.
+ *
+ * For expected, eager Sends, a Put will be matched to a user receive buffer by
+ * the NIC. Send data is copied directly to the user buffer. A Put event is
+ * generated describing the match.
+ *
+ * For unexpected, eager Sends, a Put will first match a buffer in the Overflow
+ * list. See cxip_oflow_cb() for details on Overflow event handling. Once a
+ * matching user receive buffer is appended to the Priority list, a Put
+ * Overflow event is generated. Put and Put Overflow events for an unexpected,
+ * eager Send must be correlated. These events may arrive in any order. Once
+ * both events are accounted, data is copied from the Overflow buffer to the
+ * user receive buffer.
+ *
+ * Unexpected, eager Sends that are longer than the eager threshold have their
+ * data truncated to zero. This is to avoid long messages consuming too much
+ * Overflow buffer space at the target. Once a match is made with a user
+ * receive buffer, data is re-read from the initiator using a Get.
+ *
+ * Rendezvous receive events are handled by cxip_recv_rdzv_cb().
+ */
+static int cxip_recv_cb(struct cxip_req *req, const union c_event *event)
+{
+	int ret;
+	struct cxip_rxc *rxc = req->recv.rxc;
+	struct cxip_deferred_event *def_ev;
+	bool rdzv = false;
+	bool matched;
+
+	/* Common processing for rendezvous and non-rendezvous events.
+	 * TODO: Avoid having two switch statements for event_type.
+	 */
+	switch (event->hdr.event_type) {
+	case C_EVENT_LINK:
+		/* In cases where the LE pool entry reservation is insufficient
+		 * to meet priority list buffers (due to multiple EP sharing an
+		 * LE Pool or insufficient LE Pool reservation value), then
+		 * enabling the periodic checking of LE allocations can be
+		 * used to force preemptive transitions to software match mode.
+		 */
+		if (cxi_tgt_event_rc(event) == C_RC_OK) {
+
+			if (!cxip_env.hybrid_recv_preemptive)
+				return FI_SUCCESS;
+
+			/* Check for possible hybrid mode preemptive
+			 * transitions to software managed mode.
+			 */
+			if (cxip_rxp_check_le_usage_hybrid_preempt(rxc, event))
+				RXC_WARN(rxc,
+					 "Force preemptive switch to SW EP\n");
+
+			return FI_SUCCESS;
+		}
+
+		/* If endpoint has been disabled and an append fails, free the
+		 * user request without reporting any event.
+		 */
+		if (rxc->state == RXC_DISABLED) {
+			cxip_recv_req_free(req);
+			return FI_SUCCESS;
+		}
+
+		/* Save append to repost, NIC will initiate transition to
+		 * software managed EP.
+		 */
+		if (cxi_tgt_event_rc(event) == C_RC_PTLTE_SW_MANAGED) {
+			RXC_WARN(rxc, "Append err, transitioning to SW\n");
+			cxip_recv_req_dropped(req);
+
+			return FI_SUCCESS;
+		}
+
+		/* Transition into onload and flow control if an append
+		 * fails.
+		 */
+		if (cxi_tgt_event_rc(event) != C_RC_NO_SPACE)
+			RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT_STS,
+				  cxi_event_to_str(event),
+				  cxi_rc_to_str(cxi_tgt_event_rc(event)));
+
+		RXC_WARN(rxc, "Append err, priority LE exhaustion\n");
+
+		/* Manually transition to DISABLED to initiate flow control
+		 * and onload  instead of waiting for eventual NIC no match
+		 * transition.
+		 */
+		ret = cxip_recv_pending_ptlte_disable(rxc, true);
+		if (ret != FI_SUCCESS)
+			RXC_WARN(rxc, "Force disable failed %d %s\n",
+				 ret, fi_strerror(-ret));
+
+		ret = FI_SUCCESS;
+		cxip_recv_req_dropped(req);
+
+		return ret;
+
+	case C_EVENT_UNLINK:
+		assert(!event->tgt_long.auto_unlinked);
+
+		/* TODO: This is broken with multi-recv. The multi-recv request
+		 * may be freed with pending child requests.
+		 */
+		req->recv.unlinked = true;
+		recv_req_report(req);
+		cxip_recv_req_free(req);
+
+		return FI_SUCCESS;
+
+	case C_EVENT_PUT_OVERFLOW:
+		cxip_rxc_record_req_stat(rxc, C_PTL_LIST_OVERFLOW,
+					 event->tgt_long.rlength, req);
+
+		/* ULE freed. Update RXC state to signal that the RXC should
+		 * be reenabled.
+		 */
+		/* TODO: this is not atomic, there must be a better way */
+		if (rxc->state == RXC_ONLOAD_FLOW_CONTROL)
+			rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE;
+		break;
+
+	case C_EVENT_PUT:
+		cxip_rxc_record_req_stat(rxc, C_PTL_LIST_PRIORITY,
+					 event->tgt_long.rlength, req);
+		break;
+	default:
+		break;
+	}
+
+	/* All events related to an offloaded rendezvous receive will be
+	 * handled by cxip_recv_rdzv_cb(). Those events are identified by the
+	 * event rendezvous field. Two exceptions are a Reply event generated
+	 * from a SW-issued Get, and a Ack for a software done notification
+	 * when using restricted eager get. When such an event is generated,
+	 * the request will have already processed a Rendezvous event. If the
+	 * rendezvous field is not set, but the rdzv_events count is elevated,
+	 * this must be a SW-issued Reply or Ack event.
+	 */
+	if (event->hdr.event_type == C_EVENT_REPLY ||
+	    event->hdr.event_type == C_EVENT_ACK)
+		rdzv = (event->init_short.rendezvous || req->recv.rdzv_events);
+	else
+		rdzv = event->tgt_long.rendezvous;
+
+	if (rdzv)
+		return cxip_recv_rdzv_cb(req, event);
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_SEND:
+		/* TODO Handle Send event errors. */
+		assert(cxi_event_rc(event) == C_RC_OK);
+		return FI_SUCCESS;
+	case C_EVENT_PUT_OVERFLOW:
+		/* We matched an unexpected header */
+		/* Unexpected 0-byte Put events are dropped. Skip matching. */
+		if (!event->tgt_long.rlength) {
+			ret = cxip_ux_send_zb(req, event,
+					      req->recv.start_offset, false);
+			if (ret == FI_SUCCESS)
+				ofi_atomic_dec32(&rxc->orx_hw_ule_cnt);
+
+			return ret;
+		}
+
+		/* Check for a previously received unexpected Put event,
+		 * if not found defer until it arrives.
+		 */
+		def_ev = match_put_event(rxc, req, event, &matched);
+		if (!def_ev)
+			return -FI_EAGAIN;
+
+		/* For multi-recv, management of start_offset requires events
+		 * manage_local related events to arrive in order.
+		 * Only C_EVENT_PUT_OVERFLOW events meet this criteria.
+		 */
+		def_ev->mrecv_start = req->recv.start_offset;
+		def_ev->mrecv_len =
+			mrecv_req_put_bytes(req, event->tgt_long.rlength);
+
+		if (req->recv.multi_recv && event->tgt_long.auto_unlinked) {
+			/* If a C_EVENT_PUT_OVERFLOW unlinks a multi-recv
+			 * buffer, mrecv_start contains the number of bytes
+			 * consumed before this C_EVENT_PUT_OVERFLOW. Adding in
+			 * mrecv_len gets the total bytes consumed.
+			 */
+			req->recv.auto_unlinked = true;
+			req->recv.mrecv_unlink_bytes =
+				def_ev->mrecv_start + def_ev->mrecv_len;
+		}
+
+		ofi_atomic_dec32(&rxc->orx_hw_ule_cnt);
+
+		if (!matched)
+			return FI_SUCCESS;
+
+		ret = cxip_ux_send(req, def_ev->req, &def_ev->ev,
+				   def_ev->mrecv_start, def_ev->mrecv_len,
+				   false);
+		if (ret == FI_SUCCESS) {
+			free_put_event(rxc, def_ev);
+		} else {
+			/* undo mrecv_req_put_bytes() and orx_hw_ule_cnt dec */
+			req->recv.start_offset -= def_ev->mrecv_len;
+			ofi_atomic_inc32(&rxc->orx_hw_ule_cnt);
+		}
+
+		return ret;
+	case C_EVENT_PUT:
+		/* Data was delivered directly to the user buffer. Complete the
+		 * request.
+		 */
+		if (req->recv.multi_recv) {
+			if (event->tgt_long.auto_unlinked) {
+				uintptr_t mrecv_head;
+
+				/* For C_EVENT_PUT, need to calculate how much
+				 * of the multi-recv buffer was consumed while
+				 * factoring in any truncation.
+				 */
+				mrecv_head =
+					CXI_IOVA_TO_VA(req->recv.recv_md->md,
+						       event->tgt_long.start);
+
+				req->recv.auto_unlinked = true;
+				req->recv.mrecv_unlink_bytes =
+					mrecv_head -
+					(uintptr_t)req->recv.recv_buf +
+					event->tgt_long.mlength;
+			}
+
+			req = mrecv_req_dup(req);
+			if (!req)
+				return -FI_EAGAIN;
+			recv_req_tgt_event(req, event);
+
+			req->buf = (uint64_t)(CXI_IOVA_TO_VA(
+					req->recv.recv_md->md,
+					event->tgt_long.start));
+			req->data_len = event->tgt_long.mlength;
+
+			recv_req_report(req);
+			cxip_evtq_req_free(req);
+		} else {
+			req->data_len = event->tgt_long.mlength;
+			recv_req_tgt_event(req, event);
+			recv_req_report(req);
+			cxip_recv_req_free(req);
+		}
+		return FI_SUCCESS;
+
+	case C_EVENT_REPLY:
+		/* Long-send Get completed. Complete the request. */
+		req->recv.rc = cxi_init_event_rc(event);
+
+		recv_req_report(req);
+		if (req->recv.multi_recv)
+			cxip_evtq_req_free(req);
+		else
+			cxip_recv_req_free(req);
+
+		return FI_SUCCESS;
+	default:
+		RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+}
+
+/*
+ * cxip_recv_cancel() - Cancel outstanding receive request.
+ */
+int cxip_recv_cancel(struct cxip_req *req)
+{
+	int ret = FI_SUCCESS;
+	struct cxip_rxc *rxc = req->recv.rxc;
+
+	/* In hybrid mode requests could be on priority list
+	 * or software receive list.
+	 */
+	if (req->recv.software_list) {
+		dlist_remove_init(&req->recv.rxc_entry);
+		req->recv.canceled = true;
+		req->recv.unlinked = true;
+		recv_req_report(req);
+		cxip_recv_req_free(req);
+	} else {
+		ret = cxip_pte_unlink(rxc->rx_pte, C_PTL_LIST_PRIORITY,
+				req->req_id, rxc->rx_cmdq);
+		if (ret == FI_SUCCESS)
+			req->recv.canceled = true;
+	}
+	return ret;
+}
+
+/*
+ * cxip_recv_reenable() - Attempt to re-enable the RX queue.
+ *
+ * Called by disabled EP ready to re-enable.
+ *
+ * Determine if the RX queue can be re-enabled and perform a state change
+ * command if necessary. The Endpoint must receive dropped Send notifications
+ * from all peers who experienced drops before re-enabling the RX queue.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_recv_reenable(struct cxip_rxc *rxc)
+{
+	struct cxi_pte_status pte_status = {};
+	int ret __attribute__((unused));
+
+	if (rxc->drop_count == -1) {
+		RXC_WARN(rxc, "Waiting for pending FC_NOTIFY messages\n");
+		return -FI_EAGAIN;
+	}
+
+	ret = cxil_pte_status(rxc->rx_pte->pte, &pte_status);
+	assert(!ret);
+
+	if (rxc->drop_count != pte_status.drop_count) {
+		RXC_DBG(rxc, "Processed %d/%d drops\n",
+			rxc->drop_count, pte_status.drop_count);
+		return -FI_EAGAIN;
+	}
+
+	RXC_WARN(rxc, "Re-enabling PTE, drop_count %d\n",
+		 rxc->drop_count);
+
+	do {
+		ret = cxip_rxc_msg_enable(rxc, rxc->drop_count);
+		if (ret == -FI_EAGAIN &&
+		    rxc->new_state == RXC_ENABLED_SOFTWARE) {
+			RXC_WARN(rxc,
+				 "PTE disable->sm drop mismatch, will retry\n");
+			break;
+		}
+	} while (ret == -FI_EAGAIN);
+
+	if (ret != FI_SUCCESS && ret != -FI_EAGAIN)
+		RXC_FATAL(rxc, "cxip_rxc_msg_enable failed: %d\n", ret);
+
+	return ret;
+}
+
+/*
+ * cxip_fc_resume_cb() - Process FC resume completion events.
+ */
+int cxip_fc_resume_cb(struct cxip_ctrl_req *req, const union c_event *event)
+{
+	struct cxip_fc_drops *fc_drops = container_of(req,
+			struct cxip_fc_drops, req);
+	struct cxip_rxc *rxc = fc_drops->rxc;
+	int ret = FI_SUCCESS;
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_ACK:
+		switch (cxi_event_rc(event)) {
+		case C_RC_OK:
+			RXC_DBG(rxc,
+				"FC_RESUME to %#x:%u successfully sent: retry_count=%u\n",
+				fc_drops->nic_addr, fc_drops->pid,
+				fc_drops->retry_count);
+			free(fc_drops);
+			break;
+
+		/* This error occurs when the target's control event queue has
+		 * run out of space. Since the target should be processing the
+		 * event queue, it is safe to replay messages until C_RC_OK is
+		 * returned.
+		 */
+		case C_RC_ENTRY_NOT_FOUND:
+			fc_drops->retry_count++;
+			RXC_WARN(rxc,
+				 "%#x:%u dropped FC message: retry_delay_usecs=%d retry_count=%u\n",
+				 fc_drops->nic_addr, fc_drops->pid,
+				 cxip_env.fc_retry_usec_delay,
+				 fc_drops->retry_count);
+			usleep(cxip_env.fc_retry_usec_delay);
+			ret = cxip_ctrl_msg_send(req);
+			break;
+		default:
+			RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT_STS,
+				  cxi_event_to_str(event),
+				  cxi_rc_to_str(cxi_event_rc(event)));
+		}
+		break;
+	default:
+		RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+
+	return ret;
+}
+
+/*
+ * cxip_fc_process_drops() - Process a dropped Send notification from a peer.
+ *
+ * Called by disabled EP waiting to re-enable.
+ *
+ * When a peer detects dropped Sends it follows up by sending a message to the
+ * disabled Endpoint indicating the number of drops experienced. The disabled
+ * Endpoint peer must count all drops before re-enabling its RX queue.
+ */
+int cxip_fc_process_drops(struct cxip_ep_obj *ep_obj, uint32_t nic_addr,
+			  uint32_t pid, uint16_t drops)
+{
+	struct cxip_rxc *rxc = &ep_obj->rxc;
+	struct cxip_fc_drops *fc_drops;
+	int ret __attribute__((unused));
+
+	fc_drops = calloc(1, sizeof(*fc_drops));
+	if (!fc_drops) {
+		RXC_WARN(rxc, "Failed to allocate drops\n");
+		return -FI_ENOMEM;
+	}
+
+	/* TODO: Cleanup cxip_fc_drops fields. Many of the fields are redundant
+	 * with the req structure.
+	 */
+	fc_drops->rxc = rxc;
+	fc_drops->nic_addr = nic_addr;
+	fc_drops->pid = pid;
+	fc_drops->drops = drops;
+
+	fc_drops->req.send.nic_addr = nic_addr;
+	fc_drops->req.send.pid = pid;
+	fc_drops->req.send.mb.drops = drops;
+
+	fc_drops->req.send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG;
+	fc_drops->req.send.mb.ctrl_msg_type = CXIP_CTRL_MSG_FC_RESUME;
+	fc_drops->req.cb = cxip_fc_resume_cb;
+	fc_drops->req.ep_obj = rxc->ep_obj;
+
+	dlist_insert_tail(&fc_drops->rxc_entry, &rxc->fc_drops);
+
+	RXC_DBG(rxc, "Processed drops: %d NIC: %#x PID: %d\n",
+		drops, nic_addr, pid);
+
+	rxc->drop_count += drops;
+
+	/* Wait until search and delete completes before attempting to
+	 * re-enable.
+	 */
+	if (rxc->state == RXC_FLOW_CONTROL) {
+		ret = cxip_recv_reenable(rxc);
+		assert(ret == FI_SUCCESS || ret == -FI_EAGAIN);
+
+		/* Disable to software managed transition is synchronous
+		 * in order to handle drop count mismatches correctly. If
+		 * successful the H/W transition completed, otherwise it
+		 * will be retried when notified and count matches.
+		 */
+		if (rxc->new_state == RXC_ENABLED_SOFTWARE &&
+		    ret == FI_SUCCESS) {
+			cxip_fc_progress_ctrl(rxc);
+			rxc->state = RXC_ENABLED_SOFTWARE;
+			RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n");
+		}
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_recv_replay() - Replay dropped Receive requests.
+ *
+ * When no LE is available while processing an Append command, the command is
+ * dropped and future appends are disabled. After all outstanding commands are
+ * dropped and resources are recovered, replayed all Receive requests in order.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int cxip_recv_replay(struct cxip_rxc *rxc)
+{
+	struct cxip_req *req;
+	struct dlist_entry *tmp;
+	bool restart_seq = true;
+	int ret;
+
+	dlist_foreach_container_safe(&rxc->replay_queue,
+				     struct cxip_req, req,
+				     recv.rxc_entry, tmp) {
+		dlist_remove_init(&req->recv.rxc_entry);
+
+		/* Since the RXC and PtlTE are in a controlled state and no new
+		 * user receives are being posted, it is safe to ignore the RXC
+		 * state when replaying failed user posted receives.
+		 */
+		ret = cxip_recv_req_queue(req, restart_seq);
+
+		/* Match made in software? */
+		if (ret == -FI_EALREADY)
+			continue;
+
+		/* TODO: Low memory or full CQ during SW matching would cause
+		 * -FI_EAGAIN to be seen here.
+		 */
+		assert(ret == FI_SUCCESS);
+
+		restart_seq = false;
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_recv_resume() - Send a resume message to all peers who reported dropped
+ * Sends.
+ *
+ * Called by disabled EP after re-enable.
+ *
+ * After counting all dropped sends targeting a disabled RX queue and
+ * re-enabling the queue, notify all peers who experienced dropped Sends so
+ * they can be replayed.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_recv_resume(struct cxip_rxc *rxc)
+{
+	struct cxip_fc_drops *fc_drops;
+	struct dlist_entry *tmp;
+	int ret;
+
+	dlist_foreach_container_safe(&rxc->fc_drops,
+				     struct cxip_fc_drops, fc_drops,
+				     rxc_entry, tmp) {
+		ret = cxip_ctrl_msg_send(&fc_drops->req);
+		if (ret)
+			return ret;
+
+		dlist_remove(&fc_drops->rxc_entry);
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_fc_progress_ctrl() - Progress the control EP until all resume
+ * control messages can be queued.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static void cxip_fc_progress_ctrl(struct cxip_rxc *rxc)
+{
+	int ret __attribute__((unused));
+
+	assert(rxc->state == RXC_FLOW_CONTROL);
+
+	/* Successful transition from disabled occurred, reset
+	 * drop count.
+	 */
+	rxc->drop_count = rxc->ep_obj->asic_ver < CASSINI_2_0 ? -1 : 0;
+
+	while ((ret = cxip_recv_resume(rxc)) == -FI_EAGAIN)
+		cxip_ep_tx_ctrl_progress_locked(rxc->ep_obj);
+
+	assert(ret == FI_SUCCESS);
+}
+
+/*
+ * cxip_post_ux_onload_sw() - Nic HW-to-SW EP post UX onload processing.
+ *
+ * PTE transitioned from enabled to software managed. Onloading
+ * was done and appends that failed need to be replayed.
+ */
+static void cxip_post_ux_onload_sw(struct cxip_rxc *rxc)
+{
+	int ret;
+
+	assert(cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE);
+	assert(rxc->prev_state == RXC_ENABLED);
+	assert(rxc->new_state == RXC_ENABLED_SOFTWARE);
+
+	ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool,
+					 true);
+	if (ret != FI_SUCCESS)
+		RXC_WARN(rxc, "Request list replenish failed %d %s\n",
+			 ret, fi_strerror(-ret));
+
+	/* Priority list appends that failed during the transition can
+	 * now be replayed.
+	 */
+	ret = cxip_recv_replay(rxc);
+	assert(ret == FI_SUCCESS || ret == -FI_EAGAIN);
+
+	if (rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) {
+		/* Transition from enabled to software managed is complete.
+		 * Allow posting of receive operations.
+		 */
+		RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n");
+		rxc->state = RXC_ENABLED_SOFTWARE;
+	}
+}
+
+/*
+ * cxip_post_ux_onload_fc() - Flow control onload complete processing.
+ *
+ * PTE transitioned to disabled and UX onload has completed.
+ */
+static void cxip_post_ux_onload_fc(struct cxip_rxc *rxc)
+{
+	int ret;
+
+	/* Disable RX matching offload if transitioning to
+	 * software enabled EP.
+	 */
+	if (rxc->new_state == RXC_ENABLED_SOFTWARE) {
+		RXC_DBG(rxc, "Transitioning to SW EP\n");
+		rxc->msg_offload = 0;
+	}
+
+	if (rxc->fc_reason == C_SC_FC_EQ_FULL)
+		goto replay;
+
+	if (rxc->new_state == RXC_ENABLED_SOFTWARE)
+		ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool,
+						 true);
+	else
+		ret = cxip_ptelist_buf_replenish(rxc->oflow_list_bufpool,
+						 true);
+	if (ret != FI_SUCCESS)
+		RXC_WARN(rxc, "%s buffer replenish failed %d %s\n",
+			 rxc->new_state == RXC_ENABLED_SOFTWARE ?
+			 "Request" : "Overflow", ret, fi_strerror(-ret));
+
+replay:
+	/* Any priority list appends that failed during the transition
+	 * can now be replayed.
+	 */
+	if (rxc->new_state == RXC_ENABLED)
+		rxc->msg_offload = 1;
+
+	ret = cxip_recv_replay(rxc);
+	RXC_DBG(rxc, "Replay of failed receives ret: %d %s\n",
+		ret, fi_strerror(-ret));
+	assert(ret == FI_SUCCESS || ret == -FI_EAGAIN);
+
+	if (rxc->state != RXC_ONLOAD_FLOW_CONTROL_REENABLE &&
+	    rxc->new_state != RXC_ENABLED_SOFTWARE)
+		RXC_FATAL(rxc, FC_SW_ONLOAD_MSG_FATAL);
+
+	rxc->state = RXC_FLOW_CONTROL;
+	ret = cxip_recv_reenable(rxc);
+	assert(ret == FI_SUCCESS || ret == -FI_EAGAIN);
+	RXC_WARN(rxc, "Now in RXC_FLOW_CONTROL\n");
+
+	/* Disable to software managed transition is synchronous in order to
+	 * handle drop count mismatches correctly. If successful the H/W
+	 * transition completed, otherwise the transition will occur when
+	 * additional drop notifies are received.
+	 */
+	if (rxc->new_state == RXC_ENABLED_SOFTWARE && ret == FI_SUCCESS) {
+		cxip_fc_progress_ctrl(rxc);
+		rxc->state = RXC_ENABLED_SOFTWARE;
+		RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n");
+	}
+}
+
+/*
+ * cxip_ux_onload_complete() - Unexpected list entry onload complete.
+ *
+ * All unexpected message headers have been onloaded from hardware.
+ */
+static void cxip_ux_onload_complete(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->search.rxc;
+
+	assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE ||
+	       rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED);
+
+	free(rxc->ule_offsets);
+	rxc->ule_offsets = 0;
+
+	/* During a transition to software managed PtlTE, received
+	 * request list entries resulting from hardware not matching
+	 * the priority list on an incoming packet were added to a
+	 * pending unexpected message list. We merge the two
+	 * expected list here.
+	 */
+	RXC_DBG(rxc, "Req pending %d UX entries, SW list %d UX entries\n",
+		rxc->sw_pending_ux_list_len, rxc->sw_ux_list_len);
+
+	dlist_splice_tail(&rxc->sw_ux_list, &rxc->sw_pending_ux_list);
+	rxc->sw_ux_list_len += rxc->sw_pending_ux_list_len;
+	rxc->sw_pending_ux_list_len = 0;
+
+	RXC_WARN(rxc, "Software UX list updated, %d SW UX entries\n",
+		 rxc->sw_ux_list_len);
+
+	if (rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED)
+		cxip_post_ux_onload_sw(rxc);
+	else
+		cxip_post_ux_onload_fc(rxc);
+
+	ofi_atomic_dec32(&rxc->orx_reqs);
+	cxip_evtq_req_free(req);
+}
+
+/*
+ * cxip_get_ule_offsets() - Initialize an in-order array of ULE offsets
+ *
+ * If snapshot is requested, no more than two passes at getting offsets
+ * will be made. This is intended to be used with FI_CLAIM processing,
+ * where the PtlTE is enabled.
+ */
+static int cxip_get_ule_offsets(struct cxip_rxc *rxc, uint64_t **ule_offsets,
+				unsigned int *num_ule_offsets, bool snapshot)
+{
+	struct cxi_pte_status pte_status = {
+		.ule_count = 512
+	};
+	size_t cur_ule_count = 0;
+	int ret;
+	int calls = 0;
+
+	/* Get all the unexpected header remote offsets. */
+	*ule_offsets = NULL;
+	*num_ule_offsets = 0;
+
+	do {
+		cur_ule_count = pte_status.ule_count;
+		*ule_offsets = reallocarray(*ule_offsets, cur_ule_count,
+					    sizeof(*ule_offsets));
+		if (*ule_offsets == NULL) {
+			RXC_WARN(rxc, "Failed allocate ule offset memory\n");
+			ret = -FI_ENOMEM;
+			goto err;
+		}
+
+		pte_status.ule_offsets = (void *)*ule_offsets;
+		ret = cxil_pte_status(rxc->rx_pte->pte, &pte_status);
+		assert(!ret);
+	} while (cur_ule_count < pte_status.ule_count &&
+		 !(snapshot && ++calls > 1));
+
+	*num_ule_offsets = pte_status.ule_count;
+
+	return FI_SUCCESS;
+err:
+	free(*ule_offsets);
+
+	return ret;
+}
+
+/*
+ * cxip_ux_onload_cb() - Process SEARCH_AND_DELETE command events.
+ */
+static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event)
+{
+	struct cxip_rxc *rxc = req->search.rxc;
+	struct cxip_deferred_event *def_ev;
+	struct cxip_ux_send *ux_send;
+	bool matched;
+
+	assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL ||
+	       rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE ||
+	       rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED);
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_PUT_OVERFLOW:
+		assert(cxi_event_rc(event) == C_RC_OK);
+
+		ux_send = calloc(1, sizeof(*ux_send));
+		if (!ux_send) {
+			RXC_WARN(rxc, "Failed allocate to memory\n");
+			return -FI_EAGAIN;
+		}
+
+		/* Zero-byte unexpected onloads require special handling since
+		 * no deferred structure would be allocated.
+		 */
+		if (event->tgt_long.rlength) {
+
+			def_ev = match_put_event(rxc, req, event, &matched);
+			if (!matched) {
+				if (!def_ev) {
+					free(ux_send);
+					return -FI_EAGAIN;
+				}
+
+				/* Gather Put events later */
+				def_ev->ux_send = ux_send;
+				req->search.puts_pending++;
+			} else {
+				ux_send->req = def_ev->req;
+				ux_send->put_ev = def_ev->ev;
+
+				free_put_event(rxc, def_ev);
+			}
+		} else {
+			ux_send->put_ev = *event;
+		}
+
+		/* For flow control transition if a ULE is freed, then
+		 * set state so that re-enable will be attempted.
+		 */
+		if (rxc->state == RXC_ONLOAD_FLOW_CONTROL)
+			rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE;
+
+		/* Fixup event with the expected remote offset for an RGet. */
+		if (event->tgt_long.rlength) {
+			ux_send->put_ev.tgt_long.remote_offset =
+				rxc->ule_offsets[rxc->cur_ule_offsets] +
+				event->tgt_long.mlength;
+		}
+		rxc->cur_ule_offsets++;
+
+		dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list);
+		rxc->sw_ux_list_len++;
+
+		RXC_DBG(rxc, "Onloaded Send: %p\n", ux_send);
+
+		ofi_atomic_dec32(&rxc->orx_hw_ule_cnt);
+
+		break;
+	case C_EVENT_SEARCH:
+		if (rxc->new_state == RXC_ENABLED_SOFTWARE &&
+		    rxc->state == RXC_ONLOAD_FLOW_CONTROL)
+			rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE;
+
+		if (rxc->state == RXC_ONLOAD_FLOW_CONTROL)
+			RXC_FATAL(rxc, FC_SW_ONLOAD_MSG_FATAL);
+
+		req->search.complete = true;
+		rxc->rx_evtq.ack_batch_size = rxc->rx_evtq.cq->ack_batch_size;
+
+		RXC_DBG(rxc, "UX Onload Search done\n");
+
+		if (cxip_ux_is_onload_complete(req))
+			cxip_ux_onload_complete(req);
+
+		break;
+	default:
+		RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ux_onload() - Issue SEARCH_AND_DELETE command to on-load unexpected
+ * Send headers queued on the RXC message queue.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int cxip_ux_onload(struct cxip_rxc *rxc)
+{
+	struct cxip_req *req;
+	union c_cmdu cmd = {};
+	int ret;
+
+	assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL ||
+	       rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE ||
+	       rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED);
+
+	RXC_DBG(rxc, "Initiate hardware UX list onload\n");
+
+	/* Get all the unexpected header remote offsets. */
+	rxc->ule_offsets = NULL;
+	rxc->num_ule_offsets = 0;
+	rxc->cur_ule_offsets = 0;
+
+	ret = cxip_get_ule_offsets(rxc, &rxc->ule_offsets,
+				   &rxc->num_ule_offsets, false);
+	if (ret) {
+		RXC_WARN(rxc, "Failed to read UX remote offsets: %d %s\n",
+			 ret, fi_strerror(-ret));
+		goto err;
+	}
+
+	/* Populate request */
+	req = cxip_evtq_req_alloc(&rxc->rx_evtq, 1, NULL);
+	if (!req) {
+		RXC_DBG(rxc, "Failed to allocate request\n");
+		ret = -FI_EAGAIN;
+		goto err_free_onload_offset;
+	}
+	ofi_atomic_inc32(&rxc->orx_reqs);
+
+	req->cb = cxip_ux_onload_cb;
+	req->type = CXIP_REQ_SEARCH;
+	req->search.rxc = rxc;
+
+	cmd.command.opcode = C_CMD_TGT_SEARCH_AND_DELETE;
+	cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED;
+	cmd.target.ptlte_index = rxc->rx_pte->pte->ptn;
+	cmd.target.buffer_id = req->req_id;
+	cmd.target.length = -1U;
+	cmd.target.ignore_bits = -1UL;
+	cmd.target.match_id = CXI_MATCH_ID_ANY;
+
+	ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd);
+	if (ret) {
+		RXC_WARN(rxc, "Failed to write Search command: %d\n", ret);
+		ret = -FI_EAGAIN;
+		goto err_dec_free_cq_req;
+	}
+
+	cxi_cq_ring(rxc->rx_cmdq->dev_cmdq);
+
+	return FI_SUCCESS;
+
+err_dec_free_cq_req:
+	ofi_atomic_dec32(&rxc->orx_reqs);
+	cxip_evtq_req_free(req);
+err_free_onload_offset:
+	free(rxc->ule_offsets);
+err:
+	RXC_WARN(rxc, "Hardware UX list onload initiation error, ret: %d\n",
+		 ret);
+	return ret;
+}
+
+static int cxip_flush_appends_cb(struct cxip_req *req,
+				 const union c_event *event)
+{
+	struct cxip_rxc *rxc = req->req_ctx;
+	int ret;
+
+	assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL ||
+	       rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE ||
+	       rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED);
+
+	assert(event->hdr.event_type == C_EVENT_SEARCH);
+	assert(cxi_event_rc(event) == C_RC_NO_MATCH);
+
+	ret = cxip_ux_onload(rxc);
+	if (ret == FI_SUCCESS) {
+		ofi_atomic_dec32(&rxc->orx_reqs);
+		cxip_evtq_req_free(req);
+	}
+
+	return ret;
+}
+
+/*
+ * cxip_flush_appends() - Flush all user appends for a RXC.
+ *
+ * Before cxip_ux_onload() can be called, all user appends in the command queue
+ * must be flushed. If not, this can cause cxip_ux_onload() to read incorrect
+ * remote offsets from cxil_pte_status(). The flush is implemented by issuing
+ * a search command which will match zero ULEs. When the search event is
+ * processed, all pending user appends will have been processed. Since the RXC
+ * is not enabled, new appends cannot occur during this time.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int cxip_flush_appends(struct cxip_rxc *rxc)
+{
+	struct cxip_req *req;
+	union c_cmdu cmd = {};
+	int ret;
+
+	assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL ||
+	       rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE ||
+	       rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED);
+
+	/* Populate request */
+	req = cxip_evtq_req_alloc(&rxc->rx_evtq, 1, rxc);
+	if (!req) {
+		RXC_DBG(rxc, "Failed to allocate request\n");
+		ret = -FI_EAGAIN;
+		goto err;
+	}
+	ofi_atomic_inc32(&rxc->orx_reqs);
+
+	rxc->rx_evtq.ack_batch_size = 1;
+
+	req->cb = cxip_flush_appends_cb;
+	req->type = CXIP_REQ_SEARCH;
+
+	/* Search command which should match nothing. */
+	cmd.command.opcode = C_CMD_TGT_SEARCH;
+	cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED;
+	cmd.target.ptlte_index = rxc->rx_pte->pte->ptn;
+	cmd.target.buffer_id = req->req_id;
+	cmd.target.match_bits = -1UL;
+	cmd.target.length = 0;
+
+	ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd);
+	if (ret) {
+		RXC_WARN(rxc, "Failed to write Search command: %d\n", ret);
+		ret = -FI_EAGAIN;
+		goto err_dec_free_cq_req;
+	}
+
+	cxi_cq_ring(rxc->rx_cmdq->dev_cmdq);
+
+	return FI_SUCCESS;
+
+err_dec_free_cq_req:
+	ofi_atomic_dec32(&rxc->orx_reqs);
+	cxip_evtq_req_free(req);
+err:
+	return ret;
+}
+
+/*
+ * cxip_recv_pte_cb() - Process receive PTE state change events.
+ */
+void cxip_recv_pte_cb(struct cxip_pte *pte, const union c_event *event)
+{
+	struct cxip_rxc *rxc = (struct cxip_rxc *)pte->ctx;
+	int fc_reason = cxip_fc_reason(event);
+	int ret __attribute__((unused));
+
+	switch (pte->state) {
+	case C_PTLTE_ENABLED:
+		assert(rxc->state == RXC_FLOW_CONTROL ||
+		       rxc->state == RXC_DISABLED ||
+		       rxc->state == RXC_PENDING_PTLTE_HARDWARE);
+
+		/* Queue any flow control resume messages */
+		if (rxc->state == RXC_FLOW_CONTROL) {
+			cxip_fc_progress_ctrl(rxc);
+			RXC_WARN(rxc, "Now in RXC_ENABLED\n");
+		}
+
+		rxc->state = RXC_ENABLED;
+		break;
+
+	case C_PTLTE_DISABLED:
+		if (rxc->state == RXC_DISABLED)
+			break;
+
+		if (fc_reason == C_SC_DIS_UNCOR)
+			RXC_FATAL(rxc, "Disabled, LE uncorrectable err\n");
+
+		/* An incorrect drop count was used during PTE enable.
+		 * Another attempt will be made when a peer sends a side-band
+		 * drop message.
+		 */
+		if (cxi_event_rc(event) == C_RC_NO_MATCH) {
+			assert(rxc->state == RXC_FLOW_CONTROL ||
+			       rxc->state == RXC_ONLOAD_FLOW_CONTROL ||
+			       rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE ||
+			       rxc->state ==
+			       RXC_PENDING_PTLTE_SOFTWARE_MANAGED);
+			RXC_WARN(rxc, FC_DROP_COUNT_MSG);
+			break;
+		}
+
+		/* Flow control occurred while transitioning from HW to SW
+		 * managed PTE. Since onloading of all UX entries will have
+		 * been initiated (i.e. no new ones will be added) and the
+		 * PTE state change from RXC_PENDING_PTLTE_SOFTWARE_MANAGED
+		 * to RXC_ENABLED_SOFTWARE following onload complete is
+		 * protected by the ep_obj->lock, it is safe to indicate that
+		 * SW managed EP must be re-enabled on onload complete.
+		 * The request list will have been replenished.
+		 */
+		if (rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) {
+			RXC_WARN(rxc,
+				 "Flow control during HW to SW transition\n");
+			rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE;
+			break;
+		}
+
+		/* Check for flow control during flow control */
+		if (rxc->state != RXC_ENABLED &&
+		    rxc->state != RXC_ENABLED_SOFTWARE &&
+		    rxc->state != RXC_PENDING_PTLTE_DISABLE) {
+
+			/* There is race between SW disable on priority list
+			 * and HW initiated LE flow control which can be
+			 * ignored; otherwise it is a fatal error.
+			 */
+			if (fc_reason == CXIP_FC_SOFTWARE_INITIATED)
+				break;
+			RXC_FATAL(rxc, FC_SW_LE_MSG_FATAL);
+		}
+
+		/* Starting flow control processing. The default is for
+		 * flow control should re-enable in the previous
+		 * hardware/software managed state.
+		 */
+		rxc->prev_state = rxc->state;
+		rxc->new_state = rxc->state;
+		rxc->state = RXC_ONLOAD_FLOW_CONTROL;
+
+		RXC_DBG(rxc, "Flow control detected, H/W: %d reason: %d\n",
+			event->tgt_long.initiator.state_change.sc_nic_auto,
+			fc_reason);
+
+		switch (fc_reason) {
+		case CXIP_FC_SOFTWARE_INITIATED:
+			/* Software initiated state change, drop count
+			 * needs to start at zero instead of -1. Add 1 to
+			 * account for this. Note this is only initiated
+			 * from an hardware enabled PTE state.
+			 */
+			RXC_WARN(rxc, "SW initiated flow control\n");
+			if (rxc->ep_obj->asic_ver < CASSINI_2_0)
+				rxc->drop_count++;
+
+			/* If running in hybrid mode, resume operation as a
+			 * software managed EP to reduce LE resource load.
+			 */
+			if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE)
+				rxc->new_state = RXC_ENABLED_SOFTWARE;
+
+			rxc->num_fc_append_fail++;
+			break;
+
+		case C_SC_FC_EQ_FULL:
+			/* EQ full does not require LE resources be recovered
+			 * to re-enable.
+			 */
+			RXC_WARN(rxc, "Flow control EQ full\n");
+			rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE;
+			rxc->num_fc_eq_full++;
+			break;
+
+		case C_SC_FC_NO_MATCH:
+			/* Overflow list buffers were full/could not be matched
+			 * against. Must replenish buffers, but does not in
+			 * itself require resources be recovered.
+			 */
+			RXC_WARN(rxc, FC_OFLOW_NO_MATCH_MSG,
+				 cxip_env.oflow_buf_size);
+
+			rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE;
+			rxc->num_fc_no_match++;
+			break;
+
+		case C_SC_FC_UNEXPECTED_FAIL:
+			/* Hybrid mode is not enabled and overflow matches, but
+			 * LE resources prevent unexpected message allocation.
+			 */
+			RXC_WARN(rxc, "Flow control UX LE resources\n");
+			rxc->num_fc_unexp++;
+			break;
+
+		case C_SC_FC_REQUEST_FULL:
+			/* Running as software managed EP and request list
+			 * buffers were full/could not be matched against.
+			 * Must replenish buffers, but does not require that
+			 * LE resources are recovered.
+			 */
+			RXC_WARN(rxc, FC_REQ_FULL_MSG, cxip_env.req_buf_size);
+			rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE;
+			rxc->num_fc_req_full++;
+			break;
+
+		case C_SC_SM_APPEND_FAIL:
+		case C_SC_SM_UNEXPECTED_FAIL:
+		default:
+			RXC_FATAL(rxc, "Invalid disable PTE c_sc_reason: %d\n",
+				  fc_reason);
+		}
+		rxc->fc_reason = fc_reason;
+
+		do {
+			ret = cxip_flush_appends(rxc);
+		} while (ret == -FI_EAGAIN);
+
+		if (ret != FI_SUCCESS)
+			RXC_FATAL(rxc, "cxip_flush_appends failed: %d\n", ret);
+
+		break;
+
+	case C_PTLTE_SOFTWARE_MANAGED:
+		/* There is an inherent race between hardware and software
+		 * in setting the PtlTE state. If software requested to
+		 * disable the PtlTE after hardware started a HW to SW
+		 * transition; just wait for the disable event.
+		 */
+		if (rxc->state == RXC_PENDING_PTLTE_DISABLE)
+			break;
+
+		RXC_DBG(rxc, "SW Managed: nic auto: %d, reason: %d\n",
+			event->tgt_long.initiator.state_change.sc_nic_auto,
+			event->tgt_long.initiator.state_change.sc_nic_auto ?
+			event->tgt_long.initiator.state_change.sc_reason : -1);
+
+		/* We should not get a bad drop count status since the
+		 * transition is synchronous but we will need this in
+		 * the future.
+		 */
+		if (cxi_event_rc(event) == C_RC_NO_MATCH) {
+			RXC_WARN(rxc, "Bad drop count, ignored\n");
+			break;
+		}
+
+		/* Sanity check */
+		if (rxc->state == RXC_FLOW_CONTROL)
+			RXC_FATAL(rxc, "FC to SW EP should be synchronous\n");
+
+		assert(rxc->state == RXC_DISABLED ||
+		       rxc->state == RXC_ENABLED ||
+		       rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED);
+
+		/* Hardware should only generate PTE software managed events
+		 * in two cases:
+		 * 1. Initial start in software mode: disabled->software.
+		 * 2. NIC initiated software transition: enabled->software.
+		 */
+		switch (fc_reason) {
+		case CXIP_FC_SOFTWARE_INITIATED:
+			/* If messaging was initially offloaded then this
+			 * state transition can only happen if the RXC has
+			 * been disabled; it is safe to ignore this change.
+			 */
+			assert(rxc->state == RXC_DISABLED);
+			if (!cxip_env.msg_offload) {
+				RXC_WARN(rxc, "Software managed EP enabled\n");
+				rxc->state = RXC_ENABLED_SOFTWARE;
+			}
+			break;
+
+		case C_SC_SM_APPEND_FAIL:
+		case C_SC_SM_UNEXPECTED_FAIL:
+			/* The NIC initiated the transition; priority list
+			 * appends that are in flight will fail and be added
+			 * to the receive replay list. Update state so that
+			 * no additional appends will be attempted until
+			 * onload completes and the failed appends are
+			 * replayed.
+			 */
+			RXC_WARN(rxc,
+				 "NIC transition to SW EP, c_sc_reason: %d\n",
+				 fc_reason);
+			rxc->fc_reason = fc_reason;
+			rxc->prev_state = rxc->state;
+			rxc->new_state = RXC_ENABLED_SOFTWARE;
+
+			if (rxc->fc_reason == C_SC_SM_UNEXPECTED_FAIL)
+				rxc->num_sc_nic_hw2sw_unexp++;
+			else if (rxc->fc_reason == C_SC_SM_APPEND_FAIL)
+				rxc->num_sc_nic_hw2sw_append_fail++;
+
+			rxc->msg_offload = 0;
+			rxc->state = RXC_PENDING_PTLTE_SOFTWARE_MANAGED;
+			do {
+				/* Flush and kick-off onloading of UX list */
+				ret = cxip_flush_appends(rxc);
+			} while (ret == -FI_EAGAIN);
+			if (ret != FI_SUCCESS)
+				RXC_WARN(rxc, "Flush/UX onload err: %d\n", ret);
+			break;
+		default:
+			RXC_FATAL(rxc, "Invalid PTE c_sc_reason: %d\n",
+				  fc_reason);
+		}
+
+		break;
+	default:
+		RXC_FATAL(rxc, "Unexpected state received: %u\n", pte->state);
+	}
+}
+
+/*
+ * tag_match() - Compare UX Send tag and Receive tags in SW.
+ */
+static bool tag_match(uint64_t init_mb, uint64_t mb, uint64_t ib)
+{
+	return !((init_mb ^ mb) & ~ib);
+}
+
+/*
+ * tag_match() - Compare UX Send initiator and Receive initiator in SW.
+ */
+static bool init_match(struct cxip_rxc *rxc, uint32_t init, uint32_t match_id)
+{
+	if (match_id == CXI_MATCH_ID_ANY)
+		return true;
+
+	if (rxc->ep_obj->av->symmetric) {
+		init = CXI_MATCH_ID_EP(rxc->pid_bits, init);
+		match_id = CXI_MATCH_ID_EP(rxc->pid_bits, match_id);
+	}
+
+	return init == match_id;
+}
+
+/*
+ * cxip_claim_onload_cb() - Process SEARCH and DELETE of claimed UX message.
+ */
+static int cxip_claim_onload_cb(struct cxip_req *req,
+				const union c_event *evt)
+{
+	struct cxip_rxc *rxc = req->req_ctx;
+	struct cxip_deferred_event *def_ev;
+	struct cxip_ux_send *ux_send;
+	bool matched = false;
+
+	if (evt->hdr.event_type != C_EVENT_PUT_OVERFLOW)
+		RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(evt),
+			  cxi_rc_to_str(cxi_event_rc(evt)));
+
+	/* Failed to onload UX message, return ENOMSG */
+	if (cxi_event_rc(evt) != C_RC_OK) {
+		RXC_WARN(rxc, "FI_CLAIM HW onload failed: %d\n",
+			 cxi_event_rc(evt));
+		recv_req_peek_complete(req, NULL);
+
+		return FI_SUCCESS;
+	}
+
+	ofi_atomic_dec32(&rxc->orx_hw_ule_cnt);
+
+	/* FI_CLAIM UX message onloaded from hardware */
+	ux_send = calloc(1, sizeof(*ux_send));
+	if (!ux_send) {
+		RXC_WARN(rxc, "Failed allocate UX memory\n");
+		return -FI_EAGAIN;
+	}
+	ux_send->claimed = true;
+
+	/* Zero-byte unexpected onloads require special handling
+	 * since no deferred structure would be allocated.
+	 */
+	if (evt->tgt_long.rlength) {
+		def_ev = match_put_event(rxc, req, evt, &matched);
+		if (!matched) {
+			/* The EVENT_PUT to the overflow list has not been
+			 * processed. The FI_CLAIM operation will be completed
+			 * when the matching put is received.
+			 */
+			if (!def_ev) {
+				free(ux_send);
+				return -FI_EAGAIN;
+			}
+			def_ev->ux_send = ux_send;
+		} else {
+			ux_send->req = def_ev->req;
+			ux_send->put_ev = def_ev->ev;
+			free_put_event(rxc, def_ev);
+		}
+
+		/* Fixup event remote offset for an RGet. */
+		if (evt->tgt_long.rlength)
+			ux_send->put_ev.tgt_long.remote_offset =
+				req->recv.ule_offset + evt->tgt_long.mlength;
+
+	} else {
+		matched = true;
+		ux_send->put_ev = *evt;
+	}
+
+	/* Add to the sw UX list as a claimed entry, it will be ignored in
+	 * recieve matching of UX list entries. Its order no longer matters.
+	 */
+	dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list);
+	rxc->sw_ux_list_len++;
+
+	RXC_DBG(rxc, "FI_CLAIM Onload req: %p ux_send %p\n", req, ux_send);
+	recv_req_tgt_event(req, &ux_send->put_ev);
+
+	/* Put was already received, return FI_CLAIM completion */
+	if (matched) {
+		recv_req_peek_complete(req, ux_send);
+		RXC_DBG(rxc, "FI_CLAIM onload complete, req %p, ux_send %p\n",
+			req, ux_send);
+	}
+
+	ofi_atomic_dec32(&rxc->orx_hw_ule_cnt);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_claim_ux_onload() - Initiate SEARCH and DELETE of FI_CLAIM ux entry.
+ */
+static int cxip_claim_ux_onload(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->req_ctx;
+	int ret = FI_SUCCESS;
+	union c_cmdu cmd = {};
+	union cxip_match_bits mb = {};
+	union cxip_match_bits ib = {};
+
+	if (rxc->state != RXC_ENABLED) {
+		RXC_DBG(rxc, "FC inprogress, fail claim req %p\n", req);
+		goto err;
+	}
+
+	/* Initiate a search to get the remote offset for the
+	 * unexpected list entry we matched.
+	 */
+	req->cb = cxip_claim_onload_cb;
+	mb.tag = req->recv.tag;
+	mb.tagged = 1;
+	ib.tx_id = ~0;
+	ib.cq_data = ~0;
+	ib.match_comp = ~0;
+	ib.rdzv_done = ~0;
+	ib.le_type = ~0;
+	ib.tag = req->recv.ignore;
+
+	cmd.command.opcode = C_CMD_TGT_SEARCH_AND_DELETE;
+
+	cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED;
+	cmd.target.ptlte_index = rxc->rx_pte->pte->ptn;
+	cmd.target.buffer_id = req->req_id;
+	cmd.target.length = -1U;
+	cmd.target.ignore_bits = ib.raw;
+	cmd.target.match_bits =  mb.raw;
+	cmd.target.match_id = req->recv.match_id;
+	/* Delete first match */
+	cmd.target.use_once = 1;
+
+	ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd);
+	if (ret) {
+		/* This condition should clear */
+		RXC_WARN(rxc,
+			 "Cannot emit of UX delete cmd, return -FI_EAGAIN\n");
+		return -FI_EAGAIN;
+	}
+
+	cxi_cq_ring(rxc->rx_cmdq->dev_cmdq);
+
+	/* Hardware handles the race between subsequent priority list
+	 * appends to the search and delete command. Re-enable.
+	 */
+	rxc->hw_claim_in_progress = false;
+	RXC_DBG(rxc, "FI_CLAIM Search and Delete of UX entry initiated\n");
+
+	return FI_SUCCESS;
+
+err:
+	/* Unable to initiate FI_CLAIM, report as ENOMSG */
+	rxc->hw_claim_in_progress = false;
+	recv_req_peek_complete(req, NULL);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_hw_claim_offset_cb() - Process SEARCH command events to get remote
+ * offset of entry to be deleted.
+ */
+static int cxip_hw_claim_offset_cb(struct cxip_req *req,
+				   const union c_event *evt)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	union cxip_match_bits ux_mb;
+	uint32_t ux_init;
+	int ret;
+
+	switch (evt->hdr.event_type) {
+	case C_EVENT_SEARCH:
+		if (cxi_event_rc(evt) == C_RC_OK) {
+			RXC_DBG(rxc, "Claim UX offset search entry, req: %p\n",
+				req);
+
+			if (req->recv.offset_found)
+				break;
+
+			req->recv.cur_ule_offsets++;
+
+			/* Not found in range of the offsets we have */
+			if (req->recv.cur_ule_offsets >
+			    req->recv.num_ule_offsets) {
+				RXC_DBG(rxc, "Claim UX offsets exceeded\n");
+				break;
+			}
+
+			/* Check for a match against the FI_PEEK */
+			ux_mb.raw = evt->tgt_long.match_bits;
+			ux_init = evt->tgt_long.initiator.initiator.process;
+
+			if (req->recv.tagged != ux_mb.tagged)
+				break;
+			if (ux_mb.tagged
+			    && !tag_match(ux_mb.tag, req->recv.tag,
+					  req->recv.ignore))
+				break;
+			if (!init_match(rxc, ux_init, req->recv.match_id))
+				break;
+
+			/* Matched, update to ignore any future events */
+			req->recv.offset_found = true;
+			req->recv.ule_offset =
+				req->recv.ule_offsets[req->recv.cur_ule_offsets - 1];
+
+			RXC_DBG(rxc, "Found offset for claim %p, %d : 0x%lX\n",
+				req, req->recv.cur_ule_offsets - 1,
+				req->recv.ule_offset);
+			break;
+		}
+
+		assert(cxi_event_rc(evt) == C_RC_NO_MATCH);
+
+		RXC_DBG(rxc, "FI_CLAIM remote offset search done, status %d\n",
+			cxi_event_rc(evt));
+
+		if (!req->recv.offset_found) {
+			RXC_DBG(rxc, "Req %p, FI_CLAIM UX not found\n", req);
+			goto err_not_found;
+		}
+
+		ret = cxip_claim_ux_onload(req);
+		if (ret) {
+			/* Unable to initiate SEARCH and DELETE, this
+			 * should clear. All other errors return ENOMSG.
+			 */
+			if (ret == -FI_EAGAIN)
+				return ret;
+
+			RXC_WARN(rxc, "claim_ux_onload failed %d\n", ret);
+			goto err_not_found;
+		}
+
+		RXC_DBG(rxc, "FI_CLAIM req %p remote offset 0x%lX\n",
+			req, req->recv.ule_offset);
+		break;
+	default:
+		RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(evt),
+			  cxi_rc_to_str(cxi_event_rc(evt)));
+	}
+
+	return FI_SUCCESS;
+
+err_not_found:
+	/* Terminate FI_PEEK with FI_CLAIM with ENOMSG */
+	rxc->hw_claim_in_progress = false;
+	free(req->recv.ule_offsets);
+	req->recv.ule_offsets = NULL;
+	recv_req_peek_complete(req, NULL);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_initiate_hw_claim() - Onload the specified peek, claiming it.
+ */
+static int cxip_initiate_hw_claim(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->req_ctx;
+	union c_cmdu cmd = {};
+	int ret = FI_SUCCESS;
+
+	if (rxc->state != RXC_ENABLED) {
+		RXC_DBG(rxc, "FC inprogress, unable to claim req %p\n", req);
+		goto err;
+	}
+
+	/* UX entry exists in hardware, the initial search acts as a flush of
+	 * the event queue for priority list appends. Get remote offset for
+	 * the associated unexpected list entry.
+	 */
+	req->recv.cur_ule_offsets = 0;
+	ret = cxip_get_ule_offsets(rxc, &req->recv.ule_offsets,
+				   &req->recv.num_ule_offsets, true);
+	if (ret) {
+		RXC_WARN(rxc, "Unable to get FI_CLAIM  UX offsets\n");
+		goto err;
+	}
+
+	RXC_DBG(rxc, "ule_offsets %p, num offsets %d\n",
+		req->recv.ule_offsets, req->recv.num_ule_offsets);
+
+	/* Initiate a search to get the remote offset for the
+	 * unexpected list entry we matched. This requires going
+	 * through the list.
+	 */
+	req->cb = cxip_hw_claim_offset_cb;
+
+	cmd.command.opcode = C_CMD_TGT_SEARCH;
+	cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED;
+	cmd.target.ptlte_index = rxc->rx_pte->pte->ptn;
+	cmd.target.buffer_id = req->req_id;
+	cmd.target.length = -1U;
+	cmd.target.ignore_bits = -1UL;
+	cmd.target.match_id = CXI_MATCH_ID_ANY;
+
+	ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd);
+	if (ret) {
+		RXC_WARN(rxc, "Failed to write Search command: %d\n", ret);
+		goto err_free_offsets;
+	}
+
+	cxi_cq_ring(rxc->rx_cmdq->dev_cmdq);
+
+	RXC_DBG(rxc, "Search for remote offsets initiated, req %p\n", req);
+
+	return FI_SUCCESS;
+
+err_free_offsets:
+	free(req->recv.ule_offsets);
+	req->recv.ule_offsets = NULL;
+err:
+	/* Unable to initiate FI_CLAIM, report as ENOMSG */
+	rxc->hw_claim_in_progress = false;
+	recv_req_peek_complete(req, NULL);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ux_peek_cb() - Process UX list SEARCH command events.
+ */
+static int cxip_ux_peek_cb(struct cxip_req *req, const union c_event *event)
+{
+	struct cxip_rxc *rxc = req->req_ctx;
+
+	assert(req->recv.flags & FI_PEEK);
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_SEARCH:
+		/* Will receive event for only first match or failure */
+		if (cxi_event_rc(event) == C_RC_OK) {
+			RXC_DBG(rxc, "Peek UX search req: %p matched\n", req);
+			if (req->recv.flags & FI_CLAIM) {
+				RXC_DBG(rxc, "req: %p UX must be claimed\n",
+					req);
+				return cxip_initiate_hw_claim(req);
+			}
+
+			/* FI_PEEK only was found */
+			recv_req_tgt_event(req, event);
+		} else {
+			RXC_DBG(rxc, "Peek UX search req: %p no match\n", req);
+		}
+
+		recv_req_peek_complete(req, NULL);
+		break;
+
+	default:
+		RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_ux_peek() - Issue a SEARCH command to peek for a matching send
+ * on the RXC offloaded unexpected message list.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int cxip_ux_peek(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->req_ctx;
+	union c_cmdu cmd = {};
+	union cxip_match_bits mb = {};
+	union cxip_match_bits ib = {};
+	int ret;
+
+	assert(req->recv.flags & FI_PEEK);
+
+	req->cb = cxip_ux_peek_cb;
+
+	mb.tag = req->recv.tag;
+	mb.tagged = 1;
+	ib.tx_id = ~0;
+	ib.cq_data = ~0;
+	ib.match_comp = ~0;
+	ib.rdzv_done = ~0;
+	ib.le_type = ~0;
+	ib.tag = req->recv.ignore;
+
+	cmd.command.opcode = C_CMD_TGT_SEARCH;
+	cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED;
+	cmd.target.ptlte_index = rxc->rx_pte->pte->ptn;
+	cmd.target.buffer_id = req->req_id;
+	cmd.target.length = -1U;
+	cmd.target.ignore_bits = ib.raw;
+	cmd.target.match_bits =  mb.raw;
+	cmd.target.match_id = req->recv.match_id;
+	/* First match only */
+	cmd.target.use_once = 1;
+
+	if (cxip_evtq_saturated(&rxc->rx_evtq)) {
+		RXC_DBG(rxc, "Target HW EQ saturated\n");
+		return -FI_EAGAIN;
+	}
+
+	RXC_DBG(rxc, "Peek UX search req: %p mb.raw: 0x%" PRIx64 " match_id: 0x%x ignore: 0x%" PRIx64 "\n",
+		req, mb.raw, req->recv.match_id, req->recv.ignore);
+
+	ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd);
+	if (ret) {
+		RXC_WARN(rxc, "Failed to write Search command: %d\n", ret);
+		return -FI_EAGAIN;
+	}
+
+	cxi_cq_ring(rxc->rx_cmdq->dev_cmdq);
+
+	/* If FI_CLAIM, we disable priority list appends so the
+	 * search acts as a flush of outstanding appends.
+	 */
+	if (req->flags & FI_CLAIM)
+		rxc->hw_claim_in_progress = true;
+
+	return FI_SUCCESS;
+}
+
+/* cxip_set_ux_dump_entry() - initialize a CQ entry structure
+ * and/or source address with UX message info.
+ */
+static void cxip_set_ux_dump_entry(struct cxip_req *req,
+				   const union c_event *evt)
+{
+	struct cxip_ux_dump_state *ux_dump = req->recv.ux_dump;
+	union cxip_match_bits mb;
+	struct fi_cq_tagged_entry *cq_entry = NULL;
+	fi_addr_t *src_addr = NULL;
+
+	ux_dump->ux_count++;
+
+	/* If exceeding caller provided space updating the total
+	 * available UX message count is all that is required.
+	 */
+	if (ux_dump->ret_count >= ux_dump->max_count)
+		return;
+
+	if (ux_dump->entry)
+		cq_entry = &ux_dump->entry[ux_dump->ret_count];
+	if (ux_dump->src_addr)
+		src_addr = &ux_dump->src_addr[ux_dump->ret_count];
+
+	if (cq_entry || src_addr) {
+		ux_dump->ret_count++;
+
+		req->recv.tgt_event = false;
+		req->flags = 0;
+		recv_req_tgt_event(req, evt);
+
+		if (cq_entry) {
+			/* Need to add FI_TAGGED or FI_MSG directly */
+			mb.raw = evt->tgt_long.match_bits;
+			if (mb.tagged)
+				req->flags |= FI_TAGGED;
+			else
+				req->flags |= FI_MSG;
+			cq_entry->op_context = NULL;
+			cq_entry->flags = req->flags;
+			cq_entry->len = req->recv.rlen;
+			cq_entry->buf = NULL;
+			cq_entry->data = req->data;
+			cq_entry->tag = req->tag;
+		}
+
+		if (src_addr && req->recv.rxc->attr.caps & FI_SOURCE)
+			*src_addr = recv_req_src_addr(req);
+	}
+}
+
+/*
+ * cxip_unexp_msg_dump_cb() - Process search command dumping H/W UX entries.
+ */
+static int cxip_unexp_msg_dump_cb(struct cxip_req *req,
+				  const union c_event *evt)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+
+	if (evt->hdr.event_type != C_EVENT_SEARCH)
+		RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(evt),
+			  cxi_rc_to_str(cxi_event_rc(evt)));
+
+	if (cxi_event_rc(evt) == C_RC_NO_MATCH) {
+		req->recv.ux_dump->done = true;
+		return FI_SUCCESS;
+	}
+	assert(cxi_event_rc(evt) == C_RC_OK);
+
+	cxip_set_ux_dump_entry(req, evt);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_build_debug_ux_entry_info() - Initialize UX info array from ULE.
+ *
+ * It is expected that a debugger is utilizing this interface and is
+ * expecting synchronous behavior.
+ *
+ * Caller should hold ep_obj->lock.
+ */
+int cxip_build_ux_entry_info(struct cxip_ep *ep,
+			     struct fi_cq_tagged_entry *entry, size_t count,
+			     fi_addr_t *src_addr, size_t *ux_count)
+{
+	struct cxip_rxc *rxc = &ep->ep_obj->rxc;
+	struct cxip_ux_dump_state *ux_dump;
+	struct cxip_ux_send *ux_send;
+	struct dlist_entry *tmp;
+	struct cxip_req *req = NULL;
+	union c_cmdu cmd = {};
+	int ret_count;
+	int ret;
+
+	ret = cxip_recv_req_alloc(rxc, NULL, 0, &req);
+	if (ret)
+		return ret;
+
+	ux_dump = calloc(1, sizeof(struct cxip_ux_dump_state));
+	if (!ux_dump) {
+		RXC_WARN(rxc, "ENOMEM on allocate of UX state buffer\n");
+		ret_count = -FI_ENOMEM;
+		goto done;
+	}
+
+	ux_dump->max_count = count;
+	ux_dump->entry = entry;
+	ux_dump->src_addr = src_addr;
+	req->recv.ux_dump = ux_dump;
+
+	/* Get entries from software UX list first */
+	dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send,
+				     ux_send, rxc_entry, tmp)
+		cxip_set_ux_dump_entry(req, &ux_send->put_ev);
+
+	if (!rxc->msg_offload)
+		goto done;
+
+	/* Read H/W UX list processing the request events synchronously
+	 * until we set "Done" in the request callback.
+	 */
+	req->cb = cxip_unexp_msg_dump_cb;
+	cmd.command.opcode = C_CMD_TGT_SEARCH;
+	cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED;
+	cmd.target.ptlte_index = rxc->rx_pte->pte->ptn;
+	cmd.target.buffer_id = req->req_id;
+	cmd.target.length = -1U;
+	cmd.target.ignore_bits = -1UL;
+	cmd.target.match_id = CXI_MATCH_ID_ANY;
+
+	ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd);
+	if (ret) {
+		RXC_WARN(rxc, "Failed to write ULE Search command: %d\n", ret);
+		ret_count = ret;
+		goto done;
+	}
+	cxi_cq_ring(rxc->rx_cmdq->dev_cmdq);
+
+	RXC_DBG(rxc, "Search for ULE dump initiated, req %p\n", req);
+	do {
+		cxip_evtq_progress(&rxc->rx_evtq);
+		sched_yield();
+	} while (!ux_dump->done);
+
+	RXC_DBG(rxc, "Search ULE dump done, req %p, count %ld\n",
+		req, ux_dump->ret_count);
+done:
+	ret_count = ux_dump->ret_count;
+	*ux_count = ux_dump->ux_count;
+
+	free(ux_dump);
+	cxip_recv_req_free(req);
+
+	return ret_count;
+}
+
+/*
+ * cxip_recv_sw_matched() - Progress the SW Receive match.
+ *
+ * Progress the operation which matched in SW.
+ */
+static int cxip_recv_sw_matched(struct cxip_req *req,
+				struct cxip_ux_send *ux_send)
+{
+	int ret;
+	uint64_t mrecv_start;
+	uint32_t mrecv_len;
+	bool req_done = true;
+	uint32_t ev_init;
+	uint32_t ev_rdzv_id;
+	struct cxip_req *rdzv_req;
+	struct cxip_rxc *rxc = req->recv.rxc;
+
+	assert(req->type == CXIP_REQ_RECV);
+
+	mrecv_start = req->recv.start_offset;
+	mrecv_len = mrecv_req_put_bytes(req, ux_send->put_ev.tgt_long.rlength);
+
+	if (req->recv.multi_recv &&
+	    (req->recv.ulen - req->recv.start_offset) >=
+	     req->recv.rxc->min_multi_recv)
+		req_done = false;
+
+	if (ux_send->put_ev.tgt_long.rendezvous) {
+
+		/* Make sure we can issue the RGet; if not we stall
+		 * and TX event queue progress will free up credits.
+		 */
+		if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > rxc->max_tx) {
+			ofi_atomic_dec32(&rxc->orx_tx_reqs);
+			return -FI_EAGAIN;
+		}
+
+		ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev,
+				   mrecv_start, mrecv_len, req_done);
+		if (ret != FI_SUCCESS) {
+			req->recv.start_offset -= mrecv_len;
+			ofi_atomic_dec32(&rxc->orx_tx_reqs);
+
+			return ret;
+		}
+
+		/* If multi-recv, a child request was created from
+		 * cxip_ux_send(). Need to lookup this request.
+		 *
+		 * NOTE: Since the same event will be used, the evenet checks
+		 * must be NOT be performed. The event checks are only needed
+		 * when hardware is generating put and put overflow events for
+		 * an mrecv buffer. If we have reached here, we know a put
+		 * overflow event will never occur since the mrecv buffer has
+		 * not been offloaded to hardware.
+		 */
+		if (req->recv.multi_recv) {
+			ret = rdzv_mrecv_req_lookup(req, &ux_send->put_ev,
+						    &ev_init, &ev_rdzv_id,
+						    false, &rdzv_req);
+
+			/* If the previous cxip_ux_send() returns FI_SUCCESS,
+			 * a matching rdzv mrecv req will always exist.
+			 */
+			assert(ret == FI_SUCCESS);
+		} else {
+			rdzv_req = req;
+		}
+
+		/* Rendezvous event will not happen. So ack rendezvous event
+		 * now.
+		 */
+		rdzv_recv_req_event(rdzv_req, ux_send->put_ev.hdr.event_type);
+
+		cxip_recv_req_set_rget_info(rdzv_req);
+
+
+		/* A TX credit has been reserved and user receive request may
+		 * have been removed from the ordered SW queue. If the command
+		 * queue is backed up the condition will clear and the rget
+		 * must get sent out, so wait for it.
+		 */
+		do {
+			ret = issue_rdzv_get(rdzv_req);
+		} while (ret == -FI_EAGAIN);
+		assert(ret == FI_SUCCESS);
+	} else {
+		if (ux_send->put_ev.tgt_long.rlength)
+			ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev,
+					   mrecv_start, mrecv_len, req_done);
+		else
+			ret = cxip_ux_send_zb(req, &ux_send->put_ev,
+					      mrecv_start, req_done);
+
+		if (ret != FI_SUCCESS) {
+			/* undo mrecv_req_put_bytes() */
+			req->recv.start_offset -= mrecv_len;
+			return ret;
+		}
+	}
+
+	/* If this is a multi-receive request and there is still space, return
+	 * a special code to indicate SW should keep matching messages to it.
+	 */
+	if (ret == FI_SUCCESS && !req_done)
+		return -FI_EINPROGRESS;
+
+	return ret;
+}
+
+static bool cxip_match_recv_sw(struct cxip_rxc *rxc, struct cxip_req *req,
+			       struct cxip_ux_send *ux, bool claimed)
+{
+	union cxip_match_bits ux_mb;
+	uint32_t ux_init;
+
+	if (claimed != ux->claimed)
+		return false;
+
+	ux_mb.raw = ux->put_ev.tgt_long.match_bits;
+	ux_init = ux->put_ev.tgt_long.initiator.initiator.process;
+
+	if (req->recv.tagged != ux_mb.tagged)
+		return false;
+
+	if (ux_mb.tagged &&
+	    !tag_match(ux_mb.tag, req->recv.tag, req->recv.ignore))
+		return false;
+
+	if (!init_match(rxc, ux_init, req->recv.match_id))
+		return false;
+
+	return true;
+}
+
+static int cxip_recv_sw_matcher(struct cxip_rxc *rxc, struct cxip_req *req,
+				struct cxip_ux_send *ux, bool claimed)
+{
+	int ret;
+
+	if (!cxip_match_recv_sw(rxc, req, ux, claimed))
+		return -FI_ENOMSG;
+
+	ret = cxip_recv_sw_matched(req, ux);
+	if (ret == -FI_EAGAIN)
+		return -FI_EAGAIN;
+
+	/* FI_EINPROGRESS is return for a multi-recv match. */
+	assert(ret == FI_SUCCESS || ret == -FI_EINPROGRESS);
+
+	/* TODO: Manage freeing of UX entries better. */
+	dlist_remove(&ux->rxc_entry);
+	if (ux->req && ux->req->type == CXIP_REQ_RBUF) {
+		cxip_req_buf_ux_free(ux);
+		rxc->sw_ux_list_len--;
+	} else {
+		free(ux);
+		rxc->sw_ux_list_len--;
+	}
+
+	RXC_DBG(rxc,
+		"Software match, req: %p ux_send: %p (sw_ux_list_len: %u)\n",
+		req, ux, req->recv.rxc->sw_ux_list_len);
+
+	return ret;
+}
+
+/*
+ * cxip_recv_ux_sw_matcher() - Attempt to match an unexpected message to a user
+ * posted receive.
+ *
+ * User must hold the ep_obj->lock.
+ */
+int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux)
+{
+	struct cxip_ptelist_buf *rbuf = ux->req->req_ctx;
+	struct cxip_rxc *rxc = rbuf->rxc;
+	struct cxip_req *req;
+	struct dlist_entry *tmp;
+	int ret;
+
+	if (dlist_empty(&rxc->sw_recv_queue))
+		return -FI_ENOMSG;
+
+	dlist_foreach_container_safe(&rxc->sw_recv_queue, struct cxip_req, req,
+				     recv.rxc_entry, tmp) {
+		/* Only matches against unclaimed UX messages */
+		ret = cxip_recv_sw_matcher(rxc, req, ux, false);
+
+		/* Unexpected message found match but unable to progress */
+		if (ret == -FI_EAGAIN)
+			return ret;
+
+		/* Unexpected message found a match. */
+		if (ret == FI_SUCCESS || ret == -FI_EINPROGRESS)
+			return FI_SUCCESS;
+	}
+
+	return -FI_ENOMSG;
+}
+
+/*
+ * cxip_recv_req_sw_matcher() - Attempt to match the receive request in SW.
+ *
+ * Loop through all onloaded UX Sends looking for a match for the Receive
+ * request. If a match is found, progress the operation.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_recv_req_sw_matcher(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	struct cxip_ux_send *ux_send;
+	struct dlist_entry *tmp;
+	int ret;
+
+	if (dlist_empty(&rxc->sw_ux_list))
+		return -FI_ENOMSG;
+
+	dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send,
+				     ux_send, rxc_entry, tmp) {
+		/* Only match against unclaimed UX messages */
+		ret = cxip_recv_sw_matcher(rxc, req, ux_send, false);
+		switch (ret) {
+		/* On successful multi-recv or no match, keep matching. */
+		case -FI_EINPROGRESS:
+		case -FI_ENOMSG:
+			break;
+
+		/* Stop matching. */
+		default:
+			return ret;
+		}
+	}
+
+	return -FI_ENOMSG;
+}
+
+/*
+ * cxip_recv_req_dropped() - Mark the Received request dropped.
+ *
+ * If HW does not have sufficient LEs to perform an append, the command is
+ * dropped. Queue the request for replay. When all outstanding append commands
+ * complete, replay all Receives.
+ *
+ * Caller must hold ep_obj->lock
+ */
+static int cxip_recv_req_dropped(struct cxip_req *req)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	int ret __attribute__((unused));
+
+	assert(dlist_empty(&req->recv.rxc_entry));
+	dlist_insert_tail(&req->recv.rxc_entry, &rxc->replay_queue);
+
+	RXC_DBG(rxc, "Receive dropped: %p\n", req);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_recv_req_peek() - Peek for matching unexpected message on RXC.
+ *
+ * Examine onloaded UX sends, if not found there and HW offload is enabled,
+ * initiate check of HW UX list. In either case the operation will not
+ * consume the UX send, but only report the results of the peek to the CQ.
+ *
+ * Caller must hold the ep_obj->lock.
+ */
+static int cxip_recv_req_peek(struct cxip_req *req, bool check_rxc_state)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	struct cxip_ux_send *ux_send;
+	struct dlist_entry *tmp;
+	int ret;
+
+	if (check_rxc_state && rxc->state != RXC_ENABLED &&
+	    rxc->state != RXC_ENABLED_SOFTWARE)
+		return -FI_EAGAIN;
+
+	/* Attempt to match the onloaded UX list first */
+	dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send,
+				     ux_send, rxc_entry, tmp) {
+		if (cxip_match_recv_sw(rxc, req, ux_send, false)) {
+			if (req->recv.flags & FI_CLAIM)
+				ux_send->claimed = true;
+
+			recv_req_tgt_event(req, &ux_send->put_ev);
+			recv_req_peek_complete(req, ux_send);
+			return FI_SUCCESS;
+		}
+	}
+
+	if (rxc->msg_offload) {
+		/* Must serialize H/W FI_CLAIM due to getting remote offsets */
+		if (rxc->hw_claim_in_progress)
+			return -FI_EAGAIN;
+
+		ret = cxip_ux_peek(req);
+	} else {
+		req->recv.rc = C_RC_NO_MATCH;
+		recv_req_peek_complete(req, NULL);
+		ret = FI_SUCCESS;
+	}
+
+	return ret;
+}
+
+/*
+ * cxip_recv_req_queue() - Queue Receive request on RXC.
+ *
+ * Before appending a new Receive request to a HW list, attempt to match the
+ * Receive to any onloaded UX Sends.
+ *
+ * Caller must hold the RXC lock and ensure correct RXC state if required.
+ */
+static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	int ret;
+
+	/* Try to match against onloaded Sends first. */
+	ret = cxip_recv_req_sw_matcher(req);
+	if (ret == FI_SUCCESS)
+		return -FI_EALREADY;
+	else if (ret == -FI_EAGAIN)
+		return -FI_EAGAIN;
+	else if (ret != -FI_ENOMSG)
+		RXC_FATAL(rxc, "SW matching failed: %d\n", ret);
+
+	if (rxc->msg_offload) {
+		/* Can not append to priority list if claimng UX */
+		if (rxc->hw_claim_in_progress)
+			goto err_dequeue_req;
+
+		ret = _cxip_recv_req(req, restart_seq);
+		if (ret)
+			goto err_dequeue_req;
+	} else {
+
+		req->recv.software_list = true;
+		dlist_insert_tail(&req->recv.rxc_entry, &rxc->sw_recv_queue);
+	}
+
+	return FI_SUCCESS;
+
+err_dequeue_req:
+	dlist_remove_init(&req->recv.rxc_entry);
+
+	return -FI_EAGAIN;
+}
+
+static int cxip_rxc_check_recv_count_hybrid_preempt(struct cxip_rxc *rxc)
+{
+	int ret;
+	int count;
+
+	if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE &&
+	    cxip_env.hybrid_posted_recv_preemptive == 1) {
+		count = ofi_atomic_get32(&rxc->orx_reqs);
+
+		if (count > rxc->attr.size) {
+			assert(rxc->state == RXC_ENABLED);
+
+			/* On success, need to return -FI_EAGAIN which will
+			 * propagate back to the user. In addition, RXC state
+			 * will have transitioned to RXC_PENDING_PTLTE_DISABLE.
+			 */
+			ret = cxip_recv_pending_ptlte_disable(rxc, false);
+			if (ret == FI_SUCCESS) {
+				RXC_WARN(rxc,
+					 "Transitioning to SW EP due to too many posted recvs: posted_count=%u request_size=%lu\n",
+					 ret, rxc->attr.size);
+				return -FI_EAGAIN;
+			}
+
+			RXC_WARN(rxc, "Failed to transition to SW EP: %d\n",
+				 ret);
+			return ret;
+		}
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * _cxip_recv_req() - Submit Receive request to hardware.
+ */
+static ssize_t _cxip_recv_req(struct cxip_req *req, bool restart_seq)
+{
+	struct cxip_rxc *rxc = req->recv.rxc;
+	uint32_t le_flags = 0;
+	union cxip_match_bits mb = {};
+	union cxip_match_bits ib = {
+		.tx_id = ~0,
+		.match_comp = 1,
+		.cq_data = 1,
+		.rdzv_done = 1,
+		.le_type = ~0,
+	};
+	int ret;
+	struct cxip_md *recv_md = req->recv.recv_md;
+	uint64_t recv_iova = 0;
+
+	ret = cxip_rxc_check_recv_count_hybrid_preempt(rxc);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	if (req->recv.tagged) {
+		mb.tagged = 1;
+		mb.tag = req->recv.tag;
+		ib.tag = req->recv.ignore;
+	}
+
+	/* For poorly written applications a periodic check LE pool
+	 * resources can be requested to force transitions to software mode.
+	 * For this to occur, the code must be executing in hybrid mode,
+	 * still matching in hardware, and FI_CXI_HYBRID_RECV_PREEMPTIVE
+	 * explicitly set by the application.
+	 */
+	if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE ||
+	    ++rxc->recv_appends & CXIP_HYBRID_RECV_CHECK_INTERVAL)
+		le_flags = C_LE_EVENT_LINK_DISABLE;
+
+	/* Always set manage_local in Receive LEs. This makes Cassini ignore
+	 * initiator remote_offset in all Puts. With this, remote_offset in Put
+	 * events can be used by the initiator for protocol data. The behavior
+	 * of use_once is not impacted by manage_local.
+	 */
+	le_flags |= C_LE_EVENT_UNLINK_DISABLE | C_LE_MANAGE_LOCAL |
+		    C_LE_UNRESTRICTED_BODY_RO | C_LE_UNRESTRICTED_END_RO |
+		    C_LE_OP_PUT;
+
+	if (!req->recv.multi_recv)
+		le_flags |= C_LE_USE_ONCE;
+	if (restart_seq)
+		le_flags |= C_LE_RESTART_SEQ;
+
+	if (recv_md)
+		recv_iova = CXI_VA_TO_IOVA(recv_md->md,
+					   (uint64_t)req->recv.recv_buf +
+					   req->recv.start_offset);
+
+	req->recv.hw_offloaded = true;
+
+	/* Issue Append command */
+	ret = cxip_pte_append(rxc->rx_pte, recv_iova,
+			      req->recv.ulen - req->recv.start_offset,
+			      recv_md ? recv_md->md->lac : 0,
+			      C_PTL_LIST_PRIORITY, req->req_id,
+			      mb.raw, ib.raw, req->recv.match_id,
+			      req->recv.multi_recv ? rxc->min_multi_recv : 0,
+			      le_flags, NULL, rxc->rx_cmdq,
+			      !(req->recv.flags & FI_MORE));
+	if (ret != FI_SUCCESS) {
+		RXC_WARN(rxc, "Failed to write Append command: %d\n", ret);
+		return ret;
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_recv_common() - Common message receive function. Used for tagged and
+ * untagged sends of all sizes.
+ */
+ssize_t cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len,
+			 void *desc, fi_addr_t src_addr, uint64_t tag,
+			 uint64_t ignore, void *context, uint64_t flags,
+			 bool tagged, struct cxip_cntr *comp_cntr)
+{
+	int ret;
+	struct cxip_req *req;
+	struct cxip_addr caddr;
+	struct cxip_ux_send *ux_msg;
+	uint32_t match_id;
+
+	if (len && !buf)
+		return -FI_EINVAL;
+
+	if (rxc->state == RXC_DISABLED)
+		return -FI_EOPBADSTATE;
+
+	/* HW to SW PtlTE transition, ensure progress is made */
+	if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) {
+		cxip_cq_progress(rxc->recv_cq);
+		return -FI_EAGAIN;
+	}
+
+	if (tagged) {
+		if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) {
+			RXC_WARN(rxc,
+				 "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n",
+				 tag, ignore, CXIP_TAG_MASK);
+			return -FI_EINVAL;
+		}
+		flags &= ~FI_MULTI_RECV;
+	}
+
+	/* If FI_DIRECTED_RECV and a src_addr is specified, encode the address
+	 * in the LE for matching. If application AVs are symmetric, use
+	 * logical FI address for matching. Otherwise, use physical address.
+	 */
+	if (rxc->attr.caps & FI_DIRECTED_RECV &&
+	    src_addr != FI_ADDR_UNSPEC) {
+		if (rxc->ep_obj->av->symmetric) {
+			/* PID is not used for matching */
+			match_id = CXI_MATCH_ID(rxc->pid_bits, C_PID_ANY,
+						src_addr);
+		} else {
+			ret = cxip_av_lookup_addr(rxc->ep_obj->av, src_addr,
+						  &caddr);
+			if (ret != FI_SUCCESS) {
+				RXC_WARN(rxc, "Failed to look up FI addr: %d\n",
+					 ret);
+				return -FI_EINVAL;
+			}
+
+			match_id = CXI_MATCH_ID(rxc->pid_bits, caddr.pid,
+						caddr.nic);
+		}
+	} else {
+		match_id = CXI_MATCH_ID_ANY;
+	}
+
+	ofi_genlock_lock(&rxc->ep_obj->lock);
+	ret = cxip_recv_req_alloc(rxc, buf, len, &req);
+	if (ret)
+		goto err;
+
+	/* req->data_len, req->tag, req->data must be set later. req->buf may
+	 * be overwritten later.
+	 */
+	req->context = (uint64_t)context;
+
+	req->flags = FI_RECV | (flags & FI_COMPLETION);
+	if (tagged)
+		req->flags |= FI_TAGGED;
+	else
+		req->flags |= FI_MSG;
+
+	req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr;
+	req->recv.match_id = match_id;
+	req->recv.tag = tag;
+	req->recv.ignore = ignore;
+	req->recv.flags = flags;
+	req->recv.tagged = tagged;
+	req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false);
+
+	if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) {
+		ret = -FI_EAGAIN;
+		goto err_free_request;
+	}
+
+	if (!(req->recv.flags & (FI_PEEK | FI_CLAIM))) {
+
+		ret = cxip_recv_req_queue(req, false);
+		/* Match made in software? */
+		if (ret == -FI_EALREADY) {
+			ofi_genlock_unlock(&rxc->ep_obj->lock);
+
+			return FI_SUCCESS;
+		}
+
+		/* RXC busy (onloading Sends or full CQ)? */
+		if (ret != FI_SUCCESS)
+			goto err_free_request;
+
+		ofi_genlock_unlock(&rxc->ep_obj->lock);
+
+		RXC_DBG(rxc,
+			"req: %p buf: %p len: %lu src_addr: %ld tag(%c):"
+			" 0x%lx ignore: 0x%lx context: %p\n",
+			req, buf, len, src_addr, tagged ? '*' : '-', tag,
+			ignore, context);
+
+		return FI_SUCCESS;
+	}
+
+	/* FI_PEEK with/without FI_CLAIM */
+	if (req->recv.flags & FI_PEEK) {
+		if (req->recv.flags & FI_CLAIM && !req->context) {
+			RXC_WARN(rxc, "FI_CLAIM requires fi_context\n");
+			ret = -FI_EINVAL;
+			goto err_free_request;
+		}
+		ret = cxip_recv_req_peek(req, true);
+		if (ret == FI_SUCCESS) {
+			ofi_genlock_unlock(&rxc->ep_obj->lock);
+
+			return ret;
+		}
+
+		goto err_free_request;
+	}
+
+	/* FI_CLAIM without FI_PEEK */
+	ux_msg = ((struct fi_context *)req->context)->internal[0];
+	if (!ux_msg->claimed) {
+		RXC_WARN(rxc, "Bad fi_context specified with FI_CLAIM\n");
+		ret = -FI_EINVAL;
+		goto err_free_request;
+	}
+
+	RXC_DBG(rxc, "FI_CLAIM invoke sw matcher %p\n", ux_msg);
+	ret = cxip_recv_sw_matcher(rxc, req, ux_msg, true);
+	if (ret == FI_SUCCESS || ret == -FI_EINPROGRESS) {
+		ofi_genlock_unlock(&rxc->ep_obj->lock);
+
+		return FI_SUCCESS;
+	}
+
+err_free_request:
+	cxip_recv_req_free(req);
+err:
+	ofi_genlock_unlock(&rxc->ep_obj->lock);
+
+	return ret;
+}
+
+/*
+ * cxip_txc_fi_addr() - Return the FI address of the TXC.
+ */
+static fi_addr_t _txc_fi_addr(struct cxip_txc *txc)
+{
+	if (txc->ep_obj->fi_addr == FI_ADDR_NOTAVAIL) {
+		txc->ep_obj->fi_addr =
+				cxip_av_lookup_fi_addr(txc->ep_obj->av,
+						       &txc->ep_obj->src_addr);
+		TXC_DBG(txc, "Found EP FI Addr: %lu\n", txc->ep_obj->fi_addr);
+	}
+
+	return txc->ep_obj->fi_addr;
+}
+
+/*
+ * cxip_msg_match_id() - Return the TXC's initiator address used to transmit a
+ * message.
+ *
+ * By default, the physical address of the TXC is returned. This address is
+ * sent along with message data and is used for source address matching at the
+ * target. When the target receives a message, the physical ID is translated to
+ * a logical FI address. Translation adds overhead to the receive path.
+ *
+ * As an optimization, if rendezvous offload is not being used and the process
+ * is part of a job with symmetric AVs, a logical FI address is returned. This
+ * way, there is no source address translation overhead involved in the
+ * receive.
+ */
+static uint32_t cxip_msg_match_id(struct cxip_txc *txc)
+{
+	/* PID is not used for logical matching, but is used for rendezvous. */
+	if (txc->ep_obj->av->symmetric)
+		return CXI_MATCH_ID(txc->pid_bits, txc->ep_obj->src_addr.pid,
+				    _txc_fi_addr(txc));
+
+	return CXI_MATCH_ID(txc->pid_bits, txc->ep_obj->src_addr.pid,
+			    txc->ep_obj->src_addr.nic);
+}
+
+/*
+ * report_send_completion() - Report the completion of a send operation.
+ */
+static void report_send_completion(struct cxip_req *req, bool sw_cntr)
+{
+	int ret;
+	int ret_err;
+	int success_event = (req->flags & FI_COMPLETION);
+	struct cxip_txc *txc = req->send.txc;
+
+	req->flags &= (FI_MSG | FI_TAGGED | FI_SEND);
+
+	if (req->send.rc == C_RC_OK) {
+		TXC_DBG(txc, "Request success: %p\n", req);
+
+		if (success_event) {
+			ret = cxip_cq_req_complete(req);
+			if (ret != FI_SUCCESS)
+				TXC_WARN(txc,
+					 "Failed to report completion: %d\n",
+					 ret);
+		}
+
+		if (sw_cntr && req->send.cntr) {
+			ret = cxip_cntr_mod(req->send.cntr, 1, false, false);
+			if (ret)
+				TXC_WARN(txc, "cxip_cntr_mod returned: %d\n",
+					 ret);
+		}
+	} else {
+		ret_err = proverr2errno(req->send.rc);
+		TXC_WARN(txc, "Request dest_addr: %ld caddr.nic: %#X caddr.pid: %u error: %p (err: %d, %s)\n",
+			 req->send.dest_addr, req->send.caddr.nic,
+			 req->send.caddr.pid, req, ret_err,
+			 cxi_rc_to_str(req->send.rc));
+
+		ret = cxip_cq_req_error(req, 0, ret_err,
+					req->send.rc, NULL, 0,
+					FI_ADDR_UNSPEC);
+		if (ret != FI_SUCCESS)
+			TXC_WARN(txc, "Failed to report error: %d\n", ret);
+
+		if (sw_cntr && req->send.cntr) {
+			ret = cxip_cntr_mod(req->send.cntr, 1, false, true);
+			if (ret)
+				TXC_WARN(txc, "cxip_cntr_mod returned: %d\n",
+					 ret);
+		}
+	}
+}
+
+/*
+ * rdzv_send_req_complete() - Complete long send request.
+ */
+static void rdzv_send_req_complete(struct cxip_req *req)
+{
+	cxip_rdzv_id_free(req->send.txc, req->send.rdzv_id);
+
+	cxip_send_buf_fini(req);
+
+	report_send_completion(req, true);
+
+	ofi_atomic_dec32(&req->send.txc->otx_reqs);
+	cxip_evtq_req_free(req);
+}
+
+/*
+ * rdzv_send_req_event() - Count a rendezvous send event.
+ *
+ * Call for each initiator event. The events could be generated in any order.
+ * Once all expected events are received, complete the request.
+ *
+ * A successful rendezvous Send generates two events: Ack and Get.
+ */
+static void rdzv_send_req_event(struct cxip_req *req)
+{
+	if (++req->send.rdzv_send_events == 2)
+		rdzv_send_req_complete(req);
+}
+
+/*
+ * cxip_send_rdzv_put_cb() - Long send callback.
+ *
+ * Progress a long send operation to completion.
+ */
+static int cxip_send_rdzv_put_cb(struct cxip_req *req,
+				 const union c_event *event)
+{
+	int event_rc;
+	int ret;
+	struct cxip_txc *txc = req->send.txc;
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_ACK:
+		/* The source Put completed. */
+		event_rc = cxi_init_event_rc(event);
+
+		TXC_DBG(txc, "Acked: %p (rc: %s list: %s)\n", req,
+			cxi_rc_to_str(event_rc),
+			cxi_ptl_list_to_str(event->init_short.ptl_list));
+
+		/* If the message was dropped, mark the peer as disabled. Do
+		 * not generate a completion. Free associated resources. Do not
+		 * free the request (it will be used to replay the Send).
+		 */
+		if (event_rc == C_RC_PT_DISABLED) {
+			ret = cxip_send_req_dropped(req->send.txc, req);
+			if (ret == FI_SUCCESS)
+				cxip_rdzv_id_free(req->send.txc,
+						  req->send.rdzv_id);
+			else
+				ret = -FI_EAGAIN;
+
+			return ret;
+		}
+
+		/* Message was accepted by the peer. Match order is preserved.
+		 * The request can be dequeued from the SW message queue. This
+		 * allows flow-control recovery to be performed before
+		 * outstanding long Send operations have completed.
+		 */
+		ret = cxip_send_req_dequeue(req->send.txc, req);
+		if (ret != FI_SUCCESS)
+			return ret;
+
+		/* The transaction is complete if the put failed */
+		if (event_rc != C_RC_OK) {
+			req->send.rc = event_rc;
+			rdzv_send_req_complete(req);
+		} else {
+			/* Count the event, another may be expected. */
+			rdzv_send_req_event(req);
+		}
+		return FI_SUCCESS;
+
+	/* When errors happen, send events can occur before the put/get event.
+	 * These events should just be dropped.
+	 */
+	case C_EVENT_SEND:
+	{
+		struct cxi_md *md = req->send.send_md->md;
+
+		TXC_WARN(txc, "Unexpected %s event: rc:%s buf:%p len:0x%lx iova:0x%llx md.va:0x%llx lac:%d\n",
+			 cxi_event_to_str(event),
+			 cxi_rc_to_str(cxi_event_rc(event)), req->send.buf,
+			 req->send.len, CXI_VA_TO_IOVA(md, req->send.buf),
+			 md->iova, md->lac);
+	}
+		return FI_SUCCESS;
+
+	default:
+		TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+}
+
+/*
+ * cxip_rdzv_pte_src_cb() - Process rendezvous source buffer events.
+ *
+ * A Get event is generated for each rendezvous Send indicating Send completion.
+ */
+int cxip_rdzv_pte_src_cb(struct cxip_req *req, const union c_event *event)
+{
+	struct cxip_rdzv_pte *rdzv_pte = req->req_ctx;
+	struct cxip_txc *txc = rdzv_pte->txc;
+	struct cxip_req *get_req;
+	union cxip_match_bits mb;
+	int event_rc = cxi_event_rc(event);
+	int rdzv_id;
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_LINK:
+		if (event_rc == C_RC_OK)
+			ofi_atomic_inc32(&rdzv_pte->le_linked_success_count);
+		else
+			ofi_atomic_inc32(&rdzv_pte->le_linked_failure_count);
+		return FI_SUCCESS;
+
+	case C_EVENT_GET:
+		mb.raw = event->tgt_long.match_bits;
+		rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) |
+			  mb.rdzv_id_lo;
+		get_req = cxip_rdzv_id_lookup(txc, rdzv_id);
+		if (!get_req) {
+			TXC_WARN(txc, "Failed to find RDZV ID: %d\n",
+				 mb.rdzv_id_lo);
+			return FI_SUCCESS;
+		}
+
+		if (event_rc != C_RC_OK)
+			TXC_WARN(txc, "Get error: %p rc: %s\n", get_req,
+				 cxi_rc_to_str(event_rc));
+		else
+			TXC_DBG(txc, "Get received: %p rc: %s\n", get_req,
+				cxi_rc_to_str(event_rc));
+
+		get_req->send.rc = event_rc;
+
+		/* Count the event, another may be expected. */
+		rdzv_send_req_event(get_req);
+
+		return FI_SUCCESS;
+	default:
+		TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+}
+
+static inline int cxip_send_prep_cmdq(struct cxip_cmdq *cmdq,
+				      struct cxip_req *req,
+				      uint32_t tclass)
+{
+	struct cxip_txc *txc = req->send.txc;
+	int ret;
+	uint16_t vni;
+
+	if (!req->triggered) {
+		if (txc->ep_obj->av_auth_key)
+			vni = req->send.caddr.vni;
+		else
+			vni = txc->ep_obj->auth_key.vni;
+
+		ret = cxip_txq_cp_set(cmdq, vni,
+				      cxip_ofi_to_cxi_tc(txc->tclass),
+				      CXI_TC_TYPE_DEFAULT);
+		if (ret != FI_SUCCESS)
+			return ret;
+	}
+
+	if (req->send.flags & FI_FENCE) {
+		ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE);
+		if (ret) {
+			TXC_DBG(txc, "Failed to issue CQ_FENCE command: %d\n",
+				ret);
+			return -FI_EAGAIN;
+		}
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * _cxip_send_rdzv_put() - Initiate a send rendezvous put operation.
+ *
+ * The rendezvous protocol works as follows:
+ *
+ * 1. The Initiator performs a Rendezvous Put command which includes a portion
+ *    of the source buffer data.
+ * 2. Once the Put is matched to a user receive buffer (in the Priority list),
+ *    a Get of the remaining source data is performed.
+ */
+static ssize_t _cxip_send_rdzv_put(struct cxip_req *req)
+{
+	struct cxip_txc *txc = req->send.txc;
+	union c_fab_addr dfa;
+	uint8_t idx_ext;
+	struct c_full_dma_cmd cmd = {};
+	union cxip_match_bits put_mb = {};
+	int rdzv_id;
+	int lac = req->send.send_md->md->lac;
+	int ret;
+	struct cxip_cmdq *cmdq =
+		req->triggered ? txc->domain->trig_cmdq : txc->tx_cmdq;
+
+	/* Zero length rendezvous not supported. */
+	assert(req->send.send_md);
+	assert(req->send.len);
+
+	/* Allocate rendezvous ID */
+	rdzv_id = cxip_rdzv_id_alloc(txc, req);
+	if (rdzv_id < 0)
+		return -FI_EAGAIN;
+
+	/* Calculate DFA */
+	cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits,
+		      CXIP_PTL_IDX_RXQ, &dfa, &idx_ext);
+
+	/* Allocate a source request for the given LAC. This makes the source
+	 * memory accessible for rendezvous.
+	 */
+	ret = cxip_rdzv_pte_src_req_alloc(txc->rdzv_pte, lac);
+	if (ret) {
+		TXC_WARN(txc, "Failed to prepare source window: %d\n", ret);
+		goto err_free_rdzv_id;
+	}
+
+
+	/* Allocate restricted source window. If resources can not be allocated
+	 * discontinue use of the restricted protocol, falling back
+	 * to unrestricted. TODO: keep track and only switch for LAC that
+	 * failed.
+	 */
+	if (txc->rdzv_proto == CXIP_RDZV_PROTO_ALT_READ &&
+	    !txc->rdzv_nomatch_pte[lac]) {
+		TXC_DBG(txc, "allocate restricted PTE lac %d\n", lac);
+
+		ret = cxip_rdzv_nomatch_pte_alloc(txc, lac,
+						  &txc->rdzv_nomatch_pte[lac]);
+		if (ret) {
+			TXC_WARN(txc, WARN_RESTRICTED_DISABLED,
+				 cxip_rdzv_proto_to_str(txc->rdzv_proto),
+				 cxip_rdzv_proto_to_str(CXIP_RDZV_PROTO_DEFAULT));
+			txc->rdzv_proto = CXIP_RDZV_PROTO_DEFAULT;
+		}
+	}
+
+	/* Build match bits */
+	if (req->send.tagged) {
+		put_mb.tagged = 1;
+		put_mb.tag = req->send.tag;
+	}
+
+	if (req->send.flags & FI_REMOTE_CQ_DATA)
+		put_mb.cq_data = 1;
+
+	put_mb.rdzv_proto = txc->rdzv_proto;
+
+	req->send.rdzv_id = rdzv_id;
+	req->cb = cxip_send_rdzv_put_cb;
+	req->send.rdzv_send_events = 0;
+
+	/* Build Put command descriptor */
+	cmd.command.cmd_type = C_CMD_TYPE_DMA;
+	cmd.index_ext = idx_ext;
+	cmd.lac = req->send.send_md->md->lac;
+	cmd.event_send_disable = 1;
+	cmd.restricted = 0;
+	cmd.dfa = dfa;
+	cmd.local_addr = CXI_VA_TO_IOVA(req->send.send_md->md, req->send.buf);
+	cmd.request_len = req->send.len;
+	cmd.eq = cxip_evtq_eqn(&txc->tx_evtq);
+	cmd.user_ptr = (uint64_t)req;
+	cmd.initiator = cxip_msg_match_id(txc);
+	cmd.header_data = req->send.data;
+	cmd.remote_offset =
+		CXI_VA_TO_IOVA(req->send.send_md->md, req->send.buf);
+	cmd.command.opcode = C_CMD_RENDEZVOUS_PUT;
+	cmd.eager_length = txc->rdzv_eager_size;
+	cmd.use_offset_for_get = 1;
+
+	put_mb.rdzv_id_hi = rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH;
+	put_mb.rdzv_lac = req->send.send_md->md->lac;
+	put_mb.le_type = CXIP_LE_TYPE_RX;
+	cmd.match_bits = put_mb.raw;
+	cmd.rendezvous_id = rdzv_id;
+
+	if (req->triggered) {
+		const struct c_ct_cmd ct_cmd = {
+			.trig_ct = req->trig_cntr->ct->ctn,
+			.threshold = req->trig_thresh,
+		};
+
+		/* Triggered command queue is domain resource, lock. */
+		ofi_genlock_lock(&txc->domain->trig_cmdq_lock);
+
+		ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass);
+		if (ret) {
+			ofi_genlock_unlock(&txc->domain->trig_cmdq_lock);
+			goto err_free_rdzv_id;
+		}
+
+		/* Clear the triggered flag to prevent retrying of operation,
+		 * due to flow control, from using the triggered path.
+		 */
+		req->triggered = false;
+
+		ret = cxi_cq_emit_trig_full_dma(cmdq->dev_cmdq, &ct_cmd,
+						&cmd);
+		if (ret) {
+			ofi_genlock_unlock(&txc->domain->trig_cmdq_lock);
+			goto err_enqueue;
+		}
+
+		cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE),
+			      ofi_atomic_get32(&req->send.txc->otx_reqs) - 1);
+		ofi_genlock_unlock(&txc->domain->trig_cmdq_lock);
+	} else {
+
+		ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass);
+		if (ret)
+			goto err_free_rdzv_id;
+
+		ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &cmd);
+		if (ret)
+			goto err_enqueue;
+
+		cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE),
+			      ofi_atomic_get32(&req->send.txc->otx_reqs) - 1);
+	}
+
+	return FI_SUCCESS;
+
+err_enqueue:
+	TXC_DBG(txc, "Failed to enqueue Put: %d, return -FI_EAGAIN\n", ret);
+err_free_rdzv_id:
+	cxip_rdzv_id_free(txc, rdzv_id);
+
+	return -FI_EAGAIN;
+}
+
+/*
+ * cxip_send_eager_cb() - Eager send callback. Used for both tagged and
+ * untagged messages.
+ */
+static int cxip_send_eager_cb(struct cxip_req *req,
+			      const union c_event *event)
+{
+	int match_complete = req->flags & FI_MATCH_COMPLETE;
+	int ret;
+
+	/* When errors happen, send events can occur before the put/get event.
+	 * These events should just be dropped.
+	 */
+	if (event->hdr.event_type == C_EVENT_SEND) {
+		TXC_WARN(req->send.txc, CXIP_UNEXPECTED_EVENT,
+			 cxi_event_to_str(event),
+			 cxi_rc_to_str(cxi_event_rc(event)));
+		return FI_SUCCESS;
+	}
+
+	assert(event->hdr.event_type == C_EVENT_ACK);
+
+	req->send.rc = cxi_init_event_rc(event);
+
+	/* If the message was dropped, mark the peer as disabled. Do not
+	 * generate a completion. Free associated resources. Do not free the
+	 * request (it will be used to replay the Send).
+	 */
+	if (req->send.rc == C_RC_PT_DISABLED) {
+
+		ret = cxip_send_req_dropped(req->send.txc, req);
+		if (ret != FI_SUCCESS)
+			return -FI_EAGAIN;
+
+		if (match_complete)
+			cxip_tx_id_free(req->send.txc, req->send.tx_id);
+
+		return FI_SUCCESS;
+	}
+
+	ret = cxip_send_req_dequeue(req->send.txc, req);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	cxip_send_buf_fini(req);
+
+	/* If MATCH_COMPLETE was requested and the the Put did not match a user
+	 * buffer, do not generate a completion event until the target notifies
+	 * the initiator that the match is complete.
+	 */
+	if (match_complete) {
+		if (req->send.rc == C_RC_OK &&
+		    event->init_short.ptl_list != C_PTL_LIST_PRIORITY) {
+			TXC_DBG(req->send.txc,
+				"Waiting for match complete: %p\n", req);
+			return FI_SUCCESS;
+		}
+
+		TXC_DBG(req->send.txc, "Match complete with Ack: %p\n", req);
+		cxip_tx_id_free(req->send.txc, req->send.tx_id);
+	}
+
+	/* If MATCH_COMPLETE was requested, software must manage counters. */
+	report_send_completion(req, match_complete);
+
+	ofi_atomic_dec32(&req->send.txc->otx_reqs);
+	cxip_evtq_req_free(req);
+
+	return FI_SUCCESS;
+}
+
+static inline int cxip_set_eager_mb(struct cxip_req *req,
+				    union cxip_match_bits *mb)
+{
+	int tx_id;
+
+	mb->raw = 0;
+	mb->le_type = CXIP_LE_TYPE_RX;
+	mb->tagged = req->send.tagged;
+	mb->tag = req->send.tag;
+	mb->cq_data = !!(req->send.flags & FI_REMOTE_CQ_DATA);
+
+	/* Allocate a TX ID if match completion guarantees are required */
+	if (req->send.flags & FI_MATCH_COMPLETE) {
+
+		tx_id = cxip_tx_id_alloc(req->send.txc, req);
+		if (tx_id < 0) {
+			TXC_DBG(req->send.txc,
+				"Failed to allocate TX ID: %d\n", tx_id);
+			return -FI_EAGAIN;
+		}
+
+		req->send.tx_id = tx_id;
+		mb->match_comp = 1;
+		mb->tx_id = tx_id;
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * _cxip_send_eager_idc() - Enqueue eager IDC message
+ */
+static ssize_t _cxip_send_eager_idc(struct cxip_req *req)
+{
+	struct cxip_txc *txc = req->send.txc;
+	union c_fab_addr dfa;
+	uint8_t idx_ext;
+	union cxip_match_bits mb;
+	ssize_t ret;
+	struct cxip_cmdq *cmdq = txc->tx_cmdq;
+	const void *buf;
+	struct c_cstate_cmd cstate_cmd = {};
+	struct c_idc_msg_hdr idc_cmd;
+
+	assert(req->send.len > 0);
+
+#if ENABLE_DEBUG
+	if (req->send.flags & FI_INJECT)
+		assert(req->send.ibuf);
+
+	/* ibuf and send_md are mutually exclusive. */
+	if (req->send.ibuf) {
+		assert(req->send.send_md == NULL);
+	} else if (req->send.send_md) {
+		assert(req->send.ibuf == NULL);
+
+		/* All non FI_HMEM_SYSTEM buffers require an ibuf. */
+		assert(req->send.send_md->info.iface == FI_HMEM_SYSTEM);
+	}
+#endif
+
+	/* Calculate DFA */
+	cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits,
+		      CXIP_PTL_IDX_RXQ, &dfa, &idx_ext);
+
+	/* Favor bounce buffer if allocated. */
+	if (req->send.ibuf)
+		buf = req->send.ibuf;
+	else
+		buf = req->send.buf;
+
+	ret = cxip_set_eager_mb(req, &mb);
+	if (ret)
+		goto err;
+
+	req->cb = cxip_send_eager_cb;
+
+	/* Build commands before taking lock */
+	cstate_cmd.event_send_disable = 1;
+	cstate_cmd.index_ext = idx_ext;
+	cstate_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq);
+	cstate_cmd.initiator = cxip_msg_match_id(txc);
+
+	/* If MATCH_COMPLETE was requested, software must manage
+	 * counters.
+	 */
+	if (req->send.cntr && !mb.match_comp) {
+		cstate_cmd.event_ct_ack = 1;
+		cstate_cmd.ct = req->send.cntr->ct->ctn;
+	}
+
+	/* Note: IDC command completely filled in */
+	idc_cmd.unused_0 = 0;
+	idc_cmd.dfa = dfa;
+	idc_cmd.match_bits = mb.raw;
+	idc_cmd.header_data = req->send.data;
+	idc_cmd.user_ptr = (uint64_t)req;
+
+	/* Submit command */
+	ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass);
+	if (ret)
+		goto err_cleanup;
+
+	ret = cxip_cmdq_emit_c_state(cmdq, &cstate_cmd);
+	if (ret) {
+		TXC_DBG(txc, "Failed to issue C_STATE command: %ld\n", ret);
+		goto err_cleanup;
+	}
+
+	ret = cxi_cq_emit_idc_msg(cmdq->dev_cmdq, &idc_cmd, buf, req->send.len);
+	if (ret) {
+		TXC_DBG(txc, "Failed to write IDC: %ld\n", ret);
+
+		/* Return error according to Domain Resource Management */
+		ret = -FI_EAGAIN;
+		goto err_cleanup;
+	}
+
+	cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE),
+		      ofi_atomic_get32(&req->send.txc->otx_reqs) - 1);
+
+	return FI_SUCCESS;
+
+err_cleanup:
+	if (mb.match_comp)
+		cxip_tx_id_free(txc, req->send.tx_id);
+err:
+	return ret;
+}
+
+/*
+ * _cxip_send_eager() - Enqueue eager send command.
+ */
+static ssize_t _cxip_send_eager(struct cxip_req *req)
+{
+	struct cxip_txc *txc = req->send.txc;
+	union c_fab_addr dfa;
+	uint8_t idx_ext;
+	union cxip_match_bits mb;
+	ssize_t ret;
+	struct cxip_cmdq *cmdq =
+		req->triggered ? txc->domain->trig_cmdq : txc->tx_cmdq;
+	bool trig = req->triggered;
+	struct c_full_dma_cmd cmd = {};
+
+	/* Calculate DFA */
+	cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits,
+		      CXIP_PTL_IDX_RXQ, &dfa, &idx_ext);
+
+	ret = cxip_set_eager_mb(req, &mb);
+	if (ret)
+		goto err;
+
+	req->cb = cxip_send_eager_cb;
+
+	cmd.command.cmd_type = C_CMD_TYPE_DMA;
+	cmd.command.opcode = C_CMD_PUT;
+	cmd.index_ext = idx_ext;
+	cmd.event_send_disable = 1;
+	cmd.dfa = dfa;
+	cmd.eq = cxip_evtq_eqn(&txc->tx_evtq);
+	cmd.user_ptr = (uint64_t)req;
+	cmd.initiator = cxip_msg_match_id(txc);
+	cmd.match_bits = mb.raw;
+	cmd.header_data = req->send.data;
+
+	/* Triggered ops could result in 0 length DMA */
+	if (req->send.send_md) {
+		cmd.lac = req->send.send_md->md->lac;
+		cmd.local_addr = CXI_VA_TO_IOVA(req->send.send_md->md,
+						req->send.buf);
+		cmd.request_len = req->send.len;
+	}
+
+	/* If MATCH_COMPLETE was requested, software must manage
+	 * counters.
+	 */
+	if (req->send.cntr && !mb.match_comp) {
+		cmd.event_ct_ack = 1;
+		cmd.ct = req->send.cntr->ct->ctn;
+	}
+
+	/* Issue Eager Put command */
+	if (trig) {
+		const struct c_ct_cmd ct_cmd = {
+			.trig_ct = req->trig_cntr->ct->ctn,
+			.threshold = req->trig_thresh,
+		};
+
+		/* Triggered command queue is domain resource, lock. */
+		ofi_genlock_lock(&txc->domain->trig_cmdq_lock);
+		ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass);
+		if (ret) {
+			ofi_genlock_unlock(&txc->domain->trig_cmdq_lock);
+			goto err;
+		}
+
+		/* Clear the triggered flag to prevent retrying of
+		 * operation, due to flow control, from using the
+		 * triggered path.
+		 */
+		req->triggered = false;
+
+		ret = cxi_cq_emit_trig_full_dma(cmdq->dev_cmdq, &ct_cmd,
+						&cmd);
+		if (ret) {
+			ofi_genlock_unlock(&txc->domain->trig_cmdq_lock);
+			goto err_enqueue;
+		}
+		cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE),
+			      ofi_atomic_get32(&req->send.txc->otx_reqs) - 1);
+		ofi_genlock_unlock(&txc->domain->trig_cmdq_lock);
+
+	} else {
+		ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass);
+		if (ret)
+			goto err;
+
+		ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &cmd);
+		if (ret)
+			goto err_enqueue;
+
+		cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE),
+			      ofi_atomic_get32(&req->send.txc->otx_reqs) - 1);
+	}
+
+	return FI_SUCCESS;
+
+err_enqueue:
+	TXC_DBG(txc, "Failed to write DMA command: %ld\n", ret);
+	ret = -FI_EAGAIN;
+
+	if (mb.match_comp)
+		cxip_tx_id_free(txc, req->send.tx_id);
+err:
+	return ret;
+}
+
+static bool cxip_send_eager_idc(struct cxip_req *req)
+{
+	return (req->send.len <= CXIP_INJECT_SIZE) &&
+		!cxip_env.disable_non_inject_msg_idc;
+}
+
+static ssize_t _cxip_send_req(struct cxip_req *req)
+{
+	/* Force all zero-byte operations to use the eager path. This utilizes
+	 * a smaller command format.
+	 */
+	if (req->send.len == 0)
+		return _cxip_send_eager(req);
+
+	/* IDC commands are not supported with triggered operations. */
+	if (!req->triggered &&
+	    ((req->send.flags & FI_INJECT) || cxip_send_eager_idc(req)))
+		return _cxip_send_eager_idc(req);
+
+	if (req->send.len <= req->send.txc->max_eager_size)
+		return _cxip_send_eager(req);
+
+	return _cxip_send_rdzv_put(req);
+}
+
+/*
+ * cxip_fc_peer_lookup() - Check if a peer is disabled.
+ *
+ * Look up disabled peer state and return it, if available.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static struct cxip_fc_peer *cxip_fc_peer_lookup(struct cxip_txc *txc,
+						struct cxip_addr caddr)
+{
+	struct cxip_fc_peer *peer;
+
+	dlist_foreach_container(&txc->fc_peers, struct cxip_fc_peer,
+				peer, txc_entry) {
+		if (CXIP_ADDR_EQUAL(peer->caddr, caddr))
+			return peer;
+	}
+
+	return NULL;
+}
+
+/*
+ * cxip_fc_peer_put() - Account for completion of an outstanding Send targeting
+ * a disabled peer.
+ *
+ * Drop a reference to a disabled peer. When the last reference is dropped,
+ * attempt flow-control recovery.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int cxip_fc_peer_put(struct cxip_fc_peer *peer)
+{
+	int ret;
+
+	assert(peer->pending > 0);
+
+	/* Account for the completed Send */
+	if (!--peer->pending) {
+		peer->req.send.mb.drops = peer->dropped;
+
+		ret = cxip_ctrl_msg_send(&peer->req);
+		if (ret != FI_SUCCESS) {
+			peer->pending++;
+			return ret;
+		}
+
+		peer->pending_acks++;
+
+		TXC_DBG(peer->txc,
+			"Notified disabled peer NIC: %#x PID: %u dropped: %u\n",
+			peer->caddr.nic, peer->caddr.pid, peer->dropped);
+	}
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_fc_peer_fini() - Remove disabled peer state.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static void cxip_fc_peer_fini(struct cxip_fc_peer *peer)
+{
+	assert(dlist_empty(&peer->msg_queue));
+	dlist_remove(&peer->txc_entry);
+	free(peer);
+}
+
+/*
+ * cxip_fc_notify_cb() - Process FC notify completion events.
+ */
+int cxip_fc_notify_cb(struct cxip_ctrl_req *req, const union c_event *event)
+{
+	struct cxip_fc_peer *peer = container_of(req, struct cxip_fc_peer, req);
+	struct cxip_txc *txc = peer->txc;
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_ACK:
+		switch (cxi_event_rc(event)) {
+		case C_RC_OK:
+			TXC_DBG(txc,
+				"FC_NOTIFY to %#x:%u successfully sent: retry_count=%u\n",
+				peer->caddr.nic, peer->caddr.pid,
+				peer->retry_count);
+
+			/* Peer flow control structure can only be freed if
+			 * replay is complete and all acks accounted for.
+			 */
+			peer->pending_acks--;
+			if (!peer->pending_acks && peer->replayed)
+				cxip_fc_peer_fini(peer);
+
+			return FI_SUCCESS;
+
+		/* This error occurs when the target's control event queue has
+		 * run out of space. Since the target should be processing the
+		 * event queue, it is safe to replay messages until C_RC_OK is
+		 * returned.
+		 */
+		case C_RC_ENTRY_NOT_FOUND:
+			peer->retry_count++;
+			TXC_WARN(txc,
+				 "%#x:%u dropped FC message: retry_delay_usecs=%d retry_count=%u\n",
+				 peer->caddr.nic, peer->caddr.pid,
+				 cxip_env.fc_retry_usec_delay,
+				 peer->retry_count);
+			usleep(cxip_env.fc_retry_usec_delay);
+			return cxip_ctrl_msg_send(req);
+		default:
+			TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT_STS,
+				  cxi_event_to_str(event),
+				  cxi_rc_to_str(cxi_event_rc(event)));
+		}
+	default:
+		TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+}
+
+/*
+ * cxip_fc_peer_init() - Mark a peer as disabled.
+ *
+ * Called by sending EP after experiencing first dropped Send to a peer.
+ *
+ * Allocate state to track the disabled peer. Locate all outstanding Sends
+ * targeting the peer.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int cxip_fc_peer_init(struct cxip_txc *txc, struct cxip_addr caddr,
+			     struct cxip_fc_peer **peer)
+{
+	struct cxip_fc_peer *p;
+	struct cxip_req *req;
+	struct dlist_entry *tmp;
+
+	p = calloc(1, sizeof(*p));
+	if (!p) {
+		TXC_WARN(txc, "Failed to allocate FC Peer\n");
+		return -FI_ENOMEM;
+	}
+
+	p->caddr = caddr;
+	p->txc = txc;
+	dlist_init(&p->msg_queue);
+	dlist_insert_tail(&p->txc_entry, &txc->fc_peers);
+
+	p->req.send.nic_addr = caddr.nic;
+	p->req.send.pid = caddr.pid;
+	/* TODO: remove */
+	p->req.send.mb.txc_id = 0;
+	p->req.send.mb.rxc_id = 0;
+
+	p->req.send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG;
+	p->req.send.mb.ctrl_msg_type = CXIP_CTRL_MSG_FC_NOTIFY;
+	p->req.cb = cxip_fc_notify_cb;
+	p->req.ep_obj = txc->ep_obj;
+
+	/* Queue all Sends to the FC'ed peer */
+	dlist_foreach_container_safe(&txc->msg_queue, struct cxip_req,
+				     req, send.txc_entry, tmp) {
+		if (CXIP_ADDR_EQUAL(req->send.caddr, caddr)) {
+			dlist_remove(&req->send.txc_entry);
+			dlist_insert_tail(&req->send.txc_entry, &p->msg_queue);
+			p->pending++;
+			req->send.fc_peer = p;
+		}
+	}
+
+	*peer = p;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_fc_resume() - Replay dropped Sends.
+ *
+ * Called by sending EP after being notified disabled peer was re-enabled.
+ *
+ * Replay all dropped Sends in order.
+ */
+int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid)
+{
+	struct cxip_txc *txc = &ep_obj->txc;
+	struct cxip_fc_peer *peer;
+	struct cxip_addr caddr = {
+		.nic = nic_addr,
+		.pid = pid,
+	};
+	struct cxip_req *req;
+	struct dlist_entry *tmp;
+	int ret __attribute__((unused));
+
+	peer = cxip_fc_peer_lookup(txc, caddr);
+	if (!peer)
+		TXC_FATAL(txc, "Fatal, FC peer not found: NIC: %#x PID: %d\n",
+			  nic_addr, pid);
+
+	TXC_DBG(txc, "Replaying dropped sends, NIC: %#x PID: %d\n",
+		nic_addr, pid);
+
+	dlist_foreach_container_safe(&peer->msg_queue, struct cxip_req,
+				     req, send.txc_entry, tmp) {
+		/* -FI_EAGAIN can be return if the command queue is full. Loop
+		 * until this goes through.
+		 */
+		do {
+			ret = _cxip_send_req(req);
+		} while (ret == -FI_EAGAIN);
+		assert(ret == FI_SUCCESS);
+
+		/* Move request back to the message queue. */
+		dlist_remove(&req->send.txc_entry);
+		req->send.fc_peer = NULL;
+		dlist_insert_tail(&req->send.txc_entry, &txc->msg_queue);
+
+		TXC_DBG(txc, "Replayed %p\n", req);
+	}
+
+	/* Peer flow control structure can only be freed if replay is complete
+	 * and all acks accounted for.
+	 */
+	if (!peer->pending_acks)
+		cxip_fc_peer_fini(peer);
+	else
+		peer->replayed = true;
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_send_req_dropped() - Mark the Send request dropped.
+ *
+ * Mark the Send request dropped. Mark the target peer as disabled. Track all
+ * outstanding Sends targeting the disabled peer. When all outstanding Sends
+ * are completed, recovery will be performed.
+ */
+static int cxip_send_req_dropped(struct cxip_txc *txc, struct cxip_req *req)
+{
+	struct cxip_fc_peer *peer;
+	int ret;
+
+	/* Check if peer is already disabled */
+	peer = cxip_fc_peer_lookup(txc, req->send.caddr);
+	if (!peer) {
+		ret = cxip_fc_peer_init(txc, req->send.caddr, &peer);
+		if (ret != FI_SUCCESS)
+			return ret;
+
+		TXC_DBG(txc,
+			"Disabled peer detected, NIC: %#x PID: %u pending: %u\n",
+			peer->caddr.nic, peer->caddr.pid, peer->pending);
+	}
+
+	/* Account for the dropped message. */
+	peer->dropped++;
+	ret = cxip_fc_peer_put(peer);
+	if (ret)
+		peer->dropped--;
+	else
+		TXC_DBG(txc,
+			"Send dropped, req: %p NIC: %#x PID: %u pending: %u dropped: %u\n",
+			req, peer->caddr.nic, peer->caddr.pid, peer->pending,
+			peer->dropped);
+
+	return ret;
+}
+
+/*
+ * cxip_send_req_queue() - Queue Send request on TXC.
+ *
+ * Place the Send request in an ordered SW queue. Return error if the target
+ * peer is disabled.
+ */
+static int cxip_send_req_queue(struct cxip_txc *txc, struct cxip_req *req)
+{
+	struct cxip_fc_peer *peer;
+
+	if (!dlist_empty(&txc->fc_peers)) {
+		peer = cxip_fc_peer_lookup(txc, req->send.caddr);
+		if (peer) {
+			/* Peer is disabled. Progress control EQs so future
+			 * cxip_send_req_queue() may succeed.
+			 */
+			cxip_ep_ctrl_progress_locked(txc->ep_obj);
+
+			return -FI_EAGAIN;
+		}
+	}
+
+	dlist_insert_tail(&req->send.txc_entry, &txc->msg_queue);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_send_req_dequeue() - Dequeue Send request from TXC.
+ *
+ * Remove the Send requst from the ordered message queue. Update peer
+ * flow-control state, if necessary.
+ */
+static int cxip_send_req_dequeue(struct cxip_txc *txc, struct cxip_req *req)
+{
+	int ret;
+
+	if (req->send.fc_peer) {
+		/* The peer was disabled after this message arrived. */
+		TXC_DBG(txc,
+			"Send not dropped, req: %p NIC: %#x PID: %u pending: %u dropped: %u\n",
+			req, req->send.fc_peer->caddr.nic,
+			req->send.fc_peer->caddr.pid,
+			req->send.fc_peer->pending, req->send.fc_peer->dropped);
+
+		ret = cxip_fc_peer_put(req->send.fc_peer);
+		if (ret != FI_SUCCESS)
+			return ret;
+
+		req->send.fc_peer = NULL;
+	}
+
+	dlist_remove(&req->send.txc_entry);
+
+	return FI_SUCCESS;
+}
+
+static void cxip_send_buf_fini(struct cxip_req *req)
+{
+	if (req->send.send_md)
+		cxip_unmap(req->send.send_md);
+	if (req->send.ibuf)
+		cxip_txc_ibuf_free(req->send.txc, req->send.ibuf);
+}
+
+static int cxip_send_buf_init(struct cxip_req *req)
+{
+	struct cxip_txc *txc = req->send.txc;
+	int ret;
+
+	/* Nothing to do for zero byte sends. */
+	if (!req->send.len)
+		return FI_SUCCESS;
+
+	/* Triggered operation always requires memory registration. */
+	if (req->triggered)
+		return cxip_map(txc->domain, req->send.buf, req->send.len, 0,
+			       &req->send.send_md);
+
+	/* FI_INJECT operations always require an internal bounce buffer. This
+	 * is needed to replay FI_INJECT operations which may experience flow
+	 * control.
+	 */
+	if (req->send.flags & FI_INJECT) {
+
+		req->send.ibuf = cxip_txc_ibuf_alloc(txc);
+		if (!req->send.ibuf)
+			return -FI_EAGAIN;
+
+		if (txc->hmem) {
+			ret = cxip_txc_copy_from_hmem(txc, NULL, req->send.ibuf,
+						      req->send.buf,
+						      req->send.len);
+			if (ret) {
+				TXC_WARN(txc,
+					 "cxip_txc_copy_from_hmem failed: %d:%s\n",
+					 ret, fi_strerror(-ret));
+				goto err_buf_fini;
+			}
+
+			return FI_SUCCESS;
+		}
+
+		memcpy(req->send.ibuf, req->send.buf, req->send.len);
+		return FI_SUCCESS;
+	}
+
+	/* If message is going to be sent as an IDC, a bounce buffer is needed
+	 * if FI_HMEM is being used. This is due to the buffer type being
+	 * unknown.
+	 */
+	if (cxip_send_eager_idc(req)) {
+		if (txc->hmem) {
+
+			req->send.ibuf = cxip_txc_ibuf_alloc(txc);
+			if (!req->send.ibuf) {
+				ret = -FI_EAGAIN;
+				goto err_buf_fini;
+			}
+
+			ret = cxip_txc_copy_from_hmem(txc, NULL, req->send.ibuf,
+						      req->send.buf,
+						      req->send.len);
+			if (ret) {
+				TXC_WARN(txc,
+					 "cxip_txc_copy_from_hmem failed: %d:%s\n",
+					 ret, fi_strerror(-ret));
+				goto err_buf_fini;
+			}
+		}
+
+		return FI_SUCCESS;
+	}
+
+	/* Everything else requires memory registeration. */
+	return cxip_map(txc->domain, req->send.buf, req->send.len, 0,
+			&req->send.send_md);
+
+err_buf_fini:
+	cxip_send_buf_fini(req);
+
+	return ret;
+}
+
+/*
+ * cxip_send_common() - Common message send function. Used for tagged and
+ * untagged sends of all sizes. This includes triggered operations.
+ */
+ssize_t cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf,
+			 size_t len, void *desc, uint64_t data,
+			 fi_addr_t dest_addr, uint64_t tag, void *context,
+			 uint64_t flags, bool tagged, bool triggered,
+			 uint64_t trig_thresh, struct cxip_cntr *trig_cntr,
+			 struct cxip_cntr *comp_cntr)
+{
+	struct cxip_req *req;
+	struct cxip_addr caddr;
+	int ret;
+
+	if (len && !buf)
+		return -FI_EINVAL;
+
+	if (len > CXIP_EP_MAX_MSG_SZ)
+		return -FI_EMSGSIZE;
+
+	if (tagged && tag & ~CXIP_TAG_MASK) {
+		TXC_WARN(txc, "Invalid tag: %#018lx (%#018lx)\n",
+			 tag, CXIP_TAG_MASK);
+		return -FI_EINVAL;
+	}
+
+	if (flags & FI_INJECT && len > CXIP_INJECT_SIZE) {
+		TXC_WARN(txc, "Invalid inject length: %lu\n", len);
+		return -FI_EMSGSIZE;
+	}
+
+	ofi_genlock_lock(&txc->ep_obj->lock);
+
+	req = cxip_evtq_req_alloc(&txc->tx_evtq, false, txc);
+	if (!req) {
+		TXC_DBG(txc, "Failed to allocate request, return -FI_EAGAIN\n");
+		ret = -FI_EAGAIN;
+		goto unlock;
+	}
+
+	/* Restrict outstanding success event requests to queue size */
+	if (ofi_atomic_inc32(&txc->otx_reqs) > txc->attr.size) {
+		ret = -FI_EAGAIN;
+		goto err_req_free;
+	}
+
+	req->triggered = triggered;
+	req->trig_thresh = trig_thresh;
+	req->trig_cntr = trig_cntr;
+
+	/* Save Send parameters to replay */
+	req->type = CXIP_REQ_SEND;
+	req->send.txc = txc;
+	req->send.tclass = tclass;
+
+	req->send.cntr = triggered ? comp_cntr : txc->send_cntr;
+	req->send.buf = buf;
+	req->send.len = len;
+	req->send.data = data;
+	req->send.flags = flags;
+
+	/* Set completion parameters */
+	req->context = (uint64_t)context;
+	req->flags = FI_SEND | (flags & (FI_COMPLETION | FI_MATCH_COMPLETE));
+	if (tagged) {
+		req->send.tagged = tagged;
+		req->send.tag = tag;
+		req->flags |= FI_TAGGED;
+	} else {
+		req->flags |= FI_MSG;
+	}
+
+	ret = cxip_send_buf_init(req);
+	if (ret) {
+		TXC_WARN(txc, "cxip_send_buf_init failed: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		goto err_req_free;
+	}
+
+	/* Look up target CXI address */
+	ret = cxip_av_lookup_addr(txc->ep_obj->av, dest_addr, &caddr);
+	if (ret != FI_SUCCESS) {
+		TXC_WARN(txc, "Failed to look up FI addr: %d\n", ret);
+		goto err_req_buf_fini;
+	}
+
+	req->send.caddr = caddr;
+	req->send.dest_addr = dest_addr;
+
+	if (cxip_evtq_saturated(&txc->tx_evtq)) {
+		TXC_DBG(txc, "TX HW EQ saturated\n");
+		ret = -FI_EAGAIN;
+		goto err_req_buf_fini;
+	}
+
+	/* Check if target peer is disabled */
+	ret = cxip_send_req_queue(req->send.txc, req);
+	if (ret != FI_SUCCESS) {
+		TXC_DBG(txc, "Target peer disabled\n");
+		goto err_req_buf_fini;
+	}
+
+	/* Try Send */
+	ret = _cxip_send_req(req);
+	if (ret != FI_SUCCESS)
+		goto err_req_dequeue;
+
+	ofi_genlock_unlock(&txc->ep_obj->lock);
+
+	TXC_DBG(txc,
+		"req: %p buf: %p len: %lu dest_addr: 0x%lX nic: %d pid: %d tag(%c): 0x%lx context %#lx\n",
+		req, req->send.buf, req->send.len, dest_addr, caddr.nic,
+		caddr.pid, req->send.tagged ? '*' : '-', req->send.tag,
+		req->context);
+
+	return FI_SUCCESS;
+
+err_req_dequeue:
+	cxip_send_req_dequeue(req->send.txc, req);
+err_req_buf_fini:
+	cxip_send_buf_fini(req);
+err_req_free:
+	ofi_atomic_dec32(&txc->otx_reqs);
+	cxip_evtq_req_free(req);
+unlock:
+	ofi_genlock_unlock(&txc->ep_obj->lock);
+
+	return ret;
+}
+
+/*
+ * Libfabric APIs
+ */
+static ssize_t cxip_trecv(struct fid_ep *fid_ep, void *buf, size_t len,
+			  void *desc, fi_addr_t src_addr, uint64_t tag,
+			  uint64_t ignore, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_recv_common(&ep->ep_obj->rxc, buf, len, desc, src_addr,
+				tag, ignore, context, ep->rx_attr.op_flags,
+				true, NULL);
+}
+
+static ssize_t cxip_trecvv(struct fid_ep *fid_ep, const struct iovec *iov,
+			   void **desc, size_t count, fi_addr_t src_addr,
+			   uint64_t tag, uint64_t ignore, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	size_t len;
+	void *buf;
+	void *mr_desc;
+
+	if (count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (iov && count == 1) {
+		len = iov[0].iov_len;
+		buf = iov[0].iov_base;
+		mr_desc = desc ? desc[0] : NULL;
+	} else {
+		RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	return cxip_recv_common(&ep->ep_obj->rxc, buf, len, mr_desc, src_addr,
+				tag, ignore, context, ep->rx_attr.op_flags,
+				true, NULL);
+}
+
+static ssize_t cxip_trecvmsg(struct fid_ep *fid_ep,
+			     const struct fi_msg_tagged *msg, uint64_t flags)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	size_t len;
+	void *buf;
+	void *mr_desc;
+
+	if (flags & ~(CXIP_RX_OP_FLAGS | CXIP_RX_IGNORE_OP_FLAGS |
+		      FI_PEEK | FI_CLAIM))
+		return -FI_EBADFLAGS;
+
+	if (!msg) {
+		RXC_WARN(&ep->ep_obj->rxc, "NULL msg not supported\n");
+		return -FI_EINVAL;
+	}
+
+	/* If selective completion is not requested, always generate
+	 * completions.
+	 */
+	if (!ep->ep_obj->rxc.selective_completion)
+		flags |= FI_COMPLETION;
+
+	if (!(flags & FI_PEEK)) {
+		if (msg->iov_count == 0) {
+			len = 0;
+			buf = NULL;
+			mr_desc = NULL;
+		} else if (msg->msg_iov && msg->iov_count == 1) {
+			len = msg->msg_iov[0].iov_len;
+			buf = msg->msg_iov[0].iov_base;
+			mr_desc = msg->desc ? msg->desc[0] : NULL;
+		} else {
+			RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n");
+			return -FI_EINVAL;
+		}
+
+		return cxip_recv_common(&ep->ep_obj->rxc, buf, len, mr_desc,
+					msg->addr, msg->tag, msg->ignore,
+					msg->context, flags, true, NULL);
+	}
+
+	/* FI_PEEK does not post a recv or return message payload */
+	return cxip_recv_common(&ep->ep_obj->rxc, NULL, 0UL, NULL, msg->addr,
+				msg->tag, msg->ignore, msg->context, flags,
+				true, NULL);
+}
+
+static ssize_t cxip_tsend(struct fid_ep *fid_ep, const void *buf, size_t len,
+			  void *desc, fi_addr_t dest_addr, uint64_t tag,
+			  void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len,
+				desc, 0, dest_addr, tag, context,
+				ep->tx_attr.op_flags, true, false, 0,
+				NULL, NULL);
+}
+
+static ssize_t cxip_tsendv(struct fid_ep *fid_ep, const struct iovec *iov,
+			   void **desc, size_t count, fi_addr_t dest_addr,
+			   uint64_t tag, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	size_t len;
+	const void *buf;
+	void *mr_desc;
+
+	if (count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (iov && count == 1) {
+		len = iov[0].iov_len;
+		buf = iov[0].iov_base;
+		mr_desc = desc ? desc[0] : NULL;
+	} else {
+		TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len,
+				mr_desc, 0, dest_addr, tag, context,
+				ep->tx_attr.op_flags, true, false, 0, NULL,
+				NULL);
+}
+
+static ssize_t cxip_tsendmsg(struct fid_ep *fid_ep,
+			     const struct fi_msg_tagged *msg, uint64_t flags)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+	size_t len;
+	const void *buf;
+	void *mr_desc;
+
+	if (!msg) {
+		TXC_WARN(&ep->ep_obj->txc, "NULL msg not supported\n");
+		return -FI_EINVAL;
+	}
+
+	if (msg->iov_count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (msg->msg_iov && msg->iov_count == 1) {
+		len = msg->msg_iov[0].iov_len;
+		buf = msg->msg_iov[0].iov_base;
+		mr_desc = msg->desc ? msg->desc[0] : NULL;
+	} else {
+		TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	if (flags & ~CXIP_TX_OP_FLAGS)
+		return -FI_EBADFLAGS;
+
+	if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE))
+		return -FI_EINVAL;
+
+	/* If selective completion is not requested, always generate
+	 * completions.
+	 */
+	if (!txc->selective_completion)
+		flags |= FI_COMPLETION;
+
+	return cxip_send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc,
+				msg->data, msg->addr, msg->tag, msg->context,
+				flags, true, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_tinject(struct fid_ep *fid_ep, const void *buf, size_t len,
+			    fi_addr_t dest_addr, uint64_t tag)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len,
+				NULL, 0, dest_addr, tag, NULL, FI_INJECT,
+				true, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_tsenddata(struct fid_ep *fid_ep, const void *buf,
+			      size_t len, void *desc, uint64_t data,
+			      fi_addr_t dest_addr, uint64_t tag, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len,
+				desc, data, dest_addr, tag, context,
+				ep->tx_attr.op_flags | FI_REMOTE_CQ_DATA,
+				true, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_tinjectdata(struct fid_ep *fid_ep, const void *buf,
+				size_t len, uint64_t data, fi_addr_t dest_addr,
+				uint64_t tag)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf,
+				len, NULL, data, dest_addr, tag, NULL,
+				FI_INJECT | FI_REMOTE_CQ_DATA,
+				true, false, 0, NULL, NULL);
+}
+
+struct fi_ops_tagged cxip_ep_tagged_no_ops = {
+	.size = sizeof(struct fi_ops_tagged),
+	.recv = fi_no_tagged_recv,
+	.recvv = fi_no_tagged_recvv,
+	.recvmsg = fi_no_tagged_recvmsg,
+	.send = fi_no_tagged_send,
+	.sendv = fi_no_tagged_sendv,
+	.sendmsg = fi_no_tagged_sendmsg,
+	.inject = fi_no_tagged_inject,
+	.senddata = fi_no_tagged_senddata,
+	.injectdata = fi_no_tagged_injectdata,
+};
+
+struct fi_ops_tagged cxip_ep_tagged_ops = {
+	.size = sizeof(struct fi_ops_tagged),
+	.recv = cxip_trecv,
+	.recvv = cxip_trecvv,
+	.recvmsg = cxip_trecvmsg,
+	.send = cxip_tsend,
+	.sendv = cxip_tsendv,
+	.sendmsg = cxip_tsendmsg,
+	.inject = cxip_tinject,
+	.senddata = cxip_tsenddata,
+	.injectdata = cxip_tinjectdata,
+};
+
+struct fi_ops_tagged cxip_ep_tagged_no_tx_ops = {
+	.size = sizeof(struct fi_ops_tagged),
+	.recv = cxip_trecv,
+	.recvv = cxip_trecvv,
+	.recvmsg = cxip_trecvmsg,
+	.send = fi_no_tagged_send,
+	.sendv = fi_no_tagged_sendv,
+	.sendmsg = fi_no_tagged_sendmsg,
+	.inject = fi_no_tagged_inject,
+	.senddata = fi_no_tagged_senddata,
+	.injectdata = fi_no_tagged_injectdata,
+};
+
+struct fi_ops_tagged cxip_ep_tagged_no_rx_ops = {
+	.size = sizeof(struct fi_ops_tagged),
+	.recv = fi_no_tagged_recv,
+	.recvv = fi_no_tagged_recvv,
+	.recvmsg = fi_no_tagged_recvmsg,
+	.send = cxip_tsend,
+	.sendv = cxip_tsendv,
+	.sendmsg = cxip_tsendmsg,
+	.inject = cxip_tinject,
+	.senddata = cxip_tsenddata,
+	.injectdata = cxip_tinjectdata,
+};
+
+static ssize_t cxip_recv(struct fid_ep *fid_ep, void *buf, size_t len,
+			 void *desc, fi_addr_t src_addr, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_recv_common(&ep->ep_obj->rxc, buf, len, desc, src_addr, 0,
+				0, context, ep->rx_attr.op_flags, false, NULL);
+}
+
+static ssize_t cxip_recvv(struct fid_ep *fid_ep, const struct iovec *iov,
+			  void **desc, size_t count, fi_addr_t src_addr,
+			  void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	size_t len;
+	void *buf;
+	void *mr_desc;
+
+	if (count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (iov && count == 1) {
+		len = iov[0].iov_len;
+		buf = iov[0].iov_base;
+		mr_desc = desc ? desc[0] : NULL;
+	} else {
+		RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	return cxip_recv_common(&ep->ep_obj->rxc, buf, len, mr_desc, src_addr,
+				0, 0, context, ep->rx_attr.op_flags, false,
+				NULL);
+}
+
+static ssize_t cxip_recvmsg(struct fid_ep *fid_ep, const struct fi_msg *msg,
+			    uint64_t flags)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct cxip_rxc *rxc = &ep->ep_obj->rxc;
+	size_t len;
+	void *buf;
+	void *mr_desc;
+
+	if (flags & ~(CXIP_RX_OP_FLAGS | CXIP_RX_IGNORE_OP_FLAGS))
+		return -FI_EBADFLAGS;
+
+	if (!msg) {
+		RXC_WARN(rxc, "NULL msg not supported\n");
+		return -FI_EINVAL;
+	}
+
+	if (msg->iov_count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (msg->msg_iov && msg->iov_count == 1) {
+		len = msg->msg_iov[0].iov_len;
+		buf = msg->msg_iov[0].iov_base;
+		mr_desc = msg->desc ? msg->desc[0] : NULL;
+	} else {
+		RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	/* If selective completion is not requested, always generate
+	 * completions.
+	 */
+	if (!rxc->selective_completion)
+		flags |= FI_COMPLETION;
+
+	return cxip_recv_common(rxc, buf, len, mr_desc, msg->addr, 0, 0,
+				msg->context, flags, false, NULL);
+}
+
+static ssize_t cxip_send(struct fid_ep *fid_ep, const void *buf, size_t len,
+			 void *desc, fi_addr_t dest_addr, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass,
+				buf, len, desc, 0, dest_addr, 0, context,
+				ep->tx_attr.op_flags, false, false, 0,
+				NULL, NULL);
+}
+
+static ssize_t cxip_sendv(struct fid_ep *fid_ep, const struct iovec *iov,
+			  void **desc, size_t count, fi_addr_t dest_addr,
+			  void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	size_t len;
+	const void *buf;
+	void *mr_desc;
+
+	if (count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (iov && count == 1) {
+		len = iov[0].iov_len;
+		buf = iov[0].iov_base;
+		mr_desc = desc ? desc[0] : NULL;
+	} else {
+		TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass,
+				buf, len, mr_desc, 0, dest_addr, 0, context,
+				ep->tx_attr.op_flags, false, false, 0,
+				NULL, NULL);
+}
+
+static ssize_t cxip_sendmsg(struct fid_ep *fid_ep, const struct fi_msg *msg,
+			    uint64_t flags)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+	size_t len;
+	const void *buf;
+	void *mr_desc;
+
+	if (!msg) {
+		TXC_WARN(&ep->ep_obj->txc, "NULL msg not supported\n");
+		return -FI_EINVAL;
+	}
+
+	if (msg->iov_count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (msg->msg_iov && msg->iov_count == 1) {
+		len = msg->msg_iov[0].iov_len;
+		buf = msg->msg_iov[0].iov_base;
+		mr_desc = msg->desc ? msg->desc[0] : NULL;
+	} else {
+		TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	if (flags & ~CXIP_TX_OP_FLAGS)
+		return -FI_EBADFLAGS;
+
+	if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE))
+		return -FI_EINVAL;
+
+	/* If selective completion is not requested, always generate
+	 * completions.
+	 */
+	if (!txc->selective_completion)
+		flags |= FI_COMPLETION;
+
+	return cxip_send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc,
+				msg->data, msg->addr, 0, msg->context, flags,
+				false, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_inject(struct fid_ep *fid_ep, const void *buf, size_t len,
+			   fi_addr_t dest_addr)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf,
+				len, NULL, 0, dest_addr, 0, NULL, FI_INJECT,
+				false, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_senddata(struct fid_ep *fid_ep, const void *buf, size_t len,
+			     void *desc, uint64_t data, fi_addr_t dest_addr,
+			     void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf,
+				len, desc, data, dest_addr, 0, context,
+				ep->tx_attr.op_flags | FI_REMOTE_CQ_DATA,
+				false, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_injectdata(struct fid_ep *fid_ep, const void *buf,
+			       size_t len, uint64_t data, fi_addr_t dest_addr)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf,
+				len, NULL, data, dest_addr, 0, NULL,
+				FI_INJECT | FI_REMOTE_CQ_DATA,
+				false, false, 0, NULL, NULL);
+}
+
+struct fi_ops_msg cxip_ep_msg_no_ops = {
+	.size = sizeof(struct fi_ops_msg),
+	.recv = fi_no_msg_recv,
+	.recvv = fi_no_msg_recvv,
+	.recvmsg = fi_no_msg_recvmsg,
+	.send = fi_no_msg_send,
+	.sendv = fi_no_msg_sendv,
+	.sendmsg = fi_no_msg_sendmsg,
+	.inject = fi_no_msg_inject,
+	.senddata = fi_no_msg_senddata,
+	.injectdata = fi_no_msg_injectdata,
+};
+
+struct fi_ops_msg cxip_ep_msg_ops = {
+	.size = sizeof(struct fi_ops_msg),
+	.recv = cxip_recv,
+	.recvv = cxip_recvv,
+	.recvmsg = cxip_recvmsg,
+	.send = cxip_send,
+	.sendv = cxip_sendv,
+	.sendmsg = cxip_sendmsg,
+	.inject = cxip_inject,
+	.senddata = cxip_senddata,
+	.injectdata = cxip_injectdata,
+};
+
+struct fi_ops_msg cxip_ep_msg_no_tx_ops = {
+	.size = sizeof(struct fi_ops_msg),
+	.recv = cxip_recv,
+	.recvv = cxip_recvv,
+	.recvmsg = cxip_recvmsg,
+	.send = fi_no_msg_send,
+	.sendv = fi_no_msg_sendv,
+	.sendmsg = fi_no_msg_sendmsg,
+	.inject = fi_no_msg_inject,
+	.senddata = fi_no_msg_senddata,
+	.injectdata = fi_no_msg_injectdata,
+};
+
+struct fi_ops_msg cxip_ep_msg_no_rx_ops = {
+	.size = sizeof(struct fi_ops_msg),
+	.recv = fi_no_msg_recv,
+	.recvv = fi_no_msg_recvv,
+	.recvmsg = fi_no_msg_recvmsg,
+	.send = cxip_send,
+	.sendv = cxip_sendv,
+	.sendmsg = cxip_sendmsg,
+	.inject = cxip_inject,
+	.senddata = cxip_senddata,
+	.injectdata = cxip_injectdata,
+};
diff --git a/prov/cxi/src/cxip_nic.c b/prov/cxi/src/cxip_nic.c
new file mode 100644
index 00000000000..90df4cf73d8
--- /dev/null
+++ b/prov/cxi/src/cxip_nic.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+#include "config.h"
+#include "cxip.h"
+#include "ofi.h"
+#include "ofi_str.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_FABRIC, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_FABRIC, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_FABRIC, __VA_ARGS__)
+
+static int cxip_nic_get_ss_env_get_vni(void)
+{
+	char *vni_str;
+	char *vni_str_dup;
+	char *token;
+	int vni = -FI_EINVAL;
+
+	vni_str = getenv("SLINGSHOT_VNIS");
+	if (!vni_str) {
+		CXIP_INFO("SLINGSHOT_VNIS not found\n");
+		return -FI_ENOSYS;
+	}
+
+	vni_str_dup = strdup(vni_str);
+	if (!vni_str_dup)
+		return -FI_ENOMEM;
+
+	/* Index/token zero is the per job-step VNI. Only use this value. Index
+	 * one is the inter-job-step VNI. Ignore this one.
+	 */
+	token = strtok(vni_str_dup, ",");
+	if (token)
+		vni = (uint16_t)atoi(token);
+	else
+		CXIP_WARN("VNI not found in SLINGSHOT_VNIS: %s\n", vni_str);
+
+	free(vni_str_dup);
+
+	return vni;
+}
+
+static int cxip_gen_auth_key_ss_env_get_svc_id(struct cxip_if *nic_if)
+{
+	char *svc_id_str;
+	char *dev_str;
+	char *svc_id_str_dup;
+	char *dev_str_dup;
+	int device_index;
+	char *token;
+	bool found;
+	int svc_id;
+
+	svc_id_str = getenv("SLINGSHOT_SVC_IDS");
+	if (!svc_id_str) {
+		CXIP_INFO("SLINGSHOT_SVC_IDS not found\n");
+		return -FI_ENOSYS;
+	}
+
+	dev_str = getenv("SLINGSHOT_DEVICES");
+	if (!dev_str) {
+		CXIP_INFO("SLINGSHOT_DEVICES not found\n");
+		return -FI_ENOSYS;
+	}
+
+	dev_str_dup = strdup(dev_str);
+	if (!dev_str_dup)
+		return -FI_ENOMEM;
+
+	found = false;
+	device_index = 0;
+	token = strtok(dev_str_dup, ",");
+	while (token != NULL) {
+		if (strcmp(token, nic_if->info->device_name) == 0) {
+			found = true;
+			break;
+		}
+
+		device_index++;
+		token = strtok(NULL, ",");
+	}
+
+	free(dev_str_dup);
+
+	if (!found) {
+		CXIP_WARN("Failed to find %s in SLINGSHOT_DEVICES: %s\n",
+			  nic_if->info->device_name, dev_str);
+		return -FI_ENOSYS;
+	}
+
+	svc_id_str_dup = strdup(svc_id_str);
+	if (!svc_id_str_dup)
+		return -FI_ENOMEM;
+
+	found = false;
+	token = strtok(svc_id_str_dup, ",");
+	while (token != NULL) {
+		if (device_index == 0) {
+			svc_id = atoi(token);
+			found = true;
+			break;
+		}
+
+		device_index--;
+		token = strtok(NULL, ",");
+	}
+
+	free(svc_id_str_dup);
+
+	if (!found) {
+		CXIP_WARN("Failed to find service ID in SLINGSHOT_SVC_IDS: %s\n",
+			  svc_id_str);
+		return -FI_EINVAL;
+	}
+
+	return svc_id;
+}
+
+static int cxip_nic_get_rgroup_vni_ss_env(struct cxip_if *nic_if,
+					  unsigned int *rgroup,
+					  unsigned int *vni)
+{
+	int ret;
+
+	ret = cxip_nic_get_ss_env_get_vni();
+	if (ret < 0)
+		return ret;
+
+	*vni = ret;
+
+	ret = cxip_gen_auth_key_ss_env_get_svc_id(nic_if);
+	if (ret < 0)
+		return ret;
+
+	*rgroup = ret;
+
+	CXIP_INFO("Generated (%u:%u) for %s\n", *rgroup, *vni,
+		  nic_if->info->device_name);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_nic_get_best_rgroup_vni(struct cxip_if *nic_if,
+					unsigned int *rgroup,
+					unsigned int *vni)
+{
+	int ret;
+	struct cxil_svc_list *svc_list;
+	uid_t uid;
+	gid_t gid;
+	int i;
+	int j;
+	struct cxi_svc_desc *desc;
+	int found_uid;
+	int found_gid;
+	int found_unrestricted;
+
+	uid = geteuid();
+	gid = getegid();
+
+	ret = cxil_get_svc_list(nic_if->dev, &svc_list);
+	if (ret) {
+		CXIP_WARN("cxil_get_svc_list failed: %d:%s\n", ret,
+			  strerror(-ret));
+		return ret;
+	}
+
+	/* Find the service indexes which can be used by this process. These are
+	 * services which are unrestricted, have a matching UID, or have a
+	 * matching GID. If there are multiple service IDs which could match
+	 * unrestricted, UID, and GID, only the first one found is selected.
+	 */
+	found_uid = -1;
+	found_gid = -1;
+	found_unrestricted = -1;
+
+	for (i = svc_list->count - 1; i >= 0; i--) {
+		desc = svc_list->descs + i;
+
+		if (!desc->enable || desc->is_system_svc)
+			continue;
+
+		if (!desc->restricted_members) {
+			if (found_unrestricted == -1)
+				found_unrestricted = i;
+			continue;
+		}
+
+		for (j = 0; j < CXI_SVC_MAX_MEMBERS; j++) {
+			if (desc->members[j].type == CXI_SVC_MEMBER_UID &&
+			    desc->members[j].svc_member.uid == uid &&
+			    found_uid == -1)
+				found_uid = i;
+			else if (desc->members[j].type == CXI_SVC_MEMBER_GID &&
+				 desc->members[j].svc_member.gid == gid &&
+				 found_gid == -1)
+				found_gid = i;
+		}
+	}
+
+	/* Prioritized list for matching service ID. */
+	if (found_uid != -1)
+		i = found_uid;
+	else if (found_gid != -1) {
+		i = found_gid;
+	} else if (found_unrestricted != -1) {
+		i = found_unrestricted;
+	} else {
+		cxil_free_svc_list(svc_list);
+		return -FI_ENOSYS;
+	}
+
+	/* Generate auth_key using matched service ID. */
+	desc = svc_list->descs + i;
+
+	if (desc->restricted_vnis) {
+		if (desc->num_vld_vnis == 0) {
+			CXIP_WARN("No valid VNIs for %s service ID %u\n",
+				  nic_if->info->device_name, i);
+
+			cxil_free_svc_list(svc_list);
+
+			return -FI_EINVAL;
+		}
+
+		*vni = (uint16_t)desc->vnis[0];
+	} else {
+		*vni = (uint16_t)cxip_env.default_vni;
+	}
+
+	*rgroup = desc->svc_id;
+
+	CXIP_INFO("Found (%u:%u) for %s\n", *rgroup, *vni,
+		  nic_if->info->device_name);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_nic_get_rgroup_vni(struct cxip_if *nic_if,
+				   unsigned int *rgroup, unsigned int *vni)
+{
+	int ret;
+
+	ret = cxip_nic_get_rgroup_vni_ss_env(nic_if, rgroup, vni);
+	if (ret == FI_SUCCESS)
+		return FI_SUCCESS;
+
+	ret = cxip_nic_get_best_rgroup_vni(nic_if, rgroup, vni);
+	if (ret == -FI_ENOSYS) {
+		CXIP_WARN("Failed to find valid default rgroup and vni for %s\n",
+			  nic_if->info->device_name);
+		*rgroup = 0;
+		*vni = 0;
+		ret = FI_SUCCESS;
+	}
+
+	return ret;
+}
+
+static int cxip_nic_close(struct fid *fid)
+{
+	struct fid_nic *nic = (struct fid_nic *) fid;
+
+	free(nic->prov_attr);
+	return ofi_nic_close(fid);
+}
+
+static int cxip_nic_control(struct fid *fid, int command, void *arg)
+{
+	int ret;
+	struct fid_nic *nic = container_of(fid, struct fid_nic, fid);
+	struct cxip_nic_attr *nic_attr = nic->prov_attr;
+	struct fid_nic **dup = (struct fid_nic **) arg;
+	struct cxip_if *nic_if;
+
+	if (command == FI_OPT_CXI_NIC_REFRESH_ATTR) {
+		ret = cxip_get_if(nic_attr->addr, &nic_if);
+		if (ret != FI_SUCCESS)
+			return ret;
+
+		ret = cxip_nic_get_rgroup_vni(nic_if,
+					      (void *)&nic_attr->default_rgroup_id,
+					      (void *)&nic_attr->default_vni);
+
+		cxip_put_if(nic_if);
+
+		return ret;
+	}
+
+	ret = ofi_nic_control(fid, command, arg);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	if (command == FI_DUP) {
+		(*dup)->prov_attr = mem_dup(nic->prov_attr, sizeof(struct cxip_nic_attr));
+		if (!(*dup)->prov_attr) {
+			cxip_nic_close(&(*dup)->fid);
+			return -FI_ENOMEM;
+		}
+	}
+
+	return FI_SUCCESS;
+}
+
+static int cxip_nic_tostr(const struct fid *fid_nic, char *buf, size_t len)
+{
+	return ofi_nic_tostr(fid_nic, buf, len);
+}
+
+static struct fi_ops cxip_nic_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = cxip_nic_close,
+	.control = cxip_nic_control,
+	.tostr = cxip_nic_tostr,
+};
+
+int cxip_nic_alloc(struct cxip_if *nic_if, struct fid_nic **fid_nic)
+{
+	struct fid_nic *nic;
+	struct cxip_nic_attr *nic_attr;
+	int ret;
+
+	/* Reuse the common fid_nic as must as possible. */
+	nic = ofi_nic_dup(NULL);
+	if (!nic)
+		return -FI_ENOMEM;
+
+	nic_attr = calloc(1, sizeof(*nic_attr));
+	if (!nic_attr) {
+		ret = -FI_ENOMEM;
+		goto err_free_nic;
+	}
+
+	nic->prov_attr = nic_attr;
+
+	ret = cxip_nic_get_rgroup_vni(nic_if,
+				      (void *)&nic_attr->default_rgroup_id,
+				      (void *)&nic_attr->default_vni);
+	if (ret != FI_SUCCESS)
+		goto err_free_nic;
+
+	memcpy((void *)&nic_attr->addr, &nic_if->info->nic_addr,
+	       sizeof(nic_attr->addr));
+	nic_attr->version = FI_CXI_NIC_ATTR_VER;
+
+	/* Update the fid_nic to point to our operations. */
+	nic->fid.ops = &cxip_nic_ops;
+
+	nic->device_attr->name = strdup(nic_if->info->device_name);
+	if (!nic->device_attr->name) {
+		ret = -FI_ENOMEM;
+		goto err_free_nic;
+	}
+
+	ret = asprintf(&nic->device_attr->device_id, "0x%x",
+		       nic_if->info->device_id);
+	if (ret < 0)
+		goto err_free_nic;
+
+	ret = asprintf(&nic->device_attr->device_version, "%u",
+		       nic_if->info->device_rev);
+	if (ret < 0)
+		goto err_free_nic;
+
+	ret = asprintf(&nic->device_attr->vendor_id, "0x%x",
+		       nic_if->info->vendor_id);
+	if (ret < 0)
+		goto err_free_nic;
+
+	nic->device_attr->driver = strdup(nic_if->info->driver_name);
+
+	nic->bus_attr->bus_type = FI_BUS_PCI;
+	nic->bus_attr->attr.pci.domain_id = nic_if->info->pci_domain;
+	nic->bus_attr->attr.pci.bus_id = nic_if->info->pci_bus;
+	nic->bus_attr->attr.pci.device_id = nic_if->info->pci_device;
+	nic->bus_attr->attr.pci.function_id = nic_if->info->pci_function;
+
+	ret = asprintf(&nic->link_attr->address, "0x%x",
+		       nic_if->info->nic_addr);
+	if (ret < 0)
+		goto err_free_nic;
+
+	nic->link_attr->mtu = nic_if->info->link_mtu;
+	/* Convert Mb/s to libfabric reported b/s */
+	nic->link_attr->speed = (size_t)nic_if->speed * 1000000;
+	nic->link_attr->state = nic_if->link ?  FI_LINK_UP : FI_LINK_DOWN;
+	nic->link_attr->network_type = strdup("HPC Ethernet");
+
+	*fid_nic = nic;
+
+	return FI_SUCCESS;
+
+err_free_nic:
+	cxip_nic_close(&nic->fid);
+
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_portals_table.c b/prov/cxi/src/cxip_portals_table.c
new file mode 100644
index 00000000000..625b349fe1d
--- /dev/null
+++ b/prov/cxi/src/cxip_portals_table.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_DOMAIN, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__)
+
+int cxip_portals_table_alloc(struct cxip_lni *lni, uint16_t *vni,
+			     size_t vni_count, uint32_t pid,
+			     struct cxip_portals_table **ptable)
+{
+	struct cxip_portals_table *table;
+	int ret;
+	int i;
+
+
+	if (!vni_count) {
+		CXIP_WARN("Invalid VNI count\n");
+		return -FI_EINVAL;
+	}
+
+	table = calloc(1, sizeof(*table));
+	if (!table) {
+		CXIP_WARN("Failed to allocate IF domain\n");
+		return -FI_ENOMEM;
+	}
+
+	table->doms = calloc(vni_count, sizeof(*table->doms));
+	if (!table->doms) {
+		CXIP_WARN("Failed to allocate domain array\n");
+		ret = -FI_ENOMEM;
+		goto err_free_table;
+	}
+
+	for (i = 0; i < vni_count; i++) {
+		ret = cxil_alloc_domain(lni->lni, vni[i], pid, &table->doms[i]);
+		if (ret) {
+			CXIP_WARN("Failed to allocate CXI Domain, ret: %d\n",
+				  ret);
+			ret = -FI_ENOSPC;
+			goto err_free_doms;
+		}
+
+		/* To handle C_PID_ANY correctly, the same PID needs to be used
+		 * for each domain. Thus, update PID after the first domain
+		 * is allocated to a valid value.
+		 */
+		pid = table->doms[i]->pid;
+	}
+
+	table->pid = pid;
+	table->doms_count = vni_count;
+	table->lni = lni;
+
+	CXIP_DBG("Allocated portals table, %s PID: %u\n",
+		 lni->iface->info->device_name, table->pid);
+
+	*ptable = table;
+
+	return FI_SUCCESS;
+
+err_free_doms:
+	for (i--; i >= 0; i--) {
+		ret = cxil_destroy_domain(table->doms[i]);
+		if (ret)
+			CXIP_WARN("Failed to destroy domain: %d\n", ret);
+	}
+
+	free(table->doms);
+err_free_table:
+	free(table);
+
+	return ret;
+}
+
+/*
+ * cxip_free_if_domain() - Free an IF Domain.
+ */
+void cxip_portals_table_free(struct cxip_portals_table *ptable)
+{
+	int ret;
+	int i;
+
+	CXIP_DBG("Freeing portals table, %s PID: %u\n",
+		 ptable->lni->iface->info->device_name, ptable->pid);
+
+	for (i = 0; i < ptable->doms_count; i++) {
+		ret = cxil_destroy_domain(ptable->doms[i]);
+		if (ret)
+			CXIP_WARN("Failed to destroy domain: %d\n", ret);
+	}
+
+	free(ptable->doms);
+	free(ptable);
+}
diff --git a/prov/cxi/src/cxip_pte.c b/prov/cxi/src/cxip_pte.c
new file mode 100644
index 00000000000..bdbcda67e3b
--- /dev/null
+++ b/prov/cxi/src/cxip_pte.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_DOMAIN, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__)
+
+/* Caller musthold ep_obj->lock. */
+int cxip_pte_set_state(struct cxip_pte *pte, struct cxip_cmdq *cmdq,
+		       enum c_ptlte_state new_state, uint32_t drop_count)
+{
+	int ret;
+	struct c_set_state_cmd set_state = {
+		.command.opcode = C_CMD_TGT_SETSTATE,
+		.ptlte_index = pte->pte->ptn,
+		.ptlte_state = new_state,
+		.drop_count = drop_count,
+	};
+
+	ret = cxi_cq_emit_target(cmdq->dev_cmdq, &set_state);
+	if (ret) {
+		CXIP_WARN("Failed to enqueue command: %d\n", ret);
+		return -FI_EAGAIN;
+	}
+
+	cxi_cq_ring(cmdq->dev_cmdq);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_pte_set_wait() - Set a new PTE state synchronously.
+ *
+ * TODO: EP lock associated with the EP must be taken.
+ */
+int cxip_pte_set_state_wait(struct cxip_pte *pte, struct cxip_cmdq *cmdq,
+			    struct cxip_evtq *evtq,
+			    enum c_ptlte_state new_state, uint32_t drop_count)
+{
+	int ret;
+
+	ret = cxip_pte_set_state(pte, cmdq, new_state, drop_count);
+	if (ret == FI_SUCCESS) {
+		do {
+			sched_yield();
+			cxip_evtq_progress(evtq);
+		} while (pte->state != new_state);
+	}
+
+	return ret;
+}
+
+/*
+ * cxip_pte_append() - Append a buffer to a PtlTE.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_pte_append(struct cxip_pte *pte, uint64_t iova, size_t len,
+		    unsigned int lac, enum c_ptl_list list,
+		    uint32_t buffer_id, uint64_t match_bits,
+		    uint64_t ignore_bits, uint32_t match_id,
+		    uint64_t min_free, uint32_t flags,
+		    struct cxip_cntr *cntr, struct cxip_cmdq *cmdq,
+		    bool ring)
+{
+	union c_cmdu cmd = {};
+	int rc;
+
+	cmd.command.opcode                = C_CMD_TGT_APPEND;
+	cmd.target.ptl_list               = list;
+	cmd.target.ptlte_index            = pte->pte->ptn;
+	cmd.target.buffer_id              = buffer_id;
+	cmd.target.lac                    = lac;
+	cmd.target.start                  = iova;
+	cmd.target.length                 = len;
+	cmd.target.ct                     = cntr ? cntr->ct->ctn : 0;
+	cmd.target.match_bits             = match_bits;
+	cmd.target.ignore_bits            = ignore_bits;
+	cmd.target.match_id               = match_id;
+	cmd.target.min_free               = min_free;
+
+	cxi_target_cmd_setopts(&cmd.target, flags);
+
+	rc = cxi_cq_emit_target(cmdq->dev_cmdq, &cmd);
+	if (rc) {
+		CXIP_DBG("Failed to write Append command: %d\n", rc);
+		/* Return error according to Domain Resource Management */
+		return -FI_EAGAIN;
+	}
+
+	if (ring)
+		cxi_cq_ring(cmdq->dev_cmdq);
+
+	return FI_SUCCESS;
+}
+
+/*
+ * cxip_pte_unlink() - Unlink a buffer from a PtlTE.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_pte_unlink(struct cxip_pte *pte, enum c_ptl_list list,
+		    int buffer_id, struct cxip_cmdq *cmdq)
+{
+	union c_cmdu cmd = {};
+	int rc;
+
+	cmd.command.opcode = C_CMD_TGT_UNLINK;
+	cmd.target.ptl_list = list;
+	cmd.target.ptlte_index  = pte->pte->ptn;
+	cmd.target.buffer_id = buffer_id;
+
+	rc = cxi_cq_emit_target(cmdq->dev_cmdq, &cmd);
+	if (rc) {
+		CXIP_DBG("Failed to write Append command: %d\n", rc);
+		/* Return error according to Domain Resource Management */
+		return -FI_EAGAIN;
+	}
+
+	cxi_cq_ring(cmdq->dev_cmdq);
+
+	return FI_SUCCESS;
+}
+
+static void cxip_pte_unmap_list(struct dlist_entry *map_list)
+{
+	struct cxip_pte_map_entry *entry;
+	int ret;
+
+	while ((entry =
+	       dlist_first_entry_or_null(map_list, struct cxip_pte_map_entry,
+					 entry))) {
+		dlist_remove(&entry->entry);
+
+		ret = cxil_unmap_pte(entry->map);
+		if (ret)
+			CXIP_WARN("Failed to unmap PTE: %d\n", ret);
+
+		free(entry);
+	}
+}
+
+/*
+ * cxip_pte_map() - Map a PtlTE to a specific PID index. A single PtlTE can be
+ * mapped into MAX_PTE_MAP_COUNT different PID indices.
+ */
+int cxip_pte_map(struct cxip_pte *pte, uint64_t pid_idx, bool is_multicast)
+{
+	DEFINE_LIST(map_list);
+	struct cxip_pte_map_entry *entry;
+	int ret;
+	int i;
+
+	for (i = 0; i < pte->ptable->doms_count; i++) {
+
+		entry = calloc(1, sizeof(*entry));
+		if (!entry) {
+			CXIP_WARN("Failed to allocated map entry memory");
+			goto err_unmap;
+		}
+
+		ret = cxil_map_pte(pte->pte, pte->ptable->doms[i], pid_idx,
+				   is_multicast, &entry->map);
+		if (ret) {
+			CXIP_WARN("Failed to map PTE: %d\n", ret);
+			free(entry);
+			ret = -FI_EADDRINUSE;
+			goto err_unmap;
+		}
+
+		dlist_insert_tail(&entry->entry, &map_list);
+	}
+
+	dlist_splice_tail(&pte->map_list, &map_list);
+
+	return FI_SUCCESS;
+
+err_unmap:
+	cxip_pte_unmap_list(&map_list);
+
+	return ret;
+}
+
+/*
+ * cxip_pte_alloc_nomap() - Allocate a PtlTE without performing any mapping
+ * during allocation.
+ */
+int cxip_pte_alloc_nomap(struct cxip_portals_table *ptable, struct cxi_eq *evtq,
+			 struct cxi_pt_alloc_opts *opts,
+			 void (*state_change_cb)(struct cxip_pte *pte,
+						 const union c_event *event),
+			 void *ctx, struct cxip_pte **pte)
+{
+	struct cxip_pte *new_pte;
+	int ret;
+
+	new_pte = calloc(1, sizeof(*new_pte));
+	if (!new_pte) {
+		CXIP_WARN("Failed to allocate PTE structure\n");
+		return -FI_ENOMEM;
+	}
+
+	/* Allocate a PTE */
+	ret = cxil_alloc_pte(ptable->lni->lni, evtq, opts,
+			     &new_pte->pte);
+	if (ret) {
+		CXIP_WARN("Failed to allocate PTE: %d\n", ret);
+		ret = -FI_ENOSPC;
+		goto free_mem;
+	}
+
+	ofi_spin_lock(&ptable->lni->iface->lock);
+	dlist_insert_tail(&new_pte->pte_entry, &ptable->lni->iface->ptes);
+	ofi_spin_unlock(&ptable->lni->iface->lock);
+
+	new_pte->ptable = ptable;
+	new_pte->state_change_cb = state_change_cb;
+	new_pte->ctx = ctx;
+	new_pte->state = C_PTLTE_DISABLED;
+	dlist_init(&new_pte->map_list);
+
+	*pte = new_pte;
+
+	return FI_SUCCESS;
+
+free_mem:
+	free(new_pte);
+
+	return ret;
+}
+
+/*
+ * cxip_pte_alloc() - Allocate and map a PTE for use.
+ */
+int cxip_pte_alloc(struct cxip_portals_table *ptable, struct cxi_eq *evtq,
+		   uint64_t pid_idx, bool is_multicast,
+		   struct cxi_pt_alloc_opts *opts,
+		   void (*state_change_cb)(struct cxip_pte *pte,
+					   const union c_event *event),
+		   void *ctx, struct cxip_pte **pte)
+{
+	int ret;
+
+	ret = cxip_pte_alloc_nomap(ptable, evtq, opts, state_change_cb,
+				   ctx, pte);
+	if (ret)
+		return ret;
+
+	ret = cxip_pte_map(*pte, pid_idx, is_multicast);
+	if (ret)
+		goto free_pte;
+
+	return FI_SUCCESS;
+
+free_pte:
+	cxip_pte_free(*pte);
+
+	return ret;
+}
+
+/*
+ * cxip_pte_free() - Free a PTE.
+ */
+void cxip_pte_free(struct cxip_pte *pte)
+{
+	int ret;
+
+	ofi_spin_lock(&pte->ptable->lni->iface->lock);
+	dlist_remove(&pte->pte_entry);
+	ofi_spin_unlock(&pte->ptable->lni->iface->lock);
+
+	cxip_pte_unmap_list(&pte->map_list);
+
+	assert(dlist_empty(&pte->map_list));
+
+	ret = cxil_destroy_pte(pte->pte);
+	if (ret)
+		CXIP_WARN("Failed to free PTE: %d\n", ret);
+
+	free(pte);
+}
+
+/*
+ * cxip_pte_state_change() - Atomically update PTE state. Used during
+ * STATE_CHANGE event processing.
+ */
+int cxip_pte_state_change(struct cxip_if *dev_if, const union c_event *event)
+{
+	struct cxip_pte *pte;
+
+	ofi_spin_lock(&dev_if->lock);
+
+	dlist_foreach_container(&dev_if->ptes,
+				struct cxip_pte, pte, pte_entry) {
+		if (pte->pte->ptn == event->tgt_long.ptlte_index) {
+			pte->state = event->tgt_long.initiator.state_change.ptlte_state;
+			if (pte->state_change_cb)
+				pte->state_change_cb(pte, event);
+
+			ofi_spin_unlock(&dev_if->lock);
+			return FI_SUCCESS;
+		}
+	}
+
+	ofi_spin_unlock(&dev_if->lock);
+
+	return -FI_EINVAL;
+}
diff --git a/prov/cxi/src/cxip_ptelist_buf.c b/prov/cxi/src/cxip_ptelist_buf.c
new file mode 100644
index 00000000000..bfeaddb058c
--- /dev/null
+++ b/prov/cxi/src/cxip_ptelist_buf.c
@@ -0,0 +1,553 @@
+/*
+ * Copyright (c) 2018,2021-2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ofi_list.h>
+#include <ofi.h>
+
+#include "cxip.h"
+
+static const char*
+cxip_ptelist_to_str(struct cxip_ptelist_bufpool *pool)
+{
+	return cxi_ptl_list_to_str(pool->attr.list_type);
+}
+
+static int cxip_ptelist_unlink_buf(struct cxip_ptelist_buf *buf)
+{
+	struct cxip_rxc *rxc = buf->rxc;
+	int ret;
+
+	ret = cxip_pte_unlink(rxc->rx_pte, buf->pool->attr.list_type,
+			      buf->req->req_id, rxc->rx_cmdq);
+	if (ret)
+		RXC_DBG(rxc, "Failed to write command %d %s\n",
+			ret, fi_strerror(-ret));
+
+	return ret;
+}
+
+static int cxip_ptelist_link_buf(struct cxip_ptelist_buf *buf,
+				 bool seq_restart)
+{
+	struct cxip_rxc *rxc = buf->rxc;
+	uint32_t le_flags = C_LE_MANAGE_LOCAL | C_LE_NO_TRUNCATE |
+		C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_PUT |
+		C_LE_UNRESTRICTED_END_RO | C_LE_EVENT_UNLINK_DISABLE;
+	int ret;
+
+	/* Match all eager, long sends */
+	union cxip_match_bits mb = {
+		.le_type = CXIP_LE_TYPE_RX
+	};
+	union cxip_match_bits ib = {
+		.tag = ~0,
+		.tx_id = ~0,
+		.cq_data = 1,
+		.tagged = 1,
+		.match_comp = 1,
+		.rdzv_done = 1,
+	};
+
+	if (!(buf->pool->attr.list_type == C_PTL_LIST_OVERFLOW &&
+	      cxip_env.hybrid_preemptive))
+		le_flags |= C_LE_EVENT_LINK_DISABLE;
+
+	if (seq_restart)
+		le_flags |= C_LE_RESTART_SEQ;
+
+	RXC_DBG(rxc, "%s link buf %p num linked %u\n",
+		cxip_ptelist_to_str(buf->pool), buf,
+		ofi_atomic_get32(&buf->pool->bufs_linked));
+
+	/* Reset request buffer stats used to know when the buffer is consumed.
+	 */
+	assert(dlist_empty(&buf->request.pending_ux_list));
+	buf->unlink_length = -1;
+	buf->cur_offset = 0;
+
+	/* Take a request buffer reference for the link. */
+	ret = cxip_pte_append(rxc->rx_pte,
+			      CXI_VA_TO_IOVA(buf->md->md, buf->data),
+			      buf->pool->attr.buf_size, buf->md->md->lac,
+			      buf->pool->attr.list_type,
+			      buf->req->req_id, mb.raw,
+			      ib.raw, CXI_MATCH_ID_ANY,
+			      buf->pool->attr.min_space_avail,
+			      le_flags, NULL, rxc->rx_cmdq, true);
+	if (ret) {
+		RXC_WARN(rxc, "Failed to write %s append %d %s\n",
+			 cxip_ptelist_to_str(buf->pool),
+			 ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	dlist_remove(&buf->buf_entry);
+	dlist_insert_tail(&buf->buf_entry,
+			  &buf->pool->active_bufs);
+	ofi_atomic_inc32(&buf->pool->bufs_linked);
+
+	/* Reference taken until buffer is consumed or manually
+	 * unlinked.
+	 */
+	cxip_ptelist_buf_get(buf);
+
+	RXC_DBG(rxc, "APPEND %s buf %p num linked %u\n",
+		cxip_ptelist_to_str(buf->pool), buf,
+		ofi_atomic_get32(&buf->pool->bufs_linked));
+
+	return ret;
+}
+
+/*
+ * cxip_ptelist_buf_alloc() - Allocate a buffer for the Ptl buffer pool.
+ */
+static struct cxip_ptelist_buf*
+cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool)
+{
+	struct cxip_rxc *rxc = pool->rxc;
+	struct cxip_ptelist_buf *buf;
+	int ret;
+
+	buf = calloc(1, sizeof(*buf));
+	if (!buf)
+		goto err;
+
+	buf->data = aligned_alloc(pool->buf_alignment, pool->attr.buf_size);
+	if (!buf->data)
+		goto err_free_buf;
+
+	if (rxc->hmem && !cxip_env.disable_host_register) {
+		ret = ofi_hmem_host_register(buf->data, pool->attr.buf_size);
+		if (ret) {
+			RXC_WARN(rxc,
+				 "Failed to register buffer with HMEM: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_free_data_buf;
+		}
+	}
+
+	ret = cxip_map(rxc->domain, buf->data, pool->attr.buf_size,
+		       OFI_MR_NOCACHE, &buf->md);
+	if (ret)
+		goto err_unreg_buf;
+
+	buf->req = cxip_evtq_req_alloc(&rxc->rx_evtq, true, buf);
+	if (!buf->req)
+		goto err_unmap_buf;
+
+	buf->pool = pool;
+	buf->req->cb = pool->attr.ptelist_cb;
+	buf->rxc = rxc;
+	buf->le_type = CXIP_LE_TYPE_RX;
+
+	if (pool->attr.list_type == C_PTL_LIST_REQUEST)
+		buf->req->type = CXIP_REQ_RBUF;
+	else
+		buf->req->type = CXIP_REQ_OFLOW;
+
+	ofi_atomic_initialize32(&buf->refcount, 0);
+	dlist_init(&buf->request.pending_ux_list);
+	dlist_init(&buf->buf_entry);
+	ofi_atomic_inc32(&pool->bufs_allocated);
+
+	RXC_DBG(rxc, "Allocated %s buf %p num alloc %u\n",
+		cxip_ptelist_to_str(buf->pool), buf,
+		ofi_atomic_get32(&pool->bufs_allocated));
+
+	return buf;
+
+err_unmap_buf:
+	cxip_unmap(buf->md);
+err_unreg_buf:
+	if (rxc->hmem && !cxip_env.disable_host_register)
+		ofi_hmem_host_unregister(buf);
+err_free_data_buf:
+	free(buf->data);
+err_free_buf:
+	free(buf);
+err:
+	return NULL;
+}
+
+static void cxip_ptelist_buf_free(struct cxip_ptelist_buf *buf)
+{
+	struct cxip_ux_send *ux;
+	struct dlist_entry *tmp;
+	struct cxip_rxc *rxc = buf->rxc;
+
+	/* Sanity check making sure the buffer was properly removed before
+	 * freeing.
+	 */
+	assert(dlist_empty(&buf->buf_entry));
+
+	if (buf->pool->attr.list_type == C_PTL_LIST_REQUEST) {
+		dlist_foreach_container_safe(&buf->request.pending_ux_list,
+					     struct cxip_ux_send,
+					     ux, rxc_entry, tmp) {
+			dlist_remove(&ux->rxc_entry);
+			_cxip_req_buf_ux_free(ux, false);
+		}
+	}
+
+	if (ofi_atomic_get32(&buf->refcount) != 0)
+		RXC_FATAL(rxc, "%s buf %p non-zero refcount %u\n",
+			  cxip_ptelist_to_str(buf->pool), buf,
+			  ofi_atomic_get32(&buf->refcount));
+	cxip_evtq_req_free(buf->req);
+	cxip_unmap(buf->md);
+	if (rxc->hmem && !cxip_env.disable_host_register)
+		ofi_hmem_host_unregister(buf->data);
+
+	ofi_atomic_dec32(&buf->pool->bufs_allocated);
+
+	RXC_DBG(rxc, "Freeing %s buf %p num_alloc %u\n",
+		cxip_ptelist_to_str(buf->pool), buf,
+		ofi_atomic_get32(&buf->pool->bufs_allocated));
+	free(buf->data);
+	free(buf);
+}
+
+static void cxip_ptelist_buf_dlist_free(struct dlist_entry *head)
+{
+	struct cxip_ptelist_buf *buf;
+	struct dlist_entry *tmp;
+
+	dlist_foreach_container_safe(head, struct cxip_ptelist_buf, buf,
+				     buf_entry, tmp) {
+		dlist_remove_init(&buf->buf_entry);
+		cxip_ptelist_buf_free(buf);
+	}
+}
+
+void cxip_ptelist_buf_link_err(struct cxip_ptelist_buf *buf,
+			       int rc_link_error)
+{
+	struct cxip_rxc *rxc = buf->pool->rxc;
+
+	RXC_WARN(rxc, "%s buffer %p link error %s\n",
+		 cxip_ptelist_to_str(buf->pool),
+		 buf, cxi_rc_to_str(rc_link_error));
+
+	assert(rc_link_error == C_RC_NO_SPACE);
+
+	cxip_ptelist_buf_put(buf, false);
+	ofi_atomic_dec32(&buf->pool->bufs_linked);
+
+	/* We are running out of LE resources, do not repost
+	 * immediately.
+	 */
+	assert(ofi_atomic_get32(&buf->refcount) == 0);
+	dlist_remove(&buf->buf_entry);
+	dlist_insert_tail(&buf->buf_entry, &buf->pool->free_bufs);
+	ofi_atomic_inc32(&buf->pool->bufs_free);
+}
+
+void cxip_ptelist_buf_unlink(struct cxip_ptelist_buf *buf)
+{
+	struct cxip_ptelist_bufpool *pool = buf->pool;
+
+	cxip_ptelist_buf_put(buf, false);
+	ofi_atomic_dec32(&pool->bufs_linked);
+
+	RXC_DBG(pool->rxc, "%s buffer unlink\n", cxip_ptelist_to_str(pool));
+}
+
+int cxip_ptelist_bufpool_init(struct cxip_rxc *rxc,
+			      struct cxip_ptelist_bufpool **pool,
+			      struct cxip_ptelist_bufpool_attr *attr)
+{
+	int i;
+	struct cxip_ptelist_buf *buf;
+	struct dlist_entry tmp_buf_list;
+	struct dlist_entry *tmp;
+	struct cxip_ptelist_bufpool *_pool;
+	int ret;
+	size_t buf_size;
+
+
+	if (attr->list_type != C_PTL_LIST_REQUEST &&
+	    attr->list_type != C_PTL_LIST_OVERFLOW)
+		return -FI_EINVAL;
+
+	_pool = calloc(1, sizeof(*_pool));
+	if (!_pool)
+		return -FI_ENOMEM;
+
+	_pool->buf_alignment = ofi_get_page_size();
+
+	buf_size = roundup(attr->buf_size, _pool->buf_alignment);
+	if (attr->buf_size != buf_size)
+		RXC_INFO(rxc,
+			 "Aligning buf size to %lu: prev_size=%lu new_size=%lu\n",
+			 _pool->buf_alignment, attr->buf_size, buf_size);
+	attr->buf_size = buf_size;
+
+	_pool->attr = *attr;
+	_pool->rxc = rxc;
+	dlist_init(&_pool->active_bufs);
+	dlist_init(&_pool->consumed_bufs);
+	dlist_init(&_pool->free_bufs);
+	ofi_atomic_initialize32(&_pool->bufs_linked, 0);
+	ofi_atomic_initialize32(&_pool->bufs_allocated, 0);
+	ofi_atomic_initialize32(&_pool->bufs_free, 0);
+
+	dlist_init(&tmp_buf_list);
+
+	for (i = 0; i < _pool->attr.min_posted; i++) {
+		buf = cxip_ptelist_buf_alloc(_pool);
+		if (!buf) {
+			ret = -FI_ENOMEM;
+			goto err_free_bufs;
+		}
+
+		dlist_insert_tail(&buf->buf_entry, &tmp_buf_list);
+	}
+
+	/* Since this is called during RXC initialization, RXQ CMDQ should be
+	 * empty. Thus, linking a request buffer should not fail.
+	 */
+	dlist_foreach_container_safe(&tmp_buf_list, struct cxip_ptelist_buf,
+				     buf, buf_entry, tmp) {
+		ret = cxip_ptelist_link_buf(buf, false);
+		if (ret != FI_SUCCESS)
+			CXIP_FATAL("Failed to link request buffer: %d %s\n",
+				   ret, fi_strerror(-ret));
+	}
+
+	*pool = _pool;
+	return FI_SUCCESS;
+
+err_free_bufs:
+	cxip_ptelist_buf_dlist_free(&tmp_buf_list);
+
+	return ret;
+}
+
+void cxip_ptelist_bufpool_fini(struct cxip_ptelist_bufpool *pool)
+{
+	struct cxip_rxc *rxc = pool->rxc;
+	struct cxip_ptelist_buf *buf;
+	int ret;
+
+	assert(rxc->rx_pte->state == C_PTLTE_DISABLED);
+
+	RXC_INFO(rxc, "Number of %s buffers allocated %d\n",
+		 cxip_ptelist_to_str(pool),
+		 ofi_atomic_get32(&pool->bufs_allocated));
+
+	/* All request buffers are split between the active and consumed list.
+	 * Only active buffers need to be unlinked.
+	 */
+	dlist_foreach_container(&pool->active_bufs, struct cxip_ptelist_buf,
+				buf, buf_entry) {
+		ret = cxip_ptelist_unlink_buf(buf);
+		if (ret != FI_SUCCESS)
+			CXIP_FATAL("PtlTE %d failed to unlink %s buf %d %s\n",
+				   rxc->rx_pte->pte->ptn,
+				   cxip_ptelist_to_str(pool), ret,
+				   fi_strerror(-ret));
+	}
+
+	do {
+		cxip_evtq_progress(&rxc->rx_evtq);
+	} while (ofi_atomic_get32(&pool->bufs_linked));
+
+	cxip_ptelist_buf_dlist_free(&pool->active_bufs);
+	cxip_ptelist_buf_dlist_free(&pool->consumed_bufs);
+	cxip_ptelist_buf_dlist_free(&pool->free_bufs);
+
+	assert(ofi_atomic_get32(&pool->bufs_allocated) == 0);
+
+	assert(pool);
+	free(pool);
+}
+
+/*
+ * cxip_ptelist_buf_replenish() - Replenish PtlTE overflow or request list
+ * buffers.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_ptelist_buf_replenish(struct cxip_ptelist_bufpool *pool,
+			       bool seq_restart)
+{
+	struct cxip_rxc *rxc = pool->rxc;
+	struct cxip_ptelist_buf *buf;
+	int bufs_added = 0;
+	int ret = FI_SUCCESS;
+
+	if (rxc->msg_offload && pool->attr.list_type == C_PTL_LIST_REQUEST)
+		return FI_SUCCESS;
+
+	while (ofi_atomic_get32(&pool->bufs_linked) < pool->attr.min_posted) {
+
+		/* Always prefer to use a free buffer for which
+		 * reposting was deferred or an append failed.
+		 */
+		if (!dlist_empty(&pool->free_bufs)) {
+			dlist_pop_front(&pool->free_bufs,
+					struct cxip_ptelist_buf, buf,
+					buf_entry);
+			ofi_atomic_dec32(&buf->pool->bufs_free);
+
+			RXC_DBG(rxc, "%s LINK REPOST buf %p\n",
+				cxip_ptelist_to_str(pool), buf);
+		} else {
+			buf = cxip_ptelist_buf_alloc(pool);
+
+			RXC_DBG(rxc, "%s LINK NEW buf %p\n",
+				cxip_ptelist_to_str(pool), buf);
+
+		}
+
+		if (!buf) {
+			RXC_WARN(rxc, "%s buffer allocation err\n",
+				 cxip_ptelist_to_str(pool));
+			break;
+		}
+
+		RXC_DBG(rxc, "Link %s buf entry %p\n",
+			cxip_ptelist_to_str(pool), buf);
+
+		ret = cxip_ptelist_link_buf(buf, !bufs_added);
+		if (ret) {
+			RXC_WARN(rxc, "%s append failure %d %s\n",
+				 cxip_ptelist_to_str(pool), ret,
+				 fi_strerror(-ret));
+
+			dlist_insert_tail(&buf->buf_entry,
+					  &pool->free_bufs);
+			ofi_atomic_inc32(&pool->bufs_free);
+			break;
+		}
+		bufs_added++;
+	}
+
+	/* If no buffer appended, check for fatal conditions. */
+	if (!bufs_added) {
+		if (ofi_atomic_get32(&pool->bufs_linked) < 1)
+			RXC_FATAL(rxc, "%s buffer list exhausted\n",
+				  cxip_ptelist_to_str(pool));
+	}
+
+	RXC_DBG(rxc, "%s current bufs alloc %u, num linked %u\n",
+		cxip_ptelist_to_str(pool),
+		ofi_atomic_get32(&pool->bufs_allocated),
+		ofi_atomic_get32(&pool->bufs_linked));
+
+	return ret;
+}
+
+void cxip_ptelist_buf_get(struct cxip_ptelist_buf *buf)
+{
+	ofi_atomic_inc32(&buf->refcount);
+
+	RXC_DBG(buf->rxc, "%s GET buf %p refcnt %u\n",
+		cxip_ptelist_to_str(buf->pool),
+		buf, ofi_atomic_get32(&buf->refcount));
+}
+
+void cxip_ptelist_buf_put(struct cxip_ptelist_buf *buf, bool repost)
+{
+	int ret;
+	int refcount = ofi_atomic_dec32(&buf->refcount);
+
+	RXC_DBG(buf->rxc, "%s PUT buf %p refcnt %u repost %d\n",
+		cxip_ptelist_to_str(buf->pool), buf, refcount, repost);
+
+	if (refcount < 0) {
+		RXC_FATAL(buf->rxc, "%s buffer refcount underflow %d\n",
+			  cxip_ptelist_to_str(buf->pool), refcount);
+		/* not needed */
+		return;
+	}
+
+	if (refcount == 0 && repost) {
+
+		/* Overflow buffers should just be freed if no longer
+		 * in hardware RX match mode.
+		 */
+		if (buf->pool->attr.list_type == C_PTL_LIST_OVERFLOW &&
+		    (!buf->rxc->msg_offload || buf->rxc->state != RXC_ENABLED))
+			goto free_buf;
+
+		if (buf->pool->attr.list_type == C_PTL_LIST_REQUEST &&
+		    buf->rxc->state != RXC_ENABLED_SOFTWARE)
+			goto skip_repost;
+
+		/* Limit immediate repost if already sufficient */
+		if (ofi_atomic_get32(&buf->pool->bufs_linked) <
+		    buf->pool->attr.max_posted) {
+
+			do {
+				ret = cxip_ptelist_link_buf(buf, false);
+			} while (ret == -FI_EAGAIN);
+
+			if (ret != FI_SUCCESS)
+				RXC_FATAL(buf->rxc,
+					  "Fatal %s buf link err %d %s",
+					  cxip_ptelist_to_str(buf->pool),
+					  ret, fi_strerror(-ret));
+
+			return;
+		}
+
+skip_repost:
+		/* To avoid thrashing on buffer allocation, cache
+		 * free buffers until a sufficient number are kept
+		 * for reuse. This will help bursty traffic from
+		 * holding on to unnecessary buffers.
+		 */
+		if (!buf->pool->attr.max_cached ||
+		    (ofi_atomic_get32(&buf->pool->bufs_linked) +
+		     ofi_atomic_get32(&buf->pool->bufs_free) <
+		     buf->pool->attr.max_cached)) {
+
+			dlist_remove(&buf->buf_entry);
+			dlist_insert_tail(&buf->buf_entry,
+					&buf->pool->free_bufs);
+			ofi_atomic_inc32(&buf->pool->bufs_free);
+
+			return;
+		}
+
+free_buf:
+		dlist_remove_init(&buf->buf_entry);
+		cxip_ptelist_buf_free(buf);
+	}
+}
+
+void cxip_ptelist_buf_consumed(struct cxip_ptelist_buf *buf)
+{
+	RXC_DBG(buf->rxc, "%s CONSUMED off %ld len %ld buf %p\n",
+		cxip_ptelist_to_str(buf->pool), buf->cur_offset,
+		buf->unlink_length, buf);
+
+	dlist_remove(&buf->buf_entry);
+	dlist_insert_tail(&buf->buf_entry,
+			  &buf->pool->consumed_bufs);
+
+	/* Since buffer is consumed, return reference
+	 * taken during the initial linking.
+	 */
+	cxip_ptelist_buf_put(buf, true);
+}
+
+void _cxip_req_buf_ux_free(struct cxip_ux_send *ux, bool repost)
+{
+	struct cxip_ptelist_buf *buf = ux->req->req_ctx;
+
+	assert(ux->req->type == CXIP_REQ_RBUF);
+
+	cxip_ptelist_buf_put(buf, repost);
+	free(ux);
+
+	RXC_DBG(buf->rxc, "%s buf %p ux %p\n",
+		cxip_ptelist_to_str(buf->pool), buf, ux);
+}
diff --git a/prov/cxi/src/cxip_rdzv_pte.c b/prov/cxi/src/cxip_rdzv_pte.c
new file mode 100644
index 00000000000..a0af4ea10ce
--- /dev/null
+++ b/prov/cxi/src/cxip_rdzv_pte.c
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+#include "config.h"
+#include "cxip.h"
+
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+static void cxip_rdzv_pte_cb(struct cxip_pte *pte, const union c_event *event)
+{
+	switch (pte->state) {
+	case C_PTLTE_ENABLED:
+		break;
+	default:
+		CXIP_FATAL("Unexpected state received: %u\n", pte->state);
+	}
+}
+
+static bool cxip_rdzv_pte_append_done(struct cxip_rdzv_pte *pte,
+				      int expected_success_count)
+{
+	if (ofi_atomic_get32(&pte->le_linked_success_count) ==
+	    expected_success_count)
+		return true;
+
+	if (ofi_atomic_get32(&pte->le_linked_failure_count) != 0)
+		return true;
+
+	return false;
+}
+
+static int cxip_rdzv_pte_wait_append(struct cxip_rdzv_pte *pte,
+				     uint32_t expected_count)
+{
+	int ret = FI_SUCCESS;
+
+	/* Poll until the LE is linked or a failure occurs. */
+	do {
+		cxip_evtq_progress(&pte->txc->tx_evtq);
+		sched_yield();
+	} while (!cxip_rdzv_pte_append_done(pte, expected_count));
+
+	if (ofi_atomic_get32(&pte->le_linked_failure_count)) {
+		ret = -FI_EIO;
+		CXIP_WARN("Failed to append zero byte put LE: %d:%s\n", ret,
+			  fi_strerror(-ret));
+	}
+
+	return ret;
+}
+
+static void cxip_rdzv_pte_src_reqs_free(struct cxip_rdzv_match_pte *pte)
+{
+	int i;
+
+	/* The corresponding LE is not freed using an unlink command. Instead,
+	 * this logic relies on the freeing of the hardware PtlTE to release the
+	 * LEs.
+	 */
+	for (i = 0; i < RDZV_SRC_LES; i++) {
+		if (pte->src_reqs[i])
+			cxip_evtq_req_free(pte->src_reqs[i]);
+	}
+}
+
+/* caller should hold ep_obj->lock */
+int cxip_rdzv_pte_src_req_alloc(struct cxip_rdzv_match_pte *pte, int lac)
+{
+	int ret;
+	union cxip_match_bits mb;
+	union cxip_match_bits ib;
+	uint32_t le_flags;
+	int expected_success_count;
+	struct cxip_req *req;
+	struct cxip_rdzv_pte *base = &pte->base_pte;
+
+	/* Reuse a previously allocated request whenever possible. */
+	if (pte->src_reqs[lac])
+		return FI_SUCCESS;
+
+	mb.raw = 0;
+	mb.rdzv_lac = lac;
+	ib.raw = ~0;
+	ib.rdzv_lac = 0;
+	le_flags = C_LE_UNRESTRICTED_BODY_RO | C_LE_UNRESTRICTED_END_RO |
+		C_LE_OP_GET | C_LE_EVENT_UNLINK_DISABLE;
+
+	req = cxip_evtq_req_alloc(&base->txc->tx_evtq, 1, base);
+	if (!req) {
+		ret = -FI_EAGAIN;
+		CXIP_WARN("Failed to allocate %d rendezvous source request: %d:%s\n",
+			  lac, ret, fi_strerror(-ret));
+		return ret;
+	}
+	req->cb = cxip_rdzv_pte_src_cb;
+
+	expected_success_count =
+		ofi_atomic_get32(&base->le_linked_success_count) + 1;
+
+	ret = cxip_pte_append(base->pte, 0, -1ULL, lac, C_PTL_LIST_PRIORITY,
+			      req->req_id, mb.raw, ib.raw,
+			      CXI_MATCH_ID_ANY, 0, le_flags, NULL,
+			      base->txc->rx_cmdq, true);
+	if (ret) {
+		CXIP_WARN("Failed to issue %d rendezvous source request LE append: %d:%s\n",
+			  lac, ret, fi_strerror(-ret));
+		goto err_free_req;
+	}
+
+	ret = cxip_rdzv_pte_wait_append(base, expected_success_count);
+	if (ret != FI_SUCCESS)
+		goto err_free_req;
+
+	pte->src_reqs[lac] = req;
+
+	return FI_SUCCESS;
+
+err_free_req:
+	cxip_evtq_req_free(req);
+
+	return ret;
+}
+
+static void cxip_rdzv_pte_zbp_req_free(struct cxip_rdzv_match_pte *pte)
+{
+	/* The corresponding LE is not freed using an unlink command. Instead,
+	 * this logic relies on the freeing of the hardware PtlTE to release the
+	 * LEs.
+	 */
+	cxip_evtq_req_free(pte->zbp_req);
+}
+
+static int cxip_rdzv_pte_zbp_req_alloc(struct cxip_rdzv_match_pte *pte)
+{
+	uint32_t le_flags = C_LE_UNRESTRICTED_BODY_RO |
+		C_LE_UNRESTRICTED_END_RO | C_LE_OP_PUT |
+		C_LE_EVENT_UNLINK_DISABLE;
+	union cxip_match_bits mb = {
+		.le_type = CXIP_LE_TYPE_ZBP,
+	};
+	union cxip_match_bits ib = {
+		.tag = ~0,
+		.tx_id = ~0,
+		.cq_data = 1,
+		.tagged = 1,
+		.match_comp = 1,
+		.rdzv_done = 1,
+	};
+	struct cxip_rdzv_pte *base = &pte->base_pte;
+	int ret;
+	int expected_success_count;
+
+	pte->zbp_req = cxip_evtq_req_alloc(&base->txc->tx_evtq, 1, pte);
+	if (!pte->zbp_req) {
+		ret = -FI_ENOMEM;
+		CXIP_WARN("Failed to allocate zero byte put request: %d:%s\n",
+			  ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	pte->zbp_req->cb = cxip_rdzv_pte_zbp_cb;
+
+	expected_success_count =
+		ofi_atomic_get32(&base->le_linked_success_count) + 1;
+
+	ret = cxip_pte_append(base->pte, 0, 0, 0, C_PTL_LIST_PRIORITY,
+			      pte->zbp_req->req_id, mb.raw, ib.raw,
+			      CXI_MATCH_ID_ANY, 0, le_flags, NULL,
+			      base->txc->rx_cmdq, true);
+	if (ret) {
+		CXIP_WARN("Failed to issue zero byte put LE append: %d:%s\n",
+			  ret, fi_strerror(-ret));
+		goto err_free_req;
+	}
+
+	ret = cxip_rdzv_pte_wait_append(base, expected_success_count);
+	if (ret != FI_SUCCESS)
+		goto err_free_req;
+
+	return FI_SUCCESS;
+
+err_free_req:
+	cxip_evtq_req_free(pte->zbp_req);
+
+	return ret;
+}
+
+static void cxip_rdzv_pte_free(struct cxip_rdzv_pte *pte)
+{
+	/* Freeing the PtlTE causes the PtlTE to be reset and all LEs to be
+	 * freed. Thus, no need to issue disable and/or unlink commands.
+	 */
+	cxip_pte_free(pte->pte);
+
+	/* Flush the CQ to ensure any events referencing the rendezvous requests
+	 * are processed.
+	 */
+	cxip_evtq_progress(&pte->txc->tx_evtq);
+}
+
+void cxip_rdzv_match_pte_free(struct cxip_rdzv_match_pte *pte)
+{
+	cxip_rdzv_pte_free(&pte->base_pte);
+
+	/* Release all the rendezvous requests. */
+	cxip_rdzv_pte_src_reqs_free(pte);
+	cxip_rdzv_pte_zbp_req_free(pte);
+
+	free(pte);
+}
+
+void cxip_rdzv_nomatch_pte_free(struct cxip_rdzv_nomatch_pte *pte)
+{
+	cxip_rdzv_pte_free(&pte->base_pte);
+	cxip_evtq_req_free(pte->le_req);
+
+	free(pte);
+}
+
+static int cxip_rdzv_base_pte_alloc(struct cxip_txc *txc,
+				    uint32_t write_pid_idx, bool write,
+				    uint32_t read_pid_idx, bool read,
+				    bool matching,
+				    struct cxip_rdzv_pte *base_pte)
+{
+	int ret;
+	struct cxi_pt_alloc_opts pt_opts = {
+		.is_matching = matching,
+	};
+
+	base_pte->txc = txc;
+	ofi_atomic_initialize32(&base_pte->le_linked_success_count, 0);
+	ofi_atomic_initialize32(&base_pte->le_linked_failure_count, 0);
+
+	if (matching && txc->ep_obj->av->symmetric)
+		pt_opts.use_logical = 1;
+
+	/* Reserve the Rendezvous Send PTE */
+	ret = cxip_pte_alloc_nomap(txc->ep_obj->ptable, txc->tx_evtq.eq,
+				   &pt_opts, cxip_rdzv_pte_cb, txc,
+				   &base_pte->pte);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to alloc base rendezvous PtlTE: %d:%s\n",
+			  ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	if (write) {
+		ret = cxip_pte_map(base_pte->pte, write_pid_idx, false);
+		if (ret != FI_SUCCESS) {
+			CXIP_WARN("Failed to map write PTE: %d\n", ret);
+			goto err_free_rdzv_pte;
+		}
+	}
+
+	if (read) {
+		ret = cxip_pte_map(base_pte->pte, read_pid_idx, false);
+		if (ret != FI_SUCCESS) {
+			CXIP_WARN("Failed to map read PTE: %d\n", ret);
+			goto err_free_rdzv_pte;
+		}
+	}
+
+	/* Set to enable, event will be processed on link */
+	ret = cxip_pte_set_state(base_pte->pte, txc->rx_cmdq,
+				 C_PTLTE_ENABLED, 0);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to enqueue enable command: %d:%s\n", ret,
+			  fi_strerror(-ret));
+		goto err_free_rdzv_pte;
+	}
+
+	return FI_SUCCESS;
+
+err_free_rdzv_pte:
+	cxip_pte_free(base_pte->pte);
+	base_pte->pte = NULL;
+
+	return ret;
+}
+
+/* ep_obj->lock should be held by caller */
+int cxip_rdzv_match_pte_alloc(struct cxip_txc *txc,
+			      struct cxip_rdzv_match_pte **rdzv_pte)
+{
+	int ret;
+	struct cxip_rdzv_match_pte *match_pte;
+	uint32_t pid_idx = txc->domain->iface->dev->info.rdzv_get_idx;
+	struct cxip_rdzv_pte *base;
+
+	match_pte = calloc(1, sizeof(*match_pte));
+	if (!match_pte) {
+		ret = -ENOMEM;
+		CXIP_WARN("Rendzvous matching PtlTE allocation error: %d:%s\n",
+			  ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	ret = cxip_rdzv_base_pte_alloc(txc, pid_idx, true,
+				       CXIP_PTL_IDX_RDZV_DEST, true,
+				       true, &match_pte->base_pte);
+	if (ret != FI_SUCCESS)
+		goto err_free_rdzv_pte_mem;
+
+	/* Matching specific initialization */
+	base = &match_pte->base_pte;
+
+	ret = cxip_rdzv_pte_zbp_req_alloc(match_pte);
+	if (ret) {
+		CXIP_WARN("Failed to allocate zero byte put request: %d:%s\n",
+			  ret, fi_strerror(-ret));
+		goto err_free_rdzv_pte;
+	}
+	*rdzv_pte = match_pte;
+
+	return FI_SUCCESS;
+
+err_free_rdzv_pte:
+	cxip_pte_free(base->pte);
+err_free_rdzv_pte_mem:
+	free(match_pte);
+
+	return ret;
+}
+
+/* ep_obj->lock should be held by caller */
+int cxip_rdzv_nomatch_pte_alloc(struct cxip_txc *txc, int lac,
+				struct cxip_rdzv_nomatch_pte **rdzv_pte)
+{
+	int ret;
+	struct cxip_rdzv_nomatch_pte *nomatch_pte;
+	struct cxip_rdzv_pte *base;
+	uint32_t le_flags;
+	uint32_t expected_success_count;
+	uint64_t ib = 0;
+	uint32_t pid_idx = CXIP_PTL_IDX_RDZV_RESTRICTED(lac);
+
+#if ENABLE_DEBUG
+	/* Enable testing of fallback to default rendezvous protocol
+	 * if unable to allocate required non-matching PTE/LE resources.
+	 */
+	if (txc->force_err & CXIP_TXC_FORCE_ERR_ALT_READ_PROTO_ALLOC)
+		return -FI_ENOSPC;
+#endif
+	nomatch_pte = calloc(1, sizeof(*nomatch_pte));
+	if (!nomatch_pte) {
+		ret = -ENOMEM;
+		CXIP_WARN("Rdzv restricted PtlTE allocation error: %d:%s\n",
+			  ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	ret = cxip_rdzv_base_pte_alloc(txc, 0, false, pid_idx, true,
+				       false, &nomatch_pte->base_pte);
+	if (ret != FI_SUCCESS)
+		goto err_free_rdzv_pte_mem;
+
+	/* Non-matching specific initialization */
+	base = &nomatch_pte->base_pte;
+
+	nomatch_pte->le_req = cxip_evtq_req_alloc(&txc->tx_evtq, 1,
+						  nomatch_pte);
+	if (!nomatch_pte->le_req) {
+		ret = -FI_EAGAIN;
+		CXIP_WARN("Rdzv PtlTE LAC %d request allocation error: %d:%s\n",
+			  lac, ret, fi_strerror(-ret));
+		goto err_free_rdzv_pte;
+	}
+
+	nomatch_pte->le_req->cb = cxip_rdzv_pte_src_cb;
+
+	le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE |
+		   C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_GET;
+	ib = 1;
+	expected_success_count =
+		ofi_atomic_get32(&base->le_linked_success_count) + 1;
+
+	ret = cxip_pte_append(base->pte, 0, -1ULL, lac, C_PTL_LIST_PRIORITY,
+			      nomatch_pte->le_req->req_id, 0, ib,
+			      CXI_MATCH_ID_ANY, 0, le_flags, NULL,
+			      txc->rx_cmdq, true);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to enqueue append cmd: %d:%s\n",
+			  ret, fi_strerror(-ret));
+		goto err_free_rdzv_pte;
+	}
+
+	ret = cxip_rdzv_pte_wait_append(base, expected_success_count);
+	if (ret != FI_SUCCESS)
+		goto err_free_req;
+
+	*rdzv_pte = nomatch_pte;
+	return FI_SUCCESS;
+
+err_free_req:
+	cxip_evtq_req_free(nomatch_pte->le_req);
+err_free_rdzv_pte:
+	cxip_pte_free(nomatch_pte->base_pte.pte);
+err_free_rdzv_pte_mem:
+	free(nomatch_pte);
+
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_repsum.c b/prov/cxi/src/cxip_repsum.c
new file mode 100644
index 00000000000..6c0f5c93186
--- /dev/null
+++ b/prov/cxi/src/cxip_repsum.c
@@ -0,0 +1,283 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP
+ */
+
+/* Notes:
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+#include "cxip.h"
+
+/**
+ * @brief REPRODUCIBLE SUM IMPLEMENATION
+ *
+ * - Reference:
+ *   - https://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-121.pdf
+ *     Algorithm 7
+ * - Example Code:
+ *   - https://github.com/peterahrens/ReproBLAS.git
+ *
+ * This code supplies the software component of the RSDG Reproducible Sum
+ * collective reduction operation.
+ *
+ * Conceptually, the 52-bit mantissa of a double precision IEEE floating point
+ * value, extended to 53-bits to include the "hidden" bit, is placed in a
+ * register containing 2048 bits (the full possible range of IEEE double
+ * exponents) and shifted so that the MSBit of the mantissa is aligned with the
+ * 11-bit exponent.
+ *
+ * This large register is then divided into numbered "bins" of W bits. Each bin
+ * is then expanded by adding (64 - W) zero bits to the most-significant end of
+ * each bin, and these 64-bit quantities are copied into an array of Kt 64-bit
+ * registers, along with the bin number M in which the MSBit of the value is
+ * located.
+ *
+ * The extra space in each bin allow us to sum without carry from bin-to-bin
+ * until the end of the computation. With W=40, there are 24 bits of overflow,
+ * allowing at least 2^24 summations to occur before overflow can occur.
+ *
+ * If overflow does occur, both Rosetta and this software set an overflow flag,
+ * and the final result should be treated as invalid.
+ *
+ * Low order bits can be discarded in the process, and this will set an inexact
+ * flag. The result should still be reproducible, and accurate to within
+ * round-off error.
+ */
+
+#define W	40
+#define Kt	4
+
+/* special values of M for non-numbers */
+#define	MNInf	125
+#define	MInf	126
+#define	MNaN	127
+
+/**
+ * @Description
+ *
+ * BIN() converts the exponent 'e' to a W-bit bin number.
+ *
+ * OFF() provides the offset of exponent 'e' within the W-bit bin.
+ *
+ * MSK() provides a bitmask for the W LSBits.
+ */
+#define BIN(e)	(((e) - 1023 + 1024*W)/W - 1024)
+#define OFF(e)	((e) - 1023 - W*BIN(e))
+#define	MSK(w)	((1ULL << w) - 1)
+
+/**
+ * @brief Convert double to repsum
+ *
+ * Rosetta expects T[0] to be the LSBits of the value, so we load from Kt-1
+ * downward. Because W=40, T[0] will always be zero: 53 bits of mantissa cannot
+ * span more than three 40-bit registers, regardless of alignment.
+ *
+ * Note that injection of a sNaN will set the invalid bit.
+ *
+ * @param x returned repsum object
+ * @param d double to convert
+ */
+void cxip_dbl_to_rep(struct cxip_repsum *x, double d)
+{
+	unsigned long m;	// double mantissa
+	int e;			// double exponent
+	int s;			// double sign
+	int w;			// bin offset of MSbit
+	int lsh;		// left-shift amount
+	int rem;		// remaining bits to shift
+	int siz;		// number of bits to keep
+	int i;
+
+	memset(x, 0, sizeof(*x));
+	_decompose_dbl(d, &s, &e, &m);
+	if (isnan(d)) {
+		// NaN, bit 51 clear is sNaN, sign ignored
+		x->M = MNaN;
+		w = 0;
+		m = 0;
+		// injecting sNaN sets the invalid bit
+		x->invalid = !(m & 0x0008000000000000);
+	} else if (isinf(d)) {
+		// inf, sign captured in x->M
+		x->M = (s < 0) ? MNInf : MInf;
+		w = 0;
+		m = 0;
+		// injecting inf sets the overflow bit
+		x->overflow = true;
+		x->overflow_id = 3;
+	} else if (e) {
+		// Normal values, extend m with implicit MSBit == 1
+		x->M = BIN(e);
+		w = OFF(e);
+		m |= 1ULL << 52;
+	} else {
+		// Subnormal values, zero
+		x->M = BIN(1);
+		w = OFF(1);
+ 	}
+
+	/**
+	 * Copy the mantissa into the correct locations within T[].
+	 *
+	 * T[3] should contain the w+1 MSBits of m, aligned to bit 0.
+	 * T[2] should contain the next W bits, aligned to bit W-1.
+	 * T[1] should contain any remaining bits, aligned to bit W-1.
+	 * T[0] will always be zero.
+	 */
+	rem = 53;	// number of bits to process
+	siz = w + 1;	// bits to include in MSRegister
+	lsh = 0;	// left-shift to align
+	i = Kt;		// start with most significant
+	while (rem) {
+		x->T[--i] = s*((m >> (rem - siz)) << lsh);
+		rem -= siz;	// siz MSBits consumed
+		m &= MSK(rem);	// keep only rem LSBits
+		siz = (rem < W) ? rem : W;
+		lsh = W - siz;	// align to bit W-1
+	}
+	while (i)
+		x->T[--i] = 0;	// clear remaining bins
+}
+
+/**
+ * @brief Convert repsum back to double.
+ *
+ * Simply use scalbn() to scale the signed mantissas and add to the accumulator.
+ *
+ * @param x repsum object
+ * @return double returned value
+ */
+void cxip_rep_to_dbl(double *d, const struct cxip_repsum *x)
+{
+	int i, m;
+
+	*d = 0.0;
+	switch (x->M) {
+		case MNaN:	// quiet NaN only
+			*d = NAN;
+			return;
+		case MNInf:
+			*d = -INFINITY;
+			return;
+		case MInf:
+			*d = INFINITY;
+			return;
+	}
+	m = x->M;
+	for (i = Kt-1; i >= 0; i--) {
+		*d += scalbn(1.0*(int64_t)x->T[i], W*m);
+		m--;
+	}
+}
+
+/**
+ * @brief Add two repsum objects, and return the result in x.
+ *
+ * @param x accumulator
+ * @param y added to accumulator
+ */
+void cxip_rep_add(struct cxip_repsum *x, const struct cxip_repsum *y)
+{
+	struct cxip_repsum swap;
+	int i, j;
+
+	/* swap x and y if necessary, to make x the largest M.
+	 * NaN is largest, followed by +Inf, -Inf, and numbers
+	 */
+	if (y->M > x->M) {
+		memcpy(&swap, x, sizeof(struct cxip_repsum));
+		memcpy(x, y, sizeof(struct cxip_repsum));
+		y = (const struct cxip_repsum *)&swap;
+	}
+	/* +Inf > -Inf, and if added, promote to NaN */
+	if (x->M == MInf && y->M == MNInf) {
+		x->M = MNaN;
+		/* subtracting infinities sets the invalid bit */
+		x->invalid = true;
+	}
+	/* Handle the not-numbers */
+	if (x->M == MNaN || x->M == MInf || x->M == MNInf)
+		return;
+	/* inexact always propagates, no matter how small */
+	if (y->inexact)
+		x->inexact = true;
+	/* advance j until bins are aligned, note bits discarded */
+	for (j = 0; j < Kt && j + y->M < x->M; j++) {
+		if (y->T[j])
+			x->inexact = true;
+	}
+	/* any remaining overflow propagates */
+	if (y->overflow && y->overflow_id >= j) {
+		x->overflow = true;
+		x->overflow_id = y->overflow_id - j;
+	}
+	/* Add remaining y to x in each aligned bin, check for overflow */
+	for (i = 0; i < Kt && j < Kt; i++, j++) {
+		int sgn0, sgn1;
+
+		sgn0 = x->T[i] >> 63;
+		x->T[i] += y->T[j];
+		sgn1 = x->T[i] >> 63;
+		/* sign change in wrong direction */
+		if (sgn0 != sgn1 && sgn1 != y->T[j] >> 63) {
+			x->inexact = true;
+			x->overflow = true;
+			x->overflow_id = MAX(x->overflow_id, i);
+		}
+	}
+}
+
+/**
+ * @brief Add two doubles using the repsum method.
+ *
+ * @param d1 : operand 1
+ * @param d2 : operand 2
+ * @return double result
+ */
+double cxip_rep_add_dbl(double d1, double d2)
+{
+	struct cxip_repsum x, y;
+
+	cxip_dbl_to_rep(&x, d1);
+	cxip_dbl_to_rep(&y, d2);
+	cxip_rep_add(&x, &y);
+	cxip_rep_to_dbl(&d1, &x);
+
+	return d1;
+}
+
+/**
+ * @brief Sum over a list of values.
+ *
+ * @param count   : count of values
+ * @param values  : array of values to sum
+ * @return double result
+ */
+double cxip_rep_sum(size_t count, double *values)
+{
+	struct cxip_repsum x, y;
+	double d;
+	size_t i;
+
+	if (count <= 0)
+		return 0.0;
+	if (count == 1)
+		return values[0];
+
+	cxip_dbl_to_rep(&x, values[0]);
+	for (i = 1; i < count; i++) {
+		cxip_dbl_to_rep(&y, values[i]);
+		cxip_rep_add(&x, &y);
+	}
+	cxip_rep_to_dbl(&d, &x);
+	return d;
+}
diff --git a/prov/cxi/src/cxip_req_buf.c b/prov/cxi/src/cxip_req_buf.c
new file mode 100644
index 00000000000..84c72cd3488
--- /dev/null
+++ b/prov/cxi/src/cxip_req_buf.c
@@ -0,0 +1,321 @@
+/*
+ * (C) Copyright 2021-2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+#include "config.h"
+#include "cxip.h"
+
+static bool cxip_req_buf_is_head(struct cxip_ptelist_buf *buf)
+{
+	struct cxip_ptelist_buf *head_buf =
+			container_of(buf->pool->active_bufs.next,
+				     struct cxip_ptelist_buf, buf_entry);
+
+	return head_buf == buf;
+}
+
+static bool cxip_req_buf_is_consumed(struct cxip_ptelist_buf *buf)
+{
+	return buf->unlink_length && buf->unlink_length == buf->cur_offset &&
+		dlist_empty(&buf->request.pending_ux_list);
+}
+
+static bool cxip_req_buf_is_next_put(struct cxip_ptelist_buf *buf,
+				     const union c_event *event)
+{
+	return (CXI_VA_TO_IOVA(buf->md->md, buf->data) + buf->cur_offset) ==
+		event->tgt_long.start;
+}
+
+static void cxip_req_buf_get_header_info(struct cxip_ptelist_buf *buf,
+					 struct cxip_ux_send *ux,
+					 size_t *header_length,
+					 uint64_t *remote_offset)
+{
+	struct c_port_fab_hdr *fab_hdr =
+		(void *)CXI_IOVA_TO_VA(buf->md->md, ux->put_ev.tgt_long.start);
+	struct c_port_unrestricted_hdr *unres_hdr =
+		(void *)((char *)fab_hdr + sizeof(*fab_hdr));
+
+	if (fab_hdr->ver != 4)
+		RXC_FATAL(buf->rxc, "Unsupported fabric header version: %u\n",
+			  fab_hdr->ver);
+
+	switch (unres_hdr->ver_pkt_type) {
+	case C_V4_PKT_UNRESTRICTED:
+		*header_length = sizeof(*fab_hdr) +
+			sizeof(struct c_port_unrestricted_hdr);
+		*remote_offset =
+			c_port_unrestricted_hdr_get_remote_offset(unres_hdr);
+		break;
+	case C_V4_PKT_SMALLMSG:
+		*header_length = sizeof(*fab_hdr) +
+			sizeof(struct c_port_small_msg_hdr);
+		*remote_offset = 0;
+		break;
+	default:
+		RXC_FATAL(buf->rxc, "Unsupported packet type: %u\n",
+			  unres_hdr->ver_pkt_type);
+	}
+}
+
+void cxip_req_buf_ux_free(struct cxip_ux_send *ux)
+{
+	_cxip_req_buf_ux_free(ux, true);
+}
+
+static struct cxip_ux_send *cxip_req_buf_ux_alloc(struct cxip_ptelist_buf *buf,
+						  const union c_event *event)
+{
+	struct cxip_ux_send *ux;
+
+	ux = calloc(1, sizeof(*ux));
+	if (!ux)
+		return NULL;
+
+	ux->put_ev = *event;
+	ux->req = buf->req;
+	dlist_init(&ux->rxc_entry);
+	cxip_ptelist_buf_get(buf);
+
+	RXC_DBG(buf->rxc, "Referenced REQ buf=%p ux=%p\n", buf, ux);
+
+	return ux;
+}
+
+/* Caller must hold ep_obj->lock */
+static int cxip_req_buf_process_ux(struct cxip_ptelist_buf *buf,
+				   struct cxip_ux_send *ux)
+{
+	struct cxip_rxc *rxc = buf->rxc;
+	size_t header_length;
+	uint64_t remote_offset;
+	int ret;
+	size_t unlink_length;
+	bool unlinked = ux->put_ev.tgt_long.auto_unlinked;
+
+	/* Pre-processing of unlink events. */
+	if (unlinked)
+		unlink_length = ux->put_ev.tgt_long.start -
+			CXI_VA_TO_IOVA(buf->md->md, buf->data) +
+			ux->put_ev.tgt_long.mlength;
+
+	buf->cur_offset += ux->put_ev.tgt_long.mlength;
+
+	/* Fixed the put event to point to where the payload resides in the
+	 * request buffer. In addition, extract the remote offset needed for
+	 * rendezvous.
+	 */
+	cxip_req_buf_get_header_info(buf, ux, &header_length, &remote_offset);
+	assert((ssize_t)ux->put_ev.tgt_long.mlength -
+	       (ssize_t)header_length >= 0);
+
+	ux->put_ev.tgt_long.start += header_length;
+	ux->put_ev.tgt_long.mlength -= header_length;
+	ux->put_ev.tgt_long.remote_offset = remote_offset +
+		ux->put_ev.tgt_long.mlength;
+
+	rxc->sw_ux_list_len++;
+
+	ret = cxip_recv_ux_sw_matcher(ux);
+	switch (ret) {
+	/* Unexpected message needs to be processed again. Put event fields
+	 * need to be reset.
+	 */
+	case -FI_EAGAIN:
+		ux->put_ev.tgt_long.mlength += header_length;
+		ux->put_ev.tgt_long.start -= header_length;
+		buf->cur_offset -= ux->put_ev.tgt_long.mlength;
+
+		rxc->sw_ux_list_len--;
+		return -FI_EAGAIN;
+
+	/* Unexpected message failed to match a user posted request. Need to
+	 * queue the unexpected message for future processing.
+	 */
+	case -FI_ENOMSG:
+		/* Check to see if a PtlTE transition to software managed EP
+		 * is in progress, and if so add to the pending UX list which
+		 * will be appended to software UX message list following
+		 * completion of the on-loading.
+		 */
+		if (rxc->state != RXC_ENABLED_SOFTWARE &&
+		    rxc->state != RXC_FLOW_CONTROL) {
+			rxc->sw_ux_list_len--;
+			dlist_insert_tail(&ux->rxc_entry,
+					  &rxc->sw_pending_ux_list);
+			rxc->sw_pending_ux_list_len++;
+
+			RXC_DBG(buf->rxc,
+				"rbuf=%p ux=%p sw_pending_ux_list_len=%u\n",
+				buf, ux, buf->rxc->sw_pending_ux_list_len);
+		} else {
+			dlist_insert_tail(&ux->rxc_entry, &rxc->sw_ux_list);
+
+			RXC_DBG(buf->rxc, "rbuf=%p ux=%p sw_ux_list_len=%u\n",
+				buf, ux, buf->rxc->sw_ux_list_len);
+		}
+		break;
+
+	/* Unexpected message successfully matched a user posted request. */
+	case FI_SUCCESS:
+		break;
+
+	default:
+		RXC_FATAL(rxc, "Unexpected cxip_recv_ux_sw_matcher() rc: %d\n",
+			  ret);
+	}
+
+	/* Once unexpected send has been accepted, complete processing of the
+	 * unlink.
+	 */
+	if (unlinked) {
+		buf->unlink_length = unlink_length;
+		ofi_atomic_dec32(&buf->pool->bufs_linked);
+
+		RXC_DBG(rxc, "rbuf=%p rxc_rbuf_linked=%u\n", buf,
+			ofi_atomic_get32(&buf->pool->bufs_linked));
+
+		/* Replenish to keep minimum linked */
+		ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool, false);
+		if (ret)
+			RXC_WARN(rxc, "Request replenish failed: %d\n", ret);
+	}
+
+	RXC_DBG(rxc, "rbuf=%p processed ux_send=%p\n", buf, ux);
+
+	return FI_SUCCESS;
+}
+
+static void cxip_req_buf_progress_pending_ux(struct cxip_ptelist_buf *buf)
+{
+	struct cxip_ux_send *ux;
+	struct dlist_entry *tmp;
+	int ret;
+
+again:
+	dlist_foreach_container_safe(&buf->request.pending_ux_list,
+				     struct cxip_ux_send, ux, rxc_entry, tmp) {
+		if (cxip_req_buf_is_next_put(buf, &ux->put_ev)) {
+			dlist_remove(&ux->rxc_entry);
+
+			/* The corresponding event from the completion queue has
+			 * already been consumed. Thus, -FI_EAGAIN cannot be
+			 * returned.
+			 */
+			do {
+				ret = cxip_req_buf_process_ux(buf, ux);
+			} while (ret == -FI_EAGAIN);
+
+			/* Previously processed unexpected messages may now be
+			 * valid. Need to reprocess the entire list.
+			 */
+			goto again;
+		}
+	}
+}
+
+static int cxip_req_buf_process_put_event(struct cxip_ptelist_buf *buf,
+					  const union c_event *event)
+{
+	struct cxip_ux_send *ux;
+	int ret = FI_SUCCESS;
+	struct cxip_rxc *rxc = buf->rxc;
+	struct cxip_ptelist_bufpool *pool = buf->pool;
+
+	assert(event->tgt_long.mlength >= CXIP_REQ_BUF_HEADER_MIN_SIZE);
+
+	ux = cxip_req_buf_ux_alloc(buf, event);
+	if (!ux) {
+		RXC_WARN(rxc, "Memory allocation error\n");
+		return -FI_EAGAIN;
+	}
+
+	/* Target events can be out-of-order with respect to how they were
+	 * matched on the PtlTE request list. To maintain the hardware matched
+	 * order, software unexpected entries are only processed in the order in
+	 * which they land in the request buffer.
+	 */
+	if (cxip_req_buf_is_head(buf) && cxip_req_buf_is_next_put(buf, event)) {
+		ret = cxip_req_buf_process_ux(buf, ux);
+		if (ret == -FI_EAGAIN) {
+			_cxip_req_buf_ux_free(ux, false);
+			return ret;
+		}
+
+		/* Since events arrive out-of-order, it is possible that a
+		 * non-head request buffer receive an event. Scrub all request
+		 * buffers processing their pending unexpected lists until a
+		 * request buffer is not consumed.
+		 */
+		while ((buf = dlist_first_entry_or_null(&pool->active_bufs,
+							struct cxip_ptelist_buf,
+							buf_entry))) {
+			cxip_req_buf_progress_pending_ux(buf);
+
+			if (cxip_req_buf_is_consumed(buf)) {
+				RXC_DBG(rxc, "buf=%p consumed\n", buf);
+				cxip_ptelist_buf_consumed(buf);
+			} else {
+				break;
+			}
+		}
+	} else {
+		/* Out-of-order target event. Queue unexpected message on
+		 * pending list until these addition events occur.
+		 */
+		dlist_insert_tail(&ux->rxc_entry,
+				  &buf->request.pending_ux_list);
+
+		RXC_DBG(rxc, "rbuf=%p pend ux_send=%p\n", buf, ux);
+	}
+
+	return ret;
+}
+
+static int cxip_req_buf_cb(struct cxip_req *req, const union c_event *event)
+{
+	struct cxip_ptelist_buf *buf = req->req_ctx;
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_LINK:
+		/* Success events not requested */
+		cxip_ptelist_buf_link_err(buf, cxi_event_rc(event));
+		return FI_SUCCESS;
+
+	case C_EVENT_UNLINK:
+		assert(!event->tgt_long.auto_unlinked);
+		cxip_ptelist_buf_unlink(buf);
+		return FI_SUCCESS;
+
+	case C_EVENT_PUT:
+		return cxip_req_buf_process_put_event(buf, event);
+
+	default:
+		RXC_FATAL(buf->rxc, CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+	}
+}
+
+int cxip_req_bufpool_init(struct cxip_rxc *rxc)
+{
+	struct cxip_ptelist_bufpool_attr attr = {
+		.list_type = C_PTL_LIST_REQUEST,
+		.ptelist_cb = cxip_req_buf_cb,
+		.buf_size = cxip_env.req_buf_size,
+		.min_space_avail = CXIP_REQ_BUF_HEADER_MAX_SIZE +
+				   rxc->max_eager_size,
+		.min_posted = cxip_env.req_buf_min_posted,
+		/* Allow growing the number request bufs posted */
+		.max_posted = cxip_env.req_buf_min_posted << 3,
+		.max_cached = cxip_env.req_buf_max_cached,
+	};
+
+	return cxip_ptelist_bufpool_init(rxc, &rxc->req_list_bufpool, &attr);
+}
+
+void cxip_req_bufpool_fini(struct cxip_rxc *rxc)
+{
+	return cxip_ptelist_bufpool_fini(rxc->req_list_bufpool);
+}
diff --git a/prov/cxi/src/cxip_rma.c b/prov/cxi/src/cxip_rma.c
new file mode 100644
index 00000000000..25877a73b04
--- /dev/null
+++ b/prov/cxi/src/cxip_rma.c
@@ -0,0 +1,866 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved.
+ * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2018,2021-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <ofi_list.h>
+#include <ofi.h>
+
+#include "cxip.h"
+
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+/*
+ * cxip_rma_selective_completion_cb() - RMA selective completion callback.
+ */
+int cxip_rma_selective_completion_cb(struct cxip_req *req,
+				     const union c_event *event)
+{
+	/* When errors happen, send events can occur before the put/get event.
+	 * These events should just be dropped.
+	 */
+	if (event->hdr.event_type == C_EVENT_SEND) {
+		CXIP_WARN("Unexpected %s event: rc=%s\n",
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+		return FI_SUCCESS;
+	}
+
+	int event_rc;
+
+	event_rc = cxi_init_event_rc(event);
+	int ret_err;
+
+	ret_err = proverr2errno(event_rc);
+	return cxip_cq_req_error(req, 0, ret_err,
+				 cxi_event_rc(event), NULL, 0,
+				 FI_ADDR_UNSPEC);
+}
+
+/*
+ * cxip_rma_write_selective_completion_req() - Return request state associated
+ * with all RMA write with selective completion transactions on the transmit
+ * context.
+ *
+ * The request is freed when the TXC send CQ is closed.
+ */
+static struct cxip_req *cxip_rma_write_selective_completion_req(struct cxip_txc *txc)
+{
+	if (!txc->rma_write_selective_completion_req) {
+		struct cxip_req *req;
+
+		req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc);
+		if (!req)
+			return NULL;
+
+		req->cb = cxip_rma_selective_completion_cb;
+		req->context = (uint64_t)txc->context;
+		req->flags = FI_RMA | FI_WRITE;
+		req->addr = FI_ADDR_UNSPEC;
+
+		txc->rma_write_selective_completion_req = req;
+	}
+
+	return txc->rma_write_selective_completion_req;
+}
+
+/*
+ * cxip_rma_read_selective_completion_req() - Return request state associated
+ * with all RMA read with selective completion transactions on the transmit
+ * context.
+ *
+ * The request is freed when the TXC send CQ is closed.
+ */
+static struct cxip_req *cxip_rma_read_selective_completion_req(struct cxip_txc *txc)
+{
+	if (!txc->rma_read_selective_completion_req) {
+		struct cxip_req *req;
+
+		req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc);
+		if (!req)
+			return NULL;
+
+		req->cb = cxip_rma_selective_completion_cb;
+		req->context = (uint64_t)txc->context;
+		req->flags = FI_RMA | FI_READ;
+		req->addr = FI_ADDR_UNSPEC;
+
+		txc->rma_read_selective_completion_req = req;
+	}
+
+	return txc->rma_read_selective_completion_req;
+}
+
+/*
+ * cxip_rma_cb() - RMA event callback.
+ */
+static int cxip_rma_cb(struct cxip_req *req, const union c_event *event)
+{
+	int ret;
+	int event_rc;
+	int ret_err;
+	bool success_event = !!(req->flags & FI_COMPLETION);
+	struct cxip_txc *txc = req->rma.txc;
+
+	/* When errors happen, send events can occur before the put/get event.
+	 * These events should just be dropped.
+	 */
+	if (event->hdr.event_type == C_EVENT_SEND) {
+		TXC_WARN(txc, CXIP_UNEXPECTED_EVENT,
+			 cxi_event_to_str(event),
+			 cxi_rc_to_str(cxi_event_rc(event)));
+		return FI_SUCCESS;
+	}
+
+	req->flags &= (FI_RMA | FI_READ | FI_WRITE);
+
+	if (req->rma.local_md)
+		cxip_unmap(req->rma.local_md);
+
+	if (req->rma.ibuf)
+		cxip_txc_ibuf_free(txc, req->rma.ibuf);
+
+	event_rc = cxi_init_event_rc(event);
+	if (event_rc == C_RC_OK) {
+		if (success_event) {
+			ret = cxip_cq_req_complete(req);
+			if (ret != FI_SUCCESS)
+				TXC_WARN(txc,
+					 "Failed to report completion: %d\n",
+					 ret);
+		}
+	} else {
+		ret_err = proverr2errno(event_rc);
+		ret = cxip_cq_req_error(req, 0, ret_err, event_rc,
+					NULL, 0, FI_ADDR_UNSPEC);
+		if (ret != FI_SUCCESS)
+			TXC_WARN(txc, "Failed to report error: %d\n", ret);
+	}
+
+	ofi_atomic_dec32(&req->rma.txc->otx_reqs);
+	cxip_evtq_req_free(req);
+
+	return FI_SUCCESS;
+}
+
+static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len,
+			     struct cxip_mr *mr, union c_fab_addr *dfa,
+			     uint8_t *idx_ext, uint16_t vni, uint64_t addr,
+			     uint64_t key, uint64_t data, uint64_t flags,
+			     void *context, bool write, bool unr,
+			     uint32_t tclass,
+			     enum cxi_traffic_class_type tc_type,
+			     bool triggered, uint64_t trig_thresh,
+			     struct cxip_cntr *trig_cntr,
+			     struct cxip_cntr *comp_cntr)
+{
+	struct cxip_req *req = NULL;
+	struct cxip_md *dma_md = NULL;
+	void *dma_buf;
+	struct c_full_dma_cmd dma_cmd = {};
+	int ret;
+	struct cxip_domain *dom = txc->domain;
+	struct cxip_cntr *cntr;
+	void *inject_req;
+
+	/* MR desc cannot be value unless hybrid MR desc is enabled. */
+	if (!dom->hybrid_mr_desc)
+		mr = NULL;
+
+	/* DMA commands always require a request structure regardless if
+	 * FI_COMPLETION is set. This is due to the provider doing internally
+	 * memory registration and having to clean up the registration on DMA
+	 * operation completion.
+	 */
+	if ((len && (flags & FI_INJECT)) || (flags & FI_COMPLETION) || !mr) {
+		req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc);
+		if (!req) {
+			ret = -FI_EAGAIN;
+			TXC_WARN(txc, "Failed to allocate request: %d:%s\n",
+					ret, fi_strerror(-ret));
+			goto err;
+		}
+
+		req->context = (uint64_t)context;
+		req->cb = cxip_rma_cb;
+		req->flags = FI_RMA | (write ? FI_WRITE : FI_READ) |
+			(flags & FI_COMPLETION);
+		req->rma.txc = txc;
+		req->type = CXIP_REQ_RMA;
+		req->trig_cntr = trig_cntr;
+	}
+
+	if (len) {
+		/* If the operation is an DMA inject operation (which can occur
+		 * when doing RMA commands to unoptimized MRs), a provider
+		 * bounce buffer is always needed to store the user payload.
+		 *
+		 * Always prefer user provider MR over internally mapping the
+		 * buffer.
+		 */
+		if (flags & FI_INJECT) {
+			assert(req != NULL);
+
+			req->rma.ibuf = cxip_txc_ibuf_alloc(txc);
+			if (!req->rma.ibuf) {
+				ret = -FI_EAGAIN;
+				TXC_WARN(txc,
+					"Failed to allocate bounce buffer: %d:%s\n",
+					ret, fi_strerror(-ret));
+				goto err_free_cq_req;
+			}
+
+			ret = cxip_txc_copy_from_hmem(txc, NULL, req->rma.ibuf,
+						      buf, len);
+			if (ret){
+				TXC_WARN(txc,
+					 "cxip_txc_copy_from_hmem failed: %d:%s\n",
+					 ret, fi_strerror(-ret));
+				goto err_free_rma_buf;
+			}
+
+			dma_buf = (void *)req->rma.ibuf;
+			dma_md = cxip_txc_ibuf_md(req->rma.ibuf);
+		} else if (mr) {
+			dma_buf = (void *)buf;
+			dma_md = mr->md;
+		} else {
+			assert(req != NULL);
+
+			ret = cxip_map(dom, buf, len, 0, &req->rma.local_md);
+			if (ret) {
+				TXC_WARN(txc, "Failed to map buffer: %d:%s\n",
+					ret, fi_strerror(-ret));
+				goto err_free_cq_req;
+			}
+
+			dma_buf = (void *)buf;
+			dma_md = req->rma.local_md;
+		}
+	}
+
+	dma_cmd.command.cmd_type = C_CMD_TYPE_DMA;
+	dma_cmd.index_ext = *idx_ext;
+	dma_cmd.event_send_disable = 1;
+	dma_cmd.dfa = *dfa;
+	ret = cxip_adjust_remote_offset(&addr, key);
+	if (ret) {
+		TXC_WARN(txc, "Remote offset overflow\n");
+		goto err_free_cq_req;
+	}
+	dma_cmd.remote_offset = addr;
+	dma_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq);
+	dma_cmd.match_bits = CXIP_KEY_MATCH_BITS(key);
+
+	if (req) {
+		dma_cmd.user_ptr = (uint64_t)req;
+	} else {
+		if (write)
+			inject_req = cxip_rma_write_selective_completion_req(txc);
+		else
+			inject_req = cxip_rma_read_selective_completion_req(txc);
+
+		if (!inject_req) {
+			ret = -FI_EAGAIN;
+			TXC_WARN(txc,
+				 "Failed to allocate inject request: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_free_rma_buf;
+		}
+
+		dma_cmd.user_ptr = (uint64_t)inject_req;
+		dma_cmd.event_success_disable = 1;
+	}
+
+	if (!unr)
+		dma_cmd.restricted = 1;
+
+	if (write) {
+		dma_cmd.command.opcode = C_CMD_PUT;
+
+		/* Triggered DMA operations have their own completion counter
+		 * and the one associated with the TXC cannot be used.
+		 */
+		cntr = triggered ? comp_cntr : txc->write_cntr;
+		if (cntr) {
+			dma_cmd.event_ct_ack = 1;
+			dma_cmd.ct = cntr->ct->ctn;
+		}
+
+		if (flags & (FI_DELIVERY_COMPLETE | FI_MATCH_COMPLETE))
+			dma_cmd.flush = 1;
+	} else {
+		dma_cmd.command.opcode = C_CMD_GET;
+
+		/* Triggered DMA operations have their own completion counter
+		 * and the one associated with the TXC cannot be used.
+		 */
+		cntr = triggered ? comp_cntr : txc->read_cntr;
+		if (cntr) {
+			dma_cmd.event_ct_reply = 1;
+			dma_cmd.ct = cntr->ct->ctn;
+		}
+	}
+
+	/* Only need to fill if DMA command address fields if MD is valid. */
+	if (dma_md) {
+		dma_cmd.lac = dma_md->md->lac;
+		dma_cmd.local_addr = CXI_VA_TO_IOVA(dma_md->md, dma_buf);
+		dma_cmd.request_len = len;
+	}
+
+	ret = cxip_txc_emit_dma(txc, vni, cxip_ofi_to_cxi_tc(tclass),
+				tc_type, trig_cntr, trig_thresh,
+				&dma_cmd, flags);
+	if (ret) {
+		TXC_WARN(txc, "Failed to emit dma command: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		goto err_free_rma_buf;
+	}
+
+	return FI_SUCCESS;
+
+err_free_rma_buf:
+	if (req && req->rma.ibuf)
+		cxip_txc_ibuf_free(txc, req->rma.ibuf);
+err_free_cq_req:
+	if (req)
+		cxip_evtq_req_free(req);
+err:
+	return ret;
+}
+
+static int cxip_rma_emit_idc(struct cxip_txc *txc, const void *buf, size_t len,
+			     union c_fab_addr *dfa, uint8_t *idx_ext,
+			     uint16_t vni, uint64_t addr, uint64_t key,
+			     uint64_t data, uint64_t flags, void *context,
+			     bool unr, uint32_t tclass,
+			     enum cxi_traffic_class_type tc_type)
+{
+	int ret;
+	struct cxip_req *req = NULL;
+	void *hmem_buf = NULL;
+	void *idc_buf;
+	struct c_cstate_cmd cstate_cmd = {};
+	struct c_idc_put_cmd idc_put = {};
+	void *inject_req;
+
+	/* IDCs must be traffic if the user requests a completion event. */
+	if (flags & FI_COMPLETION) {
+		req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc);
+		if (!req) {
+			ret = -FI_EAGAIN;
+			TXC_WARN(txc, "Failed to allocate request: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err;
+		}
+
+		req->context = (uint64_t)context;
+		req->cb = cxip_rma_cb;
+		req->flags = FI_RMA | FI_WRITE | (flags & FI_COMPLETION);
+		req->rma.txc = txc;
+		req->type = CXIP_REQ_RMA;
+	}
+
+	/* If HMEM is request and since the buffer type may not be host memory,
+	 * doing a memcpy could result in a segfault. Thus, an HMEM bounce
+	 * buffer is required to ensure IDC payload is in host memory.
+	 */
+	if (txc->hmem && len) {
+		hmem_buf = cxip_txc_ibuf_alloc(txc);
+		if (!hmem_buf) {
+			ret = -FI_EAGAIN;
+			TXC_WARN(txc,
+				 "Failed to allocate bounce buffer: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_free_cq_req;
+		}
+
+		ret = cxip_txc_copy_from_hmem(txc, NULL, hmem_buf, buf, len);
+		if (ret) {
+			TXC_WARN(txc,
+				 "cxip_txc_copy_from_hmem failed: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_free_hmem_buf;
+		}
+
+		idc_buf = hmem_buf;
+	} else {
+		idc_buf = (void *)buf;
+	}
+
+	cstate_cmd.event_send_disable = 1;
+	cstate_cmd.index_ext = *idx_ext;
+	cstate_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq);
+
+	if (flags & (FI_DELIVERY_COMPLETE | FI_MATCH_COMPLETE))
+		cstate_cmd.flush = 1;
+
+	if (!unr)
+		cstate_cmd.restricted = 1;
+
+	if (txc->write_cntr) {
+		cstate_cmd.event_ct_ack = 1;
+		cstate_cmd.ct = txc->write_cntr->ct->ctn;
+	}
+
+	/* If the user has not request a completion, success events will be
+	 * disabled. But, if for some reason the operation completes with an
+	 * error, an event will occur. For this case, a TXC inject request is
+	 * allocated. This request enables the reporting of failed operation to
+	 *  the completion queue. This request is freed when the TXC is closed.
+	 */
+	if (req) {
+		cstate_cmd.user_ptr = (uint64_t)req;
+	} else {
+		inject_req = cxip_rma_write_selective_completion_req(txc);
+		if (!inject_req) {
+			ret = -FI_EAGAIN;
+			TXC_WARN(txc,
+				 "Failed to allocate inject request: %d:%s\n",
+				 ret, fi_strerror(-ret));
+			goto err_free_hmem_buf;
+		}
+
+		cstate_cmd.user_ptr = (uint64_t)inject_req;
+		cstate_cmd.event_success_disable = 1;
+	}
+
+	idc_put.idc_header.dfa = *dfa;
+
+	ret = cxip_adjust_remote_offset(&addr, key);
+	if (ret) {
+		TXC_WARN(txc, "Remote offset overflow\n");
+		goto err_free_hmem_buf;
+	}
+	idc_put.idc_header.remote_offset = addr;
+
+	ret = cxip_txc_emit_idc_put(txc, vni, cxip_ofi_to_cxi_tc(tclass),
+				    tc_type, &cstate_cmd, &idc_put, idc_buf,
+				    len, flags);
+	if (ret) {
+		TXC_WARN(txc, "Failed to emit idc_put command: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		goto err_free_hmem_buf;
+	}
+
+	if (hmem_buf)
+		cxip_txc_ibuf_free(txc, hmem_buf);
+
+	return FI_SUCCESS;
+
+err_free_hmem_buf:
+	if (hmem_buf)
+		cxip_txc_ibuf_free(txc, hmem_buf);
+err_free_cq_req:
+	if (req)
+		cxip_evtq_req_free(req);
+err:
+	return ret;
+}
+
+static bool cxip_rma_is_unrestricted(struct cxip_txc *txc, uint64_t key,
+				     uint64_t msg_order, bool write)
+{
+	/* Unoptimized keys are implemented with match bits and must always be
+	 * unrestricted.
+	 */
+	if (!cxip_generic_is_mr_key_opt(key))
+		return true;
+
+	/* If MR indicates remote events are required unrestricted must be
+	 * used. If the MR is a client key, we assume if FI_RMA_EVENTS are
+	 * requested, the remote client key MR is attached to a counter or
+	 * requires RMA events, so unrestricted is used.
+	 */
+	if (cxip_generic_is_mr_key_events(txc->ep_obj->caps, key))
+		return true;
+
+	/* If the operation is an RMA write and the user has requested fabric
+	 * write after write ordering, unrestricted must be used.
+	 */
+	if (write && msg_order & (FI_ORDER_WAW | FI_ORDER_RMA_WAW))
+		return true;
+
+	return false;
+}
+
+static bool cxip_rma_is_idc(struct cxip_txc *txc, uint64_t key, size_t len,
+			    bool write, bool triggered, bool unr)
+{
+	size_t max_idc_size = unr ? CXIP_INJECT_SIZE : C_MAX_IDC_PAYLOAD_RES;
+
+	/* IDC commands are not supported for unoptimized MR since the IDC
+	 * small message format does not support remote offset which is needed
+	 * for RMA commands.
+	 */
+	if (!cxip_generic_is_mr_key_opt(key))
+		return false;
+
+	/* IDC commands are only support with RMA writes. */
+	if (!write)
+		return false;
+
+	/* IDC commands only support a limited payload size. */
+	if (len > max_idc_size)
+		return false;
+
+	/* Triggered operations never can be issued with an IDC. */
+	if (triggered)
+		return false;
+
+	return true;
+}
+
+/*
+ * cxip_rma_common() - Perform an RMA operation.
+ *
+ * Common RMA function. Performs RMA reads and writes of all kinds.
+ *
+ * Generally, operations are supported by Cassini DMA commands. IDC commands
+ * are used instead for Write operations smaller than the maximum IDC payload
+ * size.
+ *
+ * If the FI_COMPLETION flag is specified, the operation will generate a
+ * libfabric completion event. If an event is not requested and an IDC command
+ * is used, hardware success events will be suppressed. If a completion is
+ * required but an IDC can't be used, the provider tracks the request
+ * internally, but will suppress the libfabric event. The provider must track
+ * DMA commands in order to clean up the source buffer mapping on completion.
+ */
+ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc,
+			const void *buf, size_t len, void *desc,
+			fi_addr_t tgt_addr, uint64_t addr, uint64_t key,
+			uint64_t data, uint64_t flags, uint32_t tclass,
+			uint64_t msg_order, void *context,
+			bool triggered, uint64_t trig_thresh,
+			struct cxip_cntr *trig_cntr,
+			struct cxip_cntr *comp_cntr)
+{
+	struct cxip_addr caddr;
+	union c_fab_addr dfa;
+	uint8_t idx_ext;
+	uint32_t pid_idx;
+	enum cxi_traffic_class_type tc_type;
+	bool write = op == FI_OP_WRITE;
+	bool unr;
+	bool idc;
+	int ret;
+	uint16_t vni;
+
+	if (len && !buf) {
+		TXC_WARN(txc, "Invalid buffer\n");
+		return -FI_EINVAL;
+	}
+
+	if ((flags & FI_INJECT) && len > CXIP_INJECT_SIZE) {
+		TXC_WARN(txc, "RMA inject size exceeds limit\n");
+		return -FI_EMSGSIZE;
+	}
+
+	if (len > CXIP_EP_MAX_MSG_SZ) {
+		TXC_WARN(txc, "RMA length exceeds limit\n");
+		return -FI_EMSGSIZE;
+	}
+
+	if (!cxip_generic_is_valid_mr_key(key)) {
+		TXC_WARN(txc, "Invalid remote key: 0x%lx\n", key);
+		return -FI_EKEYREJECTED;
+	}
+
+	unr = cxip_rma_is_unrestricted(txc, key, msg_order, write);
+	idc = cxip_rma_is_idc(txc, key, len, write, triggered, unr);
+
+	/* Build target network address. */
+	ret = cxip_av_lookup_addr(txc->ep_obj->av, tgt_addr, &caddr);
+	if (ret) {
+		TXC_WARN(txc, "Failed to look up FI addr: %d:%s\n",
+			 ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	if (txc->ep_obj->av_auth_key)
+		vni = caddr.vni;
+	else
+		vni = txc->ep_obj->auth_key.vni;
+
+	pid_idx = cxip_generic_mr_key_to_ptl_idx(txc->domain, key, write);
+	cxi_build_dfa(caddr.nic, caddr.pid, txc->pid_bits, pid_idx, &dfa,
+		      &idx_ext);
+
+	/* Select the correct traffic class type within a traffic class. */
+	if (!unr && (flags & FI_CXI_HRP))
+		tc_type = CXI_TC_TYPE_HRP;
+	else if (!unr)
+		tc_type = CXI_TC_TYPE_RESTRICTED;
+	else
+		tc_type = CXI_TC_TYPE_DEFAULT;
+
+	/* IDC commands are preferred wherever possible since the payload is
+	 * written with the command thus avoiding all memory registration. In
+	 * addition, this allows for success events to be surpressed if
+	 * FI_COMPLETION is not requested.
+	 */
+	ofi_genlock_lock(&txc->ep_obj->lock);
+	if (idc)
+		ret = cxip_rma_emit_idc(txc, buf, len, &dfa, &idx_ext, vni,
+					addr, key, data, flags, context, unr,
+					tclass, tc_type);
+	else
+		ret = cxip_rma_emit_dma(txc, buf, len, desc, &dfa, &idx_ext,
+					vni, addr, key, data, flags, context,
+					write, unr, tclass, tc_type,
+					triggered, trig_thresh,
+					trig_cntr, comp_cntr);
+	ofi_genlock_unlock(&txc->ep_obj->lock);
+
+	if (ret)
+		TXC_WARN(txc,
+			 "%s RMA %s failed: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n",
+			 idc ? "IDC" : "DMA", write ? "write" : "read",
+			 buf, len, key, addr, caddr.nic, caddr.pid, pid_idx);
+	else
+		TXC_DBG(txc,
+			"%s RMA %s emitted: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n",
+			idc ? "IDC" : "DMA", write ? "write" : "read",
+			buf, len, key, addr, caddr.nic, caddr.pid, pid_idx);
+
+	return ret;
+}
+
+/*
+ * Libfabric APIs
+ */
+static ssize_t cxip_rma_write(struct fid_ep *fid_ep, const void *buf,
+			      size_t len, void *desc, fi_addr_t dest_addr,
+			      uint64_t addr, uint64_t key, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_rma_common(FI_OP_WRITE, &ep->ep_obj->txc, buf, len, desc,
+			       dest_addr, addr, key, 0, ep->tx_attr.op_flags,
+			       ep->tx_attr.tclass, ep->tx_attr.msg_order,
+			       context, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_rma_writev(struct fid_ep *fid_ep, const struct iovec *iov,
+			       void **desc, size_t count, fi_addr_t dest_addr,
+			       uint64_t addr, uint64_t key, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	size_t len;
+	const void *buf;
+	void *mr_desc;
+
+	if (count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (iov && count == 1) {
+		len = iov[0].iov_len;
+		buf = iov[0].iov_base;
+		mr_desc = desc ? desc[0] : NULL;
+	} else {
+		TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	return cxip_rma_common(FI_OP_WRITE, &ep->ep_obj->txc, buf, len,
+			       mr_desc, dest_addr, addr, key, 0,
+			       ep->tx_attr.op_flags, ep->tx_attr.tclass,
+			       ep->tx_attr.msg_order, context, false, 0, NULL,
+			       NULL);
+}
+
+static ssize_t cxip_rma_writemsg(struct fid_ep *fid_ep,
+				 const struct fi_msg_rma *msg, uint64_t flags)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+	size_t len;
+	const void *buf;
+	void *mr_desc;
+
+	if (!msg) {
+		TXC_WARN(txc, "NULL msg not supported\n");
+		return -FI_EINVAL;
+	}
+
+	if (msg->rma_iov_count != 1) {
+		TXC_WARN(txc, "Invalid RMA iov\n");
+		return -FI_EINVAL;
+	}
+
+	if (msg->iov_count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (msg->msg_iov && msg->iov_count == 1) {
+		len = msg->msg_iov[0].iov_len;
+		buf = msg->msg_iov[0].iov_base;
+		mr_desc = msg->desc ? msg->desc[0] : NULL;
+	} else {
+		TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS | FI_CXI_HRP |
+		      FI_CXI_WEAK_FENCE))
+		return -FI_EBADFLAGS;
+
+	if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE))
+		return -FI_EINVAL;
+
+	/* If selective completion is not requested, always generate
+	 * completions.
+	 */
+	if (!txc->selective_completion)
+		flags |= FI_COMPLETION;
+
+	return cxip_rma_common(FI_OP_WRITE, txc, buf, len, mr_desc, msg->addr,
+			       msg->rma_iov[0].addr, msg->rma_iov[0].key,
+			       msg->data, flags, ep->tx_attr.tclass,
+			       ep->tx_attr.msg_order, msg->context, false, 0,
+			       NULL, NULL);
+}
+
+ssize_t cxip_rma_inject(struct fid_ep *fid_ep, const void *buf, size_t len,
+			fi_addr_t dest_addr, uint64_t addr, uint64_t key)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_rma_common(FI_OP_WRITE, &ep->ep_obj->txc, buf, len, NULL,
+			       dest_addr, addr, key, 0, FI_INJECT,
+			       ep->tx_attr.tclass, ep->tx_attr.msg_order, NULL,
+			       false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_rma_read(struct fid_ep *fid_ep, void *buf, size_t len,
+			     void *desc, fi_addr_t src_addr, uint64_t addr,
+			     uint64_t key, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+
+	return cxip_rma_common(FI_OP_READ, &ep->ep_obj->txc, buf, len, desc,
+			       src_addr, addr, key, 0, ep->tx_attr.op_flags,
+			       ep->tx_attr.tclass, ep->tx_attr.msg_order,
+			       context, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_rma_readv(struct fid_ep *fid_ep, const struct iovec *iov,
+			      void **desc, size_t count, fi_addr_t src_addr,
+			      uint64_t addr, uint64_t key, void *context)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	size_t len;
+	const void *buf;
+	void *mr_desc;
+
+	if (count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (iov && count == 1) {
+		len = iov[0].iov_len;
+		buf = iov[0].iov_base;
+		mr_desc = desc ? desc[0] : NULL;
+	} else {
+		TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	return cxip_rma_common(FI_OP_READ, &ep->ep_obj->txc, buf, len, mr_desc,
+			       src_addr, addr, key, 0, ep->tx_attr.op_flags,
+			       ep->tx_attr.tclass, ep->tx_attr.msg_order,
+			       context, false, 0, NULL, NULL);
+}
+
+static ssize_t cxip_rma_readmsg(struct fid_ep *fid_ep,
+				const struct fi_msg_rma *msg, uint64_t flags)
+{
+	struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep);
+	struct cxip_txc *txc = &ep->ep_obj->txc;
+	size_t len;
+	const void *buf;
+	void *mr_desc;
+
+	if (!msg) {
+		TXC_WARN(txc, "NULL msg not supported\n");
+		return -FI_EINVAL;
+	}
+
+	if (msg->rma_iov_count != 1) {
+		TXC_WARN(txc, "Invalid RMA iov\n");
+		return -FI_EINVAL;
+	}
+
+	if (msg->iov_count == 0) {
+		len = 0;
+		buf = NULL;
+		mr_desc = NULL;
+	} else if (msg->msg_iov && msg->iov_count == 1) {
+		len = msg->msg_iov[0].iov_len;
+		buf = msg->msg_iov[0].iov_base;
+		mr_desc = msg->desc ? msg->desc[0] : NULL;
+	} else {
+		TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n");
+		return -FI_EINVAL;
+	}
+
+	if (flags & ~CXIP_READMSG_ALLOWED_FLAGS)
+		return -FI_EBADFLAGS;
+
+	if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE))
+		return -FI_EINVAL;
+
+	/* If selective completion is not requested, always generate
+	 * completions.
+	 */
+	if (!txc->selective_completion)
+		flags |= FI_COMPLETION;
+
+	return cxip_rma_common(FI_OP_READ, txc, buf, len, mr_desc, msg->addr,
+			       msg->rma_iov[0].addr, msg->rma_iov[0].key,
+			       msg->data, flags, ep->tx_attr.tclass,
+			       ep->tx_attr.msg_order, msg->context, false, 0,
+			       NULL, NULL);
+}
+
+struct fi_ops_rma cxip_ep_rma_ops = {
+	.size = sizeof(struct fi_ops_rma),
+	.read = cxip_rma_read,
+	.readv = cxip_rma_readv,
+	.readmsg = cxip_rma_readmsg,
+	.write = cxip_rma_write,
+	.writev = cxip_rma_writev,
+	.writemsg = cxip_rma_writemsg,
+	.inject = cxip_rma_inject,
+	.injectdata = fi_no_rma_injectdata,
+	.writedata = fi_no_rma_writedata,
+};
+
+struct fi_ops_rma cxip_ep_rma_no_ops = {
+	.size = sizeof(struct fi_ops_rma),
+	.read = fi_no_rma_read,
+	.readv = fi_no_rma_readv,
+	.readmsg = fi_no_rma_readmsg,
+	.write = fi_no_rma_write,
+	.writev = fi_no_rma_writev,
+	.writemsg = fi_no_rma_writemsg,
+	.inject = fi_no_rma_inject,
+	.injectdata = fi_no_rma_injectdata,
+	.writedata = fi_no_rma_writedata,
+};
diff --git a/prov/cxi/src/cxip_rxc.c b/prov/cxi/src/cxip_rxc.c
new file mode 100644
index 00000000000..3fce655a6d7
--- /dev/null
+++ b/prov/cxi/src/cxip_rxc.c
@@ -0,0 +1,555 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2019,2020-2023 Hewlett Packard Enterprise Development LP
+ */
+
+/* CXI RX Context Management */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+#define CXIP_SC_STATS "FC/SC stats - EQ full: %d append fail: %d no match: %d"\
+		      " request full: %d unexpected: %d, NIC HW2SW unexp: %d"\
+		      " NIC HW2SW append fail: %d\n"
+
+/*
+ * cxip_rxc_msg_enable() - Enable RXC messaging.
+ *
+ * Change the RXC RX PtlTE to enabled state. Once in enabled state, messages
+ * will be accepted by hardware. Prepare all messaging resources before
+ * enabling the RX PtlTE.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+int cxip_rxc_msg_enable(struct cxip_rxc *rxc, uint32_t drop_count)
+{
+	int ret;
+
+	/* If transitioning from disabled to the software managed state a
+	 * synchronous call is used which handles drop count mismatches.
+	 */
+	if (rxc->new_state == RXC_ENABLED_SOFTWARE) {
+		ret = cxil_pte_transition_sm(rxc->rx_pte->pte, drop_count);
+		if (ret)
+			RXC_WARN(rxc,
+				 "Error transitioning to SW EP %d %s\n",
+				  ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	return cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq,
+				  C_PTLTE_ENABLED, drop_count);
+}
+
+/*
+ * rxc_msg_disable() - Disable RXC messaging.
+ *
+ * Change the RXC RX PtlTE to disabled state. Once in disabled state, the PtlTE
+ * will receive no additional events.
+ *
+ * Caller must hold rxc->ep_obj->lock.
+ */
+static int rxc_msg_disable(struct cxip_rxc *rxc)
+{
+	int ret;
+
+	if (rxc->state != RXC_ENABLED &&
+	    rxc->state != RXC_ENABLED_SOFTWARE)
+		RXC_FATAL(rxc, "RXC in bad state to be disabled: state=%d\n",
+			  rxc->state);
+
+	rxc->state = RXC_DISABLED;
+
+	ret = cxip_pte_set_state_wait(rxc->rx_pte, rxc->rx_cmdq, &rxc->rx_evtq,
+				      C_PTLTE_DISABLED, 0);
+	if (ret == FI_SUCCESS)
+		CXIP_DBG("RXC PtlTE disabled: %p\n", rxc);
+
+	return ret;
+}
+
+#define RXC_RESERVED_FC_SLOTS 1
+
+/*
+ * rxc_msg_init() - Initialize an RX context for messaging.
+ *
+ * Allocates and initializes hardware resources used for receiving expected and
+ * unexpected message data.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int rxc_msg_init(struct cxip_rxc *rxc)
+{
+	int ret;
+	struct cxi_pt_alloc_opts pt_opts = {
+		.use_long_event = 1,
+		.is_matching = 1,
+		.en_flowctrl = 1,
+		.lossless = cxip_env.msg_lossless,
+	};
+	struct cxi_cq_alloc_opts cq_opts = {};
+
+	ret = cxip_ep_cmdq(rxc->ep_obj, false, FI_TC_UNSPEC,
+			   rxc->rx_evtq.eq, &rxc->rx_cmdq);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Unable to allocate RX CMDQ, ret: %d\n", ret);
+		return -FI_EDOMAIN;
+	}
+
+	/* For FI_TC_UNSPEC, reuse the TX context command queue if possible. If
+	 * a specific traffic class is requested, allocate a new command queue.
+	 * This is done to prevent performance issues with reusing the TX
+	 * context command queue and changing the communication profile.
+	 */
+	if (cxip_env.rget_tc == FI_TC_UNSPEC) {
+		ret = cxip_ep_cmdq(rxc->ep_obj, true, FI_TC_UNSPEC,
+				   rxc->rx_evtq.eq, &rxc->tx_cmdq);
+		if (ret != FI_SUCCESS) {
+			CXIP_WARN("Unable to allocate TX CMDQ, ret: %d\n", ret);
+			ret = -FI_EDOMAIN;
+			goto put_rx_cmdq;
+		}
+	} else {
+		cq_opts.count = rxc->ep_obj->txq_size * 4;
+		cq_opts.flags = CXI_CQ_IS_TX;
+		cq_opts.policy = cxip_env.cq_policy;
+
+		ret = cxip_cmdq_alloc(rxc->ep_obj->domain->lni,
+				      rxc->rx_evtq.eq, &cq_opts,
+				      rxc->ep_obj->auth_key.vni,
+				      cxip_ofi_to_cxi_tc(cxip_env.rget_tc),
+				      CXI_TC_TYPE_DEFAULT, &rxc->tx_cmdq);
+		if (ret != FI_SUCCESS) {
+			CXIP_WARN("Unable to allocate CMDQ, ret: %d\n", ret);
+			ret = -FI_ENOSPC;
+			goto put_rx_cmdq;
+		}
+	}
+
+	/* If applications AVs are symmetric, use logical FI addresses for
+	 * matching. Otherwise, physical addresses will be used.
+	 */
+	if (rxc->ep_obj->av->symmetric) {
+		CXIP_DBG("Using logical PTE matching\n");
+		pt_opts.use_logical = 1;
+	}
+
+	ret = cxip_pte_alloc(rxc->ep_obj->ptable,
+			     rxc->rx_evtq.eq, CXIP_PTL_IDX_RXQ, false,
+			     &pt_opts, cxip_recv_pte_cb, rxc, &rxc->rx_pte);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to allocate RX PTE: %d\n", ret);
+		goto put_tx_cmdq;
+	}
+
+	/* One slot must be reserved to support hardware generated state change
+	 * events.
+	 */
+	ret = cxip_evtq_adjust_reserved_fc_event_slots(&rxc->rx_evtq,
+						       RXC_RESERVED_FC_SLOTS);
+	if (ret) {
+		CXIP_WARN("Unable to adjust RX reserved event slots: %d\n",
+			  ret);
+		goto free_pte;
+	}
+
+	return FI_SUCCESS;
+
+free_pte:
+	cxip_pte_free(rxc->rx_pte);
+put_tx_cmdq:
+	if (cxip_env.rget_tc == FI_TC_UNSPEC)
+		cxip_ep_cmdq_put(rxc->ep_obj, true);
+	else
+		cxip_cmdq_free(rxc->tx_cmdq);
+put_rx_cmdq:
+	cxip_ep_cmdq_put(rxc->ep_obj, false);
+
+	return ret;
+}
+
+/*
+ * rxc_msg_fini() - Finalize RX context messaging.
+ *
+ * Free hardware resources allocated when the RX context was initialized for
+ * messaging.
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static int rxc_msg_fini(struct cxip_rxc *rxc)
+{
+	int ret __attribute__((unused));
+
+	cxip_pte_free(rxc->rx_pte);
+
+	cxip_ep_cmdq_put(rxc->ep_obj, false);
+
+	if (cxip_env.rget_tc == FI_TC_UNSPEC)
+		cxip_ep_cmdq_put(rxc->ep_obj, true);
+	else
+		cxip_cmdq_free(rxc->tx_cmdq);
+
+	cxip_evtq_adjust_reserved_fc_event_slots(&rxc->rx_evtq,
+						 -1 * RXC_RESERVED_FC_SLOTS);
+
+	cxip_evtq_fini(&rxc->rx_evtq);
+
+	return FI_SUCCESS;
+}
+
+static void cxip_rxc_free_ux_entries(struct cxip_rxc *rxc)
+{
+	struct cxip_ux_send *ux_send;
+	struct dlist_entry *tmp;
+
+	/* TODO: Manage freeing of UX entries better. This code is redundant
+	 * with the freeing in cxip_recv_sw_matcher().
+	 */
+	dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send,
+				     ux_send, rxc_entry, tmp) {
+		dlist_remove(&ux_send->rxc_entry);
+		if (ux_send->req && ux_send->req->type == CXIP_REQ_RBUF)
+			cxip_req_buf_ux_free(ux_send);
+		else
+			free(ux_send);
+
+		rxc->sw_ux_list_len--;
+	}
+
+	if (rxc->sw_ux_list_len != 0)
+		CXIP_WARN("sw_ux_list_len %d != 0\n", rxc->sw_ux_list_len);
+	assert(rxc->sw_ux_list_len == 0);
+
+	/* Free any pending UX entries waiting from the request list */
+	dlist_foreach_container_safe(&rxc->sw_pending_ux_list,
+				     struct cxip_ux_send, ux_send,
+				     rxc_entry, tmp) {
+		dlist_remove(&ux_send->rxc_entry);
+		if (ux_send->req->type == CXIP_REQ_RBUF)
+			cxip_req_buf_ux_free(ux_send);
+		else
+			free(ux_send);
+
+		rxc->sw_pending_ux_list_len--;
+	}
+
+	if (rxc->sw_pending_ux_list_len != 0)
+		CXIP_WARN("sw_pending_ux_list_len %d != 0\n",
+			  rxc->sw_pending_ux_list_len);
+	assert(rxc->sw_pending_ux_list_len == 0);
+}
+
+static size_t cxip_rxc_get_num_events(struct cxip_rxc *rxc)
+{
+	size_t num_events;
+
+	/* Hardware will ensure incoming RDMA operations have event queue space.
+	 * It is the responsibility of software to ensure that any SW initiated
+	 * target commands which may generate an event (e.g. append with failure
+	 * or search) have enough space in the EQ. This can be done in two ways.
+	 *
+	 * 1. Continually increase EQ buffer size until EQ overflows go away.
+	 * This option is not ideal since many application variables are in play
+	 * which impact number of events needed.
+	 *
+	 * 2. Use hybrid endpoint mode to preemptively transition to software
+	 * endpoint when event queue space may be under pressure. When in
+	 * software endpoint mode, software should not be issuing commands, like
+	 * append and search/search & delete, which could result in events being
+	 * generated.
+	 *
+	 * For both cases, RXC size will be used to size number of events. To
+	 * accommodate a stream of unexpected puts and append failures, RXC size
+	 * is added again. With correct credit control for hybrid endpoint to
+	 * preemptively transition to software endpoint, 2* RXC size should be
+	 * enough to prevent EQ overflow. For all other cases, EQ size needs to
+	 * be increased.
+	 */
+
+	num_events = rxc->attr.size * 2;
+
+	/* Add 1 more event for software initiated state change. */
+	num_events++;
+
+	return num_events;
+}
+
+/*
+ * cxip_rxc_enable() - Enable an RX context for use.
+ *
+ * Called via fi_enable(). The context could be used in a standard endpoint or
+ * a scalable endpoint.
+ */
+int cxip_rxc_enable(struct cxip_rxc *rxc)
+{
+	int ret;
+	int tmp;
+	size_t num_events;
+	enum c_ptlte_state state;
+
+	if (rxc->state != RXC_DISABLED)
+		return FI_SUCCESS;
+
+	if (!ofi_recv_allowed(rxc->attr.caps)) {
+		rxc->state = RXC_ENABLED;
+		return FI_SUCCESS;
+	}
+
+	if (!rxc->recv_cq) {
+		CXIP_WARN("Undefined recv CQ\n");
+		return -FI_ENOCQ;
+	}
+
+	num_events = cxip_rxc_get_num_events(rxc);
+	ret = cxip_evtq_init(&rxc->rx_evtq, rxc->recv_cq, num_events, 1);
+	if (ret) {
+		CXIP_WARN("Failed to initialize RXC event queue: %d, %s\n",
+			  ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	ret = rxc_msg_init(rxc);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("rxc_msg_init returned: %d\n", ret);
+		ret = -FI_EDOMAIN;
+		goto evtq_fini;
+	}
+
+	/* If starting in or able to transition to software managed
+	 * PtlTE, append request list entries first.
+	 */
+	if (cxip_software_pte_allowed()) {
+		ret = cxip_req_bufpool_init(rxc);
+		if (ret != FI_SUCCESS)
+			goto err_msg_fini;
+	}
+
+	if (rxc->msg_offload) {
+		state = C_PTLTE_ENABLED;
+		ret = cxip_oflow_bufpool_init(rxc);
+		if (ret != FI_SUCCESS)
+			goto err_req_buf_fini;
+	} else {
+		state = C_PTLTE_SOFTWARE_MANAGED;
+	}
+
+	/* Start accepting Puts. */
+	ret = cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq, state, 0);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("cxip_pte_set_state returned: %d\n", ret);
+		goto err_oflow_buf_fini;
+	}
+
+	/* Wait for PTE state change */
+	do {
+		sched_yield();
+		cxip_evtq_progress(&rxc->rx_evtq);
+	} while (rxc->rx_pte->state != state);
+
+	rxc->pid_bits = rxc->domain->iface->dev->info.pid_bits;
+	CXIP_DBG("RXC messaging enabled: %p, pid_bits: %d\n",
+		 rxc, rxc->pid_bits);
+
+	return FI_SUCCESS;
+
+err_oflow_buf_fini:
+	if (rxc->msg_offload)
+		cxip_oflow_bufpool_fini(rxc);
+
+err_req_buf_fini:
+	if (cxip_software_pte_allowed())
+		cxip_req_bufpool_fini(rxc);
+
+err_msg_fini:
+	tmp = rxc_msg_fini(rxc);
+	if (tmp != FI_SUCCESS)
+		CXIP_WARN("rxc_msg_fini returned: %d\n", tmp);
+
+evtq_fini:
+	cxip_evtq_fini(&rxc->rx_evtq);
+
+	return ret;
+}
+
+/*
+ * rxc_cleanup() - Attempt to free outstanding requests.
+ *
+ * Outstanding commands may be dropped when the RX Command Queue is freed.
+ * This leads to missing events. Attempt to gather all events before freeing
+ * the RX CQ. If events go missing, resources will be leaked until the
+ * Completion Queue is freed.
+ */
+static void rxc_cleanup(struct cxip_rxc *rxc)
+{
+	int ret;
+	uint64_t start;
+	int canceled = 0;
+	struct cxip_fc_drops *fc_drops;
+	struct dlist_entry *tmp;
+
+	if (!ofi_atomic_get32(&rxc->orx_reqs))
+		return;
+
+	cxip_evtq_req_discard(&rxc->rx_evtq, rxc);
+
+	do {
+		ret = cxip_evtq_req_cancel(&rxc->rx_evtq, rxc, 0, false);
+		if (ret == FI_SUCCESS)
+			canceled++;
+	} while (ret == FI_SUCCESS);
+
+	if (canceled)
+		CXIP_DBG("Canceled %d Receives: %p\n", canceled, rxc);
+
+	start = ofi_gettime_ms();
+	while (ofi_atomic_get32(&rxc->orx_reqs)) {
+		sched_yield();
+		cxip_evtq_progress(&rxc->rx_evtq);
+
+		if (ofi_gettime_ms() - start > CXIP_REQ_CLEANUP_TO) {
+			CXIP_WARN("Timeout waiting for outstanding requests.\n");
+			break;
+		}
+	}
+
+	dlist_foreach_container_safe(&rxc->fc_drops, struct cxip_fc_drops,
+				     fc_drops, rxc_entry, tmp) {
+		dlist_remove(&fc_drops->rxc_entry);
+		free(fc_drops);
+	}
+
+	if (rxc->num_fc_eq_full || rxc->num_fc_no_match ||
+	    rxc->num_fc_req_full || rxc->num_fc_unexp ||
+	    rxc->num_fc_append_fail || rxc->num_sc_nic_hw2sw_unexp ||
+	    rxc->num_sc_nic_hw2sw_append_fail)
+		CXIP_INFO(CXIP_SC_STATS, rxc->num_fc_eq_full,
+			  rxc->num_fc_append_fail, rxc->num_fc_no_match,
+			  rxc->num_fc_req_full, rxc->num_fc_unexp,
+			  rxc->num_sc_nic_hw2sw_unexp,
+			  rxc->num_sc_nic_hw2sw_append_fail);
+}
+
+static void cxip_rxc_dump_counters(struct cxip_rxc *rxc)
+{
+	int i;
+	int j;
+	int k;
+	size_t msg_size;
+	bool print_header;
+	int count;
+
+	for (i = 0; i < CXIP_LIST_COUNTS; i++) {
+		for (j = 0; j < OFI_HMEM_MAX; j++) {
+
+			print_header = true;
+
+			for (k = 0; k < CXIP_COUNTER_BUCKETS; k++) {
+				if (k == 0)
+					msg_size = 0;
+				else
+					msg_size = (1ULL << (k - 1));
+
+				count = ofi_atomic_get32(&rxc->cntrs.msg_count[i][j][k]);
+				if (count) {
+					if (print_header) {
+						RXC_INFO(rxc, "Recv Message Size %s - %s Histogram\n",
+							 c_ptl_list_strs[i],
+							 fi_tostr(&j, FI_TYPE_HMEM_IFACE));
+						RXC_INFO(rxc, "%-14s Count\n", "Size");
+						print_header = false;
+					}
+
+					RXC_INFO(rxc, "%-14lu %u\n", msg_size,
+						 count);
+				}
+			}
+		}
+
+	}
+}
+
+void cxip_rxc_struct_init(struct cxip_rxc *rxc, const struct fi_rx_attr *attr,
+			  void *context)
+{
+	int i;
+
+	dlist_init(&rxc->ep_list);
+	ofi_atomic_initialize32(&rxc->orx_hw_ule_cnt, 0);
+	ofi_atomic_initialize32(&rxc->orx_reqs, 0);
+	ofi_atomic_initialize32(&rxc->orx_tx_reqs, 0);
+	rxc->max_tx = cxip_env.sw_rx_tx_init_max;
+
+	rxc->context = context;
+	rxc->attr = *attr;
+
+	for (i = 0; i < CXIP_DEF_EVENT_HT_BUCKETS; i++)
+		dlist_init(&rxc->deferred_events.bh[i]);
+
+	dlist_init(&rxc->fc_drops);
+	dlist_init(&rxc->replay_queue);
+	dlist_init(&rxc->sw_ux_list);
+	dlist_init(&rxc->sw_recv_queue);
+	dlist_init(&rxc->sw_pending_ux_list);
+
+	rxc->max_eager_size = cxip_env.rdzv_threshold + cxip_env.rdzv_get_min;
+	rxc->drop_count = rxc->ep_obj->asic_ver < CASSINI_2_0 ? -1 : 0;
+
+	/* TODO make configurable */
+	rxc->min_multi_recv = CXIP_EP_MIN_MULTI_RECV;
+	rxc->state = RXC_DISABLED;
+	rxc->msg_offload = cxip_env.msg_offload;
+	rxc->hmem = !!(attr->caps & FI_HMEM);
+	rxc->sw_ep_only = cxip_env.rx_match_mode == CXIP_PTLTE_SOFTWARE_MODE;
+	rxc->rget_align_mask = cxip_env.rdzv_aligned_sw_rget ?
+					cxip_env.cacheline_size - 1 : 0;
+
+	cxip_msg_counters_init(&rxc->cntrs);
+}
+
+/*
+ * cxip_rxc_disable() - Disable the RX context of an base endpoint object.
+ *
+ * Free hardware resources allocated when the context was enabled. Called via
+ * fi_close().
+ */
+void cxip_rxc_disable(struct cxip_rxc *rxc)
+{
+	int ret;
+
+	cxip_rxc_dump_counters(rxc);
+
+	if (rxc->state == RXC_DISABLED)
+		return;
+
+	if (ofi_recv_allowed(rxc->attr.caps)) {
+		/* Stop accepting Puts. */
+		ret = rxc_msg_disable(rxc);
+		if (ret != FI_SUCCESS)
+			CXIP_WARN("rxc_msg_disable returned: %d\n", ret);
+
+		cxip_rxc_free_ux_entries(rxc);
+
+		rxc_cleanup(rxc);
+
+		if (cxip_software_pte_allowed())
+			cxip_req_bufpool_fini(rxc);
+
+		if (cxip_env.msg_offload)
+			cxip_oflow_bufpool_fini(rxc);
+
+		/* Free hardware resources. */
+		ret = rxc_msg_fini(rxc);
+		if (ret != FI_SUCCESS)
+			CXIP_WARN("rxc_msg_fini returned: %d\n", ret);
+	}
+}
diff --git a/prov/cxi/src/cxip_telemetry.c b/prov/cxi/src/cxip_telemetry.c
new file mode 100644
index 00000000000..6bbb16bea0c
--- /dev/null
+++ b/prov/cxi/src/cxip_telemetry.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+#include "config.h"
+#include "cxip.h"
+
+#define TELEMETRY_FILE_FMT "/sys/class/cxi/cxi%u/device/telemetry/%s"
+
+static long cxip_telemetry_entry_read_value(struct cxip_telemetry_entry *entry)
+{
+	long ret;
+	char path[FI_PATH_MAX];
+	FILE *f;
+	long value;
+	long tstamp_sec;
+	long tstamp_nsec;
+
+	ret = snprintf(path, FI_PATH_MAX, TELEMETRY_FILE_FMT,
+		       entry->telemetry->dom->iface->info->dev_id, entry->name);
+	if (ret < 0)
+		return ret;
+
+	f = fopen(path, "r");
+	if (!f)
+		return -errno;
+
+	/* Even though only value is needed, extract 3 values to ensure
+	 * telemetry data is in the expected format.
+	 */
+	ret = fscanf(f, "%ld@%ld.%ld", &value, &tstamp_sec, &tstamp_nsec);
+	if (ret != 3) {
+		if (ret == EOF)
+			ret = -errno;
+		else
+			ret = -FI_EINVAL;
+	} else {
+		ret = value;
+	}
+
+	fclose(f);
+
+	return ret;
+}
+
+static void cxip_telemetry_entry_dump_delta(struct cxip_telemetry_entry *entry)
+{
+	long delta;
+
+	delta = cxip_telemetry_entry_read_value(entry);
+	if (delta < 0) {
+		DOM_WARN(entry->telemetry->dom, "Failed to read %s: %ld:%s\n",
+			 entry->name, delta, fi_strerror(-delta));
+		return;
+	}
+
+	if (delta < entry->value) {
+		DOM_WARN(entry->telemetry->dom,
+			 "Failed to perform delta due to %s reset\n",
+			 entry->name);
+		return;
+	}
+
+	delta -= entry->value;
+
+	DOM_INFO(entry->telemetry->dom, "%s: %ld\n", entry->name, delta);
+}
+
+static int cxip_telemetry_entry_reset_value(struct cxip_telemetry_entry *entry)
+{
+	long ret;
+
+	ret = cxip_telemetry_entry_read_value(entry);
+	if (ret < 0) {
+		DOM_WARN(entry->telemetry->dom, "Failed to read %s: %ld:%s\n",
+			 entry->name, ret, fi_strerror(-ret));
+		return ret;
+	}
+
+	entry->value = ret;
+
+	return FI_SUCCESS;
+}
+
+static void
+cxip_telemetry_entry_free(struct cxip_telemetry_entry *entry)
+{
+	dlist_remove(&entry->telemetry_entry);
+	free(entry);
+}
+
+static bool
+cxip_telemetry_entry_validate_token_file(struct cxip_telemetry *telemetry,
+					 const char *telemetry_token)
+{
+	char path[FI_PATH_MAX];
+	int ret;
+
+	ret = snprintf(path, FI_PATH_MAX, TELEMETRY_FILE_FMT,
+		       telemetry->dom->iface->info->dev_id, telemetry_token);
+	if (ret < 0)
+		return false;
+
+	/* Verify user read access to the telemetry file. */
+	if (access(path, R_OK))
+		return false;
+
+	return true;
+}
+
+static bool
+cxip_telemetry_entry_validate_token(struct cxip_telemetry *telemetry,
+				    const char *telemetry_token)
+{
+	/* The telemetry directory has an ALL-in-binary entry. This file is
+	 * considered invalid for this telemetry implementation.
+	 */
+	if (strcmp(telemetry_token, "ALL-in-binary") == 0)
+		return false;
+
+	return cxip_telemetry_entry_validate_token_file(telemetry,
+							telemetry_token);
+}
+
+static int cxip_telemetry_entry_alloc(struct cxip_telemetry *telemetry,
+				      const char *telemetry_token)
+{
+	struct cxip_telemetry_entry *entry;
+	int ret;
+
+	if (!cxip_telemetry_entry_validate_token(telemetry, telemetry_token)) {
+		DOM_WARN(telemetry->dom, "Invalid telemetry: %s\n",
+			 telemetry_token);
+		return -FI_EINVAL;
+	}
+
+	entry = calloc(1, sizeof(*entry));
+	if (!entry)
+		return -FI_ENOMEM;
+
+	entry->telemetry = telemetry;
+
+	strncpy(entry->name, telemetry_token, TELEMETRY_ENTRY_NAME_SIZE - 1);
+	entry->name[TELEMETRY_ENTRY_NAME_SIZE - 1] = '\0';
+
+	/* Revalidate the name after the memcpy. */
+	if (!cxip_telemetry_entry_validate_token(telemetry, entry->name)) {
+		DOM_WARN(telemetry->dom, "Invalid telemetry: %s\n",
+			 entry->name);
+		ret = FI_EINVAL;
+		goto err_free_entry;
+	}
+
+	ret = cxip_telemetry_entry_reset_value(entry);
+	if (ret)
+		goto err_free_entry;
+
+	dlist_insert_tail(&entry->telemetry_entry, &telemetry->telemetry_list);
+
+	return FI_SUCCESS;
+
+err_free_entry:
+	free(entry);
+
+	return ret;
+}
+
+static int cxip_telemetry_sleep_duration(void)
+{
+	int ret;
+	int msec_sleep;
+	char *path = "/sys/module/cxi_core/parameters/cntr_refresh_interval";
+	FILE *f;
+
+	f = fopen(path, "r");
+	if (!f)
+		return -errno;
+
+	ret = fscanf(f, "%d", &msec_sleep);
+	if (ret != 1) {
+		if (ret == EOF)
+			ret = -errno;
+		else
+			ret = -FI_EINVAL;
+	} else {
+		/* Convert sleep duration to seconds. */
+		ret = msec_sleep / 1000;
+		if (msec_sleep % 1000)
+			ret++;
+		ret = MAX(ret, 1);
+	}
+
+	fclose(f);
+
+	return ret;
+}
+
+void cxip_telemetry_dump_delta(struct cxip_telemetry *telemetry)
+{
+	struct cxip_telemetry_entry *entry;
+	int sleep_duration;
+
+	/* Since sysfs telemetry entries are refreshed as some interval, we need
+	 * to sleep for a refresh interval to get updates. Else, the application
+	 * could run and telemetry deltas would be zero.
+	 */
+	sleep_duration = cxip_telemetry_sleep_duration();
+	if (sleep_duration < 0) {
+		DOM_WARN(telemetry->dom,
+			 "Failed to retrieve telemetry sleep duration: %d:%s\n",
+			 sleep_duration, fi_strerror(-sleep_duration));
+		return;
+	}
+
+	sleep(sleep_duration);
+
+	dlist_foreach_container(&telemetry->telemetry_list,
+				struct cxip_telemetry_entry, entry,
+				telemetry_entry)
+		cxip_telemetry_entry_dump_delta(entry);
+}
+
+void cxip_telemetry_free(struct cxip_telemetry *telemetry)
+{
+	struct cxip_telemetry_entry *entry;
+	struct dlist_entry *tmp;
+
+	dlist_foreach_container_safe(&telemetry->telemetry_list,
+				     struct cxip_telemetry_entry,
+				     entry, telemetry_entry, tmp)
+		cxip_telemetry_entry_free(entry);
+
+	free(telemetry);
+}
+
+int cxip_telemetry_alloc(struct cxip_domain *dom,
+			 struct cxip_telemetry **telemetry)
+{
+	struct cxip_telemetry *_telemetry;
+	const char *telemetry_token;
+	char *telemetry_copy;
+	int ret = FI_SUCCESS;
+
+	/* If user defined telemtry string is NULL or RGID does not match,
+	 * return -FI_ENOSYS signalling this function is not supported.
+	 */
+	if (!cxip_env.telemetry ||
+	    (cxip_env.telemetry_rgid >= 0 &&
+	     dom->lni->lni->id != cxip_env.telemetry_rgid))
+		return -FI_ENOSYS;
+
+	_telemetry = calloc(1, sizeof(*_telemetry));
+	if (!_telemetry)
+		return -FI_ENOMEM;
+
+	_telemetry->dom = dom;
+	dlist_init(&_telemetry->telemetry_list);
+
+	telemetry_copy = malloc(strlen(cxip_env.telemetry) + 1);
+	if (!telemetry_copy) {
+		ret = -FI_ENOMEM;
+		goto err_free_telemetry;
+	}
+
+	strcpy(telemetry_copy, cxip_env.telemetry);
+
+	/* The following will parse the comma separated list and attempt to
+	 * allocate a telemetry entry for any valid substring/token. If a
+	 * telemetry entry fails to be allocated for a given substring/token,
+	 * this is not considered fatal and parsing will continue.
+	 */
+	telemetry_token = strtok(telemetry_copy, ",");
+	while (telemetry_token != NULL) {
+		ret = cxip_telemetry_entry_alloc(_telemetry, telemetry_token);
+		if (ret)
+			DOM_WARN(dom, "Failed to allocated %s telemetry entry: %d:%s\n",
+				 telemetry_token, ret, fi_strerror(-ret));
+		else
+			DOM_INFO(dom, "Telemetry entry allocated for %s\n",
+				 telemetry_token);
+
+		telemetry_token = strtok(NULL, ",");
+	}
+
+	free(telemetry_copy);
+
+	if (dlist_empty(&_telemetry->telemetry_list)) {
+		DOM_WARN(dom, "Failed to allocated any telemetry entries\n");
+		ret = -FI_EINVAL;
+		goto err_free_telemetry;
+	}
+
+	*telemetry = _telemetry;
+
+	return FI_SUCCESS;
+
+err_free_telemetry:
+	cxip_telemetry_free(_telemetry);
+
+	return ret;
+}
diff --git a/prov/cxi/src/cxip_trace.c b/prov/cxi/src/cxip_trace.c
new file mode 100644
index 00000000000..5d3a371b5f4
--- /dev/null
+++ b/prov/cxi/src/cxip_trace.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+/**
+ * @brief TRACE function for producing runtime debugging logs
+ *
+ * The following should be inserted at the top of a code module to trace:
+ *
+ *   #define TRACE(fmt, ...) CXIP_TRACE(<module>, fmt, ##__VA_ARGS__)
+ *
+ * If ENABLE_DEBUG is false at compile time, CXIP_TRACE is a syntactically
+ * robust NOOP which results in no code being emitted, ensuring that these
+ * trace calls do not affect performance in production, and none of the
+ * following comment apply.
+ *
+ * - cxip_trace_fn is the function that logs a trace message.
+ * - cxip_trace_flush_fn can be used to flush buffered trace messages.
+ * - cxip_trace_close_fn can be used to flush and close the output.
+ * - cxip_trace_enable_fn is used to enable/disable all tracing.
+ * - cxip_trace_set() is used to enable a tracing module.
+ * - cxip_trace_clr() is used to disable a tracing module.
+ *
+ * Modules are defined by the list of enum cxip_trace_module values, which
+ * can be extended as needed to provide finer control over tracing.
+ *
+ * The initial values are set in cxip_trace_init() below, using run-time
+ * environment variables. cxip_trace_enable() can be used to dynamically
+ * enable or disable tracing. cxip_trace_set() and cxip_trace_clr() can be
+ * used to dynamically modify which traces will generate output.
+ *
+ * Some initialization is required by the use of environment variables:
+ *
+ * Specifying the environment variable CXIP_TRACE_FILENAME will deliver
+ * output to a file with the specified name, followed by the PMI_RANK value
+ * (if there is one).
+ *
+ * Specifying CXIP_TRACE_APPEND in conjunction with CXIP_TRACE_FILENAME will
+ * open the file in append mode. This is important for NETSIM tests under
+ * Criterion, since each test is run in a separate process and closes all
+ * files at completion of each test.
+ *
+ * Specifying PMI_RANK as a rank value will apply a prefix to the trace lines
+ * that identifies the rank of the trace.
+ *
+ * Specifying PMI_SIZE will expand the prefix to show the number of ranks.
+ *
+ * cxip_trace_fid is exposed, and can be manipulated using the normal stream
+ * file functions. Default buffering is fully buffered output, which can
+ * result in delays in the appearance of logging information. Using
+ * setlinebuf() will run slower, but will display lines more quickly.
+ *
+ * cxip_trace_flush() forces all output be flushed AND written to disk, but
+ * leaves the file open for more writing.
+ *
+ * cxip_trace_close() flushes all output and closes the file.
+ */
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "cxip.h"
+
+bool cxip_trace_initialized;
+bool cxip_trace_enabled;
+bool cxip_trace_append;
+bool cxip_trace_linebuf;	// set line buffering for trace
+int cxip_trace_rank;
+int cxip_trace_numranks;
+char *cxip_trace_filename;
+FILE *cxip_trace_fid;
+uint64_t cxip_trace_mask;
+
+/* Static initialization of default trace functions, can be overridden */
+cxip_trace_t cxip_trace_attr cxip_trace_fn = cxip_trace;
+cxip_trace_flush_t cxip_trace_flush_fn = cxip_trace_flush;
+cxip_trace_close_t cxip_trace_close_fn = cxip_trace_close;
+cxip_trace_enable_t cxip_trace_enable_fn = cxip_trace_enable;
+
+/* Get environment variable as string representation of int */
+static int getenv_int(const char *name)
+{
+	char *env;
+	int value;
+
+	value = -1;
+	env = getenv(name);
+	if (env)
+		sscanf(env, "%d", &value);
+	return value;
+}
+
+void cxip_trace_init(void)
+{
+	const char *fname;
+
+	if (cxip_trace_initialized)
+		return;
+
+	cxip_trace_initialized = true;
+	cxip_trace_enabled = !!getenv("CXIP_TRACE_ENABLE");
+	cxip_trace_append = !!getenv("CXIP_TRACE_APPEND");
+	cxip_trace_linebuf = !!getenv("CXIP_TRACE_LINEBUF");
+	cxip_trace_rank = getenv_int("PMI_RANK");
+	cxip_trace_numranks = getenv_int("PMI_SIZE");
+	cxip_trace_append = getenv("CXIP_TRACE_APPEND");
+	fname = getenv("CXIP_TRACE_FILENAME");
+
+	cxip_trace_mask = 0L;
+	if (getenv("CXIP_TRC_CTRL"))
+		cxip_trace_set(CXIP_TRC_CTRL);
+	if (getenv("CXIP_TRC_ZBCOLL"))
+		cxip_trace_set(CXIP_TRC_ZBCOLL);
+	if (getenv("CXIP_TRC_CURL"))
+		cxip_trace_set(CXIP_TRC_CURL);
+	if (getenv("CXIP_TRC_COLL_PKT"))
+		cxip_trace_set(CXIP_TRC_COLL_PKT);
+	if (getenv("CXIP_TRC_COLL_JOIN"))
+		cxip_trace_set(CXIP_TRC_COLL_JOIN);
+	if (getenv("CXIP_TRC_COLL_DEBUG"))
+		cxip_trace_set(CXIP_TRC_COLL_DEBUG);
+	if (getenv("CXIP_TRC_TEST_CODE"))
+		cxip_trace_set(CXIP_TRC_TEST_CODE);
+
+	if (!fname)
+		fname = "trace";
+	if (fname) {
+		asprintf(&cxip_trace_filename, "./%s%d",
+			 fname, cxip_trace_rank);
+		cxip_trace_fid = fopen(cxip_trace_filename,
+				       cxip_trace_append ? "a" : "w");
+		if (!cxip_trace_fid) {
+			fprintf(stderr, "open(%s) failed: %s\n",
+				cxip_trace_filename, strerror(errno));
+		}
+		if (cxip_trace_linebuf && cxip_trace_fid)
+			setlinebuf(cxip_trace_fid);
+	}
+}
+
+void cxip_trace_flush(void)
+{
+	cxip_trace_init();
+	if (cxip_trace_fid) {
+		fflush(cxip_trace_fid);
+		fsync(fileno(cxip_trace_fid));
+	}
+}
+
+void cxip_trace_close(void)
+{
+	cxip_trace_init();
+	if (cxip_trace_fid) {
+		cxip_trace_flush();
+		fclose(cxip_trace_fid);
+		cxip_trace_fid = NULL;
+		cxip_trace_initialized = false;
+	}
+}
+
+int cxip_trace_attr cxip_trace(const char *fmt, ...)
+{
+	va_list args;
+	char *str;
+	int len;
+
+	cxip_trace_init();
+	if (!cxip_trace_enabled)
+		return 0;
+	va_start(args, fmt);
+	len = vasprintf(&str, fmt, args);
+	va_end(args);
+	if (len >= 0) {
+		len = fprintf(cxip_trace_fid, "[%2d|%2d] %s",
+			      cxip_trace_rank, cxip_trace_numranks, str);
+		free(str);
+	}
+	return len;
+}
+
+bool cxip_trace_enable(bool enable)
+{
+	bool was_enabled = cxip_trace_enabled;
+
+	cxip_trace_init();
+	cxip_trace_enabled = enable;
+	return was_enabled;
+}
diff --git a/prov/cxi/src/cxip_txc.c b/prov/cxi/src/cxip_txc.c
new file mode 100644
index 00000000000..a15ed8ee65b
--- /dev/null
+++ b/prov/cxi/src/cxip_txc.c
@@ -0,0 +1,695 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2019-2023 Hewlett Packard Enterprise Development LP
+ */
+
+/* CXI TX Context Management */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "cxip.h"
+
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+/* 8 Rendezvous, 2 RMA and 2 Atomic + 4 extra */
+#define CXIP_INTERNAL_TX_REQS	16
+
+struct cxip_md *cxip_txc_ibuf_md(void *ibuf)
+{
+	return ofi_buf_hdr(ibuf)->region->context;
+}
+
+/*
+ * cxip_txc_ibuf_alloc() - Allocate an inject buffer.
+ *
+ * Caller must hold txc->ep_obj.lock
+ */
+void *cxip_txc_ibuf_alloc(struct cxip_txc *txc)
+{
+	void *ibuf;
+
+	ibuf = (struct cxip_req *)ofi_buf_alloc(txc->ibuf_pool);
+	if (ibuf)
+		CXIP_DBG("Allocated inject buffer: %p\n", ibuf);
+	else
+		CXIP_WARN("Failed to allocate inject buffer\n");
+
+	return ibuf;
+}
+
+/*
+ * cxip_txc_ibuf_free() - Free an inject buffer.
+ *
+ * Caller must hold txc->ep_obj.lock
+ */
+void cxip_txc_ibuf_free(struct cxip_txc *txc, void *ibuf)
+{
+	ofi_buf_free(ibuf);
+	CXIP_DBG("Freed inject buffer: %p\n", ibuf);
+}
+
+int cxip_ibuf_chunk_init(struct ofi_bufpool_region *region)
+{
+	struct cxip_txc *txc = region->pool->attr.context;
+	struct cxip_md *md;
+	int ret;
+
+	ret = cxip_map(txc->domain, region->mem_region,
+		       region->pool->region_size, OFI_MR_NOCACHE, &md);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Failed to map inject buffer chunk\n");
+		return ret;
+	}
+
+	region->context = md;
+
+	return FI_SUCCESS;
+}
+
+void cxip_ibuf_chunk_fini(struct ofi_bufpool_region *region)
+{
+	cxip_unmap(region->context);
+}
+
+int cxip_txc_ibuf_create(struct cxip_txc *txc)
+{
+	struct ofi_bufpool_attr bp_attrs = {};
+	int ret;
+
+	bp_attrs.size = CXIP_INJECT_SIZE;
+	bp_attrs.alignment = 8;
+	bp_attrs.max_cnt = UINT16_MAX;
+	bp_attrs.chunk_cnt = 64;
+	bp_attrs.alloc_fn = cxip_ibuf_chunk_init;
+	bp_attrs.free_fn = cxip_ibuf_chunk_fini;
+	bp_attrs.context = txc;
+
+	/* Avoid creating VA holes outside the buffer pool
+	 * if CXI_FORK_SAFE/CXI_FORK_SAFE_HP is enabled.
+	 */
+	if (cxip_env.fork_safe_requested)
+		bp_attrs.flags = OFI_BUFPOOL_NONSHARED;
+
+	ret = ofi_bufpool_create_attr(&bp_attrs, &txc->ibuf_pool);
+	if (ret)
+		ret = -FI_ENOMEM;
+
+	return ret;
+}
+
+/*
+ * cxip_tx_id_alloc() - Allocate a TX ID.
+ *
+ * TX IDs are assigned to Put operations that need to be tracked by the target.
+ * One example of this is a Send with completion that guarantees match
+ * completion at the target. This only applies to eager, unexpected Sends.
+ *
+ * Caller must hold txc->ep_obj.lock
+ */
+int cxip_tx_id_alloc(struct cxip_txc *txc, void *ctx)
+{
+	int id;
+
+	id = ofi_idx_insert(&txc->tx_ids, ctx);
+	if (id < 0 || id >= CXIP_TX_IDS) {
+		CXIP_DBG("Failed to allocate TX ID: %d\n", id);
+		if (id > 0)
+			ofi_idx_remove(&txc->tx_ids, id);
+
+		return -FI_ENOSPC;
+	}
+
+	CXIP_DBG("Allocated ID: %d\n", id);
+
+	return id;
+}
+
+/*
+ * cxip_tx_id_free() - Free a TX ID.
+ *
+ * Caller must hold txc->ep_obj.lock
+ */
+int cxip_tx_id_free(struct cxip_txc *txc, int id)
+{
+	if (id < 0 || id >= CXIP_TX_IDS)
+		return -FI_EINVAL;
+
+	ofi_idx_remove(&txc->tx_ids, id);
+	CXIP_DBG("Freed ID: %d\n", id);
+
+	return FI_SUCCESS;
+}
+
+/* Caller must hold txc->ep_obj.lock */
+void *cxip_tx_id_lookup(struct cxip_txc *txc, int id)
+{
+	return ofi_idx_lookup(&txc->tx_ids, id);
+}
+
+/*
+ * cxip_rdzv_id_alloc() - Allocate a rendezvous ID.
+ *
+ * A Rendezvous ID are assigned to rendezvous Send operation. The ID is used by
+ * the target to differentiate rendezvous Send operations initiated by a source.
+ *
+ * Caller must hold txc->ep_obj->lock.
+ */
+int cxip_rdzv_id_alloc(struct cxip_txc *txc, struct cxip_req *req)
+{
+	struct indexer *rdzv_ids;
+	int max_rdzv_id;
+	int id_offset;
+	int id;
+
+	/* FI_TAGGED sends by definition do not support FI_MULTI_RECV;
+	 * they can utilize the pool of rendezvous ID [256 to 32K-1].
+	 * FI_MSG which supports FI_MULTI_RECV is restricted to a rendezvous
+	 * ID range of [0 to 255].
+	 */
+	if (req->send.tagged) {
+		rdzv_ids = &txc->rdzv_ids;
+		max_rdzv_id = CXIP_RDZV_IDS;
+		id_offset = CXIP_RDZV_IDS_MULTI_RECV;
+	} else {
+		rdzv_ids = &txc->msg_rdzv_ids;
+		max_rdzv_id = CXIP_RDZV_IDS_MULTI_RECV;
+		id_offset = 0;
+	}
+
+	id = ofi_idx_insert(rdzv_ids, req);
+	if (id < 0 || id + id_offset >= max_rdzv_id) {
+		CXIP_DBG("Failed to allocate rdzv ID: %d\n", id);
+		if (id > 0)
+			ofi_idx_remove(rdzv_ids, id);
+
+		return -FI_ENOSPC;
+	}
+
+	id += id_offset;
+	CXIP_DBG("Allocated ID: %d\n", id);
+
+	return id;
+}
+
+/*
+ * cxip_rdzv_id_free() - Free a rendezvous ID.
+ *
+ * Caller must hold txc->ep_obj->lock.
+ */
+int cxip_rdzv_id_free(struct cxip_txc *txc, int id)
+{
+	if (id < 0 || id >= CXIP_RDZV_IDS)
+		return -FI_EINVAL;
+
+	CXIP_DBG("Freed RDZV ID: %d\n", id);
+
+	/* ID value indicates which pool it comes from */
+	if (id >= CXIP_RDZV_IDS_MULTI_RECV) {
+		id -= CXIP_RDZV_IDS_MULTI_RECV;
+		ofi_idx_remove(&txc->rdzv_ids, id);
+	} else {
+		ofi_idx_remove(&txc->msg_rdzv_ids, id);
+	}
+
+	return FI_SUCCESS;
+}
+
+/* Caller must hold txc->ep_obj->lock. */
+void *cxip_rdzv_id_lookup(struct cxip_txc *txc, int id)
+{
+
+	if (id >= CXIP_RDZV_IDS_MULTI_RECV) {
+		id -= CXIP_RDZV_IDS_MULTI_RECV;
+		return ofi_idx_lookup(&txc->rdzv_ids, id);
+	}
+	return ofi_idx_lookup(&txc->msg_rdzv_ids, id);
+}
+
+/*
+ * txc_msg_init() - Initialize an RX context for messaging.
+ *
+ * Allocates and initializes hardware resources used for transmitting messages.
+ *
+ * Caller must hold ep_obj->lock
+ */
+static int txc_msg_init(struct cxip_txc *txc)
+{
+	int ret;
+
+	/* Allocate TGQ for posting source data */
+	ret = cxip_ep_cmdq(txc->ep_obj, false, FI_TC_UNSPEC,
+			   txc->tx_evtq.eq, &txc->rx_cmdq);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Unable to allocate TGQ, ret: %d\n", ret);
+		return -FI_EDOMAIN;
+	}
+
+	ret = cxip_rdzv_match_pte_alloc(txc, &txc->rdzv_pte);
+	if (ret) {
+		CXIP_WARN("Failed to allocate rendezvous PtlTE: %d:%s\n", ret,
+			  fi_strerror(-ret));
+		goto err_put_rx_cmdq;
+	}
+	txc->rdzv_proto = cxip_env.rdzv_proto;
+
+	CXIP_DBG("TXC RDZV PtlTE enabled: %p proto: %s\n",
+		 txc, cxip_rdzv_proto_to_str(txc->rdzv_proto));
+
+	return FI_SUCCESS;
+
+err_put_rx_cmdq:
+	cxip_ep_cmdq_put(txc->ep_obj, false);
+
+	return ret;
+}
+
+/*
+ * txc_msg_fini() - Finalize TX context messaging.
+ *
+ * Free hardware resources allocated when the TX context was initialized for
+ * messaging.
+ *
+ * Caller must hold txc->ep_obj->lock.
+ */
+static int txc_msg_fini(struct cxip_txc *txc)
+{
+	int i;
+
+	cxip_rdzv_match_pte_free(txc->rdzv_pte);
+
+	for (i = 0; i < RDZV_NO_MATCH_PTES; i++) {
+		if (txc->rdzv_nomatch_pte[i])
+			cxip_rdzv_nomatch_pte_free(txc->rdzv_nomatch_pte[i]);
+	}
+
+	cxip_ep_cmdq_put(txc->ep_obj, false);
+
+	return FI_SUCCESS;
+}
+
+static size_t cxip_txc_get_num_events(struct cxip_txc *txc)
+{
+	size_t num_events;
+
+	/* Need enough events to accommodate initiator credits which is
+	 * based on TX attr size.
+	 */
+	num_events = txc->attr.size;
+
+	/* Worse case is an initiator credit needs two events (e.g. rendezvous
+	 * send).
+	 */
+	num_events *= 2;
+
+	/* For messaging, target initiator rendezvous gets has its own set of
+	 * credits. These are always single event operations.
+	 */
+	num_events += cxip_env.sw_rx_tx_init_max;
+
+	/* Account for internal operations. */
+	num_events += CXIP_INTERNAL_TX_REQS;
+
+	return num_events;
+}
+
+/*
+ * cxip_txc_enable() - Enable a TX context for use.
+ *
+ * Called via fi_enable(). The context could be used in a standard endpoint or
+ * a scalable endpoint.
+ */
+int cxip_txc_enable(struct cxip_txc *txc)
+{
+	int ret = FI_SUCCESS;
+	size_t num_events;
+
+	if (txc->enabled)
+		return FI_SUCCESS;
+
+	if (!txc->send_cq) {
+		CXIP_WARN("Undefined send CQ\n");
+		return -FI_ENOCQ;
+	}
+
+	ret = cxip_txc_ibuf_create(txc);
+	if (ret) {
+		CXIP_WARN("Failed to create inject bufpool %d\n", ret);
+		return ret;
+	}
+
+	/* Protected with ep_obj->lock */
+	memset(&txc->rdzv_ids, 0, sizeof(txc->rdzv_ids));
+	memset(&txc->msg_rdzv_ids, 0, sizeof(txc->msg_rdzv_ids));
+	memset(&txc->tx_ids, 0, sizeof(txc->tx_ids));
+
+	num_events = cxip_txc_get_num_events(txc);
+	ret = cxip_evtq_init(&txc->tx_evtq, txc->send_cq, num_events, 0);
+	if (ret) {
+		CXIP_WARN("Failed to initialize TX event queue: %d, %s\n",
+			  ret, fi_strerror(-ret));
+		goto destroy_ibuf;
+	}
+
+	ret = cxip_ep_cmdq(txc->ep_obj, true, txc->tclass,
+			   txc->tx_evtq.eq, &txc->tx_cmdq);
+	if (ret != FI_SUCCESS) {
+		CXIP_WARN("Unable to allocate TX CMDQ, ret: %d\n", ret);
+		ret = -FI_EDOMAIN;
+		/* CQ disable will be done at CQ close */
+		goto destroy_evtq;
+	}
+
+	if (ofi_send_allowed(txc->attr.caps)) {
+		ret = txc_msg_init(txc);
+		if (ret != FI_SUCCESS) {
+			CXIP_WARN("Unable to init TX CTX, ret: %d\n", ret);
+			goto put_tx_cmdq;
+		}
+	}
+
+	txc->pid_bits = txc->domain->iface->dev->info.pid_bits;
+	txc->enabled = true;
+
+	return FI_SUCCESS;
+
+put_tx_cmdq:
+	cxip_ep_cmdq_put(txc->ep_obj, true);
+destroy_evtq:
+	cxip_evtq_fini(&txc->tx_evtq);
+destroy_ibuf:
+	ofi_idx_reset(&txc->tx_ids);
+	ofi_idx_reset(&txc->rdzv_ids);
+	ofi_idx_reset(&txc->msg_rdzv_ids);
+	ofi_bufpool_destroy(txc->ibuf_pool);
+
+	return ret;
+}
+
+/*
+ * txc_cleanup() - Attempt to free outstanding requests.
+ *
+ * Outstanding commands may be dropped when the TX Command Queue is freed.
+ * This leads to missing events. Attempt to gather all events before freeing
+ * the TX CQ. If events go missing, resources will be leaked until the
+ * Completion Queue is freed.
+ */
+static void txc_cleanup(struct cxip_txc *txc)
+{
+	uint64_t start;
+	struct cxip_fc_peer *fc_peer;
+	struct dlist_entry *tmp;
+
+	if (!ofi_atomic_get32(&txc->otx_reqs))
+		goto free_fc_peers;
+
+	cxip_evtq_req_discard(&txc->tx_evtq, txc);
+
+	start = ofi_gettime_ms();
+	while (ofi_atomic_get32(&txc->otx_reqs)) {
+		sched_yield();
+
+		cxip_evtq_progress(&txc->tx_evtq);
+		cxip_ep_ctrl_progress_locked(txc->ep_obj);
+
+		if (ofi_gettime_ms() - start > CXIP_REQ_CLEANUP_TO) {
+			CXIP_WARN("Timeout waiting for outstanding requests.\n");
+			break;
+		}
+	}
+
+	assert(ofi_atomic_get32(&txc->otx_reqs) == 0);
+
+free_fc_peers:
+	dlist_foreach_container_safe(&txc->fc_peers, struct cxip_fc_peer,
+				     fc_peer, txc_entry, tmp) {
+		dlist_remove(&fc_peer->txc_entry);
+		free(fc_peer);
+	}
+}
+
+void cxip_txc_struct_init(struct cxip_txc *txc, const struct fi_tx_attr *attr,
+			  void *context)
+{
+	dlist_init(&txc->ep_list);
+	ofi_atomic_initialize32(&txc->otx_reqs, 0);
+	dlist_init(&txc->msg_queue);
+	dlist_init(&txc->fc_peers);
+
+	txc->context = context;
+	txc->attr = *attr;
+	txc->max_eager_size = cxip_env.rdzv_threshold + cxip_env.rdzv_get_min;
+	txc->rdzv_eager_size = cxip_env.rdzv_eager_size;
+	txc->hmem = !!(attr->caps & FI_HMEM);
+}
+
+/*
+ * cxip_txc_disable() - Disable a TX context for a base endpoint object.
+ *
+ * Free hardware resources allocated when the context was enabled. Called via
+ * fi_close().
+ */
+void cxip_txc_disable(struct cxip_txc *txc)
+{
+	int ret;
+
+	if (!txc->enabled)
+		return;
+
+	txc->enabled = false;
+	txc_cleanup(txc);
+
+	ofi_idx_reset(&txc->tx_ids);
+	ofi_idx_reset(&txc->rdzv_ids);
+	ofi_idx_reset(&txc->msg_rdzv_ids);
+	ofi_bufpool_destroy(txc->ibuf_pool);
+
+	if (ofi_send_allowed(txc->attr.caps)) {
+		ret = txc_msg_fini(txc);
+		if (ret)
+			CXIP_WARN("Unable to destroy TX CTX, ret: %d\n",
+				       ret);
+	}
+
+	cxip_ep_cmdq_put(txc->ep_obj, true);
+	cxip_evtq_fini(&txc->tx_evtq);
+}
+
+/* Caller must hold ep_obj->lock. */
+void cxip_txc_flush_msg_trig_reqs(struct cxip_txc *txc)
+{
+	struct cxip_req *req;
+	struct dlist_entry *tmp;
+
+	/* Drain the message queue. */
+	dlist_foreach_container_safe(&txc->msg_queue, struct cxip_req, req,
+				     send.txc_entry, tmp) {
+		if (cxip_is_trig_req(req)) {
+			ofi_atomic_dec32(&txc->otx_reqs);
+			dlist_remove(&req->send.txc_entry);
+			cxip_unmap(req->send.send_md);
+			cxip_evtq_req_free(req);
+		}
+	}
+}
+
+static bool cxip_txc_can_emit_op(struct cxip_txc *txc,
+				 bool event_success_disabled)
+{
+	if (cxip_evtq_saturated(&txc->tx_evtq)) {
+		TXC_WARN(txc, "TX HW EQ saturated\n");
+		return false;
+	}
+
+	/* If taking a successful completion, limit outstanding operations */
+	if (!event_success_disabled &&
+	    (ofi_atomic_get32(&txc->otx_reqs) >= txc->attr.size)) {
+		TXC_WARN(txc, "TXC attr size saturated\n");
+		return false;
+	}
+
+	return true;
+}
+
+int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni,
+			  enum cxi_traffic_class tc,
+			  enum cxi_traffic_class_type tc_type,
+			  const struct c_cstate_cmd *c_state,
+			  const struct c_idc_put_cmd *put, const void *buf,
+			  size_t len, uint64_t flags)
+{
+	int ret;
+
+	if (!cxip_txc_can_emit_op(txc, c_state->event_success_disable))
+		return -FI_EAGAIN;
+
+	/* Ensure correct traffic class is used. */
+	ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type);
+	if (ret) {
+		TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		return ret;
+	}
+
+	ret = cxip_cmdq_emit_idc_put(txc->tx_cmdq, c_state, put, buf, len,
+				     flags);
+	if (ret) {
+		TXC_WARN(txc, "Failed to emit idc_put command: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		return ret;
+	}
+
+	/* Kick the command queue. */
+	cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE),
+		      ofi_atomic_get32(&txc->otx_reqs));
+
+	if (!c_state->event_success_disable)
+		ofi_atomic_inc32(&txc->otx_reqs);
+
+	return FI_SUCCESS;
+}
+
+int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni,
+		      enum cxi_traffic_class tc,
+		      enum cxi_traffic_class_type tc_type,
+		      struct cxip_cntr *trig_cntr, size_t trig_thresh,
+		      struct c_full_dma_cmd *dma, uint64_t flags)
+{
+	int ret;
+
+	if (!cxip_txc_can_emit_op(txc, dma->event_success_disable))
+		return -FI_EAGAIN;
+
+	if (trig_cntr) {
+		ret = cxip_domain_dwq_emit_dma(txc->domain, vni,
+					       tc, tc_type, trig_cntr,
+					       trig_thresh, dma, flags);
+		if (ret)
+			TXC_WARN(txc,
+				 "Failed to emit trigger dma command: %d:%s\n",
+				 ret, fi_strerror(-ret));
+		else if (!dma->event_success_disable)
+			ofi_atomic_inc32(&txc->otx_reqs);
+
+		return ret;
+	}
+
+	/* Ensure correct traffic class is used. */
+	ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type);
+	if (ret) {
+		TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		return ret;
+	}
+
+	ret = cxip_cmdq_emit_dma(txc->tx_cmdq, dma, flags);
+	if (ret) {
+		TXC_WARN(txc, "Failed to emit dma command: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		return ret;
+	}
+
+	/* Kick the command queue. */
+	cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE),
+		      ofi_atomic_get32(&txc->otx_reqs));
+
+	if (!dma->event_success_disable)
+		ofi_atomic_inc32(&txc->otx_reqs);
+
+	return FI_SUCCESS;
+}
+
+int cxip_txc_emit_idc_amo(struct cxip_txc *txc, uint16_t vni,
+			  enum cxi_traffic_class tc,
+			  enum cxi_traffic_class_type tc_type,
+			  const struct c_cstate_cmd *c_state,
+			  const struct c_idc_amo_cmd *amo, uint64_t flags,
+			  bool fetching, bool flush)
+{
+	int ret;
+
+	if (!cxip_txc_can_emit_op(txc, c_state->event_success_disable))
+		return -FI_EAGAIN;
+
+	/* Ensure correct traffic class is used. */
+	ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type);
+	if (ret) {
+		TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		return ret;
+	}
+
+	ret = cxip_cmdq_emic_idc_amo(txc->tx_cmdq, c_state, amo, flags,
+				     fetching, flush);
+	if (ret) {
+		TXC_WARN(txc, "Failed to emit idc_put command: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		return ret;
+	}
+
+	/* Kick the command queue. */
+	cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE),
+		      ofi_atomic_get32(&txc->otx_reqs));
+
+	if (!c_state->event_success_disable)
+		ofi_atomic_inc32(&txc->otx_reqs);
+
+	return FI_SUCCESS;
+}
+
+int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni,
+			  enum cxi_traffic_class tc,
+			  enum cxi_traffic_class_type tc_type,
+			  struct cxip_cntr *trig_cntr, size_t trig_thresh,
+			  struct c_dma_amo_cmd *amo, uint64_t flags,
+			  bool fetching, bool flush)
+{
+	int ret;
+
+	if (!cxip_txc_can_emit_op(txc, amo->event_success_disable))
+		return -FI_EAGAIN;
+
+	if (trig_cntr) {
+		ret = cxip_domain_dwq_emit_amo(txc->domain, vni, tc,
+					       CXI_TC_TYPE_DEFAULT, trig_cntr,
+					       trig_thresh, amo, flags,
+					       fetching, flush);
+		if (ret)
+			TXC_WARN(txc,
+				 "Failed to emit trigger amo command: %d:%s\n",
+				 ret, fi_strerror(-ret));
+		else if (!amo->event_success_disable)
+			ofi_atomic_inc32(&txc->otx_reqs);
+
+		return ret;
+	}
+
+	/* Ensure correct traffic class is used. */
+	ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type);
+	if (ret) {
+		TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		return ret;
+	}
+
+	ret = cxip_cmdq_emit_dma_amo(txc->tx_cmdq, amo, flags, fetching, flush);
+	if (ret) {
+		TXC_WARN(txc, "Failed to emit DMA amo command: %d:%s\n", ret,
+			 fi_strerror(-ret));
+		return ret;
+	}
+
+	/* Kick the command queue. */
+	cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE),
+		      ofi_atomic_get32(&txc->otx_reqs));
+
+	if (!amo->event_success_disable)
+		ofi_atomic_inc32(&txc->otx_reqs);
+
+	return FI_SUCCESS;
+}
diff --git a/prov/cxi/src/cxip_zbcoll.c b/prov/cxi/src/cxip_zbcoll.c
new file mode 100644
index 00000000000..7f59b2ba599
--- /dev/null
+++ b/prov/cxi/src/cxip_zbcoll.c
@@ -0,0 +1,1686 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2021-2022 Hewlett Packard Enterprise Development LP
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <endian.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <ofi_list.h>
+#include <ofi.h>
+#include <fenv.h>
+
+#include "cxip.h"
+
+/* Distinctions:
+ * CXIP_DBG() is generally useless in a multi-node collective. Use TRACE().
+ * CXIP_INFO() is generally useless in internal code of this sort.
+ * CXIP_WARN() is used to leave a log trace to identify failures.
+ *     -FI_ENOMEM is not logged, since where it occurs is irrelevant: all
+ *         memory allocation in this module is small, so heap exhaustion
+ *         indicates a systemic failure.
+ *     -FI_EAGAIN and -FI_EBUSY are not logged, as they are transient
+ */
+#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__)
+#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
+
+#define	TRACE(fmt, ...)	CXIP_TRACE(CXIP_TRC_ZBCOLL, fmt, ##__VA_ARGS__)
+
+/* see data packing structures below */
+#define	ZB_MAP_BITS	54
+#define	ZB_GRPID_BITS	6
+#define	ZB_SIM_BITS	5
+#define	ZB_SIM_MAX	(1 << (ZB_SIM_BITS))
+#define	ZB_NEG_BIT	((ZB_MAP_BITS) - 1)
+
+static int zbdata_send_cb(struct cxip_ctrl_req *req,
+			  const union c_event *event);
+
+/****************************************************************************
+ * OVERVIEW
+ *
+ * There are two related components in this file.
+ * - An abstract radix tree constructor
+ * - A collective implemention built on the Zero-Buffer Put control channel.
+ *
+ * The basic operational flow is as follows:
+ * - cxip_zbcoll_init() prepares the system for zbcoll collectives.
+ * - cxip_zbcoll_alloc() allocates and configures a collective structure.
+ * - cxip_zbcoll_getgroup() negotiates a collective identifier (one time).
+ * - cxip_zbcoll_barrier() performs a barrier (can be repeated).
+ * - cxip_zbcoll_broadcast() performs a broadcast (can be repeated).
+ * - cxip_zbcoll_progress() progresses getgroup/barrier/broadcast/reduce on ep.
+ * - cxip_zbcoll_free() releases the collective structure and identifier.
+ * - cxip_zbcoll_fini() releases all collectives and cleans up.
+ *
+ * Any number of collective structures can be created, spanning the same, or
+ * different node-sets.
+ *
+ * To enable the structure, it must acquire a group identifier using the
+ * getgroup operation, which is itself a collective operation. Getgroup
+ * negotiates for and acquires one of 53 possible group identifiers (43 in
+ * simulation). The group identifier remains with that structure until the
+ * structure is deleted, allowing it to be used for multiple collective
+ * operations without renegotiating.
+ *
+ * Collective operations are concurrent for groups with different group
+ * identifiers. Collective operations for a single group are serialized,
+ * returning -FI_EAGAIN if there is already a collective operation in progress
+ * for that group.
+ *
+ * The getgroup, barrier, and broadcast functions support a callback stack that
+ * allows caller-defined callback functions to be stacked for execution upon
+ * completion of a collective. The callback can initiate a new collective on the
+ * same object.
+ *
+ * Note that this is NOT a general-purpose collective implementation.
+ */
+
+/****************************************************************************
+ * ABSTRACT RADIX TREE
+ *
+ * We lay out all of the node indices (0..maxnodes-1) in layers, as follows:
+ *
+ * RADIX 1:
+ * row: nodeidx
+ *   0: 0
+ *   1: 1
+ *   2: 2
+ * ...
+ *
+ * RADIX 2:
+ * row: nodeidx
+ *   0: 0
+ *   1: 1, 2
+ *   2: 3, 4, 5, 6
+ *   3: 7, 8, 9, 10, 11, 12, 13, 14
+ * ...
+ *
+ * RADIX 3:
+ * row: nodeidx
+ *   0: 0
+ *   1: 1, 2, 3
+ *   2: 4, 5, 6, 7, 8, 9, 10, 11, 12
+ *   3: 13, 14, 15, 16, 17, 18, ... 38, 39
+ * ...
+ *
+ * The parent of any node is in the row above it, and the children are in the
+ * row below it. The width of any row is (RADIX ^ row), so for every node, there
+ * can be up to RADIX children, and one parent, with the exception of the root
+ * node (no parent).
+ */
+
+/**
+ * @brief Compute row and column for a given node index.
+ *
+ * @param radix   : radix of tree
+ * @param nodeidx : node index
+ * @param row     : returned row of this node
+ * @param col     : returned offset of this node in the row
+ * @param siz     : returned size of the row, (0 <= col < siz)
+ */
+void cxip_tree_rowcol(int radix, int nodeidx, int *row, int *col, int *siz)
+{
+	int rownum = 0;
+	int rowcum = 0;
+	int rowsiz = 1;
+
+	*row = 0;
+	*col = 0;
+	*siz = rowsiz;
+	if (radix < 1)
+		return;
+	while (nodeidx > rowcum) {
+		rowsiz *= radix;
+		*row = rownum + 1;
+		*col = nodeidx - rowcum - 1;
+		*siz = rowsiz;
+		rowcum += rowsiz;
+		rownum += 1;
+	}
+}
+
+/**
+ * @brief Compute the node index for a give row and column.
+ *
+ * Note that illegal columns can be specified for a row, which results
+ * in a return index of -1.
+ *
+ * @param radix   : radix of tree
+ * @param row     : row of node
+ * @param col     : column of node
+ * @param nodeidx : returned node index, or -1 if illegal
+ */
+void cxip_tree_nodeidx(int radix, int row, int col, int *nodeidx)
+{
+	int rownum = 0;
+	int rowcum = 0;
+	int rowsiz = 1;
+
+	*nodeidx = 0;
+	while (radix && rownum < row) {
+		rowsiz *= radix;
+		*nodeidx = rowcum + col + 1;
+		rowcum += rowsiz;
+		rownum += 1;
+	}
+	if (col >= rowsiz)
+		*nodeidx = -1;
+}
+
+/**
+ * @brief Provide the relatives (parent, children) of a node
+ *
+ * The rels array must be provided, and must have RADIX+1 entries.
+ *
+ * The parent position [0] will always be populated, but with -1 if the node is
+ * the root node.
+ *
+ * Only valid child positions in [1..RADIX] will be populated.
+ *
+ * This returns the total number of positions populated.
+ *
+ * If radix < 1, there can be no relatives, and this returns 0.
+ *
+ * @param radix    : radix of tree
+ * @param nodeidx  : index of node to find relatives for
+ * @param maxnodes : maximum valid node indices available
+ * @param rels     : relative index array
+ * @return int : number of valid relatives found
+ */
+int cxip_tree_relatives(int radix, int nodeidx, int maxnodes, int *rels)
+{
+	int row, col, siz, idx, n;
+
+	if (radix < 1 || !maxnodes || !rels)
+		return 0;
+
+	cxip_tree_rowcol(radix, nodeidx, &row, &col, &siz);
+
+	idx = 0;
+	if (row)
+		cxip_tree_nodeidx(radix, row - 1, col / radix, &rels[idx++]);
+	else
+		rels[idx++] = -1;
+
+	cxip_tree_nodeidx(radix, row+1, col*radix, &nodeidx);
+	for (n = 0; n < radix; n++) {
+		if ((nodeidx + n) >= maxnodes)
+			break;
+		rels[idx++] = nodeidx + n;
+	}
+
+	return idx;
+}
+
+/****************************************************************************
+ * @brief Zero-buffer collectives.
+ *
+ * ZB collectives are intended for implementation of the fi_join_collective()
+ * function.
+ *
+ * The ep_obj has a container structure of type cxip_ep_zbcoll_obj, which
+ * maintains endpoint-global state for all zb collectives on that NIC endpoint.
+ * We refer to this as the zbcoll object, and it is an extension of the endpoint
+ * itself.
+ *
+ * The zbcoll object contains dynamic zb objects, each representing a collective
+ * group.
+ *
+ * Each zb object contains one or more state structures, which support simulated
+ * operations on a single node. Production code will use only one state for the
+ * NID.
+ *
+ * Diagnostic counters are maintained:
+ *
+ * - ack_count == successful sends
+ * - err_count == failed sends
+ * - rcv_count == successful receives
+ * - dsc_count == discarded receives
+ */
+
+static inline void _setbit(uint64_t *mask, int bit)
+{
+	*mask |= (1ULL << bit);
+}
+
+static inline void _clrbit(uint64_t *mask, int bit)
+{
+	*mask &= ~(1ULL << bit);
+}
+
+void cxip_zbcoll_get_counters(struct cxip_ep_obj *ep_obj, uint32_t *dsc,
+			      uint32_t *err, uint32_t *ack, uint32_t *rcv)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+
+	zbcoll = &ep_obj->zbcoll;
+	if (dsc)
+		*dsc = ofi_atomic_get32(&zbcoll->dsc_count);
+	if (err)
+		*err = ofi_atomic_get32(&zbcoll->err_count);
+	if (ack)
+		*ack = ofi_atomic_get32(&zbcoll->ack_count);
+	if (rcv)
+		*rcv = ofi_atomic_get32(&zbcoll->rcv_count);
+}
+
+/**
+ * @brief Link a secondary zb object to a primary zb object.
+ *
+ * This is used with multi-zb object simulation. The basic (single-zb) model is
+ * that the zb object has num_caddrs state structures to manage the state of
+ * each simulated destination address, each of which has a backpointer to the
+ * containing zb object.
+ *
+ * For the multi-zb simulation, the root zb (simrank == 0) has num_caddrs state
+ * structures, but each points back to a different zb object. When packets are
+ * received, the are initially received by the root zb, which determines the
+ * state structure to use from the simulated 'dst' address embedded in the
+ * packet, and that is then re-routed through the state[dst].zb pointer to the
+ * correct target zb object and state[dst] on that object.
+ *
+ * - zb[0]->state[0].zb -> zb[0]
+ * - zb[0]->state[1].zb -> zb[1]
+ * - ...
+ * - zb[0]->state[n].zb -> zb[n]
+ *
+ * This also modifies each of the other structures to backlink state[0] to the
+ * root structure. This allows data from the leaf nodes to be placed in the root
+ * structure for sending.
+ *
+ * - zb[1]->state[0].zb -> zb[0]
+ * - ...
+ * - zb[n]->state[0].zb -> zb[0]
+ *
+ * Note that only zb->state[0].zb is a "real" zb pointer. If the pointer
+ * reference is needed, use the BASEZB() macro below.
+ *
+ * @param zb0  : primary (root) zb structure
+ * @param zb   : secondary zb structure to link to the root
+ * @return int error if conditions aren't met
+ */
+
+#define	BASEZB(zb)	zb->state[0].zb
+
+int cxip_zbcoll_simlink(struct cxip_zbcoll_obj *zb0,
+			struct cxip_zbcoll_obj *zb1)
+{
+	int i;
+
+	if (zb0 == zb1)
+		return FI_SUCCESS;
+	if (!zb0 || !zb1) {
+		CXIP_WARN("arguments cannot be NULL\n");
+		return -FI_EINVAL;
+	}
+	if (zb0->num_caddrs != zb1->num_caddrs) {
+		CXIP_WARN("address counts do not match\n");
+		return -FI_EINVAL;
+	}
+	for (i = 0; i < zb0->num_caddrs; i++)
+		if (!CXIP_ADDR_EQUAL(zb0->caddrs[i], zb1->caddrs[i])) {
+			CXIP_WARN("address values do not match caddr[%d]\n", i);
+			return -FI_EINVAL;
+		}
+	/* zb0 must be root */
+	if (zb0->simrank != 0) {
+		CXIP_WARN("zb0 simrank != 0\n");
+		return -FI_EINVAL;
+	}
+	/* zb1 must be valid simrank */
+	if (zb1->simrank <= 0 || zb1->simrank >= zb1->num_caddrs) {
+		CXIP_WARN("zb1 simrank %d invalid, max = %d\n",
+			  zb1->simrank, zb1->num_caddrs);
+		return -FI_EINVAL;
+	}
+	/* may only link once for a simrank */
+	if (zb0->state[zb1->simrank].zb != zb0) {
+		CXIP_WARN("zb0 state[%d] cannot be re-linked\n", zb1->simrank);
+		return -FI_EINVAL;
+	}
+	/* may not re-link after linking is done */
+	if (zb1->state[0].zb != zb1) {
+		CXIP_WARN("zb1 state[0] cannot be re-linked\n");
+		return -FI_EINVAL;
+	}
+
+	/* link each to the other */
+	zb0->simref++;
+	zb0->state[zb1->simrank].zb = zb1;
+	zb1->state[zb0->simrank].zb = zb0;
+
+	return FI_SUCCESS;
+}
+
+/* utility to free a zbcoll object */
+static void _free_zbcoll(struct cxip_zbcoll_obj *zb)
+{
+	int i;
+
+	if (zb->state)
+		for (i = 0; i < zb->simcount; i++)
+			free(zb->state[i].relatives);
+	cxip_zbcoll_rlsgroup(zb);
+	free(zb->caddrs);
+	free(zb->state);
+	free(zb->shuffle);
+	free(zb);
+}
+
+/**
+ * @brief Free zb object.
+ *
+ * This flushes the callback stack, and releases the group identifier associated
+ * with this zb object. It also removes the backreference in the multi-zb
+ * simulation.
+ *
+ * In the multi-zb simulation, it must defer actual deletion until all of the zb
+ * objects in the collective have been deleted, since the tree may still be in
+ * use until all of have deleted.
+ *
+ * @param zb : zb object to free
+ */
+void cxip_zbcoll_free(struct cxip_zbcoll_obj *zb)
+{
+	int i;
+
+	if (!zb)
+		return;
+
+	/* edge case in some tests */
+	if (!zb->state) {
+		_free_zbcoll(zb);
+		return;
+	}
+	if (zb->simrank >= 0) {
+		zb = BASEZB(zb);
+		if (--zb->simref)
+			return;
+		for (i = 1; i < zb->simcount; i++) {
+			_free_zbcoll(zb->state[i].zb);
+		}
+	}
+	_free_zbcoll(zb);
+}
+
+/* configure the zb object -- error frees zb in caller */
+static int _state_config(struct cxip_zbcoll_obj *zb)
+{
+	struct cxip_zbcoll_state *zbs;
+	int radix, n;
+
+	radix = cxip_env.zbcoll_radix;
+
+	zb->state = calloc(zb->simcount, sizeof(*zbs));
+	if (!zb->state)
+		return -FI_ENOMEM;
+
+	for (n = 0; n < zb->simcount; n++) {
+		zbs = &zb->state[n];
+		zbs->zb = zb;
+
+		/* do not create relatives if no addrs */
+		if (!zb->num_caddrs)
+			continue;
+
+		/* simcount == 1, production: user specifies rank
+		 * simcount >  1, simulation: each state has its own rank
+		 */
+		zbs->grp_rank = (zb->simcount == 1) ? zb->simrank : n;
+
+		/* create space for relatives */
+		zbs->relatives = calloc(radix + 1, sizeof(*zbs->relatives));
+		if (!zbs->relatives)
+			return -FI_ENOMEM;
+
+		/* This produces indices in an abstract tree */
+		zbs->num_relatives =
+			cxip_tree_relatives(radix, zbs->grp_rank,
+					    zb->num_caddrs, zbs->relatives);
+	}
+	return FI_SUCCESS;
+}
+
+/* sort out the various configuration cases -- error frees zb in caller */
+static int _zbcoll_config(struct cxip_zbcoll_obj *zb, int num_addrs,
+			  fi_addr_t *fiaddrs)
+{
+	int i, ret;
+
+	if (!num_addrs) {
+		/* test case: no nics, send-to-self only */
+		zb->num_caddrs = 1;
+		zb->caddrs = calloc(zb->num_caddrs, sizeof(*zb->caddrs));
+		if (!zb->caddrs)
+			return -FI_ENOMEM;
+		zb->caddrs[0] = zb->ep_obj->src_addr;
+		zb->simrank = 0;
+		zb->simcount = 1;
+	} else if (zb->simrank != ZB_NOSIM) {
+		/* test case: regression with simulated addresses */
+		if (num_addrs > ZB_SIM_MAX || zb->simrank >= num_addrs) {
+			CXIP_WARN("Simulation maximum size = %d\n",
+				  MIN(num_addrs, ZB_SIM_MAX));
+			return -FI_EINVAL;
+		}
+		zb->num_caddrs = num_addrs;
+		zb->caddrs = calloc(zb->num_caddrs, sizeof(*zb->caddrs));
+		if (!zb->caddrs)
+			return -FI_ENOMEM;
+		for (i = 0; i < num_addrs; i++) {
+			zb->caddrs[i].nic = i;
+			zb->caddrs[i].pid = zb->ep_obj->src_addr.pid;
+		}
+		zb->simcount = num_addrs;
+	} else {
+		/* production case: real addresses supplied */
+		zb->num_caddrs = num_addrs;
+		zb->caddrs = calloc(zb->num_caddrs, sizeof(*zb->caddrs));
+		if (!zb->caddrs)
+			return -FI_ENOMEM;
+		zb->simrank = -1;
+		for (i = 0; i < num_addrs; i++) {
+			ret = cxip_av_lookup_addr(zb->ep_obj->av,
+						  fiaddrs[i], &zb->caddrs[i]);
+			if (ret) {
+				CXIP_WARN("Lookup on fiaddr=%ld failed\n",
+					  fiaddrs[i]);
+				return -FI_ECONNREFUSED;
+			}
+			if (zb->simrank < 0 &&
+			    CXIP_ADDR_EQUAL(zb->caddrs[i],
+					    zb->ep_obj->src_addr))
+				zb->simrank = i;
+		}
+		if (zb->simrank < 0) {
+			CXIP_WARN("Endpoint addr not in addrs[]\n");
+			return -FI_ECONNREFUSED;
+		}
+		zb->simcount = 1;
+	}
+
+	/* find the index of the source address in the address list */
+	return _state_config(zb);
+}
+
+/**
+ * @brief Allocate and configure a zb object.
+ *
+ * The zb object represents a radix tree through multiple nics that can perform
+ * sequential synchronizing collectives. It can be reused.
+ *
+ * This supports several test modes.
+ *
+ * If num_nics == 0, the zb object can only be used to test cxip_zbcoll_send(),
+ * to exercise a send-to-self using the ctrl channel, and will work with NETSIM.
+ *
+ * If simrank is ZB_NOSIM, this will be used to perform real collectives over
+ * the group specified by the specified nics. The self-address of the node
+ * calling this must be a member of this set.
+ *
+ * If simrank is ZB_ALLSIM, this will be used to perform an internal simulation
+ * of all the nics with a single call to a collective operation.
+ *
+ * If simrank is >= 0, then it represents the rank to be simulated by this zb
+ * object. The test will need to create num_nics zb objects, each with a
+ * different rank, and the zb collective operation will have to be initiated on
+ * each of these to complete the collective.
+ *
+ * Simulation is limited to (1 << ZB_SIM_BITS) simulated endpoints. Simulation
+ * also reduces the number of group identifiers that can be used.
+ *
+ * nid[0] is defined as the collective root nid.
+ *
+ * @param ep_obj    : NIC endpoint object
+ * @param num_addrs : number of fabric addresses
+ * @param fiaddrs   : fabric addresses
+ * @param simrank   : simulated rank
+ * @param zbp       : returned zb object
+ * @return int : FI_SUCCESS or error value
+ */
+int cxip_zbcoll_alloc(struct cxip_ep_obj *ep_obj,
+		      int num_addrs, fi_addr_t *fiaddrs, int simrank,
+		      struct cxip_zbcoll_obj **zbp)
+{
+	struct cxip_zbcoll_obj *zb;
+	int ret;
+
+	if (!zbp) {
+		CXIP_WARN("zbp is NULL\n");
+		return -FI_EINVAL;
+	}
+
+	/* allocate the zb object */
+	*zbp = NULL;
+	zb = calloc(1, sizeof(*zb));
+	if (!zb)
+		return -FI_ENOMEM;
+	dlist_init(&zb->ready_link);
+	zb->ep_obj = ep_obj;
+	zb->grpmskp = &ep_obj->zbcoll.grpmsk;
+	zb->grpid = ZB_NEG_BIT;
+	zb->simrank = simrank;
+	zb->simref = 1;
+
+	/* configure the zb object */
+	ret = _zbcoll_config(zb, num_addrs, fiaddrs);
+	if (ret) {
+		cxip_zbcoll_free(zb);
+		CXIP_WARN("Failed to configure zb object = %s\n",
+			  fi_strerror(-ret));
+		return ret;
+	}
+
+	/* return the zb object */
+	*zbp = zb;
+	return FI_SUCCESS;
+}
+
+/**
+ * Data packing structures.
+ *
+ * This defines the specific bit meanings in the 64-bit zb put packet. Bit
+ * mapping could be modified, see considerations below.
+ *
+ * Considerations for the (production) network field:
+ *
+ * - dat MUST hold a multicast address and hardware root data
+ * - grpid size limits the number of concurrent zbcoll operations
+ * - sim requires only one bit and applies only to devel testing
+ * - pad is fixed by the control channel implementation
+ *
+ * Implementation of the negotiation operation requires that dat contain a
+ * bitmap. The choice of 54 allows for 54 grpid values (0-53), which will fit
+ * into a 6-bit grpid value. This is a large number for concurrencies. The grpid
+ * field could be reduced to 5 bits, offering only 32 concurrent operations. The
+ * map bits should then be reduced to 32, which would free up 23 bits for other
+ * information during negotiation, should extra bits be required.
+ *
+ * For broadcast, the full dat field is available for multicast information. The
+ * multicast address is currently 13 bits. Future revisions of Rosetta may
+ * increase this. The remaining bits can be used for a representation of the
+ * root node. A full caddr would require 32 bits, while using a 32-bit index
+ * into the fi_av_set would allow for a collective spanning up to 4 billion
+ * endpoints. This allows the multicast address to expand by another 9 bits, for
+ * a total of 22 bits, or 4 million multicast addresses.
+ *
+ * Considerations for the simulation fields:
+ *
+ * - src and dst must have the same number of bits
+ * - src/dst bits constrain the size of the simulated zbcoll tree
+ */
+union packer {
+	struct {
+		uint64_t dat: (ZB_MAP_BITS - 2*ZB_SIM_BITS);
+		uint64_t src: ZB_SIM_BITS;
+		uint64_t dst: ZB_SIM_BITS;
+		uint64_t grpid: ZB_GRPID_BITS;
+		uint64_t sim: 1;
+		uint64_t pad: 3;
+	} sim __attribute__((__packed__));
+	struct {
+		uint64_t dat: ZB_MAP_BITS;
+		uint64_t grpid: ZB_GRPID_BITS;
+		uint64_t sim: 1;
+		uint64_t pad: 3;
+	} net __attribute__((__packed__));
+	uint64_t raw;
+};
+
+
+/* pack data */
+static inline uint64_t zbpack(int sim, int src, int dst, int grpid,
+			      uint64_t dat)
+{
+	union packer x = {.raw = 0};
+	if (sim) {
+		x.sim.sim = 1;
+		x.sim.src = src;
+		x.sim.dst = dst;
+		x.sim.grpid = grpid;
+		x.sim.dat = dat;
+	} else {
+		x.sim.sim = 0;
+		x.net.grpid = grpid;
+		x.net.dat = dat;
+	}
+	return x.raw;
+}
+
+/* unpack data */
+static inline int zbunpack(uint64_t data, int *src, int *dst, int *grpid,
+			   uint64_t *dat)
+{
+	union packer x = {.raw = data};
+	if (x.sim.sim) {
+		*src = x.sim.src;
+		*dst = x.sim.dst;
+		*grpid = x.sim.grpid;
+		*dat = x.sim.dat;
+	} else {
+		*src = 0;
+		*dst = 0;
+		*grpid = x.net.grpid;
+		*dat = x.net.dat;
+	}
+	return x.sim.sim;
+}
+
+/**
+ * zbcoll state machine.
+ *
+ * The zbcollectives are intended to perform necessary synchronization among all
+ * NIDs participating in a fi_join_collective() operation. Every join will have
+ * its own set of NIDs, which may overlap with the NIDs used in another
+ * concurrently-executing fi_join_collective(). Thus, every NID may be
+ * participating simultaneously in a different number of join operations.
+ *
+ * Every process (NID) in the collective sits somewhere in a radix tree, with
+ * one parent as relative[0] (except for the root), and up to RADIX-1 children
+ * at relative[1,...].
+ *
+ * The collective follows a two-stage data flow, first from children toward the
+ * root (upstream), then from root toward the children (downstream).
+ *
+ * Processes (NIDs) must wait for all children to report before forwarding their
+ * own contribution toward the root. When the children of the root all report,
+ * the root reflects the result back to its children, and completes. As each
+ * child receives from its parent, it propagates the result to its children, and
+ * completes.
+ *
+ * Packets are unrestricted, and thus receive confirmation ACK messages from the
+ * hardware, or NAK and retry if delivery fails.
+ *
+ * The leaf (childless) NIDs contribute immediately and send the zb->dataval
+ * data upstream. Each parent collects data from its children and bitwise-ANDs
+ * the data with its own zb->dataval. When all children have reported to the
+ * root, the root sends the root contents of *zb->dataptr downstream, and the
+ * children simply propagate the received data to the leaves. This fixed
+ * behavior covers all our use-cases.
+ *
+ * For the barrier operation, zb->dataptr is set to NULL, and zb->dataval is set
+ * to zero. Both are effectively ignored.
+ *
+ * For the broadcast operation, zb->dataptr is a caller-supplied pointer, and
+ * zb->dataval is ignored. When all contributions have arrived on the root, the
+ * user-supplied value of *zb->dataptr is sent downstream, and propagated to all
+ * leaves, overwriting *zb->dataptr on each endpoint.
+ *
+ * For the reduce operation, zb->dataptr is set to a caller-supplied pointer,
+ * and zb->dataval is set to the value contained in this pointer. All of these
+ * caller values are sent upstream and reduced using a bitwise-AND reduction.
+ * When all contributions have arrived on the root, the value of the root
+ * *zb->dataptr is overwritten with the reduced zb->dataval, and then propagated
+ * to all leaves.
+ *
+ * Barrier, broadcast, and reduce must be preceded by a getgroup operation, to
+ * obtain a grpid value for the zb object.
+ *
+ * For the getgroup operation, zb->dataptr points to &zb->dataval, and
+ * zb->dataval contains a copy of the endpoint zbcoll grpmsk, which has a bit
+ * set to 1 for every grpid that is available for that NID. NIDs may have
+ * different grpmsk values. All of these masks are passed upstream through
+ * zb->dataval in a bitwise-AND reduction. When it reaches the root, the set
+ * bits in zb->dataval are the grpid values still available across all of the
+ * NIDs in the group. Because zb->dataptr == &zb-dataval, *zb->dataptr on the
+ * root contains the final reduced value, which is then propagated to all the
+ * leaves.
+ *
+ * The negotiated group id is the lowest numbered bit still set, and every NID
+ * computes this from the bitmask.
+ *
+ * It is possible for all group ID values to be exhausted. In this case, the
+ * getgroup operation will report -FI_EBUSY, and the caller should retry until a
+ * join operation completes, releasing one of the group ID values. If zb
+ * collective objects are never released, new operations will be blocked
+ * indefinitely.
+ *
+ * Getgroup operations are always serialized across the entire endpoint.
+ * Attempting a second getgroup on any (new) zb object before the first has
+ * completed will return -FI_EAGAIN. This is required to prevent race conditions
+ * that would issue the same group id to multiple zbcoll objects.
+ *
+ * We are externally guaranteed that all fi_join_collective() operations will
+ * observe proper collective ordering. Specifically, if any two joins share two
+ * or more NIDs, those joins will be initiated in the same order on all shared
+ * NIDs (possibly interspersed with other joins for unrelated groups). This
+ * behavior is necessary to ensure that all NIDs in a group obtain the same
+ * grpid value.
+ */
+
+/* send a zbcoll packet -- wrapper for cxip_ctrl_msg_send().
+ *
+ * Caller must hold ep_obj->lock.
+ */
+static void zbsend(struct cxip_ep_obj *ep_obj, uint32_t dstnic, uint32_t dstpid,
+		   uint64_t mbv)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+	struct cxip_ctrl_req *req;
+	int ret;
+
+	zbcoll = &ep_obj->zbcoll;
+
+	req = calloc(1, sizeof(*req));
+	if (!req) {
+		CXIP_WARN("failed request allocation\n");
+		ofi_atomic_inc32(&zbcoll->err_count);
+		return;
+	}
+
+	req->ep_obj = ep_obj;
+	req->cb = zbdata_send_cb;
+	req->send.nic_addr = dstnic;
+	req->send.pid = dstpid;
+	req->send.mb.raw = mbv;
+	req->send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG;
+	req->send.mb.ctrl_msg_type = CXIP_CTRL_MSG_ZB_DATA;
+
+	/* If we can't send, collective cannot complete, just spin */
+	do {
+		ret =  cxip_ctrl_msg_send(req);
+		if (ret == -FI_EAGAIN)
+			cxip_ep_ctrl_progress_locked(ep_obj);
+	} while (ret == -FI_EAGAIN);
+	if (ret) {
+		CXIP_WARN("failed CTRL message send\n");
+		ofi_atomic_inc32(&zbcoll->err_count);
+	}
+}
+
+/* send a rejection packet */
+static void reject(struct cxip_ep_obj *ep_obj, int dstnic, int dstpid,
+		   int sim, int src, int dst, int grpid)
+{
+	union cxip_match_bits mb;
+
+	mb.raw = zbpack(sim, src, dst, grpid, 0);
+	zbsend(ep_obj, dstnic, dstpid, mb.raw);
+}
+
+/**
+ * @brief Send a zero-buffer collective packet.
+ *
+ * Creates a request packet that must be freed (or retried) in callback.
+ *
+ * This can physically send ONLY from the endpoint source address, but the src
+ * address can be provided for simulation.
+ *
+ * Only the lower bits of the 64-bit payload will be delivered, depending on the
+ * specific packing model. Upper control bits will be overwritten as necessary.
+ *
+ * @param zb      : indexed zb structure
+ * @param srcidx  : source address index (ignored unless simulating)
+ * @param dstidx  : destination address index (required)
+ * @param payload : packet value to send
+ */
+void cxip_zbcoll_send(struct cxip_zbcoll_obj *zb, int srcidx, int dstidx,
+		      uint64_t payload)
+{
+	union cxip_match_bits mb = {.raw = 0};
+	struct cxip_addr dstaddr;
+
+	/* resolve NETSIM testcase */
+	TRACE("SND %04x->%04x %016lx\n", srcidx, dstidx, payload);
+	if (zb->simcount > 1) {
+		if (dstidx >= zb->simcount) {
+			ofi_atomic_inc32(&zb->ep_obj->zbcoll.err_count);
+			return;
+		}
+		/* alter the data to pass srcaddr/dstaddr */
+		mb.zb_data = zbpack(1, srcidx, dstidx, zb->grpid, payload);
+		dstaddr = zb->ep_obj->src_addr;
+	} else {
+		/* srcidx, dstaddr are discarded in zbpack() */
+		if (dstidx >= zb->num_caddrs) {
+			ofi_atomic_inc32(&zb->ep_obj->zbcoll.err_count);
+			return;
+		}
+		mb.zb_data = zbpack(0, 0, 0, zb->grpid, payload);
+		dstaddr = zb->caddrs[dstidx];
+	}
+	zbsend(zb->ep_obj, dstaddr.nic, dstaddr.pid, mb.raw);
+}
+
+/* set the group ID */
+static void setgrpid(struct cxip_zbcoll_obj *zb, uint64_t mask)
+{
+	uint64_t v;
+	int grpid;
+
+	TRACE("search for grpid in %016lx\n", mask);
+	for (grpid = 0, v = 1; grpid <= ZB_NEG_BIT; grpid++, v<<=1)
+		if (v & mask)
+			break;
+	TRACE("found grpid = %d\n", grpid);
+
+	/* manage a rejection due to a transient race condition */
+	if (grpid > ZB_NEG_BIT) {
+		/* race condition reported */
+		TRACE("cancel: getgroup transient race\n");
+		zb->error = -FI_EAGAIN;
+		return;
+	}
+
+	/* manage failure due to all grpid values in-use */
+	if (grpid == ZB_NEG_BIT) {
+		/* no group IDs available */
+		TRACE("cancel: getgroup no grpid available\n");
+		zb->error = -FI_EBUSY;
+		return;
+	}
+
+	/* we found our group ID */
+	TRACE("set grpid = %d\n", grpid);
+	zb->grpid = grpid;
+	_clrbit(zb->grpmskp, grpid);
+}
+
+/* mark a collective operation done */
+static inline void zbdone(struct cxip_zbcoll_state *zbs, uint64_t mbv)
+{
+	struct cxip_zbcoll_obj *zb;
+	struct cxip_ep_zbcoll_obj *zbcoll;
+
+	/* getgroup:
+	 *   single-zb sim: refcnt=1, busy=N
+	 *   multi-zb  sim: refcnt=N, busy=1
+	 *   production   : refcnt=1, busy=1
+	 * reduction:
+	 *   single-zb sim: refcnt=0, busy=N
+	 *   multi-zb  sim: refcnt=0, busy=1
+	 *   production   : refcnt=0, busy=1
+	 */
+	zb = zbs->zb;
+	zbcoll = &zbs->zb->ep_obj->zbcoll;
+	TRACE("%s: zb[%d] contribs=%d\n", __func__, zb->simrank, zbs->contribs);
+
+	ofi_spin_lock(&zbcoll->lock);
+	zbs->contribs = 0;
+	TRACE("--REFCNT=%d in %s\n", zbcoll->refcnt, __func__);
+	TRACE("--BUSY  =%d in %s\n", zb->busy, __func__);
+	/* Reduce the refcnt when we are no longer busy */
+	if (zb->busy && !--zb->busy) {
+		if (zb->grpid == ZB_NEG_BIT)
+			setgrpid(zb, mbv);
+		/* Complete the negotiation on the last reference */
+		if (!zbcoll->refcnt || !--zbcoll->refcnt) {
+			if (zbcoll->grptbl[ZB_NEG_BIT] == BASEZB(zb)) {
+				TRACE("GETGROUP FINISHED\n");
+				zbcoll->grptbl[zb->grpid] = BASEZB(zb);
+				zbcoll->grptbl[ZB_NEG_BIT] = NULL;
+			}
+		}
+		TRACE(".. append to zb[%d]\n", zb->simrank);
+		dlist_insert_tail(&zb->ready_link, &zbcoll->ready_list);
+	}
+	ofi_spin_unlock(&zbcoll->lock);
+}
+
+/* mark a collective send failure and end the collective */
+static void zbsend_fail(struct cxip_zbcoll_state *zbs,
+			struct cxip_ctrl_req *req, int ret)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+
+	/* highly unexpected ret == -FI_EIO */
+	zbcoll = &zbs->zb->ep_obj->zbcoll;
+	ofi_atomic_inc32(&zbcoll->err_count);
+	zbs->zb->error = ret;
+	free(req);
+}
+
+/* root has no parent */
+static inline bool isroot(struct cxip_zbcoll_state *zbs)
+{
+	return (zbs->relatives[0] < 0);
+}
+
+/* receive is complete when all contributors have spoken */
+static inline bool rcvcomplete(struct cxip_zbcoll_state *zbs)
+{
+	return (zbs->contribs == zbs->num_relatives);
+}
+
+/* send upstream to the parent */
+static void zbsend_up(struct cxip_zbcoll_state *zbs,
+		      uint64_t mbv)
+{
+	TRACE("%04x->%04x: %-10s %-10s %d/%d\n",
+		zbs->grp_rank, zbs->relatives[0], "", __func__,
+		zbs->contribs, zbs->num_relatives);
+	cxip_zbcoll_send(zbs->zb, zbs->grp_rank, zbs->relatives[0], mbv);
+ }
+
+/* send downstream to all of the children */
+static void zbsend_dn(struct cxip_zbcoll_state *zbs,
+		      uint64_t mbv)
+{
+	int relidx;
+
+	for (relidx = 1; relidx < zbs->num_relatives; relidx++) {
+		TRACE("%04x->%04x: %-10s %-10s\n",
+			zbs->grp_rank, zbs->relatives[relidx],
+			__func__, "");
+		cxip_zbcoll_send(zbs->zb, zbs->grp_rank,
+				 zbs->relatives[relidx], mbv);
+	}
+}
+
+/* advance the upstream data flow, reverse direction at root */
+static void advance(struct cxip_zbcoll_state *zbs, uint64_t mbv)
+{
+	union cxip_match_bits mb = {.raw = mbv};
+
+	if (!rcvcomplete(zbs))
+		return;
+
+	if (isroot(zbs)) {
+		/* Reduction overwrites root data */
+		if (zbs->dataptr && zbs->zb->reduce)
+			*zbs->dataptr = zbs->dataval;
+		/* The root always reflects its data down */
+		mb.zb_data = (zbs->dataptr) ? (*zbs->dataptr) : 0;
+		zbsend_dn(zbs, mb.raw);
+		zbdone(zbs, mbv);
+	} else {
+		/* completed children send up */
+		zbsend_up(zbs, mbv);
+	}
+}
+
+/* standard message for discarding a packet (should be rare) */
+static void discard_msg(uint32_t inic, uint32_t ipid, char *msg)
+{
+	CXIP_WARN("discard: INI=%04x PID=%d: %s\n", inic, ipid, msg);
+	TRACE("discard: INI=%04x PID=%d: %s\n", inic, ipid, msg);
+}
+
+/**
+ * @brief zbcoll message receive callback.
+ *
+ * This is called by the cxip_ctrl handler when a ZB collective packet is
+ * received. This function is "installed" at ep initialization, so it can begin
+ * receiving packets before a zb object has been allocated to receive the data.
+ * Races are handled by issuing a rejection packet back to the sender, which
+ * results in a retry.
+ *
+ * All incoming packets pass through this function. The group identifier is part
+ * of the packet format, and directs the packet to the zb object in the grptbl[]
+ * associated with that grpid, which allows for multiple concurrent collective
+ * operations.
+ *
+ * For the production case, there is only one zb associated with a grpid, with
+ * one state entry. The source address is provided to us by the NIC, and the
+ * destination is (obviously) this NIC.
+ *
+ * For the single-zb simulation case, there is only one zb associated with a
+ * grpid, with a state entry for each simulated collective endpoint. The
+ * simulated source and destination is present in the packet format, and this is
+ * used to identify the source, and direct the packet to the correct destination
+ * state object. The actual source address (always this NIC) is ignored.
+ *
+ * In the multi-zb simulation, there are multiple (linked) zb objects associated
+ * with the grpid, each with a state entry for each simulated endpoint. The
+ * grptbl[] only selects a single zb, which is the root (simrank=0) zb. Each
+ * state in this object contains a backpointer that normally points to the
+ * containing zb, but the linking operation modifies this to point to the
+ * separate zb objects. So a simple redirection through the state backpointer
+ * gets us to the correct zb and state within that zb. The linking operation
+ * also modifies the state[0] entry in each of the different zb objects to point
+ * back to the simrank=0 zb. The other state entries are unused. While this
+ * requires an O(N^2) memory where only O(N) is used, we are fundamentally
+ * limited to N=32 simulated endpoints by the space available in the packet for
+ * addresses, so the waste is negligible.
+ *
+ * Calling code does not handle error returns gracefully, so handle all errors,
+ * and return FI_SUCCESS.
+ *
+ * @param ep_obj    : endpoint
+ * @param init_nic  : received (actual) initiator NIC
+ * @param init_pid  : received (actual) initiator PID
+ * @param mbv       : received match bits
+ * @return int : FI_SUCCESS (formal return)
+ */
+int cxip_zbcoll_recv_cb(struct cxip_ep_obj *ep_obj, uint32_t init_nic,
+			uint32_t init_pid, uint64_t mbv)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+	struct cxip_zbcoll_obj *zb;
+	struct cxip_zbcoll_state *zbs;
+	int sim, src, dst, grpid;
+	uint32_t inic, ipid;
+	uint64_t dat;
+	union cxip_match_bits mb = {.raw = mbv};
+	int relidx;
+
+	zbcoll = &ep_obj->zbcoll;
+	/* src, dst always zero for production */
+	sim = zbunpack(mbv, &src, &dst, &grpid, &dat);
+	/* determine the initiator to use */
+	if (sim) {
+		inic = src;
+		ipid = ep_obj->src_addr.pid;
+	} else {
+		inic = init_nic;
+		ipid = init_pid;
+	}
+	TRACE("RCV INI=%04x PID=%04x sim=%d %d->%d grp=%d dat=%016lx\n",
+	    inic, ipid, sim, src, dst, grpid, dat);
+
+	/* discard if grpid is explicitly invalid (bad packet) */
+	if (grpid > ZB_NEG_BIT) {
+		discard_msg(inic, ipid, "rejected by target");
+		ofi_atomic_inc32(&zbcoll->dsc_count);
+		return FI_SUCCESS;
+	}
+	/* low-level packet test */
+	if (zbcoll->disable) {
+		/* Attempting a low-level test */
+		ofi_atomic_inc32(&zbcoll->rcv_count);
+		return FI_SUCCESS;
+	}
+	/* resolve the zb object */
+	zb = zbcoll->grptbl[grpid];
+	if (grpid == ZB_NEG_BIT) {
+		/* This is a negotiation packet */
+		if (!zb) {
+			/* mask from downstream node, we aren't ready */
+			TRACE("reject: getgroup negotiation conflict %08lx\n",
+			      dat);
+			reject(ep_obj, inic, ipid, sim, dst, src, grpid);
+			return FI_SUCCESS;
+		}
+		if (!dat) {
+			/* negotiation rejection from upstream node */
+			zbs = &zb->state[dst];
+			zbs->dataval = *zb->grpmskp;
+			zbs->dataptr = &zbs->dataval;
+			mb.zb_data = zbs->dataval;
+			TRACE("rejected: re-send %016lx\n", mb.raw);
+			zbsend_up(zbs, mb.zb_data);
+			return FI_SUCCESS;
+		}
+		/* upstream/downstream mask for negotiating zb */
+	} else {
+		/* This is a collective packet */
+		if (!zb) {
+			/* Received packet for unknown group */
+			discard_msg(inic, ipid, "reject unknown group ID");
+			reject(ep_obj, inic, ipid, sim, dst, src, ZB_MAP_BITS);
+			ofi_atomic_inc32(&zbcoll->dsc_count);
+			return FI_SUCCESS;
+		}
+		/* upstream/downstream data for collective zb */
+	}
+	/* discard bad state indices */
+	if (src >= zb->simcount || dst >= zb->simcount) {
+		TRACE("discard: simsrc=%d simdst=%d\n", src, dst);
+		CXIP_WARN("Bad simulation: src=%d dst=%d max=%d\n",
+			  src, dst, zb->simcount);
+		ofi_atomic_inc32(&zbcoll->dsc_count);
+		return FI_SUCCESS;
+	}
+	/* set the state object */
+	zbs = &zb->state[dst];
+	/* simulation redirection for multi-zb simulation */
+	if (zbs->zb != zb) {
+		zb = zbs->zb;
+		zbs = &zb->state[dst];
+	}
+	/* raw send test case, we are done */
+	if (!zbs->num_relatives) {
+		TRACE("ZBCOLL no relatives: test case\n");
+		return FI_SUCCESS;
+	}
+	/* determine which relative this came from (upstream or downstream) */
+	for (relidx = 0; relidx < zbs->num_relatives; relidx++) {
+		if (inic == zb->caddrs[zbs->relatives[relidx]].nic &&
+		    ipid == zb->caddrs[zbs->relatives[relidx]].pid)
+			break;
+	}
+	if (relidx == zbs->num_relatives) {
+		/* not a relative */
+		discard_msg(inic, ipid, "reject initiator not in tree");
+		reject(ep_obj, inic, ipid, sim, dst, src, grpid);
+		ofi_atomic_inc32(&zbcoll->dsc_count);
+		return FI_SUCCESS;
+	}
+	/* data received, increment the counter */
+	ofi_atomic_inc32(&zbcoll->rcv_count);
+
+	/* advance the state */
+	if (relidx == 0) {
+		/* downstream recv from parent */
+
+		/* copy the data to the zbs */
+		zbs->dataval = dat;
+		if (zbs->dataptr)
+			*zbs->dataptr = dat;
+		TRACE("%04x<-%04x: %-10s %-10s %d/%d (%016lx)\n",
+			zbs->grp_rank, zbs->relatives[0], "dn_recvd", "",
+			zbs->contribs, zbs->num_relatives, dat);
+
+		/* send downstream to children */
+		zbsend_dn(zbs, mb.raw);
+		zbdone(zbs, mb.raw);
+	} else {
+		/* upstream recv from child */
+
+		/* bitwise-AND the upstream data value */
+		zbs->dataval &= mb.raw;
+		mb.zb_data = zbs->dataval;
+		/* upstream packets contribute */
+		zbs->contribs += 1;
+		TRACE("%04x<-%04x: %-10s %-10s %d/%d\n",
+			zbs->grp_rank, inic, "", "up_recvd", zbs->contribs,
+			zbs->num_relatives);
+
+		/* advance the collective */
+		advance(zbs, mb.raw);
+	}
+	return FI_SUCCESS;
+}
+
+/**
+ * @brief Send callback function to manage source ACK.
+ *
+ * The request must be retried, or freed.
+ *
+ * NETSIM will simply drop packets sent to non-existent addresses, which leaks
+ * the request packet.
+ *
+ * Calling code does not handle error returns gracefully. Handle all errors, and
+ * return FI_SUCCESS.
+ *
+ * @param req   : original request
+ * @param event : CXI driver event
+ * @return int  : FI_SUCCESS (formal return)
+ */
+static int zbdata_send_cb(struct cxip_ctrl_req *req, const union c_event *event)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+	struct cxip_zbcoll_obj *zb;
+	struct cxip_zbcoll_state *zbs;
+	int src, dst, grpid;
+	int sim __attribute__((unused));
+	uint64_t dat;
+	int ret;
+
+	sim = zbunpack(req->send.mb.zb_data, &src, &dst, &grpid, &dat);
+	TRACE("ACK sim=%d %d->%d grp=%d dat=%016lx\n",
+	    sim, src, dst, grpid, dat);
+
+	zbcoll = &req->ep_obj->zbcoll;
+	ofi_atomic_inc32(&zbcoll->ack_count);
+
+	if (grpid > ZB_NEG_BIT) {
+		/* rejection packet sent */
+		TRACE("ACK: rejection sent\n");
+		goto done;
+	}
+	zb = zbcoll->grptbl[grpid];
+	if (!zb) {
+		/* Low-level testing, or ack is late */
+		TRACE("ACK: late arrival\n");
+		goto done;
+	}
+	if (src >= zb->simcount || dst >= zb->simcount) {
+		TRACE("ACK: bad simulation\n");
+		goto done;
+	}
+	zbs = &zb->state[dst];
+
+	switch (event->hdr.event_type) {
+	case C_EVENT_ACK:
+		switch (cxi_event_rc(event)) {
+		case C_RC_OK:
+			ret = FI_SUCCESS;
+			free(req);
+			break;
+		case C_RC_ENTRY_NOT_FOUND:
+			/* likely a target queue is full, retry */
+			CXIP_WARN("Target dropped packet, retry\n");
+			usleep(cxip_env.fc_retry_usec_delay);
+			ret = cxip_ctrl_msg_send(req);
+			break;
+		case C_RC_PTLTE_NOT_FOUND:
+			/* could be a race during setup, retry */
+			CXIP_WARN("Target connection failed, retry\n");
+			usleep(cxip_env.fc_retry_usec_delay);
+			ret = cxip_ctrl_msg_send(req);
+			break;
+		default:
+			CXIP_WARN("ACK return code = %d, failed\n",
+				  cxi_event_rc(event));
+			ret = -FI_EIO;
+			break;
+		}
+		break;
+	default:
+		/* fail the send */
+		CXIP_WARN(CXIP_UNEXPECTED_EVENT,
+			  cxi_event_to_str(event),
+			  cxi_rc_to_str(cxi_event_rc(event)));
+		ret = -FI_EIO;
+		break;
+	}
+	if (ret != FI_SUCCESS)
+		zbsend_fail(zbs, req, ret);
+
+	return FI_SUCCESS;
+done:
+	free(req);
+	return FI_SUCCESS;
+}
+
+/**
+ * @brief Define the user callback function to execute on completion.
+ *
+ * @param zb
+ * @param userfunc
+ * @param userptr
+ * @return int
+ */
+void cxip_zbcoll_set_user_cb(struct cxip_zbcoll_obj *zb,
+			     zbcomplete_t userfunc, void *userptr)
+{
+	zb->userfunc = userfunc;
+	zb->userptr = userptr;
+}
+
+/**
+ * @brief Return the maximum number of groups for concurrent zbcoll operations.
+ *
+ * Maximum slots are ZB_NEG_BIT+1, with one reserved for negotiation. Using
+ * simulation reduces the number of bits available for negotiation.
+ *
+ * @param sim  : true if nics are simulated
+ * @return int maximum group ID value
+ */
+int cxip_zbcoll_max_grps(bool sim)
+{
+	return (!sim) ? ZB_NEG_BIT : ZB_NEG_BIT - 2*ZB_SIM_BITS;
+}
+
+/* used in each loop over states for each collective operation */
+static bool _skip_or_shuffle(struct cxip_zbcoll_obj *zb, int i, int *n)
+{
+	/* default is that this returns n as value of i */
+	*n = i;
+	/* production means proceed over loop (of 1) with n = i */
+	if (zb->simcount == 1)
+		return false;
+	/* multi-zb simulation should skip unless simrank == i */
+	if (zb->simrank >= 0 && zb->simrank != i)
+		return true;
+	/* single-zb simulation simulates all values, with shuffling */
+	if (zb->shuffle)
+		*n = zb->shuffle[i];
+	return false;
+}
+
+/**
+ * @brief Negotiate a group id among participants.
+ *
+ * We are guaranteed that any two negotiations that take place on any two zb
+ * objects will occur in the same order. However, either of those negotiations
+ * could be separated by an arbitrary number of other negotiations for other
+ * collectives that don't involve both of those zb objects. E.g.
+ *
+ * - zb1: A1 A2
+ * - zb2: A1 B1 A2
+ *
+ * zb1 is able to start negotiation A2 as soon as A1 completes, but zb2 cannot
+ * begin until B1 has completed. To prevent issuing the same grpid to two
+ * different groups, or issuing different grpids to a single group, all getgroup
+ * collectives are serialized over the NIC endpoint. Thus, attempting to
+ * negotiate for A2 on zb2 before B1 has completed will result in -FI_EAGAIN.
+ *
+ * In production, each zb represents a different process, on a different NIC
+ * endpoint, and these typically represent different compute nodes.
+ *
+ * In the single-zb and multi-zb simulations, the entire simulation is
+ * single-threaded in a single process, in a common memory space.
+ *
+ * In the single-zb simulation, there is only one zb, and each zb->state[]
+ * represents the different simulated collective endpoints. Operations across
+ * all simulated endpoints are done sequentially, though the ordering is
+ * randomized using the shuffle[] array.
+ *
+ * In the multi-zb simulation, there is a separate zb for each collective
+ * endpoint. The same collective operation must be called independently on each
+ * zb object, and all zb objects in that group must be called. Ordering is
+ * controlled by the ordering of the operations using each zb.
+ *
+ * In production and the single-zb simulation, this is a simple first-come
+ * first-served use of the NIC endpoint zbcoll->grptbl[ZB_NEG_BIT] pointer.
+ * Serialization is guaranteed by simply testing whether grptbl[ZB_NEG_BIT] is
+ * NULL.
+ *
+ * In the multi-zb simulation, acquiring zbcoll->grptbl[ZB_NEG_BIT] is a
+ * multi-step process that requires multiple calls to getgroup, using different
+ * (linked) zb objects. Serialization means that multiple calls must be allowed,
+ * provided that they all belong to the same set of linked zb objects, until all
+ * endpoints have been called. We use the refcnt value to determine when all
+ * calls have been made.
+ *
+ * @param zb : zbcoll structure
+ * @return int : FI_SUCCESS or error value
+ */
+int cxip_zbcoll_getgroup(struct cxip_zbcoll_obj *zb)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+	struct cxip_zbcoll_state *zbs;
+	union cxip_match_bits mb = {.raw = 0};
+	int i, n, ret;
+
+	/* function could be called by non-participating nodes */
+	if (!zb) {
+		TRACE("zb is NULL\n");
+		CXIP_WARN("zb is NULL\n");
+		return -FI_EINVAL;
+	}
+
+	/* if disabled, exit */
+	zbcoll = &zb->ep_obj->zbcoll;
+	if (zbcoll->disable) {
+		TRACE("Disabled zb\n");
+		return FI_SUCCESS;
+	}
+
+	/* check for already grouped */
+	if (zb->grpid != ZB_NEG_BIT) {
+		TRACE("grpid already set = %d\n", zb->grpid);
+		CXIP_WARN("Cannot acquire a second group id\n");
+		return -FI_EINVAL;
+	}
+
+	/* getgroup operations must be serialized */
+	ret = FI_SUCCESS;
+	ofi_spin_lock(&zbcoll->lock);
+	if (!zbcoll->grptbl[ZB_NEG_BIT]) {
+		/* free to start negotiating */
+		zbcoll->grptbl[ZB_NEG_BIT] = BASEZB(zb);
+		zbcoll->refcnt++;
+	} else if (zbcoll->grptbl[ZB_NEG_BIT] == BASEZB(zb) &&
+		   zbcoll->refcnt < zb->simcount &&
+		   zb->busy < zb->simcount) {
+		/* single-zb sim, refcnt=1, busy=simcount
+		 * multi-zb  sim, refcnt=simcount, busy=1
+		 */
+		zbcoll->refcnt++;
+		TRACE("continue grpid negotiation, refcnt=%d\n",
+		      zbcoll->refcnt);
+	} else {
+		/* any other attempt has to wait */
+		ret = -FI_EAGAIN;
+		TRACE("failed grpid negotiation, retry later\n");
+	}
+	ofi_spin_unlock(&zbcoll->lock);
+	TRACE("++REFCNT=%d ret=%d in %s\n", zbcoll->refcnt, ret, __func__);
+	if (ret)
+		return ret;
+
+	/* process all states */
+	zb->error = FI_SUCCESS;
+	zb->reduce = false;
+	for (i = 0; i < zb->simcount; i++) {
+		if (_skip_or_shuffle(zb, i, &n))
+			continue;
+		zbs = &zb->state[n];
+		zbs->dataval = *zb->grpmskp;
+		zbs->dataptr = &zbs->dataval;
+		zbs->contribs++;
+		zb->busy++;
+		TRACE("%s: zb[%d] contribs=%d\n", __func__, i, zbs->contribs);
+		/* if terminal leaf node, will send up immediately */
+		mb.zb_data = zbs->dataval;
+		advance(zbs, mb.raw);
+	}
+	TRACE("++BUSY  =%d in %s\n", zb->busy, __func__);
+	return FI_SUCCESS;
+}
+
+/**
+ * @brief Release negotiated group id.
+ *
+ * @param zb : zbcoll structure
+ */
+void cxip_zbcoll_rlsgroup(struct cxip_zbcoll_obj *zb)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+
+	if (!zb || zb->grpid > ZB_NEG_BIT)
+		return;
+
+	zbcoll = &zb->ep_obj->zbcoll;
+
+	ofi_spin_lock(&zbcoll->lock);
+	_setbit(zb->grpmskp, zb->grpid);
+	zbcoll->grptbl[zb->grpid] = NULL;
+	zb->grpid = ZB_NEG_BIT;
+	ofi_spin_unlock(&zbcoll->lock);
+}
+
+/* All exported functions are variants of this core function */
+static int _zbreduce(struct cxip_zbcoll_obj *zb, uint64_t *dataptr, bool reduce)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+	struct cxip_zbcoll_state *zbs;
+	union cxip_match_bits mb = {.raw = 0};
+	int i, n;
+
+	/* function could be called on non-participating NIDs */
+	if (!zb) {
+		TRACE("[-] zb is NULL\n");
+		CXIP_WARN("zb is NULL\n");
+		return -FI_EINVAL;
+	}
+
+	/* low level testing */
+	zbcoll = &zb->ep_obj->zbcoll;
+	if (zbcoll->disable) {
+		TRACE("[%d] Disabled zb\n", zb->simrank);
+		return FI_SUCCESS;
+	}
+
+	/* operations on a single zb_obj are serialized */
+	if (zb->busy) {
+		TRACE("[%d] busy\n", zb->simrank);
+		return -FI_EAGAIN;
+	}
+
+	/* check for not grouped */
+	if (zb->grpid >= ZB_NEG_BIT) {
+		TRACE("[%d] Requires a group ID\n", zb->simrank);
+		CXIP_WARN("Requires group id\n");
+		return -FI_EINVAL;
+	}
+	TRACE("[%d] grpid = %d\n", zb->simrank, zb->grpid);
+
+	/* process all states */
+	zb->error = FI_SUCCESS;
+	zb->reduce = reduce;
+	/* Note that for simulation, dataptr must be an array */
+	for (i = 0; i < zb->simcount; i++) {
+		if (_skip_or_shuffle(zb, i, &n))
+			continue;
+		zbs = &zb->state[n];
+		zbs->dataval = (dataptr) ? *dataptr : 0;
+		zbs->dataptr = (dataptr) ? dataptr++ : NULL;
+		zbs->contribs++;
+		zb->busy++;
+		TRACE("%s: zb[%d] contribs=%d\n", __func__, i, zbs->contribs);
+		/* if terminal leaf node, will send up immediately */
+		mb.zb_data = zbs->dataval;
+		advance(zbs, mb.raw);
+		TRACE("%s: zb[%d] contribs=%d\n", __func__, i, zbs->contribs);
+	}
+	TRACE("%s: busy=%d\n", __func__, zb->busy);
+
+	return FI_SUCCESS;
+}
+
+/**
+ * @brief Initiate a bitwise-AND reduction.
+ *
+ * All participants call this.
+ *
+ * On entry, *dataptr contains the data to be reduced. On return, *dataptr
+ * contains the reduced data.
+ *
+ * NOTE: When testing in simulation, dataptr should reference an array of
+ * uint64_t with one item for each endpoint.
+ *
+ * @param zb      : zbcoll structure
+ * @param dataptr : pointer to return data
+ * @return int    : FI_SUCCESS or error value
+ */
+int cxip_zbcoll_reduce(struct cxip_zbcoll_obj *zb, uint64_t *dataptr)
+{
+	return _zbreduce(zb, dataptr, true);
+}
+
+/**
+ * @brief Initiate a broadcast from root to leaves.
+ *
+ * All participants call this.
+ *
+ * On entry, *dataptr on root contains the data to be broadcast.
+ * On return, *dataptr contains the broadcast data from root.
+ *
+ * NOTE: When testing in simulation, dataptr should reference an array of
+ * uint64_t with one item for each endpoint.
+ *
+ * @param zb      : zbcoll structure
+ * @param dataptr : pointer to return data
+ * @return int    : FI_SUCCESS or error value
+ */
+int cxip_zbcoll_broadcast(struct cxip_zbcoll_obj *zb, uint64_t *dataptr)
+{
+	return _zbreduce(zb, dataptr, false);
+}
+
+/**
+ * @brief Initiate a no-data barrier.
+ *
+ * All participants call this.
+ *
+ * @param zb      : zbcoll structure
+ * @return int : FI_SUCCESS or error value
+ */
+int cxip_zbcoll_barrier(struct cxip_zbcoll_obj *zb)
+{
+	return _zbreduce(zb, NULL, false);
+}
+
+/**
+ * @brief Progress completion.
+ *
+ * This is called from cxip_coll_progress_join(), which is called when reading
+ * the endpoint EQ as part of progressing the zb collective operation.
+ *
+ * The callback function can thus initiate new operations without concerns about
+ * recursion.
+ *
+ * @param ep_obj : endpoint
+ *
+ * Caller holds eq_obj->lock.
+ */
+void cxip_ep_zbcoll_progress(struct cxip_ep_obj *ep_obj)
+{
+	struct cxip_zbcoll_obj *zb;
+	struct cxip_ep_zbcoll_obj *zbcoll;
+
+	zbcoll = &ep_obj->zbcoll;
+	while (true) {
+		/* progress the underlying ctrl transfers */
+		cxip_ep_ctrl_progress_locked(ep_obj);
+
+		/* see if there is a zb ready to be advanced */
+		zb = NULL;
+		ofi_spin_lock(&zbcoll->lock);
+		if (!dlist_empty(&zbcoll->ready_list))
+			dlist_pop_front(&zbcoll->ready_list,
+					struct cxip_zbcoll_obj,
+					zb, ready_link);
+		ofi_spin_unlock(&zbcoll->lock);
+		if (!zb)
+			break;
+		TRACE("SAW COMPLETION on zb[%d], error=%d!!!\n",
+		    zb->simrank, zb->error);
+		if (zb->userfunc)
+			(zb->userfunc)(zb, zb->userptr);
+	}
+}
+
+/**
+ * @brief Intialize the zbcoll system.
+ *
+ * @param ep_obj : endpoint
+ * @return int : FI_SUCCESS or error value
+ */
+int cxip_zbcoll_init(struct cxip_ep_obj *ep_obj)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+
+	zbcoll = &ep_obj->zbcoll;
+	memset(zbcoll, 0, sizeof(*zbcoll));
+	dlist_init(&zbcoll->ready_list);
+	zbcoll->grpmsk = -1ULL;
+	zbcoll->grptbl = calloc(ZB_MAP_BITS, sizeof(void *));
+	if (!zbcoll->grptbl)
+		return -FI_ENOMEM;
+	ofi_spin_init(&zbcoll->lock);
+	ofi_atomic_initialize32(&zbcoll->dsc_count, 0);
+	ofi_atomic_initialize32(&zbcoll->err_count, 0);
+	ofi_atomic_initialize32(&zbcoll->ack_count, 0);
+	ofi_atomic_initialize32(&zbcoll->rcv_count, 0);
+
+	return FI_SUCCESS;
+}
+
+/**
+ * @brief Cleanup all operations in progress.
+ *
+ * @param ep_obj : endpoint
+ */
+void cxip_zbcoll_fini(struct cxip_ep_obj *ep_obj)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+	int i;
+
+	zbcoll = &ep_obj->zbcoll;
+	for (i = 0; i < ZB_MAP_BITS; i++)
+		cxip_zbcoll_free(zbcoll->grptbl[i]);
+	free(zbcoll->grptbl);
+	zbcoll->grptbl = NULL;
+}
+
+/**
+ * @brief Reset the endpoint counters.
+ *
+ * @param ep : endpoint
+ */
+void cxip_zbcoll_reset_counters(struct cxip_ep_obj *ep_obj)
+{
+	struct cxip_ep_zbcoll_obj *zbcoll;
+
+	zbcoll = &ep_obj->zbcoll;
+	ofi_atomic_set32(&zbcoll->dsc_count, 0);
+	ofi_atomic_set32(&zbcoll->err_count, 0);
+	ofi_atomic_set32(&zbcoll->ack_count, 0);
+	ofi_atomic_set32(&zbcoll->rcv_count, 0);
+}
diff --git a/prov/cxi/test/README.md b/prov/cxi/test/README.md
new file mode 100644
index 00000000000..fac6ef7d7a4
--- /dev/null
+++ b/prov/cxi/test/README.md
@@ -0,0 +1,16 @@
+*SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only 
+Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP*
+
+# Libfabric CXI Provider Tests
+
+All tests in this directory are built under the Criterion tool. See [https://criterion.readthedocs.io/en/master/index.html](url).
+
+Common setup/teardown routines are found in cxip_test_common.c.
+
+Collections of related tests are found in the other files.
+
+The build produces an executable cxitest, which runs the pre-supplied Criterion main() function, and supports selecting launch of individual tests, or the entire test suite.
+
+## Running Tests
+
+See the test.sh file for examples of launching tests with cxitest.
diff --git a/prov/cxi/test/atomic.c b/prov/cxi/test/atomic.c
new file mode 100644
index 00000000000..d87d361ea0e
--- /dev/null
+++ b/prov/cxi/test/atomic.c
@@ -0,0 +1,4433 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <complex.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+
+#include <ofi.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+#define	AMO_DISABLED	false
+
+#define RMA_WIN_LEN	64
+#define RMA_WIN_KEY	2
+#define RMA_WIN_ACCESS	(FI_REMOTE_READ | FI_REMOTE_WRITE)
+#define MR_KEY_STD	200
+
+/* Create MR -- works like a "remote_calloc()" */
+static void *_cxit_create_mr(struct mem_region *mr, uint64_t *key)
+{
+	int ret;
+
+	mr->mem = calloc(1, RMA_WIN_LEN);
+	cr_assert_not_null(mr->mem);
+
+	ret = fi_mr_reg(cxit_domain, mr->mem, RMA_WIN_LEN, RMA_WIN_ACCESS, 0,
+			*key, 0, &mr->mr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed %d", ret);
+
+	ret = fi_mr_bind(mr->mr, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind(ep) failed %d", ret);
+
+	if (cxit_fi->caps & FI_RMA_EVENT && cxit_rem_cntr) {
+		ret = fi_mr_bind(mr->mr, &cxit_rem_cntr->fid, FI_REMOTE_WRITE);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind(cntr) failed %d",
+			     ret);
+	}
+
+	ret = fi_mr_enable(mr->mr);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed %d", ret);
+
+	*key = fi_mr_key(mr->mr);
+
+	return mr->mem;
+}
+
+/* Destroy MR -- works like a "remote_free()" */
+static void _cxit_destroy_mr(struct mem_region *mr)
+{
+	fi_close(&mr->mr->fid);
+
+	free(mr->mem);
+}
+
+/* Test failures associated with bad call parameters.
+ */
+TestSuite(atomic_invalid, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .disabled = AMO_DISABLED, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(atomic_invalid, invalid_amo)
+{
+	uint64_t operand1 = 0;
+	struct fi_ioc iov = {
+		.addr = &operand1,
+		.count = 1
+	};
+	int ret;
+
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0,
+			FI_UINT64, FI_ATOMIC_OP_LAST, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0,
+			FI_UINT64, -1, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0,
+			FI_DATATYPE_LAST, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0,
+			-1, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_atomic(cxit_ep, &operand1, 0, 0, cxit_ep_fi_addr, 0, 0,
+			FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_atomic(cxit_ep, &operand1, 2, 0, cxit_ep_fi_addr, 0, 0,
+			FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_atomic(cxit_ep, 0, 1, 0, cxit_ep_fi_addr, 0, 0,
+			FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+
+	ret = fi_atomicv(cxit_ep, &iov, 0, 0, cxit_ep_fi_addr, 0, 0,
+			 FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_atomicv(cxit_ep, &iov, 0, 2, cxit_ep_fi_addr, 0, 0,
+			 FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	iov.count = 0;
+	ret = fi_atomicv(cxit_ep, &iov, 0, 1, cxit_ep_fi_addr, 0, 0,
+			 FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	iov.count = 2;
+	ret = fi_atomicv(cxit_ep, &iov, 0, 1, cxit_ep_fi_addr, 0, 0,
+			 FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_atomicv(cxit_ep, 0, 0, 1, cxit_ep_fi_addr, 0, 0,
+			 FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+}
+
+Test(atomic_invalid, invalid_fetch)
+{
+	uint64_t operand1 = 0;
+	uint64_t result = 0;
+	struct fi_ioc iov = {
+		.addr = &operand1,
+		.count = 1
+	};
+	struct fi_ioc riov = {
+		.addr = &result,
+		.count = 1
+	};
+	int ret;
+
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0,
+			      cxit_ep_fi_addr, 0, 0, FI_UINT64,
+			      FI_ATOMIC_OP_LAST, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0,
+			      cxit_ep_fi_addr, 0, 0, FI_UINT64, -1, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0,
+			      cxit_ep_fi_addr, 0, 0, FI_DATATYPE_LAST, FI_SUM,
+			      0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0,
+			      cxit_ep_fi_addr, 0, 0, -1, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, 0, 0,
+			      cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 0, 0, &result, 0,
+			      cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 2, 0, &result, 0,
+			      cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomic(cxit_ep, 0, 1, 0, &result, 0,
+			      cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+
+
+	ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 0,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 2,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, 0, 0, 1,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 0, &riov, 0, 1,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 2, &riov, 0, 1,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_fetch_atomicv(cxit_ep, 0, 0, 1, &riov, 0, 1,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	riov.count = 0;
+	ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 1,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	riov.count = 2;
+	ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 1,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	riov.count = 1;
+	iov.count = 0;
+	ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 1,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	iov.count = 2;
+	ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 1,
+			       cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	iov.count = 1;
+	cr_assert_eq(ret, -FI_EINVAL);
+}
+
+Test(atomic_invalid, invalid_swap)
+{
+	uint64_t operand1 = 0;
+	uint64_t compare = 0;
+	uint64_t result = 0;
+	struct fi_ioc iov = {
+		.addr = &operand1,
+		.count = 1
+	};
+	struct fi_ioc ciov = {
+		.addr = &compare,
+		.count = 1
+	};
+	struct fi_ioc riov = {
+		.addr = &result,
+		.count = 1
+	};
+	int ret;
+
+	ret = fi_compare_atomic(cxit_ep,
+				&operand1, 1, 0,
+				&compare, 0,
+				&result, 0,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_ATOMIC_OP_LAST, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomic(cxit_ep,
+				&operand1, 1, 0,
+				&compare, 0,
+				&result, 0,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, -1, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomic(cxit_ep,
+				&operand1, 1, 0,
+				&compare, 0,
+				&result, 0,
+				cxit_ep_fi_addr, 0, 0,
+				FI_DATATYPE_LAST, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomic(cxit_ep,
+				&operand1, 1, 0,
+				&compare, 0,
+				&result, 0,
+				cxit_ep_fi_addr, 0, 0,
+				-1, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomic(cxit_ep,
+				&operand1, 1, 0,
+				&compare, 0,
+				0, 0,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomic(cxit_ep,
+				&operand1, 1, 0,
+				0, 0,
+				&result, 0,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomic(cxit_ep,
+				&operand1, 2, 0,
+				&compare, 0,
+				&result, 0,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomic(cxit_ep,
+				&operand1, 0, 0,
+				&compare, 0,
+				&result, 0,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomic(cxit_ep,
+				0, 1, 0,
+				&compare, 0,
+				&result, 0,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 1,
+				&riov, 0, 2,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 1,
+				&riov, 0, 0,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 2,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 0,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 2,
+				&ciov, 0, 1,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 0,
+				&ciov, 0, 1,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	riov.count = 2;
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 1,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	riov.count = 0;
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 1,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	riov.count = 1;
+	ciov.count = 2;
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 1,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ciov.count = 0;
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 1,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	ciov.count = 1;
+	iov.count = 2;
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 1,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	iov.count = 0;
+	ret = fi_compare_atomicv(cxit_ep,
+				&iov, 0, 1,
+				&ciov, 0, 1,
+				&riov, 0, 1,
+				cxit_ep_fi_addr, 0, 0,
+				FI_UINT64, FI_CSWAP_NE, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	iov.count = 1;
+}
+
+/* Test simple operations: AMO SUM UINT64_T, FAMO SUM UINT64_T, and CAMO SWAP_NE
+ * UINT64_T. If this doesn't work, nothing else will.
+ */
+TestSuite(atomic, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .disabled = AMO_DISABLED, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(atomic, simple_amo)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t exp_remote;
+	uint64_t *rma;
+	int ret;
+	int i;
+	uint64_t key;
+
+	/* Test standard and optimized MRs. */
+	for (i = 0; i < 2; i++) {
+		key = 199 + i;
+
+		rma = _cxit_create_mr(&mr, &key);
+		exp_remote = 0;
+		cr_assert_eq(*rma, exp_remote,
+			     "Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+
+		operand1 = 1;
+		exp_remote += operand1;
+		ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+				cxit_ep_fi_addr, 0, key,
+				FI_UINT64, FI_SUM, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+
+		operand1 = 3;
+		exp_remote += operand1;
+		ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+				cxit_ep_fi_addr, 0, key,
+				FI_UINT64, FI_SUM, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+
+		operand1 = 9;
+		exp_remote += operand1;
+		ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+				cxit_ep_fi_addr, 0, key,
+				FI_UINT64, FI_SUM, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+
+		_cxit_destroy_mr(&mr);
+	}
+}
+
+/* Test atomic inject interface */
+Test(atomic, simple_inject)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t exp_remote = 0;
+	uint64_t *rma;
+	int ret;
+	int count = 0;
+	uint64_t key = RMA_WIN_KEY;
+
+	rma = _cxit_create_mr(&mr, &key);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_inject_atomic(cxit_ep, &operand1, 1,
+			       cxit_ep_fi_addr, 0, key,
+			       FI_UINT64, FI_SUM);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	operand1 = 3;
+	exp_remote += operand1;
+	ret = fi_inject_atomic(cxit_ep, &operand1, 1,
+			       cxit_ep_fi_addr, 0, key,
+			       FI_UINT64, FI_SUM);
+	cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	operand1 = 9;
+	exp_remote += operand1;
+	ret = fi_inject_atomic(cxit_ep, &operand1, 1,
+			       cxit_ep_fi_addr, 0, key,
+			       FI_UINT64, FI_SUM);
+	cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure no events were delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	_cxit_destroy_mr(&mr);
+
+	/* Try using standard MR */
+
+	exp_remote = 0;
+	key = 1000;
+	rma = _cxit_create_mr(&mr, &key);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_inject_atomic(cxit_ep, &operand1, 1,
+			       cxit_ep_fi_addr, 0, key,
+			       FI_UINT64, FI_SUM);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure no events were delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	_cxit_destroy_mr(&mr);
+}
+
+Test(atomic, simple_fetch)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t exp_remote;
+	uint64_t exp_result;
+	uint64_t *rma;
+	uint64_t *loc;
+	int ret;
+	int i;
+	uint64_t key;
+
+	for (i = 0; i < 2; i++) {
+		key = 199 + i;
+
+		rma = _cxit_create_mr(&mr, &key);
+		exp_remote = 0;
+		exp_result = 0;
+		cr_assert_eq(*rma, exp_remote,
+			     "Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+
+		loc = calloc(1, RMA_WIN_LEN);
+		cr_assert_not_null(loc);
+
+		fi_cntr_set(cxit_read_cntr, 0);
+		while (fi_cntr_read(cxit_read_cntr));
+
+		operand1 = 1;
+		*loc = -1;
+		exp_result = exp_remote;
+		exp_remote += operand1;
+		ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0,
+				      loc, 0,
+				      cxit_ep_fi_addr, 0, key,
+				      FI_UINT64, FI_SUM, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Add Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+		cr_assert_eq(*loc, exp_result,
+			     "Fetch Result = %016lx, expected = %016lx",
+			     *loc, exp_result);
+
+		operand1 = 3;
+		*loc = -1;
+		exp_result = exp_remote;
+		exp_remote += operand1;
+		ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0,
+				      loc, 0,
+				      cxit_ep_fi_addr, 0, key,
+				      FI_UINT64, FI_SUM, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Add Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+		cr_assert_eq(*loc, exp_result,
+			     "Fetch Result = %016lx, expected = %016lx",
+			     *loc, exp_result);
+
+		operand1 = 9;
+		*loc = -1;
+		exp_result = exp_remote;
+		exp_remote += operand1;
+		ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0,
+				      loc, 0,
+				      cxit_ep_fi_addr, 0, key,
+				      FI_UINT64, FI_SUM, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Add Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+		cr_assert_eq(*loc, exp_result,
+			     "Fetch Result = %016lx, expected = %016lx",
+			     *loc, exp_result);
+
+		while (fi_cntr_read(cxit_read_cntr) != 3)
+			;
+
+		free(loc);
+		_cxit_destroy_mr(&mr);
+	}
+}
+
+Test(atomic, simple_fetch_read)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t exp_remote;
+	uint64_t exp_result;
+	uint64_t *rma;
+	uint64_t *loc;
+	int ret;
+	int i;
+	uint64_t key;
+
+	for (i = 0; i < 2; i++) {
+		key = 199 + i;
+
+		rma = _cxit_create_mr(&mr, &key);
+		exp_remote = 0;
+		exp_result = 0;
+		cr_assert_eq(*rma, exp_remote,
+			     "Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+		loc = calloc(1, RMA_WIN_LEN);
+		cr_assert_not_null(loc);
+
+		fi_cntr_set(cxit_read_cntr, 0);
+		while (fi_cntr_read(cxit_read_cntr))
+			;
+		*rma = 1;
+		*loc = -1;
+		exp_remote = *rma;
+		exp_result = exp_remote;
+
+		ret = fi_fetch_atomic(cxit_ep, NULL, 1, NULL,
+				      loc, 0,
+				      cxit_ep_fi_addr, 0, key,
+				      FI_UINT64, FI_ATOMIC_READ, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Read Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+		cr_assert_eq(*loc, exp_result,
+			     "Fetch Result = %016lx, expected = %016lx",
+			     *loc, exp_result);
+
+		*rma = 10;
+		*loc = -1;
+		exp_remote = *rma;
+		exp_result = exp_remote;
+
+		ret = fi_fetch_atomic(cxit_ep, NULL, 1, NULL,
+				      loc, 0,
+				      cxit_ep_fi_addr, 0, key,
+				      FI_UINT64, FI_ATOMIC_READ, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Read Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+		cr_assert_eq(*loc, exp_result,
+			     "Fetch Result = %016lx, expected = %016lx",
+			     *loc, exp_result);
+
+		*rma = 0x0123456789abcdef;
+		*loc = -1;
+		exp_remote = *rma;
+		exp_result = exp_remote;
+
+		ret = fi_fetch_atomic(cxit_ep, NULL, 1, NULL,
+				      loc, 0,
+				      cxit_ep_fi_addr, 0, key,
+				      FI_UINT64, FI_ATOMIC_READ, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Read Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+		cr_assert_eq(*loc, exp_result,
+			     "Fetch Result = %016lx, expected = %016lx",
+			     *loc, exp_result);
+
+		while (fi_cntr_read(cxit_read_cntr) != 3)
+			;
+
+		free(loc);
+		_cxit_destroy_mr(&mr);
+	}
+}
+
+Test(atomic, simple_swap)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t compare;
+	uint64_t exp_remote;
+	uint64_t exp_result;
+	uint64_t *rma;
+	uint64_t *loc;
+	int ret;
+	int i;
+	uint64_t key;
+
+	for (i = 0; i < 2; i++) {
+		key = 199 + i;
+
+		rma = _cxit_create_mr(&mr, &key);
+			exp_remote = 0;
+			exp_result = 0;
+		cr_assert_eq(*rma, exp_remote,
+			     "Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+
+		loc = calloc(1, RMA_WIN_LEN);
+		cr_assert_not_null(loc);
+
+		*rma = 0;	/* remote == 0 */
+		operand1 = 1;	/* change remote to 1 */
+		compare = 2;	/* if remote != 2 (true) */
+		*loc = -1;	/* initialize result */
+		exp_remote = 1;	/* expect remote == 1 */
+		exp_result = 0;	/* expect result == 0 */
+		ret = fi_compare_atomic(cxit_ep,
+					&operand1, 1, 0,
+					&compare, 0,
+					loc, 0,
+					cxit_ep_fi_addr, 0, key,
+					FI_UINT64, FI_CSWAP_NE, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Add Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+		cr_assert_eq(*loc, exp_result,
+			     "Fetch Result = %016lx, expected = %016lx",
+			     *loc, exp_result);
+
+		*rma = 2;	/* remote == 2 */
+		operand1 = 1;	/* change remote to 1 */
+		compare = 2;	/* if remote != 2 (false) */
+		*loc = -1;	/* initialize result */
+		exp_remote = 2;	/* expect remote == 2 (no op) */
+		exp_result = 2;	/* expect result == 2 (does return value) */
+		ret = fi_compare_atomic(cxit_ep,
+					&operand1, 1, 0,
+					&compare, 0,
+					loc, 0,
+					cxit_ep_fi_addr, 0, key,
+					FI_UINT64, FI_CSWAP_NE, NULL);
+		cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+		cr_assert_eq(*rma, exp_remote,
+			     "Add Result = %ld, expected = %ld",
+			     *rma, exp_remote);
+		cr_assert_eq(*loc, exp_result,
+			     "Fetch Result = %016lx, expected = %016lx",
+			     *loc, exp_result);
+
+		free(loc);
+		_cxit_destroy_mr(&mr);
+	}
+}
+
+/* Perform a full combinatorial test suite.
+ */
+#define	MAX_TEST_SIZE	16
+
+/**
+ * Compare a seen value with an expected value, with 'len' valid bytes. This
+ * checks the seen buffer all the way to MAX_TEST_SIZE, and looks for a
+ * predefined value in every byte, to ensure that there is no overflow.
+ * The seen buffer will always be either the rma or the loc buffer, which have
+ * 64 bytes of space in them.
+ *
+ * Summation of real and complex types is trickier. Every decimal constant is
+ * internally represented by a binary approximation, and summation can
+ * accumulate errors. With only a single sum with two arguments, the error could
+ * be +1 or -1 in the LSBit.
+ *
+ * @param saw 'seen' buffer
+ * @param exp 'expected' value
+ * @param len number of valid bytes
+ *
+ * @return bool true if successful, false if comparison fails
+ */
+static bool _compare(void *saw, void *exp, int len,
+		     enum fi_op op, enum fi_datatype dt)
+{
+	uint8_t *bval = saw;
+	uint8_t *bexp = exp;
+	uint64_t uval = 0;
+	uint64_t uexp = 0;
+	int i;
+
+	/* Test MS pad bits */
+	for (i = MAX_TEST_SIZE-1; i >= len; i--) {
+		if (bval[i] != bexp[i])
+			return false;
+	}
+	if (op == FI_SUM) {
+		switch (dt) {
+		case FI_FLOAT:
+		case FI_DOUBLE:
+			/* Copy to UINT64, adjust diff (-1,1) to (0,2) */
+			memcpy(&uval, bval, len);
+			memcpy(&uexp, bexp, len);
+			if ((uval - uexp) + 1 > 2)
+				return false;
+			return true;
+		case FI_FLOAT_COMPLEX:
+		case FI_DOUBLE_COMPLEX:
+			/* Do real and imag parts separately */
+			memcpy(&uval, bval, len/2);
+			memcpy(&uexp, bexp, len/2);
+			if (uval - uexp + 1 > 2)
+				return false;
+			memcpy(&uval, bval+len/2, len/2);
+			memcpy(&uexp, bexp+len/2, len/2);
+			if (uval - uexp + 1 > 2)
+				return false;
+			return true;
+		default:
+			break;
+		}
+	}
+	/* Test LS value bits */
+	for (i = len-1; i >= 0; i--) {
+		if (bval[i] != bexp[i])
+			return false;
+	}
+	return true;
+}
+
+/**
+ * Generates a useful error message.
+ *
+ * @param op opcode
+ * @param dt dtcode
+ * @param saw 'seen' buffer
+ * @param exp 'expected' value
+ * @param len number of valid bytes
+ * @param buf buffer to fill with message
+ * @param siz buffer size
+ *
+ * @return const char* returns the buf pointer
+ */
+static const char *_errmsg(enum fi_op op, enum fi_datatype dt,
+			   void *saw, void *exp, int len,
+			   char *buf, size_t siz)
+{
+	char *p = &buf[0];
+	char *e = &buf[siz];
+	uint8_t *bsaw = saw;
+	uint8_t *bexp = exp;
+	int i;
+
+	p += snprintf(p, e-p, "%d:%d: saw=", op, dt);
+	for (i = MAX_TEST_SIZE-1; i >= 0; i--)
+		p += snprintf(p, e-p, "%02x%s", bsaw[i], i == len ? "/" : "");
+	p += snprintf(p, e-p, " exp=");
+	for (i = MAX_TEST_SIZE-1; i >= 0; i--)
+		p += snprintf(p, e-p, "%02x%s", bexp[i], i == len ? "/" : "");
+	return buf;
+}
+
+/**
+ * The general AMO test.
+ *
+ * @param index value used to help identify the test if error
+ * @param dt FI datatype
+ * @param op FI operation
+ * @param err 0 if success expected, 1 if failure expected
+ * @param operand1 operation data value pointer
+ * @param compare operation compare value pointer
+ * @param loc operation result (local) buffer pointer
+ * @param loc_init operation result initialization value pointer
+ * @param rma operation rma (remote) buffer pointer
+ * @param rma_init operation rma initialization value pointer
+ * @param rma_expect operation rma (remote) expected value pointer
+ */
+static void _test_amo(int index, enum fi_datatype dt, enum fi_op op, int err,
+		      void *operand1,
+		      void *compare,
+		      void *loc, void *loc_init,
+		      void *rma, void *rma_init, void *rma_expect,
+		      uint64_t key)
+{
+	struct fi_cq_tagged_entry cqe;
+	char msgbuf[128];
+	char opstr[64];
+	char dtstr[64];
+	uint8_t rexp[MAX_TEST_SIZE];
+	uint8_t lexp[MAX_TEST_SIZE];
+	void *rma_exp = rexp;
+	void *loc_exp = lexp;
+	int len = ofi_datatype_size(dt);
+	int ret;
+
+	strcpy(opstr, fi_tostr(&op, FI_TYPE_ATOMIC_OP));
+	strcpy(dtstr, fi_tostr(&dt, FI_TYPE_ATOMIC_TYPE));
+
+	cr_log_info("Testing %s %s (%d)\n", opstr, dtstr, len);
+
+	memset(rma, -1, MAX_TEST_SIZE);
+	memset(rma_exp, -1, MAX_TEST_SIZE);
+	memcpy(rma, rma_init, len);
+	memcpy(rma_exp, rma_expect, len);
+
+	if (loc && loc_init) {
+		memset(loc, -1, MAX_TEST_SIZE);
+		memset(loc_exp, -1, MAX_TEST_SIZE);
+		memcpy(loc, loc_init, len);
+		memcpy(loc_exp, rma_init, len);
+	}
+	if (compare && loc) {
+		/* This is a compare command */
+		ret = fi_compare_atomic(cxit_ep, operand1, 1, 0,
+					compare, 0, loc, 0,
+					cxit_ep_fi_addr, 0, key, dt,
+					op, NULL);
+	} else if (loc) {
+		/* This is a fetch command */
+		ret = fi_fetch_atomic(cxit_ep, operand1, 1, 0, loc, 0,
+				      cxit_ep_fi_addr, 0, key, dt, op,
+				      NULL);
+	} else {
+		/* This is a simple command */
+		ret = fi_atomic(cxit_ep, operand1, 1, 0,
+				cxit_ep_fi_addr, 0, key, dt, op, NULL);
+	}
+
+	if (err) {
+		/* Expected an error. Tests only invoke "unsupported" failures,
+		 * so any other error is fatal. Success is also fatal if we
+		 * expect a failure.
+		 */
+		cr_assert_eq(ret, -FI_EOPNOTSUPP,
+			     "rtn #%d:%d:%d saw=%d exp=%d\n",
+			     index, op, dt, ret, -FI_EOPNOTSUPP);
+		return;
+	}
+
+
+	/* If we weren't expecting an error, any error is fatal  */
+	cr_assert_eq(ret, 0,
+		     "rtn #%d:%d:%d saw=%d exp=%d\n",
+		     index, op, dt, ret, err);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | (loc ? FI_READ : FI_WRITE), NULL);
+
+	/* We expect the RMA effect to be as predicted */
+	cr_expect(_compare(rma, rma_exp, len, op, dt),
+		  "rma #%d:%s\n", index,
+		  _errmsg(op, dt, rma, rma_exp, len, msgbuf,
+			  sizeof(msgbuf)));
+
+	/* We expect the local result to be as predicted, if there is one */
+	if (loc && loc_init) {
+		cr_expect(_compare(loc, loc_exp, len, op, dt),
+			  "loc #%d:%s\n", index,
+			  _errmsg(op, dt, loc, loc_exp, len, msgbuf,
+				  sizeof(msgbuf)));
+	}
+}
+
+/* Every parameter list can create an OR of the following values, to indicate
+ * what forms should be attempted.
+ */
+#define	_AMO	1
+#define	_FAMO	2
+#define	_CAMO	4
+
+/* The INT tests test 8, 16, 32, and 64 bits for each line item.
+ */
+struct test_int_parms {
+	int opmask;
+	int index;
+	enum fi_op op;
+	int err;
+	uint64_t comp;
+	uint64_t o1;
+	uint64_t rini;
+	uint64_t rexp;
+	uint64_t key;
+};
+
+static struct test_int_parms int_parms[] = {
+	{ _AMO|_FAMO, 11, FI_MIN,  0, 0, 123, 120, 120 },
+	{ _AMO|_FAMO, 12, FI_MIN,  0, 0, 120, 123, 120 },
+	{ _AMO|_FAMO, 21, FI_MAX,  0, 0, 123, 120, 123 },
+	{ _AMO|_FAMO, 22, FI_MAX,  0, 0, 120, 123, 123 },
+	{ _AMO|_FAMO, 31, FI_SUM,  0, 0,   1,   0,   1 },
+	{ _AMO|_FAMO, 32, FI_SUM,  0, 0,   1,  10,  11 },
+	{ _AMO|_FAMO, 33, FI_SUM,  0, 0,   2,  -1,   1 },
+	{ _AMO|_FAMO, 41, FI_LOR,  0, 0,   0,   0,   0 },
+	{ _AMO|_FAMO, 42, FI_LOR,  0, 0, 128,   0,   1 },
+	{ _AMO|_FAMO, 43, FI_LOR,  0, 0,   0, 128,   1 },
+	{ _AMO|_FAMO, 44, FI_LOR,  0, 0,  64, 128,   1 },
+	{ _AMO|_FAMO, 51, FI_LAND, 0, 0,   0,   0,   0 },
+	{ _AMO|_FAMO, 52, FI_LAND, 0, 0, 128,   0,   0 },
+	{ _AMO|_FAMO, 53, FI_LAND, 0, 0,   0, 128,   0 },
+	{ _AMO|_FAMO, 54, FI_LAND, 0, 0,  64, 128,   1 },
+	{ _AMO|_FAMO, 61, FI_LXOR, 0, 0,   0,   0,   0 },
+	{ _AMO|_FAMO, 62, FI_LXOR, 0, 0, 128,   0,   1 },
+	{ _AMO|_FAMO, 63, FI_LXOR, 0, 0,   0, 128,   1 },
+	{ _AMO|_FAMO, 64, FI_LXOR, 0, 0,  64, 128,   0 },
+	{ _AMO|_FAMO, 71, FI_BOR,  0, 0,
+		0xf0e1f2e3f4e5f6e7,
+		0x1818181818181818,
+		0xf8f9fafbfcfdfeff },
+	{ _AMO|_FAMO, 81, FI_BAND, 0, 0,
+		0xf0e1f2e3f4e5f6e7,
+		0x1818181818181818,
+		0x1000100010001000 },
+	{ _AMO|_FAMO, 91, FI_BXOR, 0, 0,
+		0xf0e1f2e3f4e5f6e7,
+		0x1818181818181818,
+		0xe8f9eafbecfdeeff },
+	{ _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0, 0,
+		0x1234123412341234,
+		0xabcdabcdabcdabcd,
+		0x1234123412341234 },
+	{ _AMO|_FAMO, 102, FI_ATOMIC_WRITE, 0, 0,
+		0x1234123412341234,
+		0x1234123412341234,
+		0x1234123412341234 },
+	{ _FAMO, 111, FI_ATOMIC_READ, 0, 0,
+		0x1010101010101010,
+		0x4321432143214321,
+		0x4321432143214321 },
+	{ _AMO, 112, FI_ATOMIC_READ, 1 },
+	{ _CAMO, 121, FI_CSWAP,     0, 120, 123, 100, 100 },
+	{ _CAMO, 122, FI_CSWAP,     0, 100, 123, 100, 123 },
+	{ _CAMO, 131, FI_CSWAP_NE,  0, 120, 123, 100, 123 },
+	{ _CAMO, 132, FI_CSWAP_NE,  0, 100, 123, 100, 100 },
+	{ _CAMO, 141, FI_CSWAP_LE,  0, 101, 123, 100, 100 },
+	{ _CAMO, 142, FI_CSWAP_LE,  0, 100, 123, 100, 123 },
+	{ _CAMO, 143, FI_CSWAP_LE,  0,  99, 123, 100, 123 },
+	{ _CAMO, 151, FI_CSWAP_LT,  0, 101, 123, 100, 100 },
+	{ _CAMO, 152, FI_CSWAP_LT,  0, 100, 123, 100, 100 },
+	{ _CAMO, 153, FI_CSWAP_LT,  0,  99, 123, 100, 123 },
+	{ _CAMO, 161, FI_CSWAP_GE,  0, 101, 123, 100, 123 },
+	{ _CAMO, 162, FI_CSWAP_GE,  0, 100, 123, 100, 123 },
+	{ _CAMO, 163, FI_CSWAP_GE,  0,  99, 123, 100, 100 },
+	{ _CAMO, 171, FI_CSWAP_GT,  0, 101, 123, 100, 123 },
+	{ _CAMO, 173, FI_CSWAP_GT,  0, 100, 123, 100, 100 },
+	{ _CAMO, 173, FI_CSWAP_GT,  0,  99, 123, 100, 100 },
+	{ _CAMO, 181, FI_MSWAP,     0,
+		0xf0f0f0f0f0f0f0f0,
+		0xaaaaaaaaaaaaaaaa,
+		0x1111111111111111,
+		0xa1a1a1a1a1a1a1a1
+	},
+};
+
+ParameterizedTestParameters(atomic, test_int)
+{
+	struct test_int_parms *params;
+	int tests = ARRAY_SIZE(int_parms);
+	int i;
+
+	params = malloc(sizeof(int_parms) * 2);
+
+	memcpy(params, int_parms, sizeof(int_parms));
+	memcpy((uint8_t *)params + sizeof(int_parms), int_parms,
+	       sizeof(int_parms));
+
+	/* Make duplicate tests that use a standard MR key */
+	for (i = 0; i < tests; i++) {
+		params[tests + i].key = MR_KEY_STD;
+		params[tests + i].index += 1000;
+	}
+
+	return cr_make_param_array(struct test_int_parms, params,
+				   tests * 2);
+}
+
+ParameterizedTest(struct test_int_parms *p, atomic, test_int)
+{
+	struct mem_region mr;
+	enum fi_datatype dt;
+	uint64_t *rma;
+	uint64_t *loc;
+	uint64_t lini = -1;
+
+	rma = _cxit_create_mr(&mr, &p->key);
+
+	loc = calloc(1, RMA_WIN_LEN);
+	cr_assert_not_null(loc);
+
+	if (p->opmask & _AMO) {
+		for (dt = FI_INT8; dt <= FI_UINT64; dt++) {
+			_test_amo(p->index, dt, p->op, p->err, &p->o1,
+				  0, 0, 0,
+				  rma, &p->rini, &p->rexp,
+				  p->key);
+		}
+	}
+
+	if (p->opmask & _FAMO) {
+		for (dt = FI_INT8; dt <= FI_UINT64; dt++) {
+			_test_amo(p->index, dt, p->op, p->err, &p->o1,
+				  0, loc, &lini, rma, &p->rini, &p->rexp,
+				  p->key);
+		}
+	}
+
+	if (p->opmask & _CAMO) {
+		for (dt = FI_INT8; dt <= FI_UINT64; dt++) {
+			_test_amo(p->index, dt, p->op, p->err, &p->o1,
+				  &p->comp, loc, &lini, rma, &p->rini,
+				  &p->rexp,
+				  p->key);
+		}
+	}
+
+	free(loc);
+	_cxit_destroy_mr(&mr);
+}
+
+/* The FLT tests only test the float type.
+ */
+struct test_flt_parms {
+	int opmask;
+	int index;
+	enum fi_op op;
+	int err;
+	float comp;
+	float o1;
+	float rini;
+	float rexp;
+	uint64_t key;
+};
+
+static struct test_flt_parms flt_parms[] = {
+	{ _AMO|_FAMO, 11, FI_MIN,  0, 0.0f, 12.3f, 12.0f, 12.0f },
+	{ _AMO|_FAMO, 12, FI_MIN,  0, 0.0f, 12.0f, 12.3f, 12.0f },
+	{ _AMO|_FAMO, 21, FI_MAX,  0, 0.0f, 12.3f, 12.0f, 12.3f },
+	{ _AMO|_FAMO, 22, FI_MAX,  0, 0.0f, 12.0f, 12.3f, 12.3f },
+	{ _AMO|_FAMO, 31, FI_SUM,  0, 0.0f,  1.1f,  1.2f, (1.1f + 1.2f) },
+	{ _AMO|_FAMO, 32, FI_SUM,  0, 0.0f,  0.4f,  1.7f, (0.4f + 1.7f) },
+	{ _AMO|_FAMO, 41, FI_LOR,  1 },
+	{ _AMO|_FAMO, 51, FI_LAND, 1 },
+	{ _AMO|_FAMO, 61, FI_LXOR, 1 },
+	{ _AMO|_FAMO, 71, FI_BOR,  1 },
+	{ _AMO|_FAMO, 81, FI_BAND, 1 },
+	{ _AMO|_FAMO, 91, FI_BXOR, 1 },
+	{ _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0, 0.0f, 10.2f, 96.6f, 10.2f },
+	{ _FAMO, 111, FI_ATOMIC_READ, 0, 0.0f, 1.1f, 10.2f, 10.2f },
+	{ _AMO,  112, FI_ATOMIC_READ, 1 },
+	{ _CAMO, 121, FI_CSWAP,     0, 12.0f, 12.3f, 10.0f, 10.0f },
+	{ _CAMO, 122, FI_CSWAP,     0, 10.0f, 12.3f, 10.0f, 12.3f },
+	{ _CAMO, 131, FI_CSWAP_NE,  0, 12.0f, 12.3f, 10.0f, 12.3f },
+	{ _CAMO, 132, FI_CSWAP_NE,  0, 10.0f, 12.3f, 10.0f, 10.0f },
+	{ _CAMO, 141, FI_CSWAP_LE,  0, 10.1f, 12.3f, 10.0f, 10.0f },
+	{ _CAMO, 142, FI_CSWAP_LE,  0, 10.0f, 12.3f, 10.0f, 12.3f },
+	{ _CAMO, 143, FI_CSWAP_LE,  0,  9.9f, 12.3f, 10.0f, 12.3f },
+	{ _CAMO, 151, FI_CSWAP_LT,  0, 10.1f, 12.3f, 10.0f, 10.0f },
+	{ _CAMO, 152, FI_CSWAP_LT,  0, 10.0f, 12.3f, 10.0f, 10.0f },
+	{ _CAMO, 153, FI_CSWAP_LT,  0,  9.9f, 12.3f, 10.0f, 12.3f },
+	{ _CAMO, 161, FI_CSWAP_GE,  0, 10.1f, 12.3f, 10.0f, 12.3f },
+	{ _CAMO, 162, FI_CSWAP_GE,  0, 10.0f, 12.3f, 10.0f, 12.3f },
+	{ _CAMO, 163, FI_CSWAP_GE,  0,  9.9f, 12.3f, 10.0f, 10.0f },
+	{ _CAMO, 171, FI_CSWAP_GT,  0, 10.1f, 12.3f, 10.0f, 12.3f },
+	{ _CAMO, 172, FI_CSWAP_GT,  0, 10.0f, 12.3f, 10.0f, 10.0f },
+	{ _CAMO, 173, FI_CSWAP_GT,  0,  9.9f, 12.3f, 10.0f, 10.0f },
+	{ _CAMO, 181, FI_MSWAP,     1 },
+};
+
+ParameterizedTestParameters(atomic, test_flt)
+{
+	struct test_flt_parms *params;
+	int tests = ARRAY_SIZE(flt_parms);
+	int i;
+
+	params = malloc(sizeof(flt_parms) * 2);
+
+	memcpy(params, flt_parms, sizeof(flt_parms));
+	memcpy((uint8_t *)params + sizeof(flt_parms), flt_parms,
+	       sizeof(flt_parms));
+
+	/* Make duplicate tests that use a standard MR key */
+	for (i = 0; i < tests; i++) {
+		params[tests + i].key = MR_KEY_STD;
+		params[tests + i].index += 1000;
+	}
+
+	return cr_make_param_array(struct test_flt_parms, params,
+				   tests * 2);
+}
+
+ParameterizedTest(struct test_flt_parms *p, atomic, test_flt)
+{
+	struct mem_region mr;
+	enum fi_datatype dt = FI_FLOAT;
+	uint64_t *rma;
+	uint64_t *loc;
+	uint64_t lini = -1;
+
+	rma = _cxit_create_mr(&mr, &p->key);
+
+	loc = calloc(1, RMA_WIN_LEN);
+	cr_assert_not_null(loc);
+
+	if (p->opmask & _AMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  0, 0, 0,
+			  rma, &p->rini, &p->rexp,
+			  p->key);
+	}
+
+	if (p->opmask & _FAMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  0, loc, &lini, rma, &p->rini, &p->rexp,
+			  p->key);
+	}
+
+	if (p->opmask & _CAMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  &p->comp, loc, &lini, rma, &p->rini,
+			  &p->rexp,
+			  p->key);
+	}
+
+	free(loc);
+	_cxit_destroy_mr(&mr);
+}
+
+/* The DBL tests only test the double type.
+ */
+struct test_dbl_parms {
+	int opmask;
+	int index;
+	enum fi_op op;
+	int err;
+	double comp;
+	double o1;
+	double rini;
+	double rexp;
+	uint64_t key;
+};
+
+static struct test_dbl_parms dbl_parms[] = {
+	{ _AMO|_FAMO, 11, FI_MIN,  0, 0.0, 12.3, 12.0, 12.0 },
+	{ _AMO|_FAMO, 12, FI_MIN,  0, 0.0, 12.0, 12.3, 12.0 },
+	{ _AMO|_FAMO, 21, FI_MAX,  0, 0.0, 12.3, 12.0, 12.3 },
+	{ _AMO|_FAMO, 22, FI_MAX,  0, 0.0, 12.0, 12.3, 12.3 },
+	{ _AMO|_FAMO, 31, FI_SUM,  0, 0.0,  1.1,  1.2, (1.1 + 1.2) },
+	{ _AMO|_FAMO, 32, FI_SUM,  0, 0.0,  0.4,  1.7, (0.4 + 1.7) },
+	{ _AMO|_FAMO, 41, FI_LOR,  1 },
+	{ _AMO|_FAMO, 51, FI_LAND, 1 },
+	{ _AMO|_FAMO, 61, FI_LXOR, 1 },
+	{ _AMO|_FAMO, 71, FI_BOR,  1 },
+	{ _AMO|_FAMO, 81, FI_BAND, 1 },
+	{ _AMO|_FAMO, 91, FI_BXOR, 1 },
+	{ _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0, 0.0, 10.2, 123.4, 10.2 },
+	{ _FAMO, 111, FI_ATOMIC_READ, 0, 0.0, 1.1, 10.2, 10.2 },
+	{ _AMO,  112, FI_ATOMIC_READ, 1 },
+	{ _CAMO, 121, FI_CSWAP,     0, 12.0, 12.3, 10.0, 10.0 },
+	{ _CAMO, 122, FI_CSWAP,     0, 10.0, 12.3, 10.0, 12.3 },
+	{ _CAMO, 131, FI_CSWAP_NE,  0, 12.0, 12.3, 10.0, 12.3 },
+	{ _CAMO, 132, FI_CSWAP_NE,  0, 10.0, 12.3, 10.0, 10.0 },
+	{ _CAMO, 141, FI_CSWAP_LE,  0, 10.1, 12.3, 10.0, 10.0 },
+	{ _CAMO, 142, FI_CSWAP_LE,  0, 10.0, 12.3, 10.0, 12.3 },
+	{ _CAMO, 143, FI_CSWAP_LE,  0,  9.9, 12.3, 10.0, 12.3 },
+	{ _CAMO, 151, FI_CSWAP_LT,  0, 10.1, 12.3, 10.0, 10.0 },
+	{ _CAMO, 152, FI_CSWAP_LT,  0, 10.0, 12.3, 10.0, 10.0 },
+	{ _CAMO, 153, FI_CSWAP_LT,  0,  9.9, 12.3, 10.0, 12.3 },
+	{ _CAMO, 161, FI_CSWAP_GE,  0, 10.1, 12.3, 10.0, 12.3 },
+	{ _CAMO, 162, FI_CSWAP_GE,  0, 10.0, 12.3, 10.0, 12.3 },
+	{ _CAMO, 163, FI_CSWAP_GE,  0,  9.9, 12.3, 10.0, 10.0 },
+	{ _CAMO, 171, FI_CSWAP_GT,  0, 10.1, 12.3, 10.0, 12.3 },
+	{ _CAMO, 172, FI_CSWAP_GT,  0, 10.0, 12.3, 10.0, 10.0 },
+	{ _CAMO, 173, FI_CSWAP_GT,  0,  9.9, 12.3, 10.0, 10.0 },
+	{ _CAMO, 181, FI_MSWAP,     1 },
+};
+
+ParameterizedTestParameters(atomic, test_dbl)
+{
+	struct test_dbl_parms *params;
+	int tests = ARRAY_SIZE(dbl_parms);
+	int i;
+
+	params = malloc(sizeof(dbl_parms) * 2);
+
+	memcpy(params, dbl_parms, sizeof(dbl_parms));
+	memcpy((uint8_t *)params + sizeof(dbl_parms), dbl_parms,
+	       sizeof(dbl_parms));
+
+	/* Make duplicate tests that use a standard MR key */
+	for (i = 0; i < tests; i++) {
+		params[tests + i].key = MR_KEY_STD;
+		params[tests + i].index += 1000;
+	}
+
+	return cr_make_param_array(struct test_dbl_parms, params,
+				   tests * 2);
+}
+
+ParameterizedTest(struct test_dbl_parms *p, atomic, test_dbl)
+{
+	struct mem_region mr;
+	enum fi_datatype dt = FI_DOUBLE;
+	uint64_t *rma;
+	uint64_t *loc;
+	uint64_t lini = -1;
+
+	rma = _cxit_create_mr(&mr, &p->key);
+
+	loc = calloc(1, RMA_WIN_LEN);
+	cr_assert_not_null(loc);
+
+	if (p->opmask & _AMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  0, 0, 0,
+			  rma, &p->rini, &p->rexp,
+			  p->key);
+	}
+
+	if (p->opmask & _FAMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  0, loc, &lini, rma, &p->rini, &p->rexp,
+			  p->key);
+	}
+
+	if (p->opmask & _CAMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  &p->comp, loc, &lini, rma, &p->rini,
+			  &p->rexp,
+			  p->key);
+	}
+
+	free(loc);
+	_cxit_destroy_mr(&mr);
+}
+
+/* The CMPLX tests only test the float complex type.
+ */
+struct test_cplx_parms {
+	int opmask;
+	int index;
+	enum fi_op op;
+	int err;
+
+	float complex comp;
+	float complex o1;
+	float complex rini;
+	float complex rexp;
+	uint64_t key;
+};
+
+static struct test_cplx_parms cplx_parms[] = {
+	{ _AMO|_FAMO, 11, FI_MIN,  1 },
+	{ _AMO|_FAMO, 21, FI_MAX,  1 },
+	{ _AMO|_FAMO, 31, FI_SUM,  0, 0.0,  1.1,  1.2, (1.1 + 1.2) },
+	{ _AMO|_FAMO, 32, FI_SUM,  0, 0.0,  0.4,  1.7, (0.4 + 1.7) },
+	{ _AMO|_FAMO, 31, FI_SUM,  0,
+		0.0f, 1.1f+I*0.4f, 1.2f+I*1.7f, (1.1f+I*0.4f + 1.2f+I*1.7f) },
+	{ _AMO|_FAMO, 32, FI_SUM,  0,
+		0.0f, 1.1f+I*1.7f, 1.2f+I*0.4f, (1.1f+I*1.7f + 1.2f+I*0.4f) },
+	{ _AMO|_FAMO, 41, FI_LOR,  1 },
+	{ _AMO|_FAMO, 51, FI_LAND, 1 },
+	{ _AMO|_FAMO, 61, FI_LXOR, 1 },
+	{ _AMO|_FAMO, 71, FI_BOR,  1 },
+	{ _AMO|_FAMO, 81, FI_BAND, 1 },
+	{ _AMO|_FAMO, 91, FI_BXOR, 1 },
+	{ _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0,
+		0.0f, 10.2f+I*1.1f, 0.3f+I*2.2f, 10.2f+I*1.1f },
+	{ _FAMO, 111, FI_ATOMIC_READ, 0,
+		0.0f, 1.1f+I*1.1f, 10.2f+I*1.1f, 10.2f+I*1.1f },
+	{ _AMO,  112, FI_ATOMIC_READ, 1 },
+	{ _CAMO, 121, FI_CSWAP,     0,
+		12.0f+I*1.1f, 12.3f+I*1.1f, 10.0f+I*1.1f, 10.0f+I*1.1f },
+	{ _CAMO, 122, FI_CSWAP,     0,
+		10.0f+I*1.1f, 12.3f+I*1.1f, 10.0f+I*1.1f, 12.3f+I*1.1f },
+	{ _CAMO, 131, FI_CSWAP_NE,  0,
+		12.0f+I*1.1f, 12.3f+I*1.1f, 10.0f+I*1.1f, 12.3f+I*1.1f },
+	{ _CAMO, 132, FI_CSWAP_NE,  0,
+		10.0f+I*1.1f, 12.3f+I*1.1f, 10.0f+I*1.1f, 10.0f+I*1.1f },
+	{ _CAMO, 141, FI_CSWAP_LE,  1 },
+	{ _CAMO, 151, FI_CSWAP_LT,  1 },
+	{ _CAMO, 161, FI_CSWAP_GE,  1 },
+	{ _CAMO, 171, FI_CSWAP_GT,  1 },
+	{ _CAMO, 181, FI_MSWAP,     1 },
+};
+
+ParameterizedTestParameters(atomic, test_cplx)
+{
+	struct test_cplx_parms *params;
+	int tests = ARRAY_SIZE(cplx_parms);
+	int i;
+
+	params = malloc(sizeof(cplx_parms) * 2);
+
+	memcpy(params, cplx_parms, sizeof(cplx_parms));
+	memcpy((uint8_t *)params + sizeof(cplx_parms), cplx_parms,
+	       sizeof(cplx_parms));
+
+	/* Make duplicate tests that use a standard MR key */
+	for (i = 0; i < tests; i++) {
+		params[tests + i].key = MR_KEY_STD;
+		params[tests + i].index += 1000;
+	}
+
+	return cr_make_param_array(struct test_cplx_parms, params,
+				   tests * 2);
+}
+
+ParameterizedTest(struct test_cplx_parms *p, atomic, test_cplx)
+{
+	struct mem_region mr;
+	enum fi_datatype dt = FI_FLOAT_COMPLEX;
+	uint64_t *rma;
+	uint64_t *loc;
+	uint64_t lini = -1;
+	uint64_t key = 0;
+
+	rma = _cxit_create_mr(&mr, &key);
+
+	loc = calloc(1, RMA_WIN_LEN);
+	cr_assert_not_null(loc);
+
+	if (p->opmask & _AMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  0, 0, 0,
+			  rma, &p->rini, &p->rexp,
+			  key);
+	}
+
+	if (p->opmask & _FAMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  0, loc, &lini, rma, &p->rini, &p->rexp,
+			  key);
+	}
+
+	if (p->opmask & _CAMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  &p->comp, loc, &lini, rma, &p->rini,
+			  &p->rexp,
+			  key);
+	}
+
+	free(loc);
+	_cxit_destroy_mr(&mr);
+}
+
+/* The DCMPLX tests only test the double complex type.
+ */
+
+struct test_dcplx_parms {
+	int opmask;
+	int index;
+	enum fi_op op;
+	int err;
+
+	double complex comp;
+	double complex o1;
+	double complex rini;
+	double complex rexp;
+	uint64_t key;
+};
+
+static struct test_dcplx_parms dcplx_parms[] = {
+	{ _AMO|_FAMO, 11, FI_MIN,  1 },
+	{ _AMO|_FAMO, 21, FI_MAX,  1 },
+	{ _AMO|_FAMO, 31, FI_SUM,  0,
+		0.0, 1.1+I*0.4, 1.2+I*1.7, (1.1+I*0.4 + 1.2+I*1.7) },
+	{ _AMO|_FAMO, 32, FI_SUM,  0,
+		0.0, 1.1+I*1.7, 1.2+I*0.4, (1.1+I*1.7 + 1.2+I*0.4) },
+	{ _AMO|_FAMO, 41, FI_LOR,  1 },
+	{ _AMO|_FAMO, 51, FI_LAND, 1 },
+	{ _AMO|_FAMO, 61, FI_LXOR, 1 },
+	{ _AMO|_FAMO, 71, FI_BOR,  1 },
+	{ _AMO|_FAMO, 81, FI_BAND, 1 },
+	{ _AMO|_FAMO, 91, FI_BXOR, 1 },
+	{ _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0,
+		0.0, 10.2+I*1.1, 0.3+I*2.2, 10.2+I*1.1 },
+	{ _FAMO, 111, FI_ATOMIC_READ, 0,
+		0.0, 1.1+I*1.1, 10.2+I*1.1, 10.2+I*1.1 },
+	{ _AMO,  112, FI_ATOMIC_READ, 1 },
+	{ _CAMO, 121, FI_CSWAP,     0,
+		12.0+I*1.1, 12.3+I*1.1, 10.0+I*1.1, 10.0+I*1.1 },
+	{ _CAMO, 122, FI_CSWAP,     0,
+		10.0+I*1.1, 12.3+I*1.1, 10.0+I*1.1, 12.3+I*1.1 },
+	{ _CAMO, 131, FI_CSWAP_NE,  0,
+		12.0+I*1.1, 12.3+I*1.1, 10.0+I*1.1, 12.3+I*1.1 },
+	{ _CAMO, 132, FI_CSWAP_NE,  0,
+		10.0+I*1.1, 12.3+I*1.1, 10.0+I*1.1, 10.0+I*1.1 },
+	{ _CAMO, 141, FI_CSWAP_LE,  1 },
+	{ _CAMO, 151, FI_CSWAP_LT,  1 },
+	{ _CAMO, 161, FI_CSWAP_GE,  1 },
+	{ _CAMO, 171, FI_CSWAP_GT,  1 },
+	{ _CAMO, 181, FI_MSWAP,     1 },
+};
+
+ParameterizedTestParameters(atomic, test_dcplx)
+{
+	struct test_dcplx_parms *params;
+	int tests = ARRAY_SIZE(dcplx_parms);
+	int i;
+
+	params = malloc(sizeof(dcplx_parms) * 2);
+
+	memcpy(params, dcplx_parms, sizeof(dcplx_parms));
+	memcpy((uint8_t *)params + sizeof(dcplx_parms), dcplx_parms,
+	       sizeof(dcplx_parms));
+
+	/* Make duplicate tests that use a standard MR key */
+	for (i = 0; i < tests; i++) {
+		params[tests + i].key = MR_KEY_STD;
+		params[tests + i].index += 1000;
+	}
+
+	return cr_make_param_array(struct test_dcplx_parms, params,
+				   tests * 2);
+}
+
+ParameterizedTest(struct test_dcplx_parms *p, atomic, test_dcplx)
+{
+	struct mem_region mr;
+	enum fi_datatype dt = FI_DOUBLE_COMPLEX;
+	uint64_t *rma;
+	uint64_t *loc;
+	uint64_t lini = -1;
+
+	rma = _cxit_create_mr(&mr, &p->key);
+
+	loc = calloc(1, RMA_WIN_LEN);
+	cr_assert_not_null(loc);
+
+	if (p->opmask & _AMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  0, 0, 0,
+			  rma, &p->rini, &p->rexp,
+			  p->key);
+	}
+
+	if (p->opmask & _FAMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  0, loc, &lini, rma, &p->rini, &p->rexp,
+			  p->key);
+	}
+
+	if (p->opmask & _CAMO) {
+		_test_amo(p->index, dt, p->op, p->err, &p->o1,
+			  &p->comp, loc, &lini, rma, &p->rini,
+			  &p->rexp,
+			  p->key);
+	}
+
+	free(loc);
+	_cxit_destroy_mr(&mr);
+}
+
+Test(atomic, amo_cleanup)
+{
+	int ret;
+	long i;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int writes = 50;
+	struct mem_region mr;
+	uint64_t operand1 = 0;
+	uint64_t key = RMA_WIN_KEY;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	for (i = 0; i < win_len; i++)
+		send_buf[i] = 0xb1 * i;
+
+	_cxit_create_mr(&mr, &key);
+
+	/* Send 8 bytes from send buffer data to RMA window 0 */
+	for (i = 0; i < writes; i++) {
+		do {
+			ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+					cxit_ep_fi_addr, 0, key,
+					FI_UINT64, FI_SUM, NULL);
+			if (ret == -FI_EAGAIN)
+				fi_cq_read(cxit_tx_cq, NULL, 0);
+		} while (ret == -FI_EAGAIN);
+		cr_assert(ret == FI_SUCCESS);
+	}
+
+	_cxit_destroy_mr(&mr);
+
+	/* Exit without gathering events. */
+}
+
+/* Perform a batch of AMOs. A C_STATE update is required for each transaction
+ * since each transaction in the batch uses a unique internal request.
+ */
+Test(atomic, amo_batch)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	int ret;
+	int i;
+	uint64_t key = RMA_WIN_KEY;
+
+	_cxit_create_mr(&mr, &key);
+
+	cr_assert(!fi_cntr_read(cxit_write_cntr));
+
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+			cxit_ep_fi_addr, 0, key,
+			FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+			cxit_ep_fi_addr, 0, key,
+			FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+			cxit_ep_fi_addr, 0, key,
+			FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+			cxit_ep_fi_addr, 0, key,
+			FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+
+	while (fi_cntr_read(cxit_write_cntr) != 4)
+		;
+
+	for (i = 0; i < 4; i++) {
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+		validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+	}
+
+	_cxit_destroy_mr(&mr);
+}
+
+void cxit_setup_amo_selective_completion(void)
+{
+	cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION;
+
+	cxit_setup_getinfo();
+	cxit_fi_hints->tx_attr->op_flags = FI_COMPLETION;
+	cxit_setup_rma();
+}
+
+/* Test selective completion behavior with AMOs. */
+Test(atomic_sel, selective_completion,
+     .init = cxit_setup_amo_selective_completion,
+     .fini = cxit_teardown_rma)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t compare;
+	uint64_t result;
+	uint64_t exp_remote = 0;
+	uint64_t *rma;
+	int ret;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_ioc compare_ioc;
+	struct fi_ioc result_ioc;
+	struct fi_rma_ioc rma_ioc;
+	int count = 0;
+	uint64_t key = RMA_WIN_KEY;
+
+	rma = _cxit_create_mr(&mr, &key);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	result_ioc.addr = &result;
+	result_ioc.count = 1;
+
+	compare_ioc.addr = &compare;
+	compare_ioc.count = 1;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	/* Non-fetching AMOs */
+
+	/* Completion requested by default. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+			cxit_ep_fi_addr, 0, key,
+			FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Completion explicitly requested. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_COMPLETION);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Suppress completion. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_atomicmsg(cxit_ep, &msg, 0);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Inject never generates an event */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_inject_atomic(cxit_ep, &operand1, 1,
+			       cxit_ep_fi_addr, 0, key,
+			       FI_UINT64, FI_SUM);
+	cr_assert(ret == FI_SUCCESS);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Fetching AMOs */
+	count = 0;
+
+	/* Completion requested by default. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0,
+			      &result, NULL,
+			      cxit_ep_fi_addr, 0, key,
+			      FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Completion explicitly requested. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_COMPLETION);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Suppress completion. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, 0);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	/* Completion explicitly requested with inject. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_COMPLETION | FI_INJECT);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Suppress completion with inject. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_INJECT);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_read_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Comp AMOs */
+
+	/* Completion requested by default. */
+	ret = fi_compare_atomic(cxit_ep, &operand1, 1, 0,
+				&compare, NULL,
+				&result, NULL,
+				cxit_ep_fi_addr, 0, key,
+				FI_UINT64, FI_CSWAP, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+
+	/* Completion explicitly requested. */
+	msg.op = FI_CSWAP;
+	ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1,
+				   &result_ioc, NULL, 1, FI_COMPLETION);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+
+	/* Suppress completion. */
+	ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1,
+				   &result_ioc, NULL, 1, 0);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_read_cntr) != count)
+		;
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	_cxit_destroy_mr(&mr);
+}
+
+void cxit_setup_amo_selective_completion_suppress(void)
+{
+	cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION;
+
+	cxit_setup_getinfo();
+	cxit_fi_hints->tx_attr->op_flags = 0;
+	cxit_setup_rma();
+}
+
+/* Test selective completion behavior with RMA. */
+Test(atomic_sel, selective_completion_suppress,
+     .init = cxit_setup_amo_selective_completion_suppress,
+     .fini = cxit_teardown_rma)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t compare;
+	uint64_t result;
+	uint64_t exp_remote = 0;
+	uint64_t *rma;
+	int ret;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_ioc compare_ioc;
+	struct fi_ioc result_ioc;
+	struct fi_rma_ioc rma_ioc;
+	int count = 0;
+	uint64_t key = RMA_WIN_KEY;
+
+	rma = _cxit_create_mr(&mr, &key);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	result_ioc.addr = &result;
+	result_ioc.count = 1;
+
+	compare_ioc.addr = &compare;
+	compare_ioc.count = 1;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	/* Non-fetching AMOs */
+
+	/* Completion suppressed by default. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+			cxit_ep_fi_addr, 0, key,
+			FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Completion explicitly requested. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_COMPLETION);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	count++;
+
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Suppress completion. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_atomicmsg(cxit_ep, &msg, 0);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Inject never generates an event */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_inject_atomic(cxit_ep, &operand1, 1,
+			       cxit_ep_fi_addr, 0, key,
+			       FI_UINT64, FI_SUM);
+	cr_assert(ret == FI_SUCCESS);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Fetching AMOs */
+	count = 0;
+
+	/* Completion suppressed by default. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0,
+			      &result, NULL,
+			      cxit_ep_fi_addr, 0, key,
+			      FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_read_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Completion explicitly requested. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_COMPLETION);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Suppress completion. */
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, 0);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_read_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Comp AMOs */
+
+	/* Completion suppressed by default. */
+	ret = fi_compare_atomic(cxit_ep, &operand1, 1, 0,
+				&compare, NULL,
+				&result, NULL,
+				cxit_ep_fi_addr, 0, key,
+				FI_UINT64, FI_CSWAP, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Completion explicitly requested. */
+	msg.op = FI_CSWAP;
+	ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1,
+				   &result_ioc, NULL, 1, FI_COMPLETION);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+
+	/* Suppress completion. */
+	ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1,
+				   &result_ioc, NULL, 1, 0);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	/* Completion explicitly requested with inject. */
+	msg.op = FI_CSWAP;
+	ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1,
+				   &result_ioc, NULL, 1,
+				   FI_COMPLETION | FI_INJECT);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+
+	/* Suppress completion with inject. */
+	ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1,
+				   &result_ioc, NULL, 1, FI_INJECT);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	while (fi_cntr_read(cxit_read_cntr) != count)
+		;
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	_cxit_destroy_mr(&mr);
+}
+
+/* Test remote counter events with AMOs */
+Test(atomic, rem_cntr)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t exp_remote = 0;
+	uint64_t *rma;
+	int ret;
+	int count = 0;
+	uint64_t key = RMA_WIN_KEY;
+
+	rma = _cxit_create_mr(&mr, &key);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+			cxit_ep_fi_addr, 0, key,
+			FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	/* Wait for remote counter event, then check data */
+	count++;
+
+	while (fi_cntr_read(cxit_rem_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	operand1 = 3;
+	exp_remote += operand1;
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+			cxit_ep_fi_addr, 0, key,
+			FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+
+	/* Wait for remote counter event, then check data */
+	count++;
+
+	while (fi_cntr_read(cxit_rem_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	operand1 = 9;
+	exp_remote += operand1;
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0,
+			cxit_ep_fi_addr, 0, key,
+			FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+
+	/* Wait for remote counter event, then check data */
+	count++;
+
+	while (fi_cntr_read(cxit_rem_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	_cxit_destroy_mr(&mr);
+}
+
+/* Test simple operations: AMO SUM UINT64_T, FAMO SUM UINT64_T, and CAMO SWAP_NE
+ * UINT64_T. If this doesn't work, nothing else will.
+ */
+TestSuite(atomic_flush, .init = cxit_setup_rma_disable_fi_rma_event,
+	  .fini = cxit_teardown_rma, .disabled = AMO_DISABLED,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Perform a fetching AMO with flush at target. */
+Test(atomic_flush, fetch_flush)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t fetch_remote = 4;
+	uint64_t exp_remote = fetch_remote;
+	uint64_t *rma;
+	int ret;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_rma_ioc rma_ioc;
+	uint64_t result = 0;
+	struct fi_ioc result_ioc = { .count = 1, .addr = &result };
+	int count = 0;
+	uint64_t flushes_start;
+	uint64_t flushes_end;
+	uint64_t key = RMA_WIN_KEY;
+	bool enable = false;
+
+	/* If FI_MR_PROV_KEY disable the remote provider key cache */
+	fi_control(&cxit_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE,
+		   &enable);
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS,
+				 &flushes_start, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	rma = _cxit_create_mr(&mr, &key);
+	*rma = fetch_remote;
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_DELIVERY_COMPLETE);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+	cr_assert_eq(result, fetch_remote,
+		     "Result = %ld, expected = %ld",
+		     result, fetch_remote);
+
+	_cxit_destroy_mr(&mr);
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS,
+				 &flushes_end, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+	cr_assert(flushes_end > flushes_start);
+}
+
+/* Perform a fetching AMO with flush at target, but use an illegal
+ * RMA offset. Verify that an error is returned in the CQE even though
+ * the subsequent flush succeeds.
+ */
+Test(atomic_flush, fetch_flush_bounds_err)
+{
+	struct mem_region mr;
+	struct fi_cq_err_entry err;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1 = 1;
+	uint64_t result = 0;
+	uint64_t *rma;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_rma_ioc rma_ioc;
+	struct fi_ioc result_ioc = { .count = 1, .addr = &result };
+	uint64_t key = RMA_WIN_KEY;
+	int ret;
+	bool enable = false;
+
+	/* If FI_MR_PROV_KEY disable the remote provider key cache */
+	fi_control(&cxit_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE,
+		   &enable);
+
+	rma = _cxit_create_mr(&mr, &key);
+	cr_assert_not_null(rma);
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+
+	rma_ioc.addr = RMA_WIN_LEN + 1;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_DELIVERY_COMPLETE);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic flush success");
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1, "fi_cq_readerr error %d", ret);
+	cr_assert_eq(err.err, FI_EIO, "Unexpected error value: %d", err.err);
+
+	_cxit_destroy_mr(&mr);
+}
+
+/* Perform an AMO that uses a flushing ZBR at the target. */
+Test(atomic, flush)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t exp_remote = 0;
+	uint64_t *rma;
+	int ret;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_rma_ioc rma_ioc;
+	int count = 0;
+	uint64_t flushes_start;
+	uint64_t flushes_end;
+	uint64_t key = RMA_WIN_KEY;
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS,
+				 &flushes_start, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+
+	rma = _cxit_create_mr(&mr, &key);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	operand1 = 1;
+	exp_remote += operand1;
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_DELIVERY_COMPLETE);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+	count++;
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	_cxit_destroy_mr(&mr);
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS,
+				 &flushes_end, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+	cr_assert(flushes_end > flushes_start);
+}
+
+/* Test AMO FI_MORE */
+Test(atomic, more)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t exp_remote;
+	uint64_t *rma;
+	int ret;
+	int i = 0;
+	uint64_t key = 0xa;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_rma_ioc rma_ioc;
+
+
+	rma = _cxit_create_mr(&mr, &key);
+	exp_remote = 0;
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	operand1 = 1;
+	exp_remote += operand1;
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	/* Ensure no completion before the doorbell ring */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN,
+			     "write failed %d", ret);
+	} while (i++ < 100000);
+
+	operand1 = 3;
+	exp_remote += operand1;
+
+	ret = fi_atomicmsg(cxit_ep, &msg, 0);
+	cr_assert(ret == FI_SUCCESS, "Return code = %d", ret);
+
+	/* Wait for two events. */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	_cxit_destroy_mr(&mr);
+}
+
+/* Test AMO FI_FENCE */
+Test(atomic, fence)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t exp_remote;
+	uint64_t *rma;
+	int ret;
+	uint64_t key = 0xa;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_rma_ioc rma_ioc;
+
+	rma = _cxit_create_mr(&mr, &key);
+	exp_remote = 0;
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	operand1 = 1;
+	exp_remote += operand1;
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_FENCE);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	_cxit_destroy_mr(&mr);
+}
+
+void cxit_amo_setup_nofence(void)
+{
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = CXIP_EP_PRI_CAPS;
+	cxit_setup_rma();
+}
+
+/* Test AMO without FI_FENCE */
+Test(atomic_nofence, nofence,
+     .init = cxit_amo_setup_nofence,
+     .fini = cxit_teardown_rma)
+{
+	struct mem_region mr;
+	uint64_t operand1;
+	uint64_t exp_remote;
+	uint64_t *rma;
+	int ret;
+	uint64_t key = 0xa;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_rma_ioc rma_ioc;
+
+	rma = _cxit_create_mr(&mr, &key);
+	exp_remote = 0;
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	operand1 = 1;
+	exp_remote += operand1;
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_FENCE);
+	cr_assert(ret == -FI_EINVAL);
+
+	_cxit_destroy_mr(&mr);
+}
+
+void cxit_setup_amo_opt(void)
+{
+	cxit_setup_getinfo();
+
+	/* Explicitly request unordered RMA */
+	cxit_fi_hints->caps = FI_ATOMIC;
+	cxit_fi_hints->tx_attr->msg_order = 0;
+
+	cxit_setup_rma();
+}
+
+TestSuite(amo_opt, .init = cxit_setup_amo_opt, .fini = cxit_teardown_rma,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test Unreliable/HRP AMOs */
+Test(amo_opt, hrp)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t exp_remote;
+	uint64_t *rma;
+	int ret;
+	uint64_t key = 0xa;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_rma_ioc rma_ioc;
+	uint64_t res_start;
+	uint64_t res_end;
+	uint64_t hrp_acks_start;
+	uint64_t hrp_acks_end;
+	struct cxip_ep *cxi_ep;
+	uint64_t compare;
+	uint64_t result;
+	struct fi_ioc compare_ioc = { .count = 1, .addr = &compare };
+	struct fi_ioc result_ioc = { .count = 1, .addr = &result };
+
+	/* HRP not supported in netsim */
+	cxi_ep = container_of(cxit_ep, struct cxip_ep, ep);
+	if (is_netsim(cxi_ep->ep_obj))
+		return;
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT,
+				 &res_start, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	ret = cxit_dom_read_cntr(C_CNTR_HNI_HRP_ACK,
+				 &hrp_acks_start, NULL, false);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	rma = _cxit_create_mr(&mr, &key);
+	exp_remote = 0;
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	operand1 = 1;
+
+	exp_remote += operand1;
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_UNRELIABLE);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* HRP requires UNRELIABLE */
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_HRP);
+	cr_assert(ret == -FI_EINVAL, "Return code  = %d", ret);
+
+	exp_remote += operand1;
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_UNRELIABLE | FI_CXI_HRP);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	exp_remote += operand1;
+	ret = fi_atomicmsg(cxit_ep, &msg, 0);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	/* HRP FAMO is invalid */
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_CXI_UNRELIABLE | FI_CXI_HRP);
+	cr_assert(ret == -FI_EBADFLAGS, "Return code  = %d", ret);
+
+	/* Try unreliable FAMO */
+	exp_remote += operand1;
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_CXI_UNRELIABLE);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	/* wait a second to check the operation was performed. The HRP response
+	 * returns before the request hits the NIC.
+	 */
+	usleep(1000);
+
+	/* Validate sent data */
+	cr_assert_eq(*rma, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     *rma, exp_remote);
+
+	/* HRP compare AMO is invalid */
+	ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1,
+				   &result_ioc, NULL, 1,
+				   FI_CXI_UNRELIABLE | FI_CXI_HRP);
+	cr_assert(ret == -FI_EBADFLAGS, "Return code  = %d", ret);
+
+	/* Try unreliable compare AMO. */
+	msg.op = FI_CSWAP;
+	compare = exp_remote;
+	operand1 = exp_remote + 1;
+	ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1,
+				   &result_ioc, NULL, 1, FI_CXI_UNRELIABLE);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	sleep(1);
+
+	/* Validate data */
+	cr_assert_eq(*rma, operand1,
+		     "Result = %ld, expected = %ld",
+		     *rma, operand1);
+	cr_assert_eq(result, exp_remote,
+		     "Result = %ld, expected = %ld",
+		     result, exp_remote);
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT,
+				 &res_end, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	ret = cxit_dom_read_cntr(C_CNTR_HNI_HRP_ACK,
+				 &hrp_acks_end, NULL, false);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	cr_assert_eq(hrp_acks_end - hrp_acks_start, 1,
+		     "unexpected hrp_acks count: %lu\n",
+		     hrp_acks_end - hrp_acks_start);
+	cr_assert_eq(res_end - res_start, 4,
+		     "unexpected restricted packets count: %lu\n",
+		     res_end - res_start);
+
+	/* HRP does not support Fetching AMOS. */
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_CXI_UNRELIABLE | FI_CXI_HRP);
+	cr_assert(ret == -FI_EBADFLAGS, "Return code  = %d", ret);
+
+	ret = fi_compare_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				   &result_ioc, NULL, 1,
+				   FI_CXI_UNRELIABLE | FI_CXI_HRP);
+	cr_assert(ret == -FI_EBADFLAGS, "Return code  = %d", ret);
+
+	_cxit_destroy_mr(&mr);
+}
+
+Test(atomic, std_mr_inject)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t operand1;
+	uint64_t exp_remote = 0;
+	uint64_t *rma;
+	int ret;
+	int count = 0;
+	int i;
+	uint64_t win_key = CXIP_PTL_IDX_MR_OPT_CNT;
+
+	rma = _cxit_create_mr(&mr, &win_key);
+	cr_assert_eq(*rma, exp_remote,
+			"Result = %ld, expected = %ld",
+			*rma, exp_remote);
+
+	operand1 = 1;
+
+	for (i = 0; i < 10; i++) {
+		exp_remote += operand1;
+		ret = fi_inject_atomic(cxit_ep, &operand1, 1,
+				cxit_ep_fi_addr, 0, win_key,
+				FI_UINT64, FI_SUM);
+		cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+		count++;
+	}
+
+	/* Corrupt the user operand buffer to make sure the NIC is not using it
+	 * for an inject.
+	 */
+	operand1 = 0;
+
+	while (fi_cntr_read(cxit_write_cntr) != count)
+		;
+
+	cr_assert_eq(*rma, exp_remote,
+			"Result = %ld, expected = %ld",
+			*rma, exp_remote);
+
+	/* Make sure no events were delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	_cxit_destroy_mr(&mr);
+}
+
+/* Test ERRATA-2794 32bit non-fetch AMO with HRP work-around */
+Test(amo_opt, errata_2794)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	union {
+		uint32_t	_32bit;
+		uint64_t	_64bit;
+	} operand, exp_remote, *rma;
+	int ret;
+	uint64_t key = 0xa;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_rma_ioc rma_ioc;
+	struct cxip_ep *cxi_ep;
+
+	/* HRP not supported in netsim */
+	cxi_ep = container_of(cxit_ep, struct cxip_ep, ep);
+	if (is_netsim(cxi_ep->ep_obj))
+		return;
+
+	rma = _cxit_create_mr(&mr, &key);
+
+	ioc.addr = &operand;
+	ioc.count = 1;
+
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	/* Use 64-bit to make sure we are using a HRP communication profile */
+	exp_remote._64bit = 0;
+	cr_assert_eq(rma->_64bit, exp_remote._64bit,
+		     "Result = %" PRId64 ", expected = %" PRId64,
+		     rma->_64bit, exp_remote._64bit);
+
+	operand._64bit = 1UL;
+	exp_remote._64bit += operand._64bit;
+
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_UNRELIABLE | FI_CXI_HRP);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	/* wait a second to check the operation was performed. The HRP response
+	 * returns before the request hits the NIC. Validate data and that
+	 * CQ configured for HRP.
+	 */
+	usleep(1000);
+	cr_assert_eq(rma->_64bit, exp_remote._64bit,
+		     "Result = %" PRId64 ", expected = %" PRId64,
+		     rma->_64bit, exp_remote._64bit);
+
+	/* ERRATA-2794 */
+	rma->_32bit = 0;
+	exp_remote._32bit = 0;
+	msg.datatype = FI_UINT32;
+
+	operand._32bit = 1;
+	exp_remote._32bit += operand._32bit;
+
+	ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_UNRELIABLE | FI_CXI_HRP);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	/* wait a second to check the operation was performed, and validate
+	 * data.
+	 */
+	usleep(1000);
+	cr_assert_eq(rma->_32bit, exp_remote._32bit,
+		     "Result = %d, expected = %d",
+		     rma->_32bit, exp_remote._32bit);
+
+	/* Perform successive 32-bit unsigned non-fetching atomic, no
+	 * communication profile change would be required.
+	 */
+	exp_remote._32bit += operand._32bit;
+	ret = fi_atomicmsg(cxit_ep, &msg, 0);
+	cr_assert(ret == FI_SUCCESS, "Return code  = %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	/* wait a second to check the operation was performed, and
+	 * validate data.
+	 */
+	usleep(1000);
+	cr_assert_eq(rma->_32bit, exp_remote._32bit,
+		     "Result = %d, expected = %d",
+		     rma->_32bit, exp_remote._32bit);
+
+	_cxit_destroy_mr(&mr);
+}
+
+static void amo_hybrid_mr_desc_test_runner(bool fetching, bool compare,
+					   bool cq_events, bool buf_mr,
+					   bool compare_mr, bool result_mr,
+					   bool mswap, bool read, bool flush)
+{
+	struct mem_region buf_window;
+	struct mem_region compare_window;
+	struct mem_region result_window;
+	struct mem_region remote_window;
+	uint64_t remote_key = 0x1;
+	uint64_t buf_key = 0x2;
+	uint64_t compare_key = 0x3;
+	uint64_t result_key = 0x4;
+	int win_len = 1;
+	void *buf_desc[1] = {};
+	void *compare_desc[1] = {};
+	void *result_desc[1] = {};
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc = {};
+	struct fi_rma_ioc rma_ioc = {};
+	struct fi_ioc fetch_ioc = {};
+	struct fi_ioc compare_ioc = {};
+	int ret;
+	uint64_t cqe_flags = fetching ? FI_ATOMIC | FI_READ :
+		FI_ATOMIC | FI_WRITE;
+	struct fid_cntr *cntr = fetching ? cxit_read_cntr : cxit_write_cntr;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t amo_flags = cq_events ? FI_COMPLETION : 0;
+
+	if (flush)
+		amo_flags |= FI_DELIVERY_COMPLETE;
+	else
+		amo_flags |= FI_TRANSMIT_COMPLETE;
+
+	ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &buf_key,
+			&buf_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &compare_key,
+			&compare_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &result_key,
+			&result_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = mr_create(win_len, FI_REMOTE_READ | FI_REMOTE_WRITE, 0x3,
+			&remote_key, &remote_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	if (buf_mr)
+		buf_desc[0] = fi_mr_desc(buf_window.mr);
+
+	if (compare_mr)
+		compare_desc[0] = fi_mr_desc(compare_window.mr);
+
+	if (result_mr)
+		result_desc[0] = fi_mr_desc(result_window.mr);
+
+	ioc.addr = buf_window.mem;
+	ioc.count = 1;
+
+	rma_ioc.count = 1;
+	rma_ioc.key = remote_key;
+
+	msg.msg_iov = &ioc;
+	msg.desc = buf_desc;
+	msg.iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+
+	if (!compare) {
+		msg.datatype = FI_UINT8;
+
+		if (fetching && read)
+			msg.op = FI_ATOMIC_READ;
+		else
+			msg.op = FI_SUM;
+
+		*buf_window.mem = 1;
+		*result_window.mem = 0;
+		*remote_window.mem = 1;
+
+		if (fetching) {
+			fetch_ioc.addr = result_window.mem;
+			fetch_ioc.count = 1;
+
+			ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc,
+						 result_desc, 1, amo_flags);
+			cr_assert(ret == FI_SUCCESS);
+		} else {
+			ret = fi_atomicmsg(cxit_ep, &msg, amo_flags);
+			cr_assert(ret == FI_SUCCESS);
+		}
+
+		while (1) {
+			ret = fi_cntr_wait(cntr, 1, 1000);
+			if (ret == FI_SUCCESS)
+				break;
+		}
+
+		if (!read)
+			cr_assert_eq(*remote_window.mem, 2,
+				     "Data mismatch: expected=2 got=%d\n",
+				     *remote_window.mem);
+
+		if (fetching)
+			cr_assert_eq(*result_window.mem, 1,
+				     "Data mismatch: expected=1 got=%d\n",
+				     *result_window.mem);
+	} else if (mswap) {
+		msg.datatype = FI_UINT8;
+		msg.op = FI_MSWAP;
+
+		compare_ioc.addr = compare_window.mem;
+		compare_ioc.count = 1;
+
+		fetch_ioc.addr = result_window.mem;
+		fetch_ioc.count = 1;
+
+		*buf_window.mem = 0xA0;
+		*compare_window.mem = 0xB;
+		*result_window.mem = 1;
+		*remote_window.mem = 0xF;
+
+		ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc,
+					   compare_desc, 1, &fetch_ioc,
+					   result_desc, 1, amo_flags);
+		cr_assert_eq(ret, FI_SUCCESS, "Bad rc=%d\n", ret);
+
+		while (1) {
+			ret = fi_cntr_wait(cntr, 1, 1000);
+			if (ret == FI_SUCCESS)
+				break;
+		}
+
+		cr_assert_eq(*remote_window.mem, 4,
+			     "Data mismatch: expected=4 got=%d\n",
+			     *remote_window.mem);
+
+		cr_assert_eq(*result_window.mem, 0xF,
+			     "Data mismatch: expected=0xF got=%d\n",
+			     *result_window.mem);
+	} else {
+		msg.datatype = FI_UINT8;
+		msg.op = FI_CSWAP;
+
+		compare_ioc.addr = compare_window.mem;
+		compare_ioc.count = 1;
+
+		fetch_ioc.addr = result_window.mem;
+		fetch_ioc.count = 1;
+
+		*buf_window.mem = 3;
+		*compare_window.mem = 1;
+		*result_window.mem = 0;
+		*remote_window.mem = 1;
+
+		ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc,
+					   compare_desc, 1, &fetch_ioc,
+					   result_desc, 1, amo_flags);
+		cr_assert_eq(ret, FI_SUCCESS, "Bad rc=%d\n", ret);
+
+		while (1) {
+			ret = fi_cntr_wait(cntr, 1, 1000);
+			if (ret == FI_SUCCESS)
+				break;
+		}
+
+		cr_assert_eq(*remote_window.mem, 3,
+			     "Data mismatch: expected=3 got=%d\n",
+			     *remote_window.mem);
+
+		cr_assert_eq(*result_window.mem, 1,
+			     "Data mismatch: expected=1 got=%d\n",
+			     *result_window.mem);
+	}
+
+	if (cq_events) {
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, cqe_flags, NULL);
+	}
+
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	mr_destroy(&remote_window);
+	mr_destroy(&result_window);
+	mr_destroy(&compare_window);
+	mr_destroy(&buf_window);
+}
+
+TestSuite(amo_hybrid_mr_desc, .init = cxit_setup_rma_hybrid_mr_desc,
+	  .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(amo_hybrid_mr_desc, non_fetching_no_mr_desc_no_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(false, false, false, false, false,
+				       false, false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, non_fetching_buf_result_mr_desc_no_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(false, false, false, true, false, true,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, fetching_no_mr_desc_no_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, false, false, false, false,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, fetching_buf_result_mr_desc_no_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, false, true, false, true,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, non_fetching_no_mr_desc_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(false, false, true, false, false, false,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, non_fetching_buf_result_mr_desc_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(false, false, true, true, false, true,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, fetching_no_mr_desc_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, true, false, false, false,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, fetching_buf_result_mr_desc_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, true, true, false, true,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, compare_no_mr_desc_no_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, false, false, false, false,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, compare_buf_compare_result_mr_desc_no_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, false, true, true, true,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, compare_no_mr_desc_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, true, false, false, false,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, compare_buf_compare_result_mr_desc_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, true, true, true, true,
+				       false, false, false);
+}
+
+Test(amo_hybrid_mr_desc, compare_mswap_buf_compare_result_mr_desc_no_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, false, true, true, true,
+				       true, false, false);
+}
+
+Test(amo_hybrid_mr_desc, compare_mswap_buf_compare_result_mr_desc_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, true, true, true, true,
+				       true, false, false);
+}
+
+Test(amo_hybrid_mr_desc, read_buf_result_mr_desc_no_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, false, true, false, true,
+				       false, true, false);
+}
+
+Test(amo_hybrid_mr_desc, read_buf_result_mr_desc_cqe)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, true, true, false, true,
+				       false, true, false);
+}
+
+Test(amo_hybrid_mr_desc, non_fetching_no_mr_desc_no_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(false, false, false, false, false,
+				       false, false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, non_fetching_buf_result_mr_desc_no_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(false, false, false, true, false, true,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, fetching_no_mr_desc_no_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, false, false, false, false,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, fetching_buf_result_mr_desc_no_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, false, true, false, true,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, non_fetching_no_mr_desc_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(false, false, true, false, false, false,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, non_fetching_buf_result_mr_desc_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(false, false, true, true, false, true,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, fetching_no_mr_desc_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, true, false, false, false,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, fetching_buf_result_mr_desc_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, true, true, false, true,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, compare_no_mr_desc_no_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, false, false, false, false,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, compare_buf_compare_result_mr_desc_no_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, false, true, true, true,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, compare_no_mr_desc_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, true, false, false, false,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, compare_buf_compare_result_mr_desc_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, true, true, true, true,
+				       false, false, true);
+}
+
+Test(amo_hybrid_mr_desc, compare_mswap_buf_compare_result_mr_desc_no_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, false, true, true, true,
+				       true, false, true);
+}
+
+Test(amo_hybrid_mr_desc, compare_mswap_buf_compare_result_mr_desc_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, true, true, true, true, true,
+				       true, false, true);
+}
+
+Test(amo_hybrid_mr_desc, read_buf_result_mr_desc_no_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, false, true, false, true,
+				       false, true, true);
+}
+
+Test(amo_hybrid_mr_desc, read_buf_result_mr_desc_cqe_flush)
+{
+	amo_hybrid_mr_desc_test_runner(true, false, true, true, false, true,
+				       false, true, true);
+}
+
+Test(amo_hybrid_mr_desc, fetching_amo_failure)
+{
+	struct mem_region buf_window;
+	struct mem_region result_window;
+	uint64_t remote_key = 0x1;
+	uint64_t buf_key = 0x2;
+	uint64_t result_key = 0x4;
+	int win_len = 1;
+	void *buf_desc[1] = {};
+	void *result_desc[1] = {};
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc = {};
+	struct fi_rma_ioc rma_ioc = {};
+	struct fi_ioc fetch_ioc = {};
+	int ret;
+	struct fid_cntr *cntr = cxit_read_cntr;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_cq_err_entry cq_err;
+	uint64_t amo_flags = FI_TRANSMIT_COMPLETE;
+
+	ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &buf_key,
+			&buf_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &result_key,
+			&result_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	buf_desc[0] = fi_mr_desc(buf_window.mr);
+	result_desc[0] = fi_mr_desc(result_window.mr);
+
+	ioc.addr = buf_window.mem;
+	ioc.count = 1;
+
+	rma_ioc.count = 1;
+	rma_ioc.key = remote_key;
+
+	msg.msg_iov = &ioc;
+	msg.desc = buf_desc;
+	msg.iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.datatype = FI_UINT8;
+	msg.op = FI_SUM;
+
+	fetch_ioc.addr = result_window.mem;
+	fetch_ioc.count = 1;
+
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc, result_desc, 1,
+				 amo_flags);
+	cr_assert(ret == FI_SUCCESS);
+
+	while (fi_cntr_readerr(cntr) != 1)
+		;
+
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == -FI_EAVAIL);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &cq_err, 0);
+	cr_assert(ret == 1);
+
+	cr_assert(cq_err.flags == (FI_ATOMIC | FI_READ));
+	cr_assert(cq_err.op_context == NULL);
+
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	mr_destroy(&result_window);
+	mr_destroy(&buf_window);
+}
+
+Test(amo_hybrid_mr_desc, amo_failure)
+{
+	struct mem_region buf_window;
+	uint64_t remote_key = 0x1;
+	uint64_t buf_key = 0x2;
+	int win_len = 1;
+	void *buf_desc[1] = {};
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc = {};
+	struct fi_rma_ioc rma_ioc = {};
+	int ret;
+	struct fid_cntr *cntr = cxit_write_cntr;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_cq_err_entry cq_err;
+	uint64_t amo_flags = FI_TRANSMIT_COMPLETE;
+
+	ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &buf_key,
+			&buf_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	buf_desc[0] = fi_mr_desc(buf_window.mr);
+
+	ioc.addr = buf_window.mem;
+	ioc.count = 1;
+
+	rma_ioc.count = 1;
+	rma_ioc.key = remote_key;
+
+	msg.msg_iov = &ioc;
+	msg.desc = buf_desc;
+	msg.iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.datatype = FI_UINT8;
+	msg.op = FI_SUM;
+
+	ret = fi_atomicmsg(cxit_ep, &msg, amo_flags);
+	cr_assert(ret == FI_SUCCESS);
+
+	while (fi_cntr_readerr(cntr) != 1)
+		;
+
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == -FI_EAVAIL);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &cq_err, 0);
+	cr_assert(ret == 1);
+
+	cr_assert(cq_err.flags == (FI_ATOMIC | FI_WRITE));
+	cr_assert(cq_err.op_context == NULL);
+
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	mr_destroy(&buf_window);
+}
+
+Test(amo_hybrid_mr_desc, invalid_addr_fetching_amo_failure)
+{
+	struct mem_region buf_window;
+	struct mem_region result_window;
+	uint64_t remote_key = 0x1;
+	uint64_t result_key = 0x4;
+	int win_len = 1;
+	void *buf_desc[1] = {};
+	void *result_desc[1] = {};
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc = {};
+	struct fi_rma_ioc rma_ioc = {};
+	struct fi_ioc fetch_ioc = {};
+	int ret;
+	struct fid_cntr *cntr = cxit_read_cntr;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_cq_err_entry cq_err;
+	uint64_t amo_flags = FI_TRANSMIT_COMPLETE;
+
+	ret = mr_create(win_len, FI_REMOTE_READ | FI_REMOTE_WRITE,
+			0xa, &remote_key, &buf_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &result_key,
+			&result_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	buf_desc[0] = fi_mr_desc(buf_window.mr);
+	result_desc[0] = fi_mr_desc(result_window.mr);
+
+	ioc.addr = buf_window.mem;
+	ioc.count = 1;
+
+	rma_ioc.count = 1;
+	rma_ioc.key = remote_key;
+
+	msg.msg_iov = &ioc;
+	msg.desc = buf_desc;
+	msg.iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.datatype = FI_UINT8;
+	msg.op = FI_SUM;
+
+	fetch_ioc.addr = result_window.mem + 0xffffffffff;
+	fetch_ioc.count = 1;
+
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc, result_desc, 1,
+				 amo_flags);
+	cr_assert(ret == FI_SUCCESS);
+
+	while (fi_cntr_readerr(cntr) != 1)
+		;
+
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == -FI_EAVAIL);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &cq_err, 0);
+	cr_assert(ret == 1);
+
+	cr_assert(cq_err.flags == (FI_ATOMIC | FI_READ));
+	cr_assert(cq_err.op_context == NULL);
+
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	mr_destroy(&result_window);
+	mr_destroy(&buf_window);
+}
+
+struct fi_query_atomic_test {
+	enum fi_datatype datatype;
+	enum fi_op op;
+	bool valid_atomic_attr;
+	uint64_t flags;
+	int expected_rc;
+	int amo_remap_to_pcie_fadd;
+};
+
+ParameterizedTestParameters(atomic, query_atomic)
+{
+	static struct fi_query_atomic_test params[] = {
+		/* NULL atomic attributes. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_MIN,
+			.valid_atomic_attr = false,
+			.flags = 0,
+			.expected_rc = -FI_EINVAL,
+		},
+		/* Bad dataype. */
+		{
+			.datatype = 0xffff,
+			.op = FI_MIN,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EINVAL,
+		},
+		/* Bad op. */
+		{
+			.datatype = FI_INT8,
+			.op = 0xffff,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EINVAL,
+		},
+		/* Bad flags. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_MIN,
+			.valid_atomic_attr = true,
+			.flags = FI_COMPARE_ATOMIC | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EINVAL,
+		},
+		/* Valid SUM FI_INT8. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = FI_SUCCESS,
+		},
+		/* Valid SUM FI_INT8 fetching. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_FETCH_ATOMIC,
+			.expected_rc = FI_SUCCESS,
+		}
+	};
+	size_t param_sz = ARRAY_SIZE(params);
+
+	return cr_make_param_array(struct fi_query_atomic_test, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct fi_query_atomic_test *params, atomic, query_atomic)
+{
+	int ret;
+	struct fi_atomic_attr atomic_attr;
+	struct fi_atomic_attr *attr =
+		params->valid_atomic_attr ? &atomic_attr : NULL;
+
+	ret = fi_query_atomic(cxit_domain, params->datatype, params->op, attr,
+			      params->flags);
+
+	cr_assert_eq(ret, params->expected_rc,
+		     "Unexpected fi_query_atomic() rc: expected=%d got=%d\n",
+		     params->expected_rc, ret);
+}
+
+TestSuite(pcie_atomic, .init = reset_amo_remap_to_pcie_fadd,
+	  .fini = reset_amo_remap_to_pcie_fadd,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+ParameterizedTestParameters(pcie_atomic, query_atomic)
+{
+	static struct fi_query_atomic_test params[] = {
+		/* Valid SUM FI_INT8. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = FI_SUCCESS,
+			.amo_remap_to_pcie_fadd = -1,
+		},
+
+		/* Invalid PCIe SUM FI_INT8. Only 32 and 64 bit operations are
+		 * supported.
+		 */
+		{
+			.datatype = FI_INT8,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = -1,
+		},
+
+		/* Valid SUM FI_INT32. */
+		{
+			.datatype = FI_INT32,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = FI_SUCCESS,
+			.amo_remap_to_pcie_fadd = -1,
+		},
+
+		/* Invalid PCIe SUM FI_INT32 due to amo_remap_to_pcie_fadd being
+		 * -1.
+		 */
+		{
+			.datatype = FI_INT32,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = -1,
+		},
+
+		/* Invalid PCIe SUM FI_INT32 due to missing FI_FETCH_ATOMIC.
+		 */
+		{
+			.datatype = FI_INT32,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_INT32 since FI_COMPARE_ATOMIC is invalid.
+		 */
+		{
+			.datatype = FI_INT32,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_COMPARE_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Valid PCIe SUM FI_INT32 remapping C_AMO_OP_MIN. */
+		{
+			.datatype = FI_INT32,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = FI_SUCCESS,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Valid PCIe SUM FI_UINT32 remapping C_AMO_OP_MIN. */
+		{
+			.datatype = FI_UINT32,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = FI_SUCCESS,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Valid PCIe SUM FI_INT64 remapping C_AMO_OP_MIN. */
+		{
+			.datatype = FI_INT64,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = FI_SUCCESS,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Valid PCIe SUM FI_UINT64 remapping C_AMO_OP_MIN. */
+		{
+			.datatype = FI_UINT64,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = FI_SUCCESS,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_INT8. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_UINT8. */
+		{
+			.datatype = FI_UINT8,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_INT16. */
+		{
+			.datatype = FI_INT16,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_UINT16. */
+		{
+			.datatype = FI_UINT16,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_FLOAT. */
+		{
+			.datatype = FI_FLOAT,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_DOUBLE. */
+		{
+			.datatype = FI_DOUBLE,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_FLOAT_COMPLEX. */
+		{
+			.datatype = FI_FLOAT_COMPLEX,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_DOUBLE_COMPLEX. */
+		{
+			.datatype = FI_DOUBLE_COMPLEX,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_LONG_DOUBLE. */
+		{
+			.datatype = FI_LONG_DOUBLE,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid PCIe SUM FI_LONG_DOUBLE_COMPLEX. */
+		{
+			.datatype = FI_LONG_DOUBLE_COMPLEX,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid FI_MIN operation since it is remapped. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_MIN,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Invalid FI_MAX operation since it is remapped. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_MAX,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MAX,
+		},
+
+		/* Invalid FI_SUM operation without PCIe AMO since it is
+		 * remapped.
+		 */
+		{
+			.datatype = FI_INT8,
+			.op = FI_SUM,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_SUM,
+		},
+
+		/* Invalid FI_LOR operation since it is remapped. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_LOR,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_LOR,
+		},
+
+		/* Invalid FI_LAND operation since it is remapped. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_LAND,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_LAND,
+		},
+
+		/* Invalid FI_BOR operation since it is remapped. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_BOR,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_BOR,
+		},
+
+		/* Invalid FI_BAND operation since it is remapped. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_BAND,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_BAND,
+		},
+
+		/* Invalid FI_LXOR operation since it is remapped. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_LXOR,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_LXOR,
+		},
+
+		/* Invalid FI_BXOR operation since it is remapped. */
+		{
+			.datatype = FI_INT8,
+			.op = FI_BXOR,
+			.valid_atomic_attr = true,
+			.flags = 0,
+			.expected_rc = -FI_EOPNOTSUPP,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_BXOR,
+		},
+	};
+	size_t param_sz = ARRAY_SIZE(params);
+
+	return cr_make_param_array(struct fi_query_atomic_test, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct fi_query_atomic_test *params, pcie_atomic,
+		  query_atomic)
+{
+	int ret;
+	struct fi_atomic_attr atomic_attr;
+	struct fi_atomic_attr *attr =
+		params->valid_atomic_attr ? &atomic_attr : NULL;
+
+	/* The AMO remap value must be set before libfabric domain is allocated.
+	 * Else, and inconsistent view of the AMO remap value will be read.
+	 */
+	set_amo_remap_to_pcie_fadd(params->amo_remap_to_pcie_fadd);
+	cxit_setup_rma();
+
+	ret = fi_query_atomic(cxit_domain, params->datatype, params->op, attr,
+			      params->flags);
+
+	cr_assert_eq(ret, params->expected_rc,
+		     "Unexpected fi_query_atomic() rc: expected=%d got=%d\n",
+		     params->expected_rc, ret);
+
+	cxit_teardown_rma();
+}
+
+struct fi_pcie_fadd_test {
+	enum fi_datatype dt;
+	union {
+		uint64_t u64_src;
+		int64_t s64_src;
+		uint32_t u32_src;
+		int32_t s32_src;
+	} src;
+	union {
+		uint64_t u64_dst;
+		int64_t s64_dst;
+		uint32_t u32_dst;
+		int32_t s32_dst;
+	} dst;
+	union {
+		uint64_t u64_result;
+		int64_t s64_result;
+		uint32_t u32_result;
+		int32_t s32_result;
+	} result;
+	int amo_remap_to_pcie_fadd;
+};
+
+ParameterizedTestParameters(pcie_atomic, fadd)
+{
+	static struct fi_pcie_fadd_test params[] = {
+		/* Interger overflow. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = 2147483647,
+			.dst.s32_dst = 1,
+			.result.s32_result = -2147483648,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_SWAP,
+		},
+
+		/* Unsigned interger overflow. */
+		{
+			.dt = FI_UINT32,
+			.src.u32_src = 0xFFFFFFFF,
+			.dst.u32_dst = 1,
+			.result.u32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_SWAP,
+		},
+
+		/* Long overflow. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = 9223372036854775807,
+			.dst.s64_dst = 1,
+			.result.u64_result = 0x8000000000000000,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_SWAP,
+		},
+
+		/* Unsigned long overflow. */
+		{
+			.dt = FI_UINT64,
+			.src.u64_src = 0xFFFFFFFFFFFFFFFF,
+			.dst.u64_dst = 1,
+			.result.u64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_SWAP,
+		},
+
+		/* Valid 32-bit AMO with C_AMO_OP_MIN remapped. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = -1,
+			.dst.s32_dst = 1,
+			.result.s32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Valid 64-bit AMO with C_AMO_OP_MIN remapped. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = -4294967296,
+			.dst.s64_dst = 4294967296,
+			.result.s64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MIN,
+		},
+
+		/* Valid 32-bit AMO with C_AMO_OP_MAX remapped. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = -1,
+			.dst.s32_dst = 1,
+			.result.s32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MAX,
+		},
+
+		/* Valid 64-bit AMO with C_AMO_OP_MAX remapped. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = -4294967296,
+			.dst.s64_dst = 4294967296,
+			.result.s64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_MAX,
+		},
+
+		/* Valid 32-bit AMO with C_AMO_OP_SUM remapped. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = -1,
+			.dst.s32_dst = 1,
+			.result.s32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_SUM,
+		},
+
+		/* Valid 64-bit AMO with C_AMO_OP_SUM remapped. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = -4294967296,
+			.dst.s64_dst = 4294967296,
+			.result.s64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_SUM,
+		},
+
+		/* Valid 32-bit AMO with C_AMO_OP_LOR remapped. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = -1,
+			.dst.s32_dst = 1,
+			.result.s32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_LOR,
+		},
+
+		/* Valid 64-bit AMO with C_AMO_OP_LOR remapped. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = -4294967296,
+			.dst.s64_dst = 4294967296,
+			.result.s64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_LOR,
+		},
+
+		/* Valid 32-bit AMO with C_AMO_OP_LAND remapped. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = -1,
+			.dst.s32_dst = 1,
+			.result.s32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_LAND,
+		},
+
+		/* Valid 64-bit AMO with C_AMO_OP_LAND remapped. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = -4294967296,
+			.dst.s64_dst = 4294967296,
+			.result.s64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_LAND,
+		},
+
+		/* Valid 32-bit AMO with C_AMO_OP_BOR remapped. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = -1,
+			.dst.s32_dst = 1,
+			.result.s32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_BOR,
+		},
+
+		/* Valid 64-bit AMO with C_AMO_OP_BOR remapped. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = -4294967296,
+			.dst.s64_dst = 4294967296,
+			.result.s64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_BOR,
+		},
+
+		/* Valid 32-bit AMO with C_AMO_OP_BAND remapped. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = -1,
+			.dst.s32_dst = 1,
+			.result.s32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_BAND,
+		},
+
+		/* Valid 64-bit AMO with C_AMO_OP_BAND remapped. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = -4294967296,
+			.dst.s64_dst = 4294967296,
+			.result.s64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_BAND,
+		},
+
+		/* Valid 32-bit AMO with C_AMO_OP_LXOR remapped. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = -1,
+			.dst.s32_dst = 1,
+			.result.s32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_LXOR,
+		},
+
+		/* Valid 64-bit AMO with C_AMO_OP_LXOR remapped. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = -4294967296,
+			.dst.s64_dst = 4294967296,
+			.result.s64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_LXOR,
+		},
+
+		/* Valid 32-bit AMO with C_AMO_OP_BXOR remapped. */
+		{
+			.dt = FI_INT32,
+			.src.s32_src = -1,
+			.dst.s32_dst = 1,
+			.result.s32_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_BXOR,
+		},
+
+		/* Valid 64-bit AMO with C_AMO_OP_BXOR remapped. */
+		{
+			.dt = FI_INT64,
+			.src.s64_src = -4294967296,
+			.dst.s64_dst = 4294967296,
+			.result.s64_result = 0,
+			.amo_remap_to_pcie_fadd = C_AMO_OP_BXOR,
+		},
+	};
+	size_t param_sz = ARRAY_SIZE(params);
+
+	return cr_make_param_array(struct fi_pcie_fadd_test, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct fi_pcie_fadd_test *params, pcie_atomic, fadd)
+{
+	int ret;
+	size_t amo_size;
+	uint64_t rkey = 0x1;
+	uint64_t nic_rkey = 0x2;
+	struct mem_region remote_window;
+	struct mem_region nic_remote_window;
+	union {
+		uint64_t u64_fetch;
+		int64_t s64_fetch;
+		uint32_t u32_fetch;
+		int32_t s32_fetch;
+	} fetch;
+	union {
+		uint64_t u64_fetch;
+		int64_t s64_fetch;
+		uint32_t u32_fetch;
+		int32_t s32_fetch;
+	} nic_fetch;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc = {};
+	struct fi_rma_ioc rma_ioc = {};
+	struct fi_ioc fetch_ioc = {};
+	struct fi_cq_tagged_entry cqe;
+	uint64_t cur_cpu_fetch_cntr;
+	uint64_t new_cpu_fetch_cntr;
+	struct cxip_ep *cxi_ep;
+
+	if (params->dt == FI_INT32 || params->dt == FI_UINT32)
+		amo_size = 4;
+	else
+		amo_size = 8;
+
+	/* The AMO remap value must be set before libfabric domain is allocated.
+	 * Else, and inconsistent view of the AMO remap value will be read.
+	 */
+	set_amo_remap_to_pcie_fadd(params->amo_remap_to_pcie_fadd);
+	cxit_setup_rma();
+
+	/* PCIe AMOs not supported on netsim. */
+	cxi_ep = container_of(cxit_ep, struct cxip_ep, ep);
+	if (is_netsim(cxi_ep->ep_obj))
+		goto teardown;
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_CPU_FTCH_AMO_REQS,
+				 &cur_cpu_fetch_cntr, NULL, true);
+	cr_assert(ret == 0);
+
+	/* Create target MR and copy destantion contents into it. */
+	ret = mr_create(amo_size, FI_REMOTE_READ | FI_REMOTE_WRITE, 0, &rkey,
+			&remote_window);
+	cr_assert(ret == FI_SUCCESS);
+	memcpy(remote_window.mem, &params->dst, amo_size);
+
+	/* Create another target MR to be used for NIC AMO SUM comparison to the
+	 * PCIe AMO.
+	 */
+	ret = mr_create(amo_size, FI_REMOTE_READ | FI_REMOTE_WRITE, 0,
+			&nic_rkey, &nic_remote_window);
+	cr_assert(ret == FI_SUCCESS);
+	memcpy(nic_remote_window.mem, &params->dst, amo_size);
+
+	/* Fill in fetching AMO desciptors. */
+	ioc.addr = &params->src;
+	ioc.count = 1;
+
+	rma_ioc.key = rkey;
+	rma_ioc.count = 1;
+
+	fetch_ioc.addr = &fetch;
+	fetch_ioc.count = 1;
+
+	msg.datatype = params->dt;
+	msg.op = FI_SUM;
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+
+	/* Issue PCIe fetch add. */
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc, NULL, 1,
+				 FI_TRANSMIT_COMPLETE | FI_COMPLETION |
+				 FI_CXI_PCIE_AMO);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+
+	/* Issue NIC fetching SUM AMO. */
+	if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) {
+		rma_ioc.key = nic_rkey;
+		fetch_ioc.addr = &nic_fetch;
+
+		ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc, NULL, 1,
+					FI_TRANSMIT_COMPLETE | FI_COMPLETION);
+		cr_assert(ret == FI_SUCCESS);
+
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+	}
+
+	if (params->dt == FI_INT32) {
+		/* Compare PCIe FADD to the expected values. */
+		cr_assert_eq(*((int32_t *)remote_window.mem),
+			     params->result.s32_result,
+			     "Unexpected remote AMO result: got=%d expected=%d\n",
+			     *((int32_t *)remote_window.mem),
+			     params->result.s32_result);
+		cr_assert_eq(fetch.s32_fetch, params->dst.s32_dst,
+			     "Unexpected fetch AMO result: got=%d expected=%d\n",
+			     fetch.s32_fetch, params->dst.s32_dst);
+
+		/* Compare PCIe FADD to the NIC fetch add/sum values. */
+		if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) {
+			cr_assert_eq(*((int32_t *)remote_window.mem),
+				     *((int32_t *)nic_remote_window.mem),
+				     "Unexpected remote AMO result: got=%d expected=%d\n",
+				     *((int32_t *)remote_window.mem),
+				     *((int32_t *)nic_remote_window.mem));
+			cr_assert_eq(fetch.s32_fetch, nic_fetch.s32_fetch,
+				     "Unexpected fetch AMO result: got=%d expected=%d\n",
+				     fetch.s32_fetch, nic_fetch.s32_fetch);
+		}
+	} else if (params->dt == FI_UINT32) {
+		/* Compare PCIe FADD to the expected values. */
+		cr_assert_eq(*((uint32_t *)remote_window.mem),
+			     params->result.u32_result,
+			     "Unexpected remote AMO result: got=%u expected=%u\n",
+			     *((uint32_t *)remote_window.mem),
+			     params->result.s32_result);
+		cr_assert_eq(fetch.u32_fetch, params->dst.u32_dst,
+			     "Unexpected fetch AMO result: got=%u expected=%u\n",
+			     fetch.u32_fetch, params->dst.u32_dst);
+
+		/* Compare PCIe FADD to the NIC fetch add/sum values. */
+		if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) {
+			cr_assert_eq(*((uint32_t *)remote_window.mem),
+				     *((uint32_t *)nic_remote_window.mem),
+				     "Unexpected remote AMO result: got=%u expected=%u\n",
+				     *((uint32_t *)remote_window.mem),
+				     *((uint32_t *)nic_remote_window.mem));
+			cr_assert_eq(fetch.u32_fetch, nic_fetch.u32_fetch,
+				     "Unexpected fetch AMO result: got=%u expected=%u\n",
+				      fetch.u32_fetch, nic_fetch.u32_fetch);
+		}
+	} else if (params->dt == FI_INT64) {
+		/* Compare PCIe FADD to the expected values. */
+		cr_assert_eq(*((int64_t *)remote_window.mem),
+			     params->result.s64_result,
+			     "Unexpected remote AMO result: got=%ld expected=%ld\n",
+			     *((int64_t *)remote_window.mem),
+			     params->result.s64_result);
+		cr_assert_eq(fetch.s64_fetch, params->dst.s64_dst,
+			     "Unexpected fetch AMO result: got=%ld expected=%ld\n",
+			     fetch.s64_fetch, params->dst.s64_dst);
+
+		/* Compare PCIe FADD to the NIC fetch add/sum values. */
+		if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) {
+			cr_assert_eq(*((int64_t *)remote_window.mem),
+				     *((int64_t *)nic_remote_window.mem),
+				     "Unexpected remote AMO result: got=%ld expected=%ld\n",
+				     *((int64_t *)remote_window.mem),
+				     *((int64_t *)nic_remote_window.mem));
+			cr_assert_eq(fetch.s64_fetch, nic_fetch.s64_fetch,
+				     "Unexpected fetch AMO result: got=%ld expected=%ld\n",
+				     fetch.s64_fetch, nic_fetch.s64_fetch);
+		}
+	} else {
+		/* Compare PCIe FADD to the expected values. */
+		cr_assert_eq(*((uint64_t *)remote_window.mem),
+			     params->result.u64_result,
+			     "Unexpected remote AMO result: got=%lu expected=%lu\n",
+			     *((uint64_t *)remote_window.mem),
+			     params->result.u64_result);
+		cr_assert_eq(fetch.u64_fetch, params->dst.u64_dst,
+			     "Unexpected fetch AMO result: got=%lu expected=%lu\n",
+			     fetch.u64_fetch, params->dst.u64_dst);
+
+		/* Compare PCIe FADD to the NIC fetch add/sum values. */
+		if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) {
+			cr_assert_eq(*((uint64_t *)remote_window.mem),
+				     *((uint64_t *)nic_remote_window.mem),
+				     "Unexpected remote AMO result: got=%lu expected=%lu\n",
+				     *((uint64_t *)remote_window.mem),
+				     *((uint64_t *)nic_remote_window.mem));
+			cr_assert_eq(fetch.u64_fetch, nic_fetch.u64_fetch,
+				     "Unexpected fetch AMO result: got=%lu expected=%lu\n",
+				     fetch.u64_fetch, nic_fetch.u64_fetch);
+		}
+	}
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_CPU_FTCH_AMO_REQS,
+				 &new_cpu_fetch_cntr, NULL, true);
+	cr_assert(ret == 0);
+
+	cr_assert(cur_cpu_fetch_cntr + 1 == new_cpu_fetch_cntr);
+
+	mr_destroy(&nic_remote_window);
+	mr_destroy(&remote_window);
+
+teardown:
+	cxit_teardown_rma();
+}
diff --git a/prov/cxi/test/auth_key.c b/prov/cxi/test/auth_key.c
new file mode 100644
index 00000000000..57c30d37a15
--- /dev/null
+++ b/prov/cxi/test/auth_key.c
@@ -0,0 +1,2940 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <criterion/criterion.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+void *memdup(const void *src, size_t n)
+{
+	void *dest;
+
+	dest = malloc(n);
+	if (dest == NULL)
+		return NULL;
+
+	return memcpy(dest, src, n);
+}
+
+TestSuite(auth_key, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test fi_getinfo() verification of hints argument. */
+Test(auth_key, invalid_auth_key_size_domain_attr_hints)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = CXI_DEFAULT_SVC_ID,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->auth_key_size = 1;
+	hints->domain_attr->auth_key = memdup(&auth_key, 1);
+	cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+}
+
+/* Test fi_getinfo() verification of hints argument. */
+Test(auth_key, missing_auth_key_size_domain_attr_hints)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = CXI_DEFAULT_SVC_ID,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	hints->domain_attr->auth_key = memdup(&auth_key, 1);
+	cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+}
+
+/* Test fi_getinfo() verification of hints argument. */
+Test(auth_key, invalid_auth_key_size_ep_attr_hints)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = CXI_DEFAULT_SVC_ID,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	hints->ep_attr->auth_key_size = 1;
+	hints->ep_attr->auth_key = memdup(&auth_key, 1);
+	cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+}
+
+/* Test fi_getinfo() verification of hints argument. */
+Test(auth_key, missing_auth_key_size_ep_attr_hints)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = CXI_DEFAULT_SVC_ID,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	hints->ep_attr->auth_key = memdup(&auth_key, 1);
+	cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+}
+
+/* Verify fi_getinfo() correctly echos back a valid auth_key hint using the
+ * default svc_id.
+ */
+Test(auth_key, valid_default_domain_auth_key_hint)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = CXI_DEFAULT_SVC_ID,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	hints->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed");
+
+	hints->domain_attr->auth_key_size = sizeof(auth_key);
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	cr_assert_not_null(info->domain_attr->auth_key, "NULL domain auth_key");
+	cr_assert_eq(hints->domain_attr->auth_key_size,
+		     info->domain_attr->auth_key_size,
+		     "fi_getinfo returned auth_key_size does not match hints");
+
+	ret = memcmp(hints->domain_attr->auth_key, info->domain_attr->auth_key,
+		     hints->domain_attr->auth_key_size);
+	cr_assert_eq(ret, 0, "fi_getinfo returned auth_key does not match hints");
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+/* Verify fi_getinfo() correctly echos back a valid auth_key hint using the
+ * default svc_id.
+ */
+Test(auth_key, valid_default_ep_auth_key_hint)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = CXI_DEFAULT_SVC_ID,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	hints->ep_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed");
+
+	hints->ep_attr->auth_key_size = sizeof(auth_key);
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	cr_assert_not_null(info->ep_attr->auth_key, "NULL ep auth_key");
+	cr_assert_eq(hints->ep_attr->auth_key_size,
+		     info->ep_attr->auth_key_size,
+		     "fi_getinfo returned auth_key_size does not match hints");
+
+	ret = memcmp(hints->ep_attr->auth_key, info->ep_attr->auth_key,
+		     hints->ep_attr->auth_key_size);
+	cr_assert_eq(ret, 0, "fi_getinfo returned auth_key does not match hints");
+
+	/* Since hints domain auth_key is NULL, CXI provider should echo the
+	 * hints ep auth_key into info domain auth_key. This is the behavior
+	 * some MPICH versions expect.
+	 */
+	cr_assert_not_null(info->domain_attr->auth_key, "NULL domain auth_key");
+	cr_assert_eq(hints->ep_attr->auth_key_size,
+		     info->domain_attr->auth_key_size,
+		     "fi_getinfo returned auth_key_size does not match hints");
+
+	ret = memcmp(hints->ep_attr->auth_key, info->domain_attr->auth_key,
+		     hints->ep_attr->auth_key_size);
+	cr_assert_eq(ret, 0, "fi_getinfo returned auth_key does not match hints");
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+/* Verify fi_getinfo() rejects a svc_id which has not been allocated thus making
+ * the auth_key invalid.
+ */
+Test(auth_key, invalid_user_defined_domain_svc_id_hint)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = 0xffff,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	hints->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed");
+
+	hints->domain_attr->auth_key_size = sizeof(auth_key);
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+}
+
+/* Verify fi_getinfo() rejects a svc_id which has not been allocated thus making
+ * the auth_key invalid.
+ */
+Test(auth_key, invalid_user_defined_ep_svc_id_hint)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = 0xffff,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	hints->ep_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed");
+
+	hints->ep_attr->auth_key_size = sizeof(auth_key);
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+}
+
+/* Verify fi_domain() rejects an invalid auth_key. */
+Test(auth_key, invalid_user_defined_domain_svc_id)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = 0xffff,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *info;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	/* Override auth_key with bad auth_key. */
+	if (info->domain_attr->auth_key)
+		free(info->domain_attr->auth_key);
+	info->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	info->domain_attr->auth_key_size = sizeof(auth_key);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_domain failed: %d", ret);
+
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+}
+
+/* Verify fi_endpoint() rejects an invalid auth_key. */
+Test(auth_key, invalid_user_defined_ep_svc_id)
+{
+	struct cxi_auth_key auth_key = {
+		.svc_id = 0xffff,
+		.vni = 1,
+	};
+	int ret;
+	struct fi_info *info;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_ep *ep;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	/* Override auth_key with bad auth_key. */
+	if (info->domain_attr->auth_key)
+		free(info->domain_attr->auth_key);
+	info->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	info->domain_attr->auth_key_size = sizeof(auth_key);
+
+	ret = fi_endpoint(dom, info, &ep, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_endpoint failed: %d", ret);
+
+	fi_close(&dom->fid);
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+}
+
+/* Valid service ID but invalid VNI for the service ID. */
+Test(auth_key, valid_user_defined_svc_id_invalid_vni_hints)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	uint16_t valid_vni = 0x120;
+	struct fi_info *hints;
+	struct fi_info *info;
+	struct cxi_auth_key auth_key = {
+		.vni = 0x123,
+	};
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = valid_vni;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	auth_key.svc_id = svc_desc.svc_id;
+	hints->ep_attr->auth_key_size = sizeof(auth_key);
+	hints->ep_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+/* Valid service ID but invalid VNI for the service ID. */
+Test(auth_key, valid_user_defined_svc_id_invalid_vni_dom_attr)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	uint16_t valid_vni = 0x120;
+	struct fi_info *info;
+	struct cxi_auth_key auth_key = {
+		.vni = 0x123,
+	};
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = valid_vni;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	/* Override auth_key with bad auth_key. */
+	auth_key.svc_id = svc_desc.svc_id;
+
+	if (info->domain_attr->auth_key)
+		free(info->domain_attr->auth_key);
+	info->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	info->domain_attr->auth_key_size = sizeof(auth_key);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_domain failed: %d", ret);
+
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+/* Valid service ID but invalid VNI for the service ID. */
+Test(auth_key, valid_user_defined_svc_id_invalid_vni_ep_attr)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	uint16_t valid_vni = 0x120;
+	struct fi_info *info;
+	struct cxi_auth_key auth_key = {
+		.vni = 0x123,
+	};
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_ep *ep;
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = valid_vni;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	/* Override auth_key with bad auth_key. */
+	auth_key.svc_id = svc_desc.svc_id;
+
+	if (info->domain_attr->auth_key)
+		free(info->domain_attr->auth_key);
+	info->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	info->domain_attr->auth_key_size = sizeof(auth_key);
+
+	ret = fi_endpoint(dom, info, &ep, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_endpoint failed: %d", ret);
+
+	fi_close(&dom->fid);
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+static void alloc_endpoint(struct fi_info *info, struct fid_fabric **fab,
+			   struct fid_domain **dom, struct fid_av **av,
+			   struct fid_cq **cq, struct fid_ep **ep)
+{
+	int ret;
+	struct fi_cq_attr cq_attr = {
+		.format = FI_CQ_FORMAT_TAGGED,
+	};
+	struct fi_av_attr av_attr = {};
+
+	ret = fi_fabric(info->fabric_attr, fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(*fab, info, dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	ret = fi_cq_open(*dom, &cq_attr, cq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open failed: %d", ret);
+
+	ret = fi_av_open(*dom, &av_attr, av, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret);
+
+	ret = fi_endpoint(*dom, info, ep, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_endpoint failed: %d", ret);
+
+	ret = fi_ep_bind(*ep, &(*av)->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind failed: %d", ret);
+
+	ret = fi_ep_bind(*ep, &(*cq)->fid, FI_TRANSMIT | FI_RECV);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind failed: %d", ret);
+
+	ret = fi_enable(*ep);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_enable failed: %d", ret);
+}
+
+Test(auth_key, valid_user_defined_svc_id_valid_vni_verify_vni_enforcement)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	struct fi_info *hints;
+	struct fi_info *default_info;
+	struct fi_info *user_info;
+	struct cxi_auth_key auth_key = {};
+	uint16_t valid_vni = 0x1234;
+	struct fid_fabric *default_fab;
+	struct fid_domain *default_dom;
+	struct fid_av *default_av;
+	struct fid_cq *default_cq;
+	struct fid_ep *default_ep;
+	struct fid_fabric *user_fab;
+	struct fid_domain *user_dom;
+	struct fid_av *user_av;
+	struct fid_cq *user_cq;
+	struct fid_ep *user_ep;
+	char buf[256];
+	fi_addr_t target_default_ep;
+	struct fi_cq_tagged_entry event;
+	struct fi_cq_err_entry error;
+
+	/* Allocate infos for RDMA test. Default_info users the provider
+	 * assigned default auth_key where user_info uses the user defined
+	 * auth_key.
+	 */
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "0", FI_SOURCE, NULL, &default_info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = valid_vni;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	auth_key.svc_id = svc_desc.svc_id;
+	auth_key.vni = valid_vni;
+	hints->domain_attr->auth_key_size = sizeof(auth_key);
+	hints->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed");
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &user_info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	/* Allocate endpoints using different service IDs and VNIs. */
+	alloc_endpoint(default_info, &default_fab, &default_dom, &default_av,
+		       &default_cq, &default_ep);
+	alloc_endpoint(user_info, &user_fab, &user_dom, &user_av,
+		       &user_cq, &user_ep);
+
+	/* Insert the default EP address into the user AVs. */
+	ret = fi_av_insert(user_av, default_info->src_addr, 1,
+			   &target_default_ep, 0, NULL);
+	cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret);
+
+	/* These two endpoints should not be able to talk due to operating in
+	 * different VNIs. This should result in an I/O error at the initiator.
+	 */
+	ret = fi_recv(default_ep, buf, sizeof(buf), NULL, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+	ret = fi_send(user_ep, buf, sizeof(buf), NULL, target_default_ep, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+	do {
+		ret = fi_cq_read(user_cq, &event, 1);
+	} while (ret == -FI_EAGAIN);
+
+	cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_read failed: %d", ret);
+
+	ret = fi_cq_readerr(user_cq, &error, 0);
+	cr_assert_eq(ret, 1, "fi_cq_readerr failed: %d", ret);
+
+	/* Single these tests are loopback on the same NIC, RC_PTLTE_NOT_FOUND
+	 * is returned instead of RC_VNI_NOT_FOUND since the VNI is valid.
+	 * Non-loopback should returned RC_VNI_NOT_FOUND.
+	 */
+	cr_assert_eq(error.prov_errno, C_RC_PTLTE_NOT_FOUND,
+		     "Bad error.prov_errno: got=%d expected=%d",
+		     error.prov_errno, C_RC_PTLTE_NOT_FOUND);
+
+	fi_close(&user_ep->fid);
+	fi_close(&user_cq->fid);
+	fi_close(&user_av->fid);
+	fi_close(&user_dom->fid);
+	fi_close(&user_fab->fid);
+	fi_close(&default_ep->fid);
+	fi_close(&default_cq->fid);
+	fi_close(&default_av->fid);
+	fi_close(&default_dom->fid);
+	fi_close(&default_fab->fid);
+	fi_freeinfo(user_info);
+	fi_freeinfo(hints);
+	fi_freeinfo(default_info);
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+/* Use the Slingshot plugin environment variables to generate an auth_key. Only
+ * a single entry per environment variable is specified.
+ */
+Test(auth_key, ss_plugin_env_vars_single_entry)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	struct fi_info *info;
+	struct cxi_auth_key auth_key = {
+		.vni = 288,
+	};
+	char svc_id_str[256];
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct cxip_nic_attr *nic_attr;
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = auth_key.vni;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+	auth_key.svc_id = ret;
+
+	ret = setenv("SLINGSHOT_VNIS", "288", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	ret = setenv("SLINGSHOT_DEVICES", "cxi0", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	sprintf(svc_id_str, "%d", auth_key.svc_id);
+	ret = setenv("SLINGSHOT_SVC_IDS", svc_id_str, 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_eq(nic_attr->default_rgroup_id, auth_key.svc_id,
+		     "fi_getinfo returned auth_key does not match Slingshot env vars");
+	cr_assert_eq(nic_attr->default_vni, auth_key.vni,
+		     "fi_getinfo returned auth_key does not match Slingshot env vars");
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	fi_close(&dom->fid);
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+/* Use the Slingshot plugin environment variables to generate an auth_key.
+ * Multiple values per environment variable are specified.
+ */
+Test(auth_key, ss_plugin_env_vars_multiple_entries)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	struct fi_info *info;
+	struct cxi_auth_key auth_key = {
+		.vni = 288,
+	};
+	char svc_id_str[256];
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct cxip_nic_attr *nic_attr;
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = auth_key.vni;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+	auth_key.svc_id = ret;
+
+	ret = setenv("SLINGSHOT_VNIS", "288,999", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	ret = setenv("SLINGSHOT_DEVICES", "cxi1,cxi15,cxi4,cxi0", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	sprintf(svc_id_str, "1024,1025,1026,%d", auth_key.svc_id);
+	ret = setenv("SLINGSHOT_SVC_IDS", svc_id_str, 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_eq(nic_attr->default_rgroup_id, auth_key.svc_id,
+		     "fi_getinfo returned auth_key does not match Slingshot env vars");
+	cr_assert_eq(nic_attr->default_vni, auth_key.vni,
+		     "fi_getinfo returned auth_key does not match Slingshot env vars");
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	fi_close(&dom->fid);
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+#define DEFAULT_SERVICE_ID 1U
+
+/* Use the Slingshot plugin environment variables to define auth_keys for a
+ * cxi device which does not exist.
+ */
+Test(auth_key, ss_plugin_env_vars_no_nic)
+{
+	struct fi_info *info;
+	int ret;
+	struct cxip_nic_attr *nic_attr;
+
+	ret = setenv("SLINGSHOT_VNIS", "288,999", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	ret = setenv("SLINGSHOT_DEVICES", "cxi1,cxi15,cxi4", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	ret = setenv("SLINGSHOT_SVC_IDS", "1024,1025,1026", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_eq(nic_attr->default_rgroup_id, DEFAULT_SERVICE_ID,
+		     "Unexpected svc_id: %d", nic_attr->default_rgroup_id);
+
+	fi_freeinfo(info);
+}
+
+/* Define valid Slingshot plugin environment variables and verify that user
+ * provided auth_key is honored before using Slingshot plugin environment
+ * variables to generate auth_key.
+ */
+Test(auth_key, ss_plugin_auth_key_priority)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	struct fi_info *info;
+	struct fi_info *hints;
+	char svc_id_str[256];
+	struct cxi_auth_key auth_key = {
+		.svc_id = CXI_DEFAULT_SVC_ID,
+		.vni = 1,
+	};
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = auth_key.vni;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	ret = setenv("SLINGSHOT_VNIS", "1", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	ret = setenv("SLINGSHOT_DEVICES", "cxi0", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	sprintf(svc_id_str, "%d", auth_key.svc_id);
+	ret = setenv("SLINGSHOT_SVC_IDS", svc_id_str, 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", errno);
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	hints->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key));
+	cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed");
+
+	hints->domain_attr->auth_key_size = sizeof(auth_key);
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	ret = memcmp(hints->domain_attr->auth_key, info->domain_attr->auth_key,
+		     hints->domain_attr->auth_key_size);
+	cr_assert_eq(ret, 0, "fi_getinfo returned auth_key does not match hints");
+	cr_assert_eq(info->domain_attr->auth_key_size, sizeof(auth_key));
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	fi_close(&dom->fid);
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+/* Restrict the auth_key to a specific UID. */
+Test(auth_key, uid_valid_service)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	struct fi_info *info;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	uid_t test_uid = 65530;
+	uint64_t test_vni = 12345;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_nic_attr *nic_attr;
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_members = 1;
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = test_vni;
+	svc_desc.members[0].type = CXI_SVC_MEMBER_UID;
+	svc_desc.members[0].svc_member.uid = test_uid;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	auth_key.svc_id = svc_desc.svc_id;
+	auth_key.vni = test_vni;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	/* Ensure that returned auth_key does not contain allocated service ID
+	 * since this is restricted to specific UID.
+	 *
+	 * Return auth_key hint should be NULL. NIC attr should not contain the
+	 * service ID and VNI.
+	 */
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_neq(nic_attr->default_rgroup_id, auth_key.svc_id);
+	cr_assert_neq(nic_attr->default_vni, auth_key.vni);
+
+	fi_freeinfo(info);
+
+	ret = seteuid(test_uid);
+	cr_assert_eq(ret, 0, "seteuid failed: %d", errno);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	/* Ensure that returned auth_key does not contain allocated service ID
+	 * since this is restricted to specific UID.
+	 *
+	 * Return auth_key hint should be NULL. NIC attr should not contain the
+	 * service ID and VNI.
+	 */
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_eq(nic_attr->default_rgroup_id, auth_key.svc_id);
+	cr_assert_eq(nic_attr->default_vni, auth_key.vni);
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	fi_close(&dom->fid);
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+
+	/* Make sure non-root user cannot destroy service. */
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_neq(ret, 0, "cxil_destroy_svc did not fail");
+
+	ret = seteuid(0);
+	cr_assert_eq(ret, 0, "seteuid failed: %d", errno);
+
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+/* Restrict the auth_key to a specific GID. */
+Test(auth_key, gid_valid_service)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	struct fi_info *info;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	uid_t test_gid = 32766;
+	uint64_t test_vni = 12345;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_nic_attr *nic_attr;
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_members = 1;
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = test_vni;
+	svc_desc.members[0].type = CXI_SVC_MEMBER_GID;
+	svc_desc.members[0].svc_member.gid = test_gid;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	auth_key.svc_id = svc_desc.svc_id;
+	auth_key.vni = test_vni;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	/* Ensure that returned auth_key does not contain allocated service ID
+	 * since this is restricted to specific UID.
+	 *
+	 * Return auth_key hint should be NULL. NIC attr should not contain the
+	 * service ID and VNI.
+	 */
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_neq(nic_attr->default_rgroup_id, auth_key.svc_id);
+	cr_assert_neq(nic_attr->default_vni, auth_key.vni);
+
+	fi_freeinfo(info);
+
+	ret = setegid(test_gid);
+	cr_assert_eq(ret, 0, "setegid failed: %d", errno);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	/* Ensure that returned auth_key does contain allocated service ID
+	 * since this is restricted to specific UID.
+	 *
+	 * Return auth_key hint should be NULL. NIC attr should contain the
+	 * service ID and VNI.
+	 */
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_eq(nic_attr->default_rgroup_id, auth_key.svc_id);
+	cr_assert_eq(nic_attr->default_vni, auth_key.vni);
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	fi_close(&dom->fid);
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+
+	ret = setegid(0);
+	cr_assert_eq(ret, 0, "setegid failed: %d", errno);
+
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+/* Verify the priority between UID, GID, and unrestricted services get honored.
+ */
+Test(auth_key, uid_gid_default_service_id_priority)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	struct fi_info *info;
+	uid_t test_uid = 65530;
+	uint64_t test_uid_vni = 12345;
+	uid_t test_gid = 32766;
+	uint64_t test_gid_vni = 12344;
+	struct cxi_auth_key uid_auth_key = {};
+	struct cxi_auth_key gid_auth_key = {};
+	struct cxip_nic_attr *nic_attr;
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_members = 1;
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = test_uid_vni;
+	svc_desc.members[0].type = CXI_SVC_MEMBER_UID;
+	svc_desc.members[0].svc_member.uid = test_uid;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+
+	uid_auth_key.svc_id = ret;
+	uid_auth_key.vni = test_uid_vni;
+
+	svc_desc.vnis[0] = test_gid_vni;
+	svc_desc.members[0].type = CXI_SVC_MEMBER_GID;
+	svc_desc.members[0].svc_member.gid = test_gid;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+
+	gid_auth_key.svc_id = ret;
+	gid_auth_key.vni = test_gid_vni;
+
+	/* Since UID and GID have not changed, auth_key with default service ID
+	 * should be returned.
+	 */
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_eq(nic_attr->default_rgroup_id, DEFAULT_SERVICE_ID,
+		     "Default service ID was not returned: svc_id=%d",
+		     nic_attr->default_rgroup_id);
+
+	fi_freeinfo(info);
+
+	/* Changing GID should result in GID auth_key being returned. */
+	ret = setegid(test_gid);
+	cr_assert_eq(ret, 0, "setegid failed: %d", errno);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_eq(nic_attr->default_rgroup_id, gid_auth_key.svc_id);
+	cr_assert_eq(nic_attr->default_vni, gid_auth_key.vni);
+
+	fi_freeinfo(info);
+
+	/* Changing the UID should result in UID auth_key being returned. */
+	ret = seteuid(test_uid);
+	cr_assert_eq(ret, 0, "seteuid failed: %d", errno);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	cr_assert_eq(info->domain_attr->auth_key, NULL);
+	cr_assert_eq(info->domain_attr->auth_key_size, 0);
+
+	nic_attr = info->nic->prov_attr;
+	cr_assert_eq(nic_attr->default_rgroup_id, uid_auth_key.svc_id);
+	cr_assert_eq(nic_attr->default_vni, uid_auth_key.vni);
+
+	fi_freeinfo(info);
+
+	ret = seteuid(0);
+	cr_assert_eq(ret, 0, "seteuid failed: %d", errno);
+
+	ret = setegid(0);
+	cr_assert_eq(ret, 0, "setegid failed: %d", errno);
+
+	ret = cxil_destroy_svc(dev, gid_auth_key.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+
+	ret = cxil_destroy_svc(dev, uid_auth_key.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+
+	cxil_close_device(dev);
+}
+
+/* Test disabling the default service ID. */
+Test(auth_key, default_service_id_disabled)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	struct fi_info *info;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+
+	/* Disable the default service ID. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	ret = cxil_get_svc(dev, DEFAULT_SERVICE_ID, &svc_desc);
+	cr_assert_eq(ret, 0, "cxil_get_svc failed: %d", ret);
+	cr_assert_eq(svc_desc.enable, 1,
+		     "Default service ID unexpectedly disabled");
+
+	svc_desc.enable = 0;
+
+	ret = cxil_update_svc(dev, &svc_desc, &fail_info);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	/* With the default service ID disabled, NULL auth_key should be
+	 * returned.
+	 */
+	cr_assert_null(info->domain_attr->auth_key, "Domain auth_key not NULL");
+	cr_assert_null(info->ep_attr->auth_key, "EP auth_key not NULL");
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_neq(ret, FI_SUCCESS, "fi_domain did not fail");
+
+	fi_close(&fab->fid);
+	fi_freeinfo(info);
+
+	/* Restore default service. */
+	svc_desc.enable = 1;
+	ret = cxil_update_svc(dev, &svc_desc, &fail_info);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	cxil_close_device(dev);
+}
+
+#define DEFAULT_MAX_EP_AUTH_KEY 4
+
+Test(auth_key, max_ep_auth_key_null_hints)
+{
+	int ret;
+	struct fi_info *info;
+	struct fi_info *tmp;
+	int i = 0;
+	size_t expected_ep_auth_key;
+
+	ret = setenv("FI_CXI_COMPAT", "0", 1);
+	cr_assert(ret == 0);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	tmp = info;
+	while (tmp) {
+		/* The first 2 fi_info's should have max_ep_auth_key == 1*/
+		if (i < 2)
+			expected_ep_auth_key = 1;
+		else
+			expected_ep_auth_key = DEFAULT_MAX_EP_AUTH_KEY;
+
+		cr_assert_eq(tmp->domain_attr->max_ep_auth_key,
+			     expected_ep_auth_key,
+			     "Invalid max_ep_auth_key: expected=%ld got=%ld info_count=%d",
+			     expected_ep_auth_key,
+			     tmp->domain_attr->max_ep_auth_key, i);
+		tmp = tmp->next;
+		i++;
+	}
+
+	fi_freeinfo(info);
+}
+
+/* Test fi_getinfo() verification of hints argument. */
+Test(auth_key, zero_max_ep_auth_key_null_hint)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+	struct fi_info *tmp;
+	int i = 0;
+	size_t expected_ep_auth_key;
+
+	ret = setenv("FI_CXI_COMPAT", "0", 1);
+	cr_assert(ret == 0);
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->max_ep_auth_key = 0;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	tmp = info;
+	while (tmp) {
+		/* The first 2 fi_info's should have max_ep_auth_key == 1*/
+		if (i < 2)
+			expected_ep_auth_key = 1;
+		else
+			expected_ep_auth_key = DEFAULT_MAX_EP_AUTH_KEY;
+
+		cr_assert_eq(tmp->domain_attr->max_ep_auth_key,
+			     expected_ep_auth_key,
+			     "Invalid max_ep_auth_key: expected=%ld got=%ld info_count=%d",
+			     expected_ep_auth_key,
+			     tmp->domain_attr->max_ep_auth_key, i);
+		tmp = tmp->next;
+		i++;
+	}
+
+	fi_freeinfo(hints);
+	fi_freeinfo(info);
+}
+
+/* Test fi_getinfo() verification of hints argument. */
+Test(auth_key, valid_max_ep_auth_key_null_hint)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+	struct fi_info *tmp;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->max_ep_auth_key = 1;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	tmp = info;
+	while (tmp) {
+		cr_assert_eq(tmp->domain_attr->max_ep_auth_key,
+			     hints->domain_attr->max_ep_auth_key,
+			     "Invalid max_ep_auth_key: expected=%ld got=%ld",
+			     hints->domain_attr->max_ep_auth_key,
+			     tmp->domain_attr->max_ep_auth_key);
+		tmp = tmp->next;
+	}
+
+	fi_freeinfo(hints);
+	fi_freeinfo(info);
+}
+
+/* Test fi_getinfo() verification of hints argument. */
+Test(auth_key, invalid_max_ep_auth_key_null_hint)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->max_ep_auth_key = 12345678;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+}
+
+TestSuite(av_auth_key, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+static void open_av_auth_key(struct fi_info *info, struct fid_fabric **fab,
+			     struct fid_domain **dom, struct fid_av **av)
+{
+	int ret;
+	struct fi_av_attr av_attr = {};
+
+	ret = fi_fabric(info->fabric_attr, fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(*fab, info, dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	ret = fi_av_open(*dom, &av_attr, av, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret);
+}
+
+static void close_av_auth_key(struct fid_fabric *fab, struct fid_domain *dom,
+			      struct fid_av *av)
+{
+	int ret;
+
+	ret = fi_close(&av->fid);
+	cr_assert_eq(ret, FI_SUCCESS);
+
+	ret = fi_close(&dom->fid);
+	cr_assert_eq(ret, FI_SUCCESS);
+
+	ret = fi_close(&fab->fid);
+	cr_assert_eq(ret, FI_SUCCESS);
+}
+
+Test(av_auth_key, insert_without_av_auth_key_set)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_nic_attr *nic_attr;
+	fi_addr_t addr_key;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = info->nic->prov_attr;
+	auth_key.vni = nic_attr->default_vni;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, lookup_without_av_auth_key_set)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	size_t size = sizeof(auth_key);
+	fi_addr_t addr_key = 0;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_lookup_auth_key(av, addr_key, &auth_key, &size);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_lookup_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+/* Insert multiple auth_keys. */
+#define NUM_VNIS 4U
+Test(av_auth_key, insert_lookup_valid_auth_key)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxi_auth_key lookup_auth_key = {};
+	size_t auth_key_size;
+	fi_addr_t addr_key;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	int i;
+
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = NUM_VNIS;
+
+	for (i = 0; i < NUM_VNIS; i++)
+		svc_desc.vnis[i] = 123 + i;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = NUM_VNIS;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	for (i = 0; i < NUM_VNIS; i++) {
+		auth_key.vni = svc_desc.vnis[i];
+
+		ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key),
+					    &addr_key, 0);
+		cr_assert_eq(ret, FI_SUCCESS,
+			     "fi_av_insert_auth_key failed: %d", ret);
+
+		auth_key_size = sizeof(lookup_auth_key);
+		ret = fi_av_lookup_auth_key(av, addr_key, &lookup_auth_key,
+					    &auth_key_size);
+		cr_assert_eq(ret, FI_SUCCESS,
+			     "fi_av_lookup_auth_key failed: %d", ret);
+
+		cr_assert_eq(auth_key_size, sizeof(lookup_auth_key),
+			     "Invalid auth_key_size returned");
+		cr_assert_eq(lookup_auth_key.vni, auth_key.vni,
+			     "Incorrect auth_key.vni returned");
+	}
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+Test(av_auth_key, insert_invalid_null_auth_key)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_nic_attr *nic_attr;
+	fi_addr_t addr_key;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = info->nic->prov_attr;
+	auth_key.vni = nic_attr->default_vni;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, NULL, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, insert_invalid_null_fi_addr)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_nic_attr *nic_attr;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = info->nic->prov_attr;
+	auth_key.vni = nic_attr->default_vni;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), NULL, 0);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, insert_invalid_flags)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_nic_attr *nic_attr;
+	fi_addr_t addr_key;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = info->nic->prov_attr;
+	auth_key.vni = nic_attr->default_vni;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0x123);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, insert_invalid_vni)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	fi_addr_t addr_key;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	auth_key.vni = 0x1234;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, insert_max_ep_auth_key_bounds_check)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_nic_attr *nic_attr;
+	fi_addr_t addr_key;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = 1;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = info->nic->prov_attr;
+	auth_key.vni = nic_attr->default_vni;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, -FI_ENOSPC, "fi_av_insert_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, lookup_null_auth_key)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	fi_addr_t addr_key = 0;
+	size_t auth_key_size = sizeof(auth_key);
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	auth_key.vni = 0x1234;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_lookup_auth_key(av, addr_key, NULL, &auth_key_size);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_lookup_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, lookup_null_auth_key_size)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	fi_addr_t addr_key = 0;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	auth_key.vni = 0x1234;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_lookup_auth_key(av, addr_key, &auth_key, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_lookup_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, remove)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_nic_attr *nic_attr;
+	fi_addr_t addr_key;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = 1;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = info->nic->prov_attr;
+	auth_key.vni = nic_attr->default_vni;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret);
+
+	ret = fi_av_remove(av, &addr_key, 1, FI_AUTH_KEY);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_remove failed: %d", ret);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, valid_insert_auth_key_addr)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_addr addr = {};
+	struct cxip_nic_attr *nic_attr;
+	fi_addr_t addr_key;
+	size_t addr_key_size = sizeof(addr);
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = 1;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = info->nic->prov_attr;
+	auth_key.vni = nic_attr->default_vni;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret);
+
+	ret = fi_av_insert(av, &addr, 1, &addr_key, FI_AUTH_KEY, NULL);
+	cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret);
+
+	ret = fi_av_lookup(av, addr_key, &addr, &addr_key_size);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_lookup failed: %d", ret);
+
+	cr_assert_eq(addr.vni, auth_key.vni,
+		     "Invalid auth_key vni: expected=%u got=%u",
+		     auth_key.vni, addr.vni);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, miss_auth_key_insert_flag)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_addr addr = {};
+	struct cxip_nic_attr *nic_attr;
+	fi_addr_t addr_key;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = 1;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = info->nic->prov_attr;
+	auth_key.vni = nic_attr->default_vni;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret);
+
+	ret = fi_av_insert(av, &addr, 1, &addr_key, 0, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, invalid_user_id_flag)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxi_auth_key auth_key = {};
+	struct cxip_addr addr = {};
+	struct cxip_nic_attr *nic_attr;
+	fi_addr_t addr_key;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = 1;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = info->nic->prov_attr;
+	auth_key.vni = nic_attr->default_vni;
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key,
+				    0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret);
+
+	ret = fi_av_insert(av, &addr, 1, &addr_key, FI_AV_USER_ID, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, invalid_user_id_auth_key_flags)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxip_addr addr = {};
+	fi_addr_t addr_key;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->max_ep_auth_key = 1;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert(av, &addr, 1, &addr_key,
+			   (FI_AV_USER_ID | FI_AUTH_KEY), NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, null_auth_key_addr)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct cxip_addr addr = {};
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = 1;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	ret = fi_av_insert(av, &addr, 1, NULL, FI_AUTH_KEY, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, invalid_multiple_auth_keys_per_ep_with_directed_recv_cap)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct fid_ep *ep;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = 2;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	info->caps |= FI_DIRECTED_RECV;
+	ret = fi_endpoint(dom, info, &ep, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_endpoint failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(av_auth_key, invalid_multiple_auth_keys_per_ep_with_directed_recv_rx_cap)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_av *av;
+	struct fid_ep *ep;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = 2;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 "255", FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	open_av_auth_key(info, &fab, &dom, &av);
+
+	info->rx_attr->caps |= FI_DIRECTED_RECV;
+	ret = fi_endpoint(dom, info, &ep, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_endpoint failed: %d", ret);
+
+	close_av_auth_key(fab, dom, av);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+#define NUM_VNIS 4U
+#define NUM_TX_EPS NUM_VNIS
+
+static struct cxil_dev *dev;
+static struct cxi_svc_desc svc_desc = {
+	.restricted_vnis = 1,
+	.enable = 1,
+	.num_vld_vnis = NUM_VNIS,
+	.vnis = {1234, 1235, 1236, 1237},
+};
+
+static struct fid_fabric *fab;
+static struct fid_domain *dom;
+static struct fid_cq *cq;
+static struct fid_av *av;
+static volatile uint64_t rx_mr_buf;
+static struct fid_mr *rx_mr;
+static struct fid_ep *rx_ep;
+static fi_addr_t auth_keys[NUM_VNIS];
+static fi_addr_t init_addrs[NUM_TX_EPS];
+
+static char *rx_ep_pid = "0";
+static char *tx_ep_pids[] = {"128", "129", "130", "131"};
+static unsigned int nic_addr;
+
+static struct fid_domain *tx_dom;
+static struct fid_cq *tx_cq;
+static struct fid_av *tx_av;
+static struct fid_ep *tx_ep[NUM_TX_EPS];
+static volatile uint64_t tx_mr_buf[NUM_TX_EPS];
+static struct fid_mr *tx_mr[NUM_TX_EPS];
+static fi_addr_t target_addr;
+
+static void av_auth_key_test_tx_ep_init(unsigned int num_vnis)
+{
+	struct fi_info *hints;
+	static struct fi_info *info;
+	int ret;
+	struct fi_cq_attr cq_attr = {
+		.format = FI_CQ_FORMAT_TAGGED,
+	};
+	struct fi_av_attr av_attr = {
+		.type = FI_AV_TABLE,
+	};
+	int i;
+	struct cxi_auth_key key = {};
+	char node[64];
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->caps |= FI_SOURCE | FI_SOURCE_ERR | FI_MSG | FI_SEND | FI_RECV |
+		FI_RMA | FI_ATOMIC;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED |
+		FI_MR_PROV_KEY;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	ret = fi_domain(fab, info, &tx_dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	fi_freeinfo(info);
+
+	ret = fi_cq_open(tx_dom, &cq_attr, &tx_cq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open failed: %d", ret);
+
+	ret = fi_av_open(tx_dom, &av_attr, &tx_av, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret);
+
+	sprintf(node, "%u", nic_addr);
+	ret = fi_av_insertsvc(tx_av, node, rx_ep_pid, &target_addr, 0, NULL);
+	cr_assert_eq(ret, 1, "fi_av_insertsvc failed: %d", ret);
+
+	for (i = 0; i < num_vnis; i++) {
+		key.vni = svc_desc.vnis[i];
+		key.svc_id = svc_desc.svc_id;
+
+		hints->ep_attr->auth_key = (void *)&key;
+		hints->ep_attr->auth_key_size = sizeof(key);
+
+		ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+				 "cxi0", tx_ep_pids[i], FI_SOURCE, hints,
+				 &info);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+		ret = fi_endpoint(tx_dom, info, &tx_ep[i], NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_endpoint failed: %d", ret);
+
+		ret = fi_ep_bind(tx_ep[i], &tx_cq->fid, FI_TRANSMIT | FI_RECV);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind CQ failed: %d", ret);
+
+		ret = fi_ep_bind(tx_ep[i], &tx_av->fid, 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind AV failed: %d", ret);
+
+		ret = fi_enable(tx_ep[i]);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_enable failed: %d", ret);
+
+		ret = fi_mr_reg(tx_dom, (void *)&tx_mr_buf[i],
+				sizeof(tx_mr_buf[i]),
+				FI_WRITE | FI_READ | FI_REMOTE_WRITE | FI_REMOTE_READ,
+				0, 0, 0, &tx_mr[i], NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret);
+
+		ret = fi_mr_bind(tx_mr[i], &tx_ep[i]->fid, 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed: %d", ret);
+
+		ret = fi_mr_enable(tx_mr[i]);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed: %d", ret);
+
+		fi_freeinfo(info);
+	}
+
+	hints->ep_attr->auth_key = NULL;
+	fi_freeinfo(hints);
+}
+
+static void av_auth_key_test_rx_ep_init(bool source_err, unsigned int num_vnis,
+					bool directed_recv, bool av_user_id)
+{
+	struct fi_info *hints;
+	static struct fi_info *info;
+	struct cxi_svc_fail_info fail_info = {};
+	int ret;
+	struct fi_cq_attr cq_attr = {
+		.format = FI_CQ_FORMAT_TAGGED,
+	};
+	struct fi_av_attr av_attr = {
+		.type = FI_AV_TABLE,
+	};
+	int i;
+	struct cxi_auth_key key = {};
+	size_t key_size;
+	char node[64];
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	nic_addr = dev->info.nic_addr;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	if (av_user_id) {
+		av_attr.flags = FI_AV_USER_ID;
+		hints->caps |= FI_AV_USER_ID;
+	}
+
+	hints->caps |= FI_SOURCE | FI_SOURCE_ERR | FI_MSG | FI_SEND | FI_RECV |
+		FI_RMA | FI_ATOMIC;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED |
+		FI_MR_PROV_KEY;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->max_ep_auth_key = num_vnis;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed");
+
+	if (directed_recv)
+		hints->caps |= FI_DIRECTED_RECV;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 rx_ep_pid, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+
+	ret = fi_fabric(info->fabric_attr, &fab, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret);
+
+	ret = fi_domain(fab, info, &dom, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret);
+
+	ret = fi_cq_open(dom, &cq_attr, &cq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open failed: %d", ret);
+
+	ret = fi_av_open(dom, &av_attr, &av, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret);
+
+	ret = fi_endpoint(dom, info, &rx_ep, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_endpoint failed: %d", ret);
+
+	ret = fi_ep_bind(rx_ep, &cq->fid, FI_TRANSMIT | FI_RECV);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind CQ failed: %d", ret);
+
+	ret = fi_ep_bind(rx_ep, &av->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind AV failed: %d", ret);
+
+	for (i = 0; i < num_vnis; i++) {
+		key.vni = svc_desc.vnis[i];
+		key_size = sizeof(key);
+
+		ret = fi_av_insert_auth_key(av, &key, key_size, &auth_keys[i],
+					    0);
+		cr_assert_eq(ret, FI_SUCCESS,
+			     "fi_av_insert_auth_key failed: %d", ret);
+
+		if (source_err)
+			continue;
+
+		sprintf(node, "%u", nic_addr);
+		init_addrs[i] = auth_keys[i];
+		ret = fi_av_insertsvc(av, node, tx_ep_pids[i], &init_addrs[i],
+				      FI_AUTH_KEY, NULL);
+		cr_assert_eq(ret, 1, "fi_av_insertsvc failed: %d", ret);
+	}
+
+	ret = fi_enable(rx_ep);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_enable failed: %d", ret);
+
+	ret = fi_mr_reg(dom, (void *)&rx_mr_buf, sizeof(rx_mr_buf),
+			FI_WRITE | FI_READ | FI_REMOTE_WRITE | FI_REMOTE_READ,
+			0, 0, 0, &rx_mr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret);
+
+	ret = fi_mr_bind(rx_mr, &rx_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed: %d", ret);
+
+	ret = fi_mr_enable(rx_mr);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed: %d", ret);
+
+	fi_freeinfo(info);
+}
+
+static void av_auth_key_tx_ep_fini(unsigned int num_vnis)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < num_vnis; i++) {
+		ret = fi_close(&tx_mr[i]->fid);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret);
+
+		ret = fi_close(&tx_ep[i]->fid);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_close EP failed: %d", ret);
+	}
+
+	ret = fi_close(&tx_av->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close AV failed: %d", ret);
+
+	ret = fi_close(&tx_cq->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close CQ failed: %d", ret);
+
+	ret = fi_close(&tx_dom->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close dom failed: %d", ret);
+}
+
+static void av_auth_key_test_rx_ep_fini(void)
+{
+	int ret;
+
+	ret = fi_close(&rx_mr->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret);
+
+	ret = fi_close(&rx_ep->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close EP failed: %d", ret);
+
+	ret = fi_close(&av->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close AV failed: %d", ret);
+
+	ret = fi_close(&cq->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close CQ failed: %d", ret);
+
+	ret = fi_close(&dom->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close dom failed: %d", ret);
+
+	ret = fi_close(&fab->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close fab failed: %d", ret);
+
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
+
+TestSuite(data_transfer_av_auth_key, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(data_transfer_av_auth_key, successful_inject_transfer_source)
+{
+	int i;
+	int ret;
+	struct fi_cq_tagged_entry event;
+	fi_addr_t src_addr;
+
+	av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false);
+	av_auth_key_test_tx_ep_init(NUM_TX_EPS);
+
+	/* Each TX EP has been configured for a different VNI. Send from each
+	 * TX EP to the RX EP. The RX EP is configured with all VNIs.
+	 */
+	for (i = 0; i < NUM_TX_EPS; i++) {
+		ret = fi_inject(tx_ep[i], NULL, 0, target_addr);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+		ret = fi_recv(rx_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+		do {
+			ret = fi_cq_readfrom(cq, &event, 1, &src_addr);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret);
+		cr_assert_eq(src_addr, init_addrs[i], "Bad source addr");
+
+		ret = fi_inject(rx_ep, NULL, 0, src_addr);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+		ret = fi_recv(tx_ep[i], NULL, 0, NULL, FI_ADDR_UNSPEC, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+		do {
+			ret = fi_cq_readfrom(tx_cq, &event, 1, &src_addr);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret);
+		cr_assert_eq(src_addr, target_addr, "Bad source addr");
+	}
+
+	av_auth_key_tx_ep_fini(NUM_TX_EPS);
+	av_auth_key_test_rx_ep_fini();
+}
+
+Test(data_transfer_av_auth_key, successful_rdzv_transfer_source)
+{
+	int i;
+	int ret;
+	struct fi_cq_tagged_entry event;
+	fi_addr_t src_addr;
+	void *buf;
+	size_t buf_size = 1024 * 1024;
+
+	buf = malloc(buf_size);
+	cr_assert(buf != NULL);
+
+	av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false);
+	av_auth_key_test_tx_ep_init(NUM_TX_EPS);
+
+	/* Each TX EP has been configured for a different VNI. Send from each
+	 * TX EP to the RX EP. The RX EP is configured with all VNIs.
+	 */
+	for (i = 0; i < NUM_TX_EPS; i++) {
+		ret = fi_send(tx_ep[i], buf, buf_size, NULL, target_addr,
+			      tx_ep[i]);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+		ret = fi_recv(rx_ep, buf, buf_size, NULL, FI_ADDR_UNSPEC, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+		do {
+			ret = fi_cq_readfrom(cq, &event, 1, &src_addr);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret);
+		cr_assert_eq(src_addr, init_addrs[i], "Bad source addr");
+
+		do {
+			ret = fi_cq_read(tx_cq, &event, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret);
+
+		ret = fi_send(rx_ep, buf, buf_size, NULL, src_addr, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+		ret = fi_recv(tx_ep[i], buf, buf_size, NULL, FI_ADDR_UNSPEC,
+			      NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+		do {
+			ret = fi_cq_readfrom(tx_cq, &event, 1, &src_addr);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret);
+		cr_assert_eq(src_addr, target_addr, "Bad source addr");
+
+		do {
+			ret = fi_cq_read(cq, &event, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret);
+	}
+
+	av_auth_key_tx_ep_fini(NUM_TX_EPS);
+	av_auth_key_test_rx_ep_fini();
+
+	free(buf);
+}
+
+Test(data_transfer_av_auth_key, successful_transfer_source_err)
+{
+	int i;
+	int ret;
+	struct fi_cq_tagged_entry event;
+	struct fi_cq_err_entry error = {};
+	fi_addr_t src_addr;
+
+	av_auth_key_test_rx_ep_init(true, NUM_VNIS, false, false);
+	av_auth_key_test_tx_ep_init(NUM_TX_EPS);
+
+	/* Each TX EP has been configured for a different VNI. Send from each
+	 * TX EP to the RX EP. The RX EP is configured with all VNIs.
+	 */
+	for (i = 0; i < NUM_TX_EPS; i++) {
+		ret = fi_send(tx_ep[i], NULL, 0, NULL, target_addr, tx_ep[i]);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+		ret = fi_recv(rx_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+		do {
+			ret = fi_cq_readfrom(cq, &event, 1, &src_addr);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_readfrom failed: %d", ret);
+
+		ret = fi_cq_readerr(cq, &error, 0);
+		cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret);
+		cr_assert_eq(error.err, FI_EADDRNOTAVAIL, "Bad error.err");
+		cr_assert_eq(error.src_addr, auth_keys[i],
+			     "Bad error.src_addr: got=%lx expected=%lx",
+			     error.src_addr, auth_keys[i]);
+
+		do {
+			ret = fi_cq_read(tx_cq, &event, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret);
+	}
+
+	av_auth_key_tx_ep_fini(NUM_TX_EPS);
+	av_auth_key_test_rx_ep_fini();
+}
+
+Test(data_transfer_av_auth_key, single_auth_key_with_directed_recv)
+{
+	int ret;
+	int i;
+	struct fi_cq_tagged_entry event;
+	fi_addr_t src_addr;
+	fi_addr_t from_src_addr;
+	struct cxip_addr addr;
+	size_t addr_size = sizeof(struct cxip_addr);
+
+	av_auth_key_test_rx_ep_init(false, 1, true, false);
+	av_auth_key_test_tx_ep_init(1);
+
+	ret = fi_getname(&rx_ep->fid, &addr, &addr_size);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getname failed: %d", ret);
+
+	/* Insert a AV entry for the RX EP. */
+	src_addr = auth_keys[0];
+	ret = fi_av_insert(av, &addr, 1, &src_addr, FI_AUTH_KEY, NULL);
+	cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret);
+
+	/* Queue FI_DIRECTED_RECV to match only the RX EP. */
+	ret = fi_recv(rx_ep, NULL, 0, NULL, src_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+	/* Queue a zero byte message which should not match. */
+	ret = fi_send(tx_ep[0], NULL, 0, NULL, target_addr, tx_ep[0]);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+	/* Arbitrary amount of loops to ensure no recv events. */
+	for (i = 0; i < 100; i++) {
+		ret = fi_cq_readfrom(cq, &event, 1, &from_src_addr);
+		cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read failed: %d", ret);
+	}
+
+	/* Post matching send. */
+	ret = fi_send(rx_ep, NULL, 0, NULL, src_addr, rx_ep);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+	/* Two events should occur: a send and a recv. */
+	for (i = 0; i < 2; i++) {
+		do {
+			ret = fi_cq_readfrom(cq, &event, 1, &from_src_addr);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret);
+
+		if (event.flags & FI_RECV)
+			cr_assert_eq(src_addr, from_src_addr,
+				     "Bad source addr");
+	}
+
+	av_auth_key_tx_ep_fini(1);
+	av_auth_key_test_rx_ep_fini();
+}
+
+Test(data_transfer_av_auth_key, av_user_id_source_err_missing_auth_key_user_id)
+{
+	int i;
+	int ret;
+	struct fi_cq_tagged_entry event;
+	struct fi_cq_err_entry error = {};
+	fi_addr_t src_addr;
+
+	av_auth_key_test_rx_ep_init(true, NUM_VNIS, false, true);
+	av_auth_key_test_tx_ep_init(NUM_TX_EPS);
+
+	/* Each TX EP has been configured for a different VNI. Send from each
+	 * TX EP to the RX EP. The RX EP is configured with all VNIs.
+	 */
+	for (i = 0; i < NUM_TX_EPS; i++) {
+		ret = fi_send(tx_ep[i], NULL, 0, NULL, target_addr, tx_ep[i]);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+		ret = fi_recv(rx_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+		do {
+			ret = fi_cq_readfrom(cq, &event, 1, &src_addr);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_readfrom failed: %d", ret);
+
+		ret = fi_cq_readerr(cq, &error, 0);
+		cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret);
+		cr_assert_eq(error.err, FI_EADDRNOTAVAIL, "Bad error.err");
+		cr_assert_eq(error.src_addr, FI_ADDR_UNSPEC,
+			     "Bad error.src_addr: got=%lx expected=%lx",
+			     error.src_addr, FI_ADDR_UNSPEC);
+
+		do {
+			ret = fi_cq_read(tx_cq, &event, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret);
+	}
+
+	av_auth_key_tx_ep_fini(NUM_TX_EPS);
+	av_auth_key_test_rx_ep_fini();
+}
+
+Test(data_transfer_av_auth_key, av_user_id_source_err_auth_key_user_id)
+{
+	int i;
+	int ret;
+	struct fi_cq_tagged_entry event;
+	struct fi_cq_err_entry error = {};
+	fi_addr_t src_addr;
+	fi_addr_t user_id[NUM_VNIS] = {0x1234, 0x1235, 0x1236, 0x1237};
+
+	av_auth_key_test_rx_ep_init(true, NUM_VNIS, false, true);
+	av_auth_key_test_tx_ep_init(NUM_TX_EPS);
+
+	for (i = 0; i < NUM_VNIS; i++) {
+		ret = fi_av_set_user_id(av, auth_keys[i], user_id[i],
+					FI_AUTH_KEY);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_av_set_user_id failed: %d",
+			     ret);
+	}
+
+	/* Each TX EP has been configured for a different VNI. Send from each
+	 * TX EP to the RX EP. The RX EP is configured with all VNIs.
+	 */
+	for (i = 0; i < NUM_TX_EPS; i++) {
+		ret = fi_send(tx_ep[i], NULL, 0, NULL, target_addr, tx_ep[i]);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+		ret = fi_recv(rx_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+		do {
+			ret = fi_cq_readfrom(cq, &event, 1, &src_addr);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_readfrom failed: %d", ret);
+
+		ret = fi_cq_readerr(cq, &error, 0);
+		cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret);
+		cr_assert_eq(error.err, FI_EADDRNOTAVAIL, "Bad error.err");
+		cr_assert_eq(error.src_addr, user_id[i],
+			     "Bad error.src_addr: got=%lx expected=%lx",
+			     error.src_addr, user_id[i]);
+
+		do {
+			ret = fi_cq_read(tx_cq, &event, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret);
+	}
+
+	av_auth_key_tx_ep_fini(NUM_TX_EPS);
+	av_auth_key_test_rx_ep_fini();
+}
+
+Test(data_transfer_av_auth_key, rma_write_successful_transfer)
+{
+	int i;
+	int ret;
+	volatile uint64_t rma_value;
+
+	av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false);
+	av_auth_key_test_tx_ep_init(NUM_TX_EPS);
+
+	/* Each TX EP has been configured for a different VNI. Issue ping-pong
+	 * RMA from each TX MR to RX MR.
+	 */
+	for (i = 0; i < NUM_TX_EPS; i++) {
+		rma_value = i + 1;
+
+		ret = fi_write(tx_ep[i],
+			       (void *) &rma_value, sizeof(rma_value), NULL,
+			       target_addr, 0, fi_mr_key(rx_mr), NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_write failed: %d", ret);
+
+		while (rx_mr_buf != rma_value) {}
+
+		ret = fi_write(rx_ep,
+			       (void *) &rma_value, sizeof(rma_value), NULL,
+			       init_addrs[i], 0, fi_mr_key(tx_mr[i]), NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_write failed: %d", ret);
+
+		while (tx_mr_buf[i] != rma_value) {}
+	}
+
+	av_auth_key_tx_ep_fini(NUM_TX_EPS);
+	av_auth_key_test_rx_ep_fini();
+}
+
+Test(data_transfer_av_auth_key, rma_read_successful_transfer)
+{
+	int i;
+	int ret;
+	volatile uint64_t rma_value;
+
+	av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false);
+	av_auth_key_test_tx_ep_init(NUM_TX_EPS);
+
+	/* Each TX EP has been configured for a different VNI. Issue ping-pong
+	 * RMA from each TX MR to RX MR.
+	 */
+	for (i = 0; i < NUM_TX_EPS; i++) {
+		rx_mr_buf = i + 1;
+
+		ret = fi_read(tx_ep[i],
+			      (void *) &rma_value, sizeof(rma_value), NULL,
+			      target_addr, 0, fi_mr_key(rx_mr), NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_read failed: %d", ret);
+
+		while (rx_mr_buf != rma_value) {}
+
+		tx_mr_buf[i] = i + 1;
+		ret = fi_read(rx_ep,
+			      (void *) &rma_value, sizeof(rma_value), NULL,
+			       init_addrs[i], 0, fi_mr_key(tx_mr[i]), NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_read failed: %d", ret);
+
+		while (tx_mr_buf[i] != rma_value) {}
+	}
+
+	av_auth_key_tx_ep_fini(NUM_TX_EPS);
+	av_auth_key_test_rx_ep_fini();
+}
+
+Test(data_transfer_av_auth_key, amo_inject_successful_transfer)
+{
+	int i;
+	int ret;
+	uint64_t amo_value = 1;
+
+	av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false);
+	av_auth_key_test_tx_ep_init(NUM_TX_EPS);
+
+	/* Each TX EP has been configured for a different VNI. Issue ping-pong
+	 * AMO from each TX MR to RX MR.
+	 */
+	for (i = 0; i < NUM_TX_EPS; i++) {
+		ret = fi_inject_atomic(tx_ep[i], &amo_value, 1, target_addr, 0,
+				       fi_mr_key(rx_mr), FI_UINT64, FI_SUM);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_atomic failed: %d", ret);
+
+		while (rx_mr_buf != i + 1) {}
+
+		ret = fi_inject_atomic(rx_ep, &amo_value, 1, init_addrs[i], 0,
+				       fi_mr_key(tx_mr[i]), FI_UINT64, FI_SUM);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_atomic failed: %d", ret);
+
+		while (tx_mr_buf[i] != 1) {}
+	}
+
+	av_auth_key_tx_ep_fini(NUM_TX_EPS);
+	av_auth_key_test_rx_ep_fini();
+}
+
+Test(data_transfer_av_auth_key, amo_successful_transfer_opt_disabled)
+{
+	int i;
+	int ret;
+	uint64_t amo_value = 1;
+
+	ret = setenv("FI_CXI_OPTIMIZED_MRS", "0", 1);
+	cr_assert(ret == 0);
+
+	av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false);
+	av_auth_key_test_tx_ep_init(NUM_TX_EPS);
+
+	/* Each TX EP has been configured for a different VNI. Issue ping-pong
+	 * AMO from each TX MR to RX MR.
+	 */
+	for (i = 0; i < NUM_TX_EPS; i++) {
+		ret = fi_atomic(tx_ep[i], &amo_value, 1, NULL,
+				target_addr, 0, fi_mr_key(rx_mr), FI_UINT64,
+				FI_SUM, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_atomic failed: %d", ret);
+
+		while (rx_mr_buf != i + 1) {}
+
+		ret = fi_atomic(rx_ep, &amo_value, 1, NULL,
+				init_addrs[i], 0, fi_mr_key(tx_mr[i]),
+				FI_UINT64, FI_SUM, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_atomic failed: %d", ret);
+
+		while (tx_mr_buf[i] != 1) {}
+	}
+
+	av_auth_key_tx_ep_fini(NUM_TX_EPS);
+	av_auth_key_test_rx_ep_fini();
+}
diff --git a/prov/cxi/test/av.c b/prov/cxi/test/av.c
new file mode 100644
index 00000000000..84de0a6fdf1
--- /dev/null
+++ b/prov/cxi/test/av.c
@@ -0,0 +1,557 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2015-2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <netinet/ether.h>
+
+#include <criterion/criterion.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+static struct cxip_addr *test_addrs;
+fi_addr_t *test_fi_addrs;
+#define AV_COUNT 1024
+int naddrs = AV_COUNT * 10;
+
+static char *nic_to_amac(uint32_t nic)
+{
+	struct ether_addr mac = {};
+
+	mac.ether_addr_octet[5] = nic;
+	mac.ether_addr_octet[4] = nic >> 8;
+	mac.ether_addr_octet[3] = nic >> 16;
+
+	return ether_ntoa(&mac);
+}
+
+/* This allocates memory for naddrs FSAs (test_addrs), and naddrs tokens
+ * (test_fi_addrs), and initializes the FSAs to unique addresses.
+ */
+static void
+test_addrs_init(void)
+{
+	int i;
+
+	test_addrs = malloc(naddrs * sizeof(struct cxip_addr));
+	cr_assert(test_addrs != NULL);
+
+	test_fi_addrs = calloc(naddrs, sizeof(fi_addr_t));
+	cr_assert(test_fi_addrs != NULL);
+
+	for (i = 0; i < naddrs; i++) {
+		test_addrs[i].nic = i;
+		test_addrs[i].pid = i + 1;
+	}
+}
+
+/* Clean up the FSA and token memory.
+ */
+static void
+test_addrs_fini(void)
+{
+	free(test_fi_addrs);
+	free(test_addrs);
+}
+
+/* This creates an AV with 'count' objects, and peeks at internals to ensure
+ * that the structure is sound. If 'count' is 0, this should default to
+ * cxip_av_dev_sz.
+ */
+static void
+test_create(size_t count)
+{
+	cxit_av_attr.count = count;
+	cxit_create_av();
+
+	/* Should allocate a structure   */
+	cr_assert(cxit_av != NULL,
+		"cxit_av=%p", cxit_av);
+
+	cxit_destroy_av();
+}
+
+/* This inserts 'count' FSAs, looks up all of them, then removes all of them. It
+ * repeats this 'iters' times without destroying the AV.
+ */
+static void
+__test_insert(int count, int iters)
+{
+	int j, i, ret;
+	struct cxip_addr addr;
+	size_t addrlen;
+
+	/* Can't test addresses we haven't set up   */
+	cr_assert(naddrs >= count, "Invalid test case");
+
+	cxit_create_av();
+	test_addrs_init();
+
+	for (j = 0; j < iters; j++) {
+		/* Insert addresses   */
+		for (i = 0; i < count; i++) {
+			ret = fi_av_insert(cxit_av, &test_addrs[i], 1,
+				&test_fi_addrs[i], 0, NULL);
+			/* Should have inserted 1 item   */
+			cr_assert(ret == 1,
+				"fi_av_insert() iter=%d, idx=%d, ret=%d\n",
+				j, i, ret);
+			/* Returned tokens should match insertion order   */
+			cr_assert(test_fi_addrs[i] == i,
+				"fi_av_insert() iter=%d, idx=%d, index=%ld\n",
+				j, i, test_fi_addrs[i]);
+		}
+
+		/* Lookup addresses   */
+		for (i = 0; i < count; i++) {
+			addrlen = sizeof(struct cxip_addr);
+			ret = fi_av_lookup(cxit_av, test_fi_addrs[i], &addr,
+				&addrlen);
+			/* Should succeed   */
+			cr_assert(ret == FI_SUCCESS,
+				"fi_av_lookup() iter=%d, idx=%d, ret=%d",
+				j, i, ret);
+			/* Address should match what we expect   */
+			cr_assert(addr.nic == test_addrs[i].nic,
+				"fi_av_lookup() iter=%d, count=%d, i=%d, index=%ld, nic=%d, exp=%d",
+				j, count, i, test_fi_addrs[i], addr.nic,
+				test_addrs[i].nic);
+			cr_assert(addr.pid == test_addrs[i].pid,
+				"fi_av_lookup() iter=%d, idx=%d, pid=%d",
+				j, i, addr.pid);
+		}
+
+		/* Spot-check. If we remove an arbitrary entry, and then insert
+		 * a new address, it should always fill the hole left by the
+		 * removal.
+		 */
+
+		/* Remove an arbitrary item in the middle   */
+		i = count / 2;
+		ret = fi_av_remove(cxit_av, &test_fi_addrs[i], 1, 0);
+		cr_assert(ret == FI_SUCCESS,
+			"fi_av_remove() mid iter=%d, idx=%d, ret=%d\n",
+			j, i, ret);
+
+		/* Insert an address   */
+		ret = fi_av_insert(cxit_av, &test_addrs[i], 1,
+			&test_fi_addrs[i], 0, NULL);
+		cr_assert(ret == 1,
+			"fi_av_insert() mid iter=%d, idx=%d, ret=%d\n",
+			j, i, ret);
+		cr_assert(test_fi_addrs[i] == i,
+			"fi_av_insert() mid iter=%d, idx=%d, index=%ld\n",
+			j, i, test_fi_addrs[i]);
+
+		addrlen = sizeof(struct cxip_addr);
+		ret = fi_av_lookup(cxit_av, test_fi_addrs[i], &addr,
+			&addrlen);
+		cr_assert(ret == FI_SUCCESS,
+			"fi_av_lookup() mid iter=%d, idx=%d, ret=%d",
+			j, i, ret);
+		cr_assert(addr.nic == test_addrs[i].nic,
+			"fi_av_lookup() mid iter=%d, count=%d, i=%d, index=%ld, nic=%d, exp=%d",
+			j, count, i, test_fi_addrs[i], addr.nic,
+			test_addrs[i].nic);
+		cr_assert(addr.pid == test_addrs[i].pid,
+			"fi_av_lookup() mid iter=%d, idx=%d, pid=%d",
+			j, i, addr.pid);
+
+		/* Remove all of the entries   */
+		for (i = 0; i < count; i++) {
+			ret = fi_av_remove(cxit_av, &test_fi_addrs[i], 1, 0);
+			/* Should succeed   */
+			cr_assert(ret == 0,
+				"fi_av_remove() iter=%d, idx=%d, ret=%d",
+				j, i, ret);
+		}
+	}
+
+	test_addrs_fini();
+	cxit_destroy_av();
+}
+
+/* Wrapper for insert test.
+ *
+ * The first call in each group only fills half of the initially allocated
+ * space.
+ *
+ * The second call fills the entire initially allocated space.
+ *
+ * The third call requires multiple memory reallocations to expand the memory as
+ * this inserts.
+ */
+static void
+test_insert(void)
+{
+	int iters = 1;
+
+	__test_insert(AV_COUNT / 2, iters);
+	__test_insert(AV_COUNT, iters);
+	__test_insert(naddrs, iters);
+
+	iters = 3;
+
+	__test_insert(AV_COUNT / 2, iters);
+	__test_insert(AV_COUNT, iters);
+	__test_insert(naddrs, iters);
+}
+
+TestSuite(av, .init = cxit_setup_av, .fini = cxit_teardown_av,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+ReportHook(TEST_CRASH)(struct criterion_test_stats *stats)
+{
+	printf("signal = %d\n", stats->signal);
+}
+
+/* Test AV creation syntax error */
+Test(av, av_open_invalid)
+{
+	int ret;
+
+	ret = fi_av_open(cxit_domain, NULL, NULL, NULL);
+	cr_assert(ret == -FI_EINVAL, "fi_av_open AV all NULL = %d", ret);
+
+	ret = fi_av_open(cxit_domain, &cxit_av_attr, NULL, NULL);
+	cr_assert(ret == -FI_EINVAL, "fi_av_open AV NULL av = %d", ret);
+
+	ret = fi_av_open(cxit_domain, NULL, &cxit_av, NULL);
+	cr_assert(ret == -FI_EINVAL, "fi_av_open AV NULL av_attr = %d", ret);
+
+	cxit_av_attr.type = 99;
+	ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL);
+	cr_assert(ret == -FI_EINVAL, "fi_av_open AV bad type = %d", ret);
+	cxit_av_attr.type = 0;
+
+	/* NOTE: FI_READ means read-only */
+	cxit_av_attr.flags = FI_READ;
+	ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL);
+	cr_assert(ret == -FI_EINVAL, "fi_av_open AV FI_READ with no name = %d",
+		ret);
+	cxit_av_attr.flags = 0;
+
+	cxit_av_attr.rx_ctx_bits = CXIP_EP_MAX_CTX_BITS + 1;
+	ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL);
+	cr_assert(ret == -FI_EINVAL, "fi_av_open AV too many bits = %d", ret);
+	cxit_av_attr.rx_ctx_bits = 0;
+}
+
+/* Test AV bind not supported */
+Test(av, av_bind_invalid)
+{
+	int ret;
+
+	cxit_create_av();
+
+	ret = fi_av_bind(cxit_av, NULL, 0);
+	cr_assert(ret == -FI_ENOSYS, "fi_av_bind() = %d", ret);
+
+	cxit_destroy_av();
+}
+
+/* Test AV control not supported */
+Test(av, av_control_invalid)
+{
+	int ret;
+
+	cxit_create_av();
+
+	ret = fi_control(&cxit_av->fid, 0, NULL);
+	cr_assert(ret == -FI_ENOSYS, "fi_control() = %d", ret);
+
+	cxit_destroy_av();
+}
+
+/* Test AV open_ops not supported */
+Test(av, av_open_ops_invalid)
+{
+	int ret;
+
+	cxit_create_av();
+
+	ret = fi_open_ops(&cxit_av->fid, NULL, 0, NULL, NULL);
+	cr_assert(ret == -FI_ENOSYS, "fi_open_ops() = %d", ret);
+
+	cxit_destroy_av();
+}
+
+/* Test basic AV table creation */
+Test(av, table_create)
+{
+	cxit_av_attr.type = FI_AV_TABLE;
+	test_create(0);
+	test_create(1024);
+}
+
+/* Test basic AV map creation */
+Test(av, map_create)
+{
+	cxit_av_attr.type = FI_AV_MAP;
+	test_create(0);
+	test_create(1024);
+}
+
+/* Test basic AV default creation */
+Test(av, unspecified_create)
+{
+	cxit_av_attr.type = FI_AV_UNSPEC;
+	test_create(0);
+	test_create(1024);
+}
+
+/* Test basic AV table insert */
+Test(av, table_insert)
+{
+	cxit_av_attr.count = AV_COUNT;
+	cxit_av_attr.type = FI_AV_TABLE;
+	naddrs = cxit_av_attr.count * 10;
+
+	test_insert();
+}
+
+/* Test basic AV map insert */
+Test(av, map_insert)
+{
+	cxit_av_attr.count = AV_COUNT;
+	cxit_av_attr.type = FI_AV_MAP;
+	naddrs = cxit_av_attr.count * 10;
+
+	test_insert();
+}
+
+/* Test address conversion to string */
+Test(av, straddr)
+{
+	uint32_t addr = 0xabcd1234;
+	size_t len = 0;
+	char *buf = NULL;
+	const char *tmp_buf;
+
+	cxit_create_av();
+
+	tmp_buf = fi_av_straddr(cxit_av, &addr, buf, &len);
+	cr_assert_null(tmp_buf, "fi_av_straddr() buffer not null %p", tmp_buf);
+
+	buf = malloc(len);
+	cr_assert(buf != NULL);
+
+	tmp_buf = fi_av_straddr(cxit_av, &addr, buf, &len);
+	cr_assert_not_null(tmp_buf, "fi_av_straddr() buffer is null");
+	cr_assert_str_eq(tmp_buf, buf,
+		"fi_av_straddr() buffer failure: '%s' != '%s'", tmp_buf, buf);
+
+	free(buf);
+
+	cxit_destroy_av();
+}
+
+Test(av, insertsvc)
+{
+	int i, ret;
+	struct cxip_addr addr;
+	size_t addrlen;
+	char pid_str[256];
+
+	cxit_create_av();
+	test_addrs_init();
+
+	ret = fi_av_insertsvc(cxit_av, NULL, pid_str, &test_fi_addrs[0], 0,
+			      NULL);
+	cr_assert(ret == -FI_EINVAL);
+
+	ret = fi_av_insertsvc(cxit_av, nic_to_amac(test_addrs[0].nic), NULL,
+			      &test_fi_addrs[0], 0, NULL);
+	cr_assert(ret == -FI_EINVAL);
+
+	ret = fi_av_insertsvc(cxit_av, NULL, NULL, &test_fi_addrs[0], 0, NULL);
+	cr_assert(ret == -FI_EINVAL);
+
+	/* Insert addresses   */
+	for (i = 0; i < naddrs; i++) {
+		ret = sprintf(pid_str, "%d", test_addrs[i].pid);
+		cr_assert(ret > 0);
+
+		ret = fi_av_insertsvc(cxit_av, nic_to_amac(test_addrs[i].nic),
+				      pid_str, &test_fi_addrs[i], 0, NULL);
+		/* Should have inserted 1 item   */
+		cr_assert(ret == 1,
+			"fi_av_insertsvc() idx=%d, ret=%d\n",
+			i, ret);
+		/* Returned tokens should match insertion order   */
+		cr_assert(test_fi_addrs[i] == i,
+			"fi_av_insertsvc() idx=%d, fi_addr=%ld\n",
+			i, test_fi_addrs[i]);
+	}
+
+	/* Lookup addresses   */
+	for (i = 0; i < naddrs; i++) {
+		addrlen = sizeof(struct cxip_addr);
+		ret = fi_av_lookup(cxit_av, test_fi_addrs[i], &addr,
+			&addrlen);
+		/* Should succeed   */
+		cr_assert(ret == FI_SUCCESS,
+			"fi_av_lookup() idx=%d, ret=%d",
+			i, ret);
+		/* Address should match what we expect   */
+		cr_assert(addr.nic == test_addrs[i].nic,
+			"fi_av_lookup() naddrs=%d, i=%d, index=%ld, nic=%d, exp=%d",
+			naddrs, i, test_fi_addrs[i], addr.nic,
+			test_addrs[i].nic);
+		cr_assert(addr.pid == test_addrs[i].pid,
+			"fi_av_lookup() idx=%d, pid=%d",
+			i, addr.pid);
+	}
+
+	/* Spot-check. If we remove an arbitrary entry, and then insert
+	 * a new address, it should always fill the hole left by the
+	 * removal.
+	 */
+
+	/* Remove an arbitrary item in the middle   */
+	i = naddrs / 2;
+	ret = fi_av_remove(cxit_av, &test_fi_addrs[i], 1, 0);
+	cr_assert(ret == FI_SUCCESS,
+		"fi_av_remove() mid idx=%d, ret=%d\n",
+		i, ret);
+
+	/* Insert an address   */
+	ret = fi_av_insert(cxit_av, &test_addrs[i], 1,
+		&test_fi_addrs[i], 0, NULL);
+	cr_assert(ret == 1,
+		"fi_av_insert() mid idx=%d, ret=%d\n",
+		i, ret);
+	cr_assert(test_fi_addrs[i] == i,
+		"fi_av_insert() mid idx=%d, index=%ld\n",
+		i, test_fi_addrs[i]);
+
+	addrlen = sizeof(struct cxip_addr);
+	ret = fi_av_lookup(cxit_av, test_fi_addrs[i], &addr,
+		&addrlen);
+	cr_assert(ret == FI_SUCCESS,
+		"fi_av_lookup() mid idx=%d, ret=%d",
+		i, ret);
+	cr_assert(addr.nic == test_addrs[i].nic,
+		"fi_av_lookup() mid naddrs=%d, i=%d, index=%ld, nic=%d, exp=%d",
+		naddrs, i, test_fi_addrs[i], addr.nic,
+		test_addrs[i].nic);
+	cr_assert(addr.pid == test_addrs[i].pid,
+		"fi_av_lookup() mid idx=%d, pid=%d",
+		i, addr.pid);
+
+	/* Remove all of the entries   */
+	for (i = 0; i < naddrs; i++) {
+		ret = fi_av_remove(cxit_av, &test_fi_addrs[i], 1, 0);
+		/* Should succeed   */
+		cr_assert(ret == 0,
+			"fi_av_remove() idx=%d, ret=%d",
+			i, ret);
+	}
+
+	test_addrs_fini();
+	cxit_destroy_av();
+}
+
+static double diff_timespec(const struct timespec *time1,
+			    const struct timespec *time0) {
+	return (time1->tv_sec - time0->tv_sec) +
+		(time1->tv_nsec - time0->tv_nsec) / 1000000000.0;
+}
+
+/* Verify that reserve lookup is O(1). */
+Test(av, reverse_lookup)
+{
+	int i;
+	int ret;
+	struct cxip_av *av;
+	struct cxip_addr addr = {};
+	struct timespec start;
+	struct timespec end;
+	double timestamp1;
+	double timestamp2;
+	fi_addr_t fi_addr;
+
+	cxit_create_av();
+
+	av = container_of(cxit_av, struct cxip_av, av_fid.fid);
+
+	/* Insert lots of addresses into the AV. */
+	for (i = 0; i < 10000; i++) {
+		addr.nic = i;
+
+		ret = fi_av_insert(cxit_av, &addr, 1, NULL, 0, NULL);
+		cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret);
+	}
+
+	/* Verify that reserve lookup is not linear. Verify this by the
+	 * addresses being within 5% of each other.
+	 */
+	addr.nic = 0;
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	fi_addr = cxip_av_lookup_fi_addr(av, &addr);
+	clock_gettime(CLOCK_MONOTONIC, &end);
+
+	cr_assert_neq(fi_addr, FI_ADDR_NOTAVAIL,
+		      "cxip_av_lookup_fi_addr failed");
+	timestamp1 = diff_timespec(&end, &start);
+
+	addr.nic = i - 1;
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	fi_addr = cxip_av_lookup_fi_addr(av, &addr);
+	clock_gettime(CLOCK_MONOTONIC, &end);
+
+	cr_assert_neq(fi_addr, FI_ADDR_NOTAVAIL,
+		      "cxip_av_lookup_fi_addr failed");
+	timestamp2 = diff_timespec(&end, &start);
+
+	cr_assert((timestamp1 * 1.05) > timestamp2, "O(1) verification failed");
+
+	cxit_destroy_av();
+}
+
+Test(av, av_user_id_invalid_insert_with_symmetric)
+{
+	int ret;
+	struct cxip_addr addr = {};;
+	fi_addr_t fi_addr = 0;
+
+	cxit_av_attr.flags |= FI_SYMMETRIC;
+	cxit_create_av();
+
+	ret = fi_av_insert(cxit_av, &addr, 1, &fi_addr, FI_AV_USER_ID, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "Bad fi_av_insert rc: %d", ret);
+
+	cxit_destroy_av();
+}
+
+Test(av, av_user_id_invalid_null_fi_addr)
+{
+	int ret;
+	struct cxip_addr addr = {};;
+
+	cxit_create_av();
+
+	ret = fi_av_insert(cxit_av, &addr, 1, NULL, FI_AV_USER_ID, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "Bad fi_av_insert rc: %d", ret);
+
+	cxit_destroy_av();
+}
+
+Test(av, invalid_fi_av_user_id_flag)
+{
+	int ret;
+	struct cxip_addr addr = {};;
+	fi_addr_t fi_addr = 0;
+
+	cxit_av_attr.flags = FI_AV_USER_ID;
+	cxit_create_av();
+
+	ret = fi_av_insert(cxit_av, &addr, 1, &fi_addr, FI_AV_USER_ID, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "Bad fi_av_insert rc: %d", ret);
+
+	cxit_destroy_av();
+}
diff --git a/prov/cxi/test/avset.c b/prov/cxi/test/avset.c
new file mode 100644
index 00000000000..9dbe691b58c
--- /dev/null
+++ b/prov/cxi/test/avset.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <complex.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+
+#include <ofi.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(avset, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/*
+ * Simple test to ensure that any attempt to close the AV before closing any AV
+ * Set will fail with -FI_EBUSY.
+ */
+Test(avset, av_set_refcnt)
+{
+	// Make sure open AV sets preclude closing AV
+	struct fi_av_set_attr attr = {.flags=FI_UNIVERSE};
+	struct fid_av_set *set;
+	int ret;
+
+	ret = fi_av_set(cxit_av, &attr, &set, NULL);
+	cr_expect_eq(ret, 0, "fi_av_set failed, ret=%d", ret);
+
+	ret = fi_close(&cxit_av->fid);
+	cr_expect_eq(ret, -FI_EBUSY, "premature AV close failed, ret=%d", ret);
+
+	ret = fi_close(&set->fid);
+	cr_expect_eq(ret, 0, "fi_close(set) failed, ret=%d", ret);
+}
+
+/*
+ * Test of AVSet operations
+ *
+ * We choose by-two and by-three spans to explore union, intersection, diff
+ */
+static bool is_div_2(fi_addr_t addr)
+{
+	return (addr & 1) == 0;
+}
+
+static bool is_div_3(fi_addr_t addr)
+{
+	return ((addr / 3) * 3) == addr;
+}
+
+static bool is_not2_and_3(fi_addr_t addr)
+{
+	return !is_div_2(addr) && is_div_3(addr);
+}
+
+static bool is_2_and_3(fi_addr_t addr)
+{
+	return is_div_2(addr) && is_div_3(addr);
+}
+
+static bool is_2_or_3(fi_addr_t addr)
+{
+	return is_div_2(addr) || is_div_3(addr);
+}
+
+static bool is_2_and_not14(fi_addr_t addr)
+{
+	return is_div_2(addr) && addr != 14;
+}
+
+static int _comp_fi_addr(const void *a, const void *b)
+{
+	// for sorting unsigned
+	if (*(fi_addr_t *)a < *(fi_addr_t *)b) return -1;
+	if (*(fi_addr_t *)a > *(fi_addr_t *)b) return  1;
+	return 0;
+}
+
+static int check_av_set(const char *name, struct fid_av_set *set, int max,
+			bool (*func)(fi_addr_t), bool is_ordered)
+{
+	// ensure all elements of set satisfy expectations
+	struct cxip_av_set *cxi_set;
+	fi_addr_t *local;
+	int locidx = 0;
+	int errors = 0;
+	int i;
+
+	cxi_set = container_of(set, struct cxip_av_set, av_set_fid);
+
+	// Create the expected result
+	local = calloc(max, sizeof(fi_addr_t));
+	cr_assert_not_null(local, "calloc failure");
+	for (i = 0; i < max; i++) {
+		if ((*func)(i))
+			local[locidx++] = i;
+	}
+
+	// If set is not ordered, sort into order to test
+	if (! is_ordered)
+		qsort(cxi_set->fi_addr_ary, cxi_set->fi_addr_cnt,
+		      sizeof(fi_addr_t), _comp_fi_addr);
+
+	// Traverse maximum span, ensuring that allowed addr is the next addr
+	if (locidx != cxi_set->fi_addr_cnt) {
+		errors++;
+	} else {
+		for (i = 0; i < locidx; i++) {
+			if (local[i] != cxi_set->fi_addr_ary[i]) {
+				errors++;
+				break;
+			}
+		}
+	}
+	if (errors) {
+		printf("%s: bad set:\n", name);
+		printf("  exp  act\n");
+		for (i = 0; i < locidx && i < cxi_set->fi_addr_cnt; i++) {
+			printf("  %3ld  %3ld\n", local[i], cxi_set->fi_addr_ary[i]);
+		}
+		for ( ; i < locidx; i++) {
+			printf("  %3ld  ---\n", local[i]);
+		}
+		for ( ; i < cxi_set->fi_addr_cnt; i++) {
+			printf("  ---  %3ld\n", cxi_set->fi_addr_ary[i]);
+		}
+	}
+	free(local);
+	return errors;
+}
+
+enum {
+	ordered = true,
+	unordered = false
+};
+
+Test(avset, basics)
+{
+	// Test basic set operations
+	struct fi_av_set_attr attr2 = {
+		.count = 20, .start_addr = 0, .end_addr = 19, .stride = 2
+	};
+	struct fi_av_set_attr attr3 = {
+		.count = 20, .start_addr = 0, .end_addr = 19, .stride = 3
+	};
+	struct fid_av_set *set2;
+	struct fid_av_set *setX;
+	int errors;
+	int i, ret;
+
+	errors = 0;
+
+	// Expand the AV, so we have enough addresses to test
+	for (i = 0; i < 20; i++) {
+		struct cxip_addr fake_addr = { .nic = i, .pid = 0xff };
+		int inserted;
+
+		inserted = fi_av_insert(cxit_av, (void *)&fake_addr,
+					1, NULL, 0, NULL);
+		cr_expect_eq(inserted, 1,
+			     "fi_av_insert[%2d] failed, inserted=%d",
+			     i, inserted);
+	}
+
+	// Create a stride of every second element
+	ret = fi_av_set(cxit_av, &attr2, &set2, NULL);
+	cr_expect_eq(ret, 0, "1 fi_av_set set2 failed, ret=%d", ret);
+	errors += check_av_set("1 two", set2, 20, is_div_2, ordered);
+
+	// Create a stride of every third element
+	ret = fi_av_set(cxit_av, &attr3, &setX, NULL);
+	cr_expect_eq(ret, 0, "1 fi_av_set setX failed, ret=%d", ret);
+	errors += check_av_set("1 three", setX, 20, is_div_3, ordered);
+
+	ret = fi_close(&setX->fid);
+	cr_expect_eq(ret, 0, "1 fi_close(setX) failed, ret=%d", ret);
+
+	// 3 union 2
+	ret = fi_av_set(cxit_av, &attr3, &setX, NULL);
+	cr_expect_eq(ret, 0, "2 fi_av_set setX failed, ret=%d", ret);
+	errors += check_av_set("2 dst", setX, 20, is_div_3, ordered);
+
+	ret = fi_av_set_union(setX, set2);
+	cr_expect_eq(ret, 0, "2 fi_av_set set_union failed, ret=%d", ret);
+	errors += check_av_set("2 union", setX, 20, is_2_or_3, unordered);
+
+	ret = fi_close(&setX->fid);
+	cr_expect_eq(ret, 0, "2 fi_close(setX) failed, ret=%d", ret);
+
+	// 3 diff 2
+	ret = fi_av_set(cxit_av, &attr3, &setX, NULL);
+	cr_expect_eq(ret, 0, "3 fi_av_set setX failed, ret=%d", ret);
+	errors += check_av_set("3 dst", setX, 20, is_div_3, ordered);
+
+	ret = fi_av_set_diff(setX, set2);
+	cr_expect_eq(ret, 0, "3 fi_av_set set_diff failed, ret=%d", ret);
+	errors += check_av_set("3 diff", setX, 20, is_not2_and_3, ordered);
+
+	ret = fi_close(&setX->fid);
+	cr_expect_eq(ret, 0, "3 fi_close(setX) failed, ret=%d", ret);
+
+	// 3 intersect 2
+	ret = fi_av_set(cxit_av, &attr3, &setX, NULL);
+	cr_expect_eq(ret, 0, "4 fi_av_set setX failed, ret=%d", ret);
+	errors += check_av_set("4 dst", setX, 20, is_div_3, ordered);
+
+	ret = fi_av_set_intersect(setX, set2);
+	cr_expect_eq(ret, 0, "4 fi_av_set set_intersect failed, ret=%d", ret);
+	errors += check_av_set("4 intersect", setX, 20, is_2_and_3, ordered);
+
+	ret = fi_close(&setX->fid);
+	cr_expect_eq(ret, 0, "4 fi_close(setX) failed, ret=%d", ret);
+
+	// remove address 14
+	ret = fi_av_set(cxit_av, &attr2, &setX, NULL);
+	cr_expect_eq(ret, 0, "5 fi_av_set setX failed, ret=%d", ret);
+	errors += check_av_set("5 dst", setX, 20, is_div_2, ordered);
+
+	ret = fi_av_set_remove(setX, 14);
+	cr_expect_eq(ret, 0, "5 fi_av_set fi_av_set_remove failed, ret=%d", ret);
+	errors += check_av_set("4 remove", setX, 20, is_2_and_not14, ordered);
+
+	ret = fi_close(&setX->fid);
+	cr_expect_eq(ret, 0, "4 fi_close(setX) failed, ret=%d", ret);
+
+	// clean up
+	ret = fi_close(&set2->fid);
+	cr_expect_eq(ret, 0, "fi_close(set2) failed, ret=%d", ret);
+
+	cr_expect_eq(errors, 0, "Errors detected");
+}
+
+
diff --git a/prov/cxi/test/cntr.c b/prov/cxi/test/cntr.c
new file mode 100644
index 00000000000..9ab420b9993
--- /dev/null
+++ b/prov/cxi/test/cntr.c
@@ -0,0 +1,720 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(cntr, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .timeout = 5);
+
+Test(cntr, mod)
+{
+	int ret;
+	int i;
+	uint64_t val = 0;
+	uint64_t errval = 0;
+	struct fid_cntr *tmp_cntr;
+	struct fi_cntr_attr attr = {
+		.wait_obj = FI_WAIT_NONE,
+	};
+
+	ret = fi_cntr_open(cxit_domain, &attr, &tmp_cntr, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cntr_open (send)");
+
+	ret = fi_cntr_add(tmp_cntr, 1);
+	cr_assert(ret == FI_SUCCESS);
+
+	while (fi_cntr_read(tmp_cntr) != 1)
+		sched_yield();
+
+	/* fi_cntr_wait() is invalid with FI_WAIT_NONE */
+	ret = fi_cntr_wait(tmp_cntr, 1, -1);
+	cr_assert(ret == -FI_EINVAL);
+
+	fi_close(&tmp_cntr->fid);
+
+	cr_assert(!fi_cntr_read(cxit_write_cntr));
+
+	/* Test invalid values */
+	ret = fi_cntr_add(cxit_write_cntr, FI_CXI_CNTR_SUCCESS_MAX + 1);
+	cr_assert(ret == -FI_EINVAL);
+
+	ret = fi_cntr_set(cxit_write_cntr, FI_CXI_CNTR_SUCCESS_MAX + 1);
+	cr_assert(ret == -FI_EINVAL);
+
+	ret = fi_cntr_adderr(cxit_write_cntr, FI_CXI_CNTR_FAILURE_MAX + 1);
+	cr_assert(ret == -FI_EINVAL);
+
+	ret = fi_cntr_seterr(cxit_write_cntr, FI_CXI_CNTR_FAILURE_MAX + 1);
+	cr_assert(ret == -FI_EINVAL);
+
+	for (i = 0; i < 10; i++) {
+		val += 10;
+		ret = fi_cntr_add(cxit_write_cntr, 10);
+		cr_assert(ret == FI_SUCCESS);
+
+		while (fi_cntr_read(cxit_write_cntr) != val)
+			sched_yield();
+
+		errval += 30;
+		ret = fi_cntr_adderr(cxit_write_cntr, 30);
+		cr_assert(ret == FI_SUCCESS);
+
+		while (fi_cntr_readerr(cxit_write_cntr) != errval)
+			sched_yield();
+
+		val = 5;
+		ret = fi_cntr_set(cxit_write_cntr, val);
+		cr_assert(ret == FI_SUCCESS);
+
+		while (fi_cntr_read(cxit_write_cntr) != val)
+			sched_yield();
+
+		errval = 15;
+		ret = fi_cntr_seterr(cxit_write_cntr, errval);
+		cr_assert(ret == FI_SUCCESS);
+
+		while (fi_cntr_readerr(cxit_write_cntr) != errval)
+			sched_yield();
+	}
+}
+
+/* Test RMA with counters */
+Test(cntr, write)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = 0x1f;
+	struct fi_cq_tagged_entry cqe;
+	int writes = 10;
+	int i;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = 0xab + i;
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	cr_assert(!fi_cntr_read(cxit_write_cntr));
+
+	for (i = 0; i < writes; i++) {
+		int off = i * send_len;
+
+		ret = fi_inject_write(cxit_ep, send_buf + off, send_len,
+				      cxit_ep_fi_addr, off, key_val);
+		cr_assert(ret == FI_SUCCESS);
+	}
+
+	while (fi_cntr_read(cxit_write_cntr) != writes)
+		sched_yield();
+
+	/* Validate sent data */
+	for (int i = 0; i < writes * send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	/* Make sure no events were delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+/* Test all sizes of RMA transactions with counters */
+Test(cntr, write_sizes)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 16 * 1024;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = 0x1f;
+	struct fi_cq_tagged_entry cqe;
+	int writes = 0;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	cr_assert(!fi_cntr_read(cxit_write_cntr));
+
+	for (send_len = 1; send_len <= win_len; send_len <<= 1) {
+		ret = fi_write(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, key_val, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "ret=%d", ret);
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+		writes++;
+
+		/* Validate sent data */
+		for (int i = 0; i < send_len; i++)
+			cr_assert_eq(mem_window.mem[i], send_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], send_buf[i]);
+	}
+
+	while (fi_cntr_read(cxit_write_cntr) != writes)
+		sched_yield();
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+/* Test fi_read with counters */
+Test(cntr, read)
+{
+	int ret;
+	uint8_t *local;
+	int remote_len = 0x1000;
+	int local_len = 8;
+	uint64_t key_val = 0xa;
+	struct fi_cq_tagged_entry cqe;
+	struct mem_region remote;
+
+	local = calloc(1, local_len);
+	cr_assert_not_null(local, "local alloc failed");
+
+	mr_create(remote_len, FI_REMOTE_READ, 0xc0, &key_val, &remote);
+
+	cr_assert(!fi_cntr_read(cxit_read_cntr));
+
+	/* Get 8 bytes from the source buffer to the receive buffer */
+	ret = fi_read(cxit_ep, local, local_len, NULL, cxit_ep_fi_addr, 0,
+		      key_val, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_READ, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < local_len; i++)
+		cr_expect_eq(local[i], remote.mem[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     local[i], remote.mem[i]);
+
+	while (fi_cntr_read(cxit_read_cntr) != 1)
+		sched_yield();
+
+	mr_destroy(&remote);
+	free(local);
+}
+
+/* Test send/recv counters */
+Test(cntr, ping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	cr_assert(!fi_cntr_read(cxit_send_cntr));
+	cr_assert(!fi_cntr_read(cxit_recv_cntr));
+
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	while (fi_cntr_read(cxit_send_cntr) != 1)
+		sched_yield();
+
+	while (fi_cntr_read(cxit_recv_cntr) != 1)
+		sched_yield();
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+int wait_for_cnt(struct fid_cntr *cntr, int cnt,
+		 uint64_t (*cntr_read)(struct fid_cntr *cntr))
+{
+	uint64_t cntr_value;
+	time_t timeout = time(NULL) + 3;
+
+	while ((cntr_value = cntr_read(cntr)) != cnt) {
+		if (time(NULL) > timeout) {
+			printf("Timeout waiting for cnt:%d cntr_value:%lx\n",
+			       cnt, cntr_value);
+			return -1;
+		}
+		sched_yield();
+	}
+
+	return 0;
+}
+
+int wait_for_value(uint64_t compare_value, uint64_t *wb_buf)
+{
+	time_t timeout = time(NULL) + 2;
+
+	while (compare_value != *wb_buf) {
+		if (time(NULL) > timeout) {
+			printf("Timeout waiting for compare_value:%lx wb:%lx\n",
+			       compare_value, *wb_buf);
+			return -1;
+		}
+		sched_yield();
+	}
+
+	return 0;
+}
+
+static void deferred_rma_test(enum fi_op_type op)
+{
+	int ret;
+	uint8_t *send_buf;
+	struct mem_region mem_window;
+	struct iovec iov = {};
+	struct fi_rma_iov rma_iov = {};
+	struct fi_op_rma rma = {};
+	struct fi_deferred_work work = {};
+	struct fid_cntr *trig_cntr = cxit_write_cntr;
+
+	size_t xfer_size = 8;
+	uint64_t trig_thresh = 1;
+	uint64_t key = 0xbeef;
+
+	uint64_t cxi_value;
+	struct fi_cxi_cntr_ops *cntr_ops;
+	struct cxip_cntr *cxi_cntr;
+
+	ret = fi_open_ops(&trig_cntr->fid, FI_CXI_COUNTER_OPS, 0,
+			  (void **)&cntr_ops, NULL);
+	cr_assert(ret == FI_SUCCESS);
+	cxi_cntr = container_of(&trig_cntr->fid, struct cxip_cntr,
+				cntr_fid.fid);
+	cr_assert_not_null(cxi_cntr, "cxi_cntr is null");
+
+	send_buf = calloc(1, xfer_size);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(xfer_size, FI_REMOTE_WRITE | FI_REMOTE_READ, 0xa0, &key,
+		  &mem_window);
+
+	iov.iov_base = send_buf;
+	iov.iov_len = xfer_size;
+
+	rma_iov.key = key;
+
+	rma.ep = cxit_ep;
+	rma.msg.msg_iov = &iov;
+	rma.msg.iov_count = 1;
+	rma.msg.addr = cxit_ep_fi_addr;
+	rma.msg.rma_iov = &rma_iov;
+	rma.msg.rma_iov_count = 1;
+	rma.flags = FI_CXI_CNTR_WB;
+
+	work.threshold = trig_thresh;
+	work.triggering_cntr = trig_cntr;
+	work.completion_cntr = trig_cntr;
+	work.op_type = op;
+	work.op.rma = &rma;
+
+	ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work);
+	cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret);
+
+	ret = fi_cntr_add(trig_cntr, work.threshold);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+
+	ret = fi_cxi_gen_cntr_success(trig_thresh + 1, &cxi_value);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+Test(cntr, deferred_wb_rma_write)
+{
+	deferred_rma_test(FI_OP_WRITE);
+}
+
+Test(cntr, deferred_wb_rma_read)
+{
+	deferred_rma_test(FI_OP_READ);
+}
+
+Test(cntr, op_cntr_wb1)
+{
+	int ret;
+	struct fid_cntr *cntr;
+	uint64_t trig_thresh = 1;
+	uint64_t cxi_value;
+	struct cxip_cntr *cxi_cntr;
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cntr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	cxi_cntr = container_of(&cntr->fid, struct cxip_cntr, cntr_fid.fid);
+
+	ret = wait_for_cnt(cntr, 0, fi_cntr_read);
+	cr_assert(ret == 0);
+
+	ret = fi_cntr_add(cntr, trig_thresh);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+
+	fi_cntr_read(cntr);
+
+	ret = fi_cxi_gen_cntr_success(trig_thresh, &cxi_value);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb);
+
+	ret = fi_close(&cntr->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close cntr");
+}
+
+Test(cntr, op_cntr_wb2)
+{
+	int ret;
+	void *mmio_addr;
+	size_t mmio_len;
+	uint64_t cxi_value;
+	uint64_t threshold = 1;
+	struct fid_cntr *cntr;
+	struct cxip_cntr *cxi_cntr;
+	struct fi_cxi_cntr_ops *cntr_ops;
+	struct c_ct_writeback *wb_buf = NULL;
+	int wb_len = sizeof(*wb_buf);
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cntr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0,
+			  (void **)&cntr_ops, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = wait_for_cnt(cntr, 0, fi_cntr_read);
+	cr_assert(ret == 0);
+
+	cxi_cntr = container_of(&cntr->fid, struct cxip_cntr, cntr_fid.fid);
+
+	ret = fi_cntr_add(cntr, threshold);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+
+	ret = cntr_ops->get_mmio_addr(&cntr->fid, &mmio_addr, &mmio_len);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_cxi_gen_cntr_success(threshold, &cxi_value);
+	cr_assert(ret == FI_SUCCESS);
+	fi_cntr_read(cntr);
+	ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb);
+	cr_assert(ret == 0);
+
+	cr_assert(fi_cxi_cntr_wb_read(cxi_cntr->wb) == threshold);
+
+	fi_cxi_cntr_set(mmio_addr, 0);
+	fi_cxi_gen_cntr_success(0, &cxi_value);
+	ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb);
+	cr_assert(ret == 0);
+
+	threshold = 10;
+	ret = fi_cntr_add(cntr, threshold);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+	ret = fi_cxi_gen_cntr_success(threshold, &cxi_value);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_cnt(cntr, threshold, fi_cntr_read);
+	cr_assert(ret == 0);
+
+	fi_cxi_cntr_set(mmio_addr, 0);
+	fi_cxi_gen_cntr_success(0, &cxi_value);
+	ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb);
+	cr_assert(ret == 0);
+
+	/* Change to a new writeback buffer */
+	wb_buf = aligned_alloc(s_page_size, wb_len);
+	cr_assert_not_null(wb_buf, "wb_buf alloc failed");
+	ret = cntr_ops->set_wb_buffer(&cntr->fid, wb_buf, wb_len);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Use the new wb buffer */
+	threshold = 20;
+	ret = fi_cntr_add(cntr, threshold);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+	ret = fi_cxi_gen_cntr_success(threshold, &cxi_value);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_cnt(cntr, threshold, fi_cntr_read);
+	cr_assert(ret == 0);
+
+	// Use instead of fi_cxi_cntr_set()
+	*(uint64_t*)(fi_cxi_get_cntr_reset_addr(mmio_addr)) = 0;
+	ret = wait_for_cnt(cntr, 0, fi_cntr_read);
+	cr_assert(ret == 0);
+
+	ret = fi_close(&cntr->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close cntr");
+
+	free(wb_buf);
+}
+
+Test(cntr, counter_ops)
+{
+	int ret;
+	int cnt;
+	uint64_t *addr;
+	uint64_t cxi_value;
+	struct fid_cntr *cntr;
+	struct fi_cxi_cntr_ops *cntr_ops;
+	struct cxip_cntr *cxi_cntr;
+
+	struct c_ct_writeback *wb_buf = NULL;
+	int wb_len = sizeof(*wb_buf);
+	void *mmio_addr;
+	size_t mmio_len;
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cntr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0,
+			  (void **)&cntr_ops, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	cxi_cntr = container_of(&cntr->fid, struct cxip_cntr, cntr_fid.fid);
+
+	wb_buf = aligned_alloc(s_page_size, wb_len);
+	cr_assert_not_null(wb_buf, "wb_buf alloc failed");
+
+	ret = cntr_ops->set_wb_buffer(&cntr->fid, wb_buf, wb_len);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* enables counter */
+	ret = fi_cntr_set(cntr, 0);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_cnt(cntr, 0, fi_cntr_read);
+	cr_assert(ret == 0);
+
+	ret = cntr_ops->get_mmio_addr(&cntr->fid, &mmio_addr, &mmio_len);
+	cr_assert(ret == FI_SUCCESS);
+
+	cr_assert(fi_cxi_cntr_wb_read(cxi_cntr->wb) == 0);
+
+	cnt = 10;
+	ret = fi_cntr_add(cntr, cnt);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_cnt(cntr, cnt, fi_cntr_read);
+	cr_assert(ret == 0);
+	cr_assert(fi_cxi_cntr_wb_read(wb_buf) == cnt);
+
+	fi_cxi_cntr_set(mmio_addr, 0);
+	ret = wait_for_cnt(cntr, 0, fi_cntr_read);
+	cr_assert(ret == 0);
+	cr_assert(fi_cntr_read(cntr) == 0, "read:%ld", fi_cntr_read(cntr));
+
+	ret = fi_cxi_cntr_set(mmio_addr, 15);
+	cr_assert(ret != FI_SUCCESS, "fi_cxi_cntr_set should fail:%d", ret);
+
+	cnt = 5;
+	ret = fi_cntr_add(cntr, cnt);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_cnt(cntr, cnt, fi_cntr_read);
+	cr_assert(ret == 0);
+	cr_assert(fi_cxi_cntr_wb_read(wb_buf) == cnt);
+
+	fi_cxi_cntr_set(mmio_addr, 0);
+	ret = wait_for_cnt(cntr, 0, fi_cntr_read);
+	cr_assert(ret == 0);
+	cr_assert(fi_cntr_read(cntr) == 0, "read:%ld", fi_cntr_read(cntr));
+
+	fi_cxi_cntr_seterr(mmio_addr, 0);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_cnt(cntr, 0, fi_cntr_readerr);
+	cr_assert(ret == 0);
+
+	cnt = 1;
+	ret = fi_cxi_cntr_adderr(mmio_addr, cnt);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_cnt(cntr, cnt, fi_cntr_readerr);
+	cr_assert(ret == 0);
+	cr_assert(fi_cntr_readerr(cntr) == cnt);
+	cr_assert(fi_cxi_cntr_wb_readerr(wb_buf) == cnt);
+
+	fi_cxi_cntr_set(mmio_addr, 0);
+	cr_assert(ret == FI_SUCCESS);
+
+	fi_cxi_cntr_seterr(mmio_addr, 0);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_cnt(cntr, 0, fi_cntr_readerr);
+	cr_assert(ret == 0);
+
+	cnt = 50;
+	ret = fi_cxi_cntr_add(mmio_addr, cnt);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_cnt(cntr, cnt, fi_cntr_read);
+	cr_assert(ret == 0);
+	cr_assert(fi_cntr_read(cntr) == cnt, "cntr:%ld", fi_cntr_read(cntr));
+
+	fi_cxi_cntr_set(mmio_addr, 0);
+	cr_assert(ret == FI_SUCCESS);
+	ret = fi_cxi_gen_cntr_success(0, &cxi_value);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_value(cxi_value, (uint64_t *)wb_buf);
+	cr_assert(ret == 0);
+
+	// Use instead of fi_cxi_cntr_set()
+	*(uint64_t*)(fi_cxi_get_cntr_reset_addr(mmio_addr)) = 0;
+	ret = wait_for_cnt(cntr, 0, fi_cntr_read);
+	cr_assert(ret == 0);
+
+	cnt = 12;
+	*(uint64_t*)(fi_cxi_get_cntr_adderr_addr(mmio_addr)) = cnt;
+	/* Error transition from 0 causes a writeback */
+	while(fi_cxi_cntr_wb_readerr(wb_buf) != cnt)
+		sched_yield();
+
+	cr_assert(fi_cxi_cntr_wb_readerr(wb_buf) == cnt);
+
+	addr = fi_cxi_get_cntr_reseterr_addr(mmio_addr);
+	*addr = 0;
+	ret = fi_cxi_gen_cntr_success(0, &cxi_value);
+	cr_assert(ret == FI_SUCCESS);
+	ret = wait_for_value(cxi_value, (uint64_t *)wb_buf);
+	cr_assert(ret == FI_SUCCESS);
+
+	cr_assert(fi_cntr_readerr(cntr) == 0);
+
+	ret = fi_close(&cntr->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close cntr");
+
+	free(wb_buf);
+}
+
+Test(cntr, cntr_wait_timeout)
+{
+	struct fid_cntr *cntr;
+	struct fi_cntr_attr attr = {
+		.wait_obj = FI_WAIT_UNSPEC,
+	};
+	int timeout = 2999;
+	uint64_t thresh = 0x1234;
+	int ret;
+
+	ret = fi_cntr_open(cxit_domain, &attr, &cntr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_cntr_wait(cntr, thresh, timeout);
+	cr_assert(ret == -FI_ETIMEDOUT);
+
+	ret = fi_close(&cntr->fid);
+	cr_assert(ret == FI_SUCCESS);
+}
+
+Test(cntr, cntr_wait)
+{
+	struct fid_cntr *cntr;
+	struct fi_cntr_attr attr = {
+		.wait_obj = FI_WAIT_UNSPEC,
+	};
+	void *mmio_addr;
+	size_t mmio_len;
+	struct fi_cxi_cntr_ops *cntr_ops;
+	int timeout = 2000;
+	uint64_t thresh = 0x1234;
+	int ret;
+
+	ret = fi_cntr_open(cxit_domain, &attr, &cntr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0,
+			  (void **)&cntr_ops, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = cntr_ops->get_mmio_addr(&cntr->fid, &mmio_addr, &mmio_len);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_cntr_wait(cntr, thresh, timeout);
+	cr_assert(ret == -FI_ETIMEDOUT);
+
+	fi_cxi_cntr_add(mmio_addr, thresh);
+
+	ret = fi_cntr_wait(cntr, thresh, timeout);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_close(&cntr->fid);
+	cr_assert(ret == FI_SUCCESS);
+}
+
+Test(cntr, cntr_wait_bad_threshold)
+{
+	struct fid_cntr *cntr;
+	struct fi_cntr_attr attr = {
+		.wait_obj = FI_WAIT_UNSPEC,
+	};
+	int timeout = 2000;
+	uint64_t thresh = (1ULL << 49);
+	int ret;
+
+	ret = fi_cntr_open(cxit_domain, &attr, &cntr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_cntr_wait(cntr, thresh, timeout);
+	cr_assert(ret == -FI_EINVAL);
+
+	ret = fi_close(&cntr->fid);
+	cr_assert(ret == FI_SUCCESS);
+}
diff --git a/prov/cxi/test/coll.c b/prov/cxi/test/coll.c
new file mode 100644
index 00000000000..b6cc6732253
--- /dev/null
+++ b/prov/cxi/test/coll.c
@@ -0,0 +1,2376 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2017-2019 Intel Corporation. All rights reserved.
+ * Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP
+ */
+
+/*
+ * NOTE: This is a standalone test that uses the COMM_KEY_RANK model, and thus
+ * consists of a single process driving multiple data objects sequentially to
+ * simulate network transfers. It can be run under NETSIM, and is part of the
+ * standard Jenkins validation integration with Git check-in, allowing this to
+ * serve as and automated regression test.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <complex.h>
+#include <time.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <fenv.h>
+
+#include <ofi.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+/* If not compiled with DEBUG=1, this is a no-op */
+#define	TRACE(fmt, ...)	CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__)
+
+#define	MIN(a,b) (((a)<(b))?(a):(b))
+
+/***************************************/
+/**
+ * Sanity tests for proper integration with EP, enable/disable checks.
+ */
+
+TestSuite(coll_init, .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test EP close without explicitly enabling collectives.
+ */
+Test(coll_init, noop)
+{
+	struct cxip_ep *ep;
+
+	cxit_setup_rma();
+	ep = container_of(cxit_ep, struct cxip_ep, ep);
+	cr_assert(ep->ep_obj->coll.enabled,
+		  "coll not enabled on startup\n");
+
+	cr_assert(sizeof(struct cxip_coll_accumulator) >=
+		  sizeof(struct cxip_coll_data),
+		  "sizeof(cxip_coll_accumulator=%ld <"
+		  "sizeof(cxip_coll_data=%ld",
+		  sizeof(struct cxip_coll_accumulator),
+		  sizeof(struct cxip_coll_data));
+
+	cxit_teardown_rma();
+}
+
+/* Test EP close after explicitly enabling collectives.
+ */
+Test(coll_init, enable)
+{
+	struct cxip_ep *ep;
+	int ret;
+
+	cxit_setup_rma();
+	ep = container_of(cxit_ep, struct cxip_ep, ep);
+
+	ret = cxip_coll_enable(ep);
+	cr_assert(ret == 0, "cxip_coll_enable failed: %d\n", ret);
+	cr_assert(ep->ep_obj->coll.enabled,
+		  "coll not enabled after enabling\n");
+	cxit_teardown_rma();
+}
+
+/* Test EP close after disabling collectives.
+ */
+Test(coll_init, disable)
+{
+	struct cxip_ep *ep;
+	int ret;
+
+	cxit_setup_rma();
+	ep = container_of(cxit_ep, struct cxip_ep, ep);
+
+	ret = cxip_coll_enable(ep);
+	cr_assert(ret == 0, "cxip_coll_enable failed: %d\n", ret);
+	ret = cxip_coll_disable(ep->ep_obj);
+	cr_assert(ret == 0, "cxip_coll_disable failed: %d\n", ret);
+	cr_assert(!ep->ep_obj->coll.enabled,
+		  "coll enabled after disabling\n");
+	cxit_teardown_rma();
+}
+
+/* Test EP close after disabling/re-enabling collectives.
+ */
+Test(coll_init, reenable)
+{
+	struct cxip_ep *ep;
+	int ret;
+
+	cxit_setup_rma();
+	ep = container_of(cxit_ep, struct cxip_ep, ep);
+
+	ret = cxip_coll_enable(ep);
+	cr_assert(ret == 0, "cxip_coll_enable failed: %d\n", ret);
+	ret = cxip_coll_disable(ep->ep_obj);
+	cr_assert(ret == 0, "cxip_coll_disable failed: %d\n", ret);
+	ret = cxip_coll_enable(ep);
+	cr_assert(ret == 0, "cxip_coll_enable failed: %d\n", ret);
+	cr_assert(ep->ep_obj->coll.enabled,
+		  "coll not enabled after enabling\n");
+	cxit_teardown_rma();
+}
+
+/***************************************/
+/**
+ * JOIN testing.
+ */
+TestSuite(coll_join, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+struct cxip_addr caddr_base;
+void insert_out(struct cxip_addr *addr, struct cxip_addr *addr_out)
+{
+	*addr = caddr_base;
+}
+
+/* expand AV and create av_sets for collectives */
+static void _create_av_set(int count, int rank, bool rx_discard,
+			   struct fid_av_set **av_set_fid)
+{
+	struct cxip_ep *ep;
+	struct cxip_comm_key comm_key = {
+		.keytype = COMM_KEY_RANK,
+		.rank.rank = rank,
+		.rank.hwroot_idx = 0,
+		.rank.rx_discard = rx_discard
+	};
+	struct fi_av_set_attr attr = {
+		.count = 0,
+		.start_addr = FI_ADDR_NOTAVAIL,
+		.end_addr = FI_ADDR_NOTAVAIL,
+		.stride = 1,
+		.comm_key_size = sizeof(comm_key),
+		.comm_key = (void *)&comm_key,
+		.flags = 0,
+	};
+	struct cxip_addr caddr;
+	int i, ret;
+
+	ep = container_of(cxit_ep, struct cxip_ep, ep);
+
+	/* lookup initiator caddr as set in test framework */
+	ret = cxip_av_lookup_addr(ep->ep_obj->av, cxit_ep_fi_addr, &caddr);
+	cr_assert(ret == 0, "bad lookup on address %ld: %d\n",
+		  cxit_ep_fi_addr, ret);
+	caddr_base = caddr;
+
+	/* create empty av_set */
+	ret = fi_av_set(&ep->ep_obj->av->av_fid, &attr, av_set_fid, NULL);
+	cr_assert(ret == 0, "av_set creation failed: %d\n", ret);
+
+	/* add source address as multiple av entries */
+	for (i = count - 1; i >= 0; i--) {
+		fi_addr_t fi_addr;
+
+		ret = fi_av_insert(&ep->ep_obj->av->av_fid, &caddr, 1,
+				   &fi_addr, 0, NULL);
+		cr_assert(ret == 1, "%d cxip_av_insert failed: %d\n", i, ret);
+		ret = fi_av_set_insert(*av_set_fid, fi_addr);
+		cr_assert(ret == 0, "%d fi_av_set_insert failed: %d\n", i, ret);
+		caddr.nic++;
+	}
+}
+
+void _create_netsim_collective(int count, bool discard, int exp)
+{
+	int i, ret;
+
+	/* replace the insertion/lookup model */
+	cxip_av_addr_out = insert_out;
+
+	TRACE("========================\n%s: entry\n", __func__);
+	TRACE("%s: count=%d\n", __func__, count);
+	cxit_coll_mc_list.count = count;
+	cxit_coll_mc_list.av_set_fid = calloc(cxit_coll_mc_list.count,
+					      sizeof(struct fid_av_set *));
+	cxit_coll_mc_list.mc_fid = calloc(cxit_coll_mc_list.count,
+					  sizeof(struct fid_mc *));
+
+	for (i = 0; i < cxit_coll_mc_list.count; i++) {
+		TRACE("%s: ==== create %d\n", __func__, i);
+		TRACE("create av_set rank %d\n", i);
+		_create_av_set(cxit_coll_mc_list.count, i, discard,
+			       &cxit_coll_mc_list.av_set_fid[i]);
+		TRACE("join collective\n");
+		ret = cxip_join_collective(cxit_ep, FI_ADDR_NOTAVAIL,
+					   cxit_coll_mc_list.av_set_fid[i],
+					   0, &cxit_coll_mc_list.mc_fid[i],
+					   NULL);
+		TRACE("ret=%d\n", ret);
+		cr_assert(ret == exp,
+			  "cxip_coll_enable failed: exp %s saw %s\n",
+			  fi_strerror(-exp), fi_strerror(-ret));
+	}
+	TRACE("%s: exit\n========================\n", __func__);
+}
+
+void _destroy_netsim_collective(void)
+{
+	int i;
+
+	for (i = cxit_coll_mc_list.count - 1; i >= 0; i--) {
+		TRACE("closing %d\n", i);
+		if (cxit_coll_mc_list.mc_fid[i])
+			fi_close(&cxit_coll_mc_list.mc_fid[i]->fid);
+		if (cxit_coll_mc_list.av_set_fid[i])
+			fi_close(&cxit_coll_mc_list.av_set_fid[i]->fid);
+	}
+	TRACE("cleanup\n");
+	free(cxit_coll_mc_list.mc_fid);
+	free(cxit_coll_mc_list.av_set_fid);
+	cxit_coll_mc_list.mc_fid = NULL;
+	cxit_coll_mc_list.av_set_fid = NULL;
+}
+
+static void _wait_for_join(int count, int eq_err, int prov_errno)
+{
+	struct cxip_ep *ep;
+	struct fid_cq *txcq, *rxcq;
+	struct fid_eq *eq;
+	struct fi_cq_err_entry cqd = {};
+	struct fi_eq_err_entry eqd = {};
+	uint32_t event;
+	int ret, err, provcnt;
+
+	ep = container_of(cxit_ep, struct cxip_ep, ep);
+	rxcq = &ep->ep_obj->coll.rx_evtq->cq->util_cq.cq_fid;
+	txcq = &ep->ep_obj->coll.tx_evtq->cq->util_cq.cq_fid;
+	eq = &ep->ep_obj->coll.eq->util_eq.eq_fid;
+	provcnt = 0;
+
+	do {
+		sched_yield();
+		err = -FI_EINVAL;
+		ret = fi_eq_read(eq, &event, &eqd, sizeof(eqd), 0);
+		if (ret == -FI_EAVAIL) {
+			TRACE("=== error available!\n");
+			ret = fi_eq_readerr(eq, &eqd, 0);
+			cr_assert(ret >= 0,
+				"-FI_EAVAIL but fi_eq_readerr()=%d\n", ret);
+			TRACE("  event   = %d\n", event);
+			TRACE("  fid     = %p\n", eqd.fid);
+			TRACE("  context = %p\n", eqd.context);
+			TRACE("  data    = %lx\n", eqd.data);
+			TRACE("  err     = %s (%d)\n",
+				fi_strerror(-eqd.err), eqd.err);
+			TRACE("  prov_err= %d\n", eqd.prov_errno);
+			TRACE("  err_data= %p\n", eqd.err_data);
+			TRACE("  err_size= %ld\n", eqd.err_data_size);
+			TRACE("  readerr = %d\n", ret);
+			err = eqd.err;
+			event = eqd.data;
+			if (eqd.prov_errno != prov_errno) {
+				TRACE("prov_err exp=%d saw=%d\n",
+					prov_errno, eqd.prov_errno);
+				provcnt++;
+			}
+			TRACE("===\n");
+		} else if (ret >= 0) {
+			TRACE("=== EQ SUCCESS!\n");
+			err = FI_SUCCESS;
+		} else {
+			err = ret;
+		}
+		if (err != -FI_EAGAIN) {
+			TRACE("eq_err = %d, err = %d\n", eq_err, err);
+			if (eq_err != err) {
+				cr_assert(eq_err == err,
+				  "FAILED TEST: eq_err = '%s' saw '%s'\n",
+				  fi_strerror(-eq_err), fi_strerror(-err));
+				break;
+			}
+			if (event == FI_JOIN_COMPLETE) {
+				TRACE("FI_JOIN_COMPLETE seen\n");
+				count--;
+			}
+		}
+
+		ret = fi_cq_read(rxcq, &cqd, sizeof(cqd));
+		if (ret == -FI_EAVAIL) {
+			ret = fi_cq_readerr(rxcq, &cqd, sizeof(cqd));
+			break;
+		}
+
+		ret = fi_cq_read(txcq, &cqd, sizeof(cqd));
+		if (ret == -FI_EAVAIL) {
+			ret = fi_cq_readerr(txcq, &cqd, sizeof(cqd));
+			break;
+		}
+	} while (count > 0);
+	TRACE("wait done\n");
+	cr_assert(provcnt == 0, "Mismatched provider errors\n");
+}
+
+/* Basic test of single NETSIM join.
+ */
+Test(coll_join, join1)
+{
+	TRACE("=========================\n");
+	TRACE("join1\n");
+	_create_netsim_collective(1, true, FI_SUCCESS);
+	_wait_for_join(1, FI_SUCCESS, 0);
+	_destroy_netsim_collective();
+}
+
+/* Basic test of two NETSIM joins.
+ */
+Test(coll_join, join2)
+{
+	TRACE("=========================\n");
+	TRACE("join2\n");
+	_create_netsim_collective(2, true, FI_SUCCESS);
+	_wait_for_join(2, FI_SUCCESS, 0);
+	_destroy_netsim_collective();
+}
+
+/* Basic test of three NETSIM joins.
+ */
+Test(coll_join, join3)
+{
+	TRACE("=========================\n");
+	TRACE("join3\n");
+	_create_netsim_collective(3, true, FI_SUCCESS);
+	_wait_for_join(3, FI_SUCCESS, 0);
+	_destroy_netsim_collective();
+}
+
+/* Basic test of maximum NETSIM joins.
+ */
+Test(coll_join, join32)
+{
+	TRACE("=========================\n");
+	TRACE("join32\n");
+	_create_netsim_collective(32, true, FI_SUCCESS);
+	_wait_for_join(32, FI_SUCCESS, 0);
+	_destroy_netsim_collective();
+}
+
+#if ENABLE_DEBUG
+/* The following tests verify DEBUG-ONLY capabilities */
+
+/* Confirm that -FI_EAGAIN is harmless on all zbcoll stages */
+Test(coll_join, retry_getgroup) {
+	int node;
+
+	TRACE("=========================\n");
+	TRACE("join retry getgroup\n");
+	for (node = 0; node < 5; node++) {
+		cxip_trap_set(node, CXIP_TRAP_GETGRP, -FI_EAGAIN);
+		_create_netsim_collective(5, true, FI_SUCCESS);
+		_wait_for_join(5, FI_SUCCESS, 0);
+		_destroy_netsim_collective();
+		cxip_trap_close();
+	}
+}
+
+Test(coll_join, retry_broadcast) {
+	int node;
+
+	TRACE("=========================\n");
+	TRACE("join retry broadcast\n");
+	for (node = 0; node < 5; node++) {
+		cxip_trap_set(node, CXIP_TRAP_BCAST, -FI_EAGAIN);
+		_create_netsim_collective(5, true, FI_SUCCESS);
+		_wait_for_join(5, FI_SUCCESS, 0);
+		_destroy_netsim_collective();
+		cxip_trap_close();
+	}
+}
+
+Test(coll_join, retry_reduce) {
+	int node;
+
+	TRACE("=========================\n");
+	TRACE("join retry reduce\n");
+	for (node = 0; node < 5; node++) {
+		cxip_trap_set(node, CXIP_TRAP_REDUCE, -FI_EAGAIN);
+		_create_netsim_collective(5, true, FI_SUCCESS);
+		_wait_for_join(5, FI_SUCCESS, 0);
+		_destroy_netsim_collective();
+		cxip_trap_close();
+	}
+}
+
+Test(coll_join, fail_ptlte) {
+	int node;
+
+	TRACE("=========================\n");
+	TRACE("join fail mixed errors\n");
+	for (node = 0; node < 5; node++) {
+		cxip_trap_set(node, CXIP_TRAP_INITPTE, -FI_EFAULT);
+		_create_netsim_collective(5, true, FI_SUCCESS);
+		_wait_for_join(5, -FI_EAVAIL, CXIP_PROV_ERRNO_PTE);
+		_destroy_netsim_collective();
+		cxip_trap_close();
+	}
+}
+#endif
+
+/***************************************/
+/**
+ * Basic send/receive testing.
+ */
+
+TestSuite(coll_put, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* 50-byte packet */
+struct fakebuf {
+	uint64_t count[6];
+	uint16_t pad;
+} __attribute__((packed));
+
+/* Progression is needed because the test runs in a single execution thread with
+ * NETSIM. This waits for completion of PROGRESS_COUNT messages on the simulated
+ * (loopback) target. It needs to be called periodically during the test run, or
+ * the netsim resources run out and this gets blocked.
+ */
+#define	PROGRESS_COUNT	10
+void _progress_put(struct cxip_cq *cq, int sendcnt, uint64_t *dataval)
+{
+	struct fi_cq_tagged_entry entry[PROGRESS_COUNT];
+	struct fi_cq_err_entry err;
+	int i, ret;
+
+	while (sendcnt > 0) {
+		do {
+			int cnt = MIN(PROGRESS_COUNT, sendcnt);
+			sched_yield();
+			ret = fi_cq_read(&cq->util_cq.cq_fid, entry, cnt);
+		} while (ret == -FI_EAGAIN);
+		if (ret == -FI_EAVAIL) {
+			ret = fi_cq_readerr(&cq->util_cq.cq_fid, &err, 0);
+			memcpy(&entry[0], &err, sizeof(entry[0]));
+		}
+		for (i = 0; i < ret; i++) {
+			struct fakebuf *fb = entry[i].buf;
+			cr_assert(entry[i].len == sizeof(*fb),
+				  "fb->len exp %ld, saw %ld\n",
+				  sizeof(*fb), entry[i].len);
+			cr_assert(fb->count[0] == *dataval,
+				  "fb->count[0] exp %ld, saw %ld\n",
+				  fb->count[0], *dataval);
+			cr_assert(fb->count[5] == *dataval,
+				  "fb->count[5] exp %ld, saw %ld\n",
+				  fb->count[5], *dataval);
+			cr_assert(fb->pad == (uint16_t)*dataval,
+				  "fb_pad exp %x, saw %x\n",
+				  fb->pad, (uint16_t)*dataval);
+			(*dataval)++;
+		}
+		sendcnt -= ret;
+	}
+}
+
+/* Put count packets, and verify them. This sends count packets from one
+ * NETSIM multicast resource to another.
+ */
+void _put_data(int count, int from_rank, int to_rank)
+{
+	struct cxip_coll_mc *mc_obj_send, *mc_obj_recv;
+	struct cxip_coll_reduction *reduction;
+	struct cxip_ep *ep;
+	struct fakebuf *buf;
+	void *buffers;
+	int sendcnt, cnt;
+	uint64_t dataval;
+	int i, j, ret;
+
+	ep = container_of(cxit_ep, struct cxip_ep, ep);
+
+	/* from and to (may be the same mc_obj) */
+	mc_obj_send = container_of(cxit_coll_mc_list.mc_fid[from_rank],
+				 struct cxip_coll_mc, mc_fid);
+	mc_obj_recv = container_of(cxit_coll_mc_list.mc_fid[to_rank],
+				 struct cxip_coll_mc, mc_fid);
+
+	TRACE("%s: mc_obj_send = %p\n", __func__, mc_obj_send);
+	TRACE("%s: mc_obj_recv = %p\n", __func__, mc_obj_recv);
+
+	/* clear any prior values */
+	TRACE("%s: reset mc_ctrs\n", __func__);
+	cxip_coll_reset_mc_ctrs(&mc_obj_send->mc_fid);
+	cxip_coll_reset_mc_ctrs(&mc_obj_recv->mc_fid);
+
+	/* from_rank reduction */
+	reduction = &mc_obj_send->reduction[0];
+
+	/* must persist until _progress called, for validation */
+	buffers = calloc(PROGRESS_COUNT, sizeof(*buf));
+
+	buf = buffers;
+	sendcnt = 0;
+	dataval = 0;
+	TRACE("%s: iteration over %p\n", __func__, buf);
+	for (i = 0; i < count; i++) {
+		for (j = 0; j < 6; j++)
+			buf->count[j] = i;
+		buf->pad = i;
+		TRACE("call cxip_coll_send()\n");
+		ret = cxip_coll_send(reduction, to_rank, buf, sizeof(*buf),
+				     NULL);
+		cr_assert(ret == 0, "cxip_coll_send failed: %d\n", ret);
+
+		buf++;
+		sendcnt++;
+		if (sendcnt >= PROGRESS_COUNT) {
+			_progress_put(ep->ep_obj->coll.rx_evtq->cq, sendcnt,
+				      &dataval);
+			buf = buffers;
+			sendcnt = 0;
+		}
+	}
+	TRACE("call _progress_put\n");
+	_progress_put(ep->ep_obj->coll.rx_evtq->cq, sendcnt, &dataval);
+
+	/* check final counts */
+	TRACE("check counts\n");
+	if (count * sizeof(*buf) >
+	    ep->ep_obj->coll.buffer_size - ep->ep_obj->rxc.min_multi_recv) {
+		cnt = ofi_atomic_get32(&mc_obj_recv->coll_pte->buf_swap_cnt);
+		cr_assert(cnt > 0, "Did not recirculate buffers\n");
+	}
+
+	TRACE("check atomic counts\n");
+	cnt = ofi_atomic_get32(&mc_obj_send->send_cnt);
+	cr_assert(cnt == count,
+		  "Expected mc_obj[%d] send_cnt == %d, saw %d",
+		  from_rank, count, cnt);
+
+	cnt = ofi_atomic_get32(&mc_obj_recv->coll_pte->recv_cnt);
+	cr_assert(cnt == count,
+		  "Expected mc_obj raw recv_cnt == %d, saw %d",
+		  count, cnt);
+
+	cnt = ofi_atomic_get32(&mc_obj_recv->recv_cnt);
+	cr_assert(cnt == 0,
+		  "Expected mc_obj[%d]->[%d] recv_cnt == %d, saw %d",
+		  from_rank, to_rank, count, cnt);
+	cnt = ofi_atomic_get32(&mc_obj_recv->pkt_cnt);
+	cr_assert(cnt == 0,
+		  "Expected mc_obj[%d]->[%d] pkt_cnt == %d, saw %d",
+		  from_rank, to_rank, 0, cnt);
+
+	TRACE("free buffers\n");
+	free(buffers);
+}
+
+/* Attempt to send from rank 0 to rank 3 (does not exist).
+ */
+Test(coll_put, put_bad_rank)
+{
+	struct cxip_coll_mc *mc_obj;
+	struct cxip_coll_reduction *reduction;
+	struct fakebuf buf;
+	int ret;
+
+	_create_netsim_collective(2, false, FI_SUCCESS);
+	_wait_for_join(2, FI_SUCCESS, 0);
+
+	mc_obj = container_of(cxit_coll_mc_list.mc_fid[0],
+			      struct cxip_coll_mc, mc_fid);
+	reduction = &mc_obj->reduction[0];
+
+	ret = cxip_coll_send(reduction, 3, &buf, sizeof(buf), NULL);
+	cr_assert(ret == -FI_EINVAL, "cxip_coll_set bad error = %d\n", ret);
+
+	_destroy_netsim_collective();
+}
+
+/* Basic test with one packet from rank 0 to rank 0.
+ */
+Test(coll_put, put_one)
+{
+	_create_netsim_collective(1, false, FI_SUCCESS);
+	_wait_for_join(1, FI_SUCCESS, 0);
+	_put_data(1, 0, 0);
+	_destroy_netsim_collective();
+}
+
+/* Basic test with one packet from each rank to another rank.
+ * Exercises NETSIM rank-based target addressing.
+ */
+Test(coll_put, put_ranks)
+{
+	_create_netsim_collective(2, false, FI_SUCCESS);
+	_wait_for_join(2, FI_SUCCESS, 0);
+	TRACE("call _put_data()\n");
+	_put_data(1, 0, 0);
+	_put_data(1, 0, 1);
+	_put_data(1, 1, 0);
+	_put_data(1, 1, 1);
+	_destroy_netsim_collective();
+}
+
+/* Test a lot of packets to force buffer rollover.
+ */
+Test(coll_put, put_many)
+{
+	_create_netsim_collective(1, false, FI_SUCCESS);
+	_wait_for_join(1, FI_SUCCESS, 0);
+	_put_data(4000, 0, 0);
+	_destroy_netsim_collective();
+}
+
+/* Progress the reduction packet send.
+ */
+void _progress_red_pkt(struct cxip_cq *cq, int sendcnt, uint64_t *dataval)
+{
+	struct fi_cq_tagged_entry entry[PROGRESS_COUNT];
+	struct fi_cq_err_entry err;
+	int i, ret;
+
+	while (sendcnt > 0) {
+		do {
+			int cnt = MIN(PROGRESS_COUNT, sendcnt);
+			sched_yield();
+			ret = fi_cq_read(&cq->util_cq.cq_fid, entry, cnt);
+		} while (ret == -FI_EAGAIN);
+		if (ret == -FI_EAVAIL) {
+			ret = fi_cq_readerr(&cq->util_cq.cq_fid, &err, 0);
+			memcpy(&entry[0], &err, sizeof(entry[0]));
+		}
+		for (i = 0; i < ret; i++)
+			(*dataval)++;
+		sendcnt -= ret;
+	}
+}
+
+/* Test red_pkt sends. With only one node, root sends to self.
+ */
+void _put_red_pkt(int count)
+{
+	struct cxip_coll_mc *mc_obj;
+	struct cxip_coll_reduction *reduction;
+	struct cxip_coll_data coll_data = {.red_cnt = 1};
+	int sendcnt, cnt;
+	uint64_t dataval;
+	int i, ret;
+
+	_create_netsim_collective(1, false, FI_SUCCESS);
+	_wait_for_join(1, FI_SUCCESS, 0);
+
+	mc_obj = container_of(cxit_coll_mc_list.mc_fid[0],
+			      struct cxip_coll_mc, mc_fid);
+
+	/* clear counters */
+	cxip_coll_reset_mc_ctrs(&mc_obj->mc_fid);
+
+	sendcnt = 0;
+	dataval = 0;
+	coll_data.intval.ival[0] = dataval;
+	reduction = &mc_obj->reduction[0];
+	reduction->coll_state = CXIP_COLL_STATE_NONE;
+	for (i = 0; i < count; i++) {
+		ret = cxip_coll_send_red_pkt(reduction, &coll_data,
+					     false, false);
+		cr_assert(ret == FI_SUCCESS,
+			  "Packet send from root failed: %d\n", ret);
+
+		sendcnt++;
+		if (sendcnt >= PROGRESS_COUNT) {
+			_progress_red_pkt(mc_obj->ep_obj->coll.rx_evtq->cq,
+					  sendcnt, &dataval);
+			sendcnt = 0;
+		}
+	}
+	_progress_red_pkt(mc_obj->ep_obj->coll.rx_evtq->cq, sendcnt, &dataval);
+
+	cnt = ofi_atomic_get32(&mc_obj->send_cnt);
+	cr_assert(cnt == count, "Bad send counter on root: %d, exp %d\n", cnt, count);
+	cnt = ofi_atomic_get32(&mc_obj->recv_cnt);
+	cr_assert(cnt == count, "Bad recv counter on root: %d, exp %d\n", cnt, count);
+	cnt = ofi_atomic_get32(&mc_obj->pkt_cnt);
+	cr_assert(cnt == count, "Bad pkt counter on root: %d, exp %d\n", cnt, count);
+
+	_destroy_netsim_collective();
+}
+
+/* Test of a single red_pkt from root to root.
+ */
+Test(coll_put, put_red_pkt_one)
+{
+	_put_red_pkt(1);
+}
+
+/* Test of a many red_pkts from root to root.
+ */
+Test(coll_put, put_red_pkt_many)
+{
+	_put_red_pkt(4000);
+}
+
+/* Test of the reduction packet code distribution under NETSIM.
+ * Exercises distribution root->leaves, leaves->root, single packet.
+ */
+Test(coll_put, put_red_pkt_distrib)
+{
+	struct cxip_coll_mc *mc_obj[5];
+	struct cxip_cq *rx_cq;
+	struct cxip_coll_reduction *reduction;
+	struct cxip_coll_data coll_data = {.red_cnt = 1};
+	struct fi_cq_data_entry entry;
+	int i, cnt, ret;
+
+	_create_netsim_collective(5, false, FI_SUCCESS);
+	_wait_for_join(5, FI_SUCCESS, 0);
+
+	for (i = 0; i < 5; i++) {
+		mc_obj[i] = container_of(cxit_coll_mc_list.mc_fid[i],
+					 struct cxip_coll_mc, mc_fid);
+		mc_obj[i]->reduction[0].coll_state = CXIP_COLL_STATE_NONE;
+		cxip_coll_reset_mc_ctrs(&mc_obj[i]->mc_fid);
+	}
+
+	rx_cq = mc_obj[0]->ep_obj->coll.rx_evtq->cq;
+
+	coll_data.intval.ival[0] = 0;
+	reduction = &mc_obj[0]->reduction[0];
+	ret = cxip_coll_send_red_pkt(reduction, &coll_data,
+				     false, false);
+	cr_assert(ret == FI_SUCCESS,
+		  "Packet send from root failed: %d\n", ret);
+	cnt = ofi_atomic_get32(&mc_obj[0]->send_cnt);
+	cr_assert(cnt == 4, "Bad send counter on root: %d\n", cnt);
+	for (i = 1; i < 5; i++) {
+		do {
+			sched_yield();
+			ret = fi_cq_read(&rx_cq->util_cq.cq_fid, &entry, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert(ret == 1, "Bad CQ response[%d]: %d\n", i, ret);
+		cnt = ofi_atomic_get32(&mc_obj[i]->recv_cnt);
+		cr_assert(cnt == 1,
+			  "Bad recv counter on leaf[%d]: %d\n", i, cnt);
+	}
+
+	/* Send data from leaf (!0) to root */
+	for (i = 0; i < 5; i++)
+		cxip_coll_reset_mc_ctrs(&mc_obj[i]->mc_fid);
+	for (i = 1; i < 5; i++) {
+		coll_data.intval.ival[0] = i;
+		reduction = &mc_obj[i]->reduction[0];
+		ret = cxip_coll_send_red_pkt(reduction, &coll_data,
+					     false, false);
+		cr_assert(ret == FI_SUCCESS,
+			  "Packet send from leaf[%d] failed: %d\n", i, ret);
+		cnt = ofi_atomic_get32(&mc_obj[i]->send_cnt);
+		cr_assert(cnt == 1,
+			  "Bad send counter on leaf[%d]: %d\n", i, cnt);
+		do {
+			sched_yield();
+			ret = fi_cq_read(&rx_cq->util_cq.cq_fid, &entry, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert(ret == 1, "Bad CQ response[%d]: %d\n", i, ret);
+	}
+
+	cnt = ofi_atomic_get32(&mc_obj[0]->recv_cnt);
+	cr_assert(cnt == 4,
+		  "Bad recv counter on root: %d\n", cnt);
+
+	_destroy_netsim_collective();
+}
+
+/***************************************/
+/**
+ * Test reduction concurrency.
+ */
+TestSuite(coll_reduce, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .disabled = false, .timeout = 2*CXIT_DEFAULT_TIMEOUT);
+
+/* Simulated user context, specifically to return error codes */
+struct user_context {
+	struct dlist_entry entry;
+	int node;		// reduction simulated node (MC object)
+	int seqno;		// reduction sequence number
+	int red_id;		// reduction ID
+	int errcode;		// reduction error code
+	int hw_rc;		// reduction hardware failure code
+	uint64_t expval;	// expected reduction value
+};
+
+static struct dlist_entry done_list;
+static int dlist_initialized;
+static int max_queue_depth;
+static int queue_depth;
+static int rx_count;
+static int tx_count;
+
+static ssize_t _allreduce_poll(struct fid_cq *rx_cq_fid,
+				struct fid_cq *tx_cq_fid,
+				struct fi_cq_data_entry *entry)
+{
+	ssize_t ret;
+
+	/* poll once for RX and TX, report only TX event */
+	sched_yield();
+	ret = fi_cq_read(rx_cq_fid, entry, 1);
+	if (ret == FI_SUCCESS)
+		rx_count++;
+	ret = fi_cq_read(tx_cq_fid, entry, 1);
+	if (ret == FI_SUCCESS)
+		tx_count++;
+	return ret;
+}
+
+static void _allreduce_wait(struct fid_cq *rx_cq_fid, struct fid_cq *tx_cq_fid,
+			    struct user_context *context)
+{
+	struct dlist_entry *done;
+	struct fi_cq_data_entry entry;
+	struct fi_cq_err_entry err_entry;
+	struct user_context *ctx;
+	int ret;
+
+	/* initialize the static locals on first use */
+	if (! dlist_initialized) {
+		dlist_init(&done_list);
+		dlist_initialized = 1;
+	}
+
+	/* search for prior detection of context (on queue) */
+	dlist_foreach(&done_list, done) {
+		if ((void *)context == (void *)done) {
+			dlist_remove(done);
+			return;
+		}
+	}
+
+	do {
+		/* Wait for a tx CQ completion event, rx CQ may get behind */
+		do {
+			ret = _allreduce_poll(rx_cq_fid, tx_cq_fid, &entry);
+		} while (context && ret == -FI_EAGAIN);
+
+		ctx = NULL;
+		if (ret == -FI_EAVAIL) {
+			/* tx CQ posted an error, copy to user context */
+			ret = fi_cq_readerr(tx_cq_fid, &err_entry, 1);
+			cr_assert(ret == 1, "fi_cq_readerr failed: %d\n", ret);
+			ctx = err_entry.op_context;
+			ctx->errcode = err_entry.err;
+			ctx->hw_rc = err_entry.prov_errno;
+			cr_assert(err_entry.err != 0,
+				  "Failure with good return\n");
+			queue_depth--;
+		} else if (ret == 1) {
+			/* tx CQ posted a normal completion */
+			ctx = entry.op_context;
+			ctx->errcode = 0;
+			ctx->hw_rc = 0;
+			queue_depth--;
+		} else {
+			/* We should only see a 'no-event' error */
+			cr_assert(ret == -FI_EAGAIN, "Improper return %d\n", 		  ret);
+		}
+
+		/* context we are looking for, NULL matches no-event */
+		if (ctx == context)
+			return;
+
+		/* if we did see a ctx == context, record it  */
+		if (ctx)
+			dlist_insert_tail(&ctx->entry, &done_list);
+
+	} while (context);
+
+}
+
+/* extract and verify mcs and cqs across NETSIM collective group */
+void _resolve_group(const char *label, int nodes,
+		    struct cxip_coll_mc **mc_obj,
+		    struct fid_cq **rx_cq_fid,
+		    struct fid_cq **tx_cq_fid)
+{
+	struct cxip_ep_obj *ep_obj;
+	int node;
+
+	/* scan mc_fid[], convert to mc_obj[], and extract ep_obj pointer */
+	ep_obj = NULL;
+	for (node = 0; node < nodes; node++) {
+		mc_obj[node] = container_of(cxit_coll_mc_list.mc_fid[node],
+					     struct cxip_coll_mc, mc_fid);
+		/* all mc_obj[] must have the same ep_obj */
+		if (!ep_obj)
+			ep_obj = mc_obj[node]->ep_obj;
+		cr_assert(mc_obj[node]->ep_obj == ep_obj,
+			  "%s Mismatched endpoints\n", label);
+	}
+	cr_assert(ep_obj != NULL,
+		  "%s Did not find an endpoint object\n", label);
+	/* extract rx and tx cq fids */
+	*rx_cq_fid = &ep_obj->coll.rx_evtq->cq->util_cq.cq_fid;
+	*tx_cq_fid = &ep_obj->coll.tx_evtq->cq->util_cq.cq_fid;
+}
+
+/**
+ * @brief Exercise the collective state machine.
+ *
+ * This is a single-threaded test, intended for use with NETSIM.
+ *
+ * We initiate the collective in sequence, beginning with 'start_node', and
+ * wrapping around. If start_node is zero, the root node initiates first,
+ * otherwise a leaf node initiates first.
+ *
+ * We perform 'concur' reductions concurrently. When we hit the maximum of
+ * concurrent injections, the reduction attempt should return -FI_EAGAIN. When
+ * this happens, we poll to see if a completion has occurred, then try again.
+ * Since we don't know the order of completions, we wait for ANY completion,
+ * which is then saved in a queue. We can then (later) look for a specific
+ * completion, which searches the queue before waiting for new completions.
+ *
+ * We inject an error by specifying a 'bad' node in the range of nodes. If
+ * bad_node is outside the range (e.g. -1), no errors will be injected. The
+ * error injection is done by choosing to send the wrong reduction operation
+ * code for the bad node, which causes the entire reduction to fail.
+ *
+ * We perform 'concur' reductions to exercise the round-robin reduction ID
+ * handling and blocking. This should be tested for values > 8.
+ *
+ * We generate different results for each concurrent reduction, to ensure that
+ * there is no mixing of the packets in each reduction channel.
+ *
+ * @param start_node - node (rank) to start the reduction
+ * @param bad_node - node to inject a bad reduction, or -1 to succeed
+ * @param concur - number of reductions to start before polling
+ */
+void _allreduce(int start_node, int bad_node, int concur)
+{
+	struct cxip_coll_mc **mc_obj;
+	struct user_context **context;
+	struct cxip_intval **rslt;
+	struct cxip_intval *data;
+	struct fid_cq *rx_cq_fid, *tx_cq_fid;
+	int nodes, first, last, base;
+	char label[128];
+	uint64_t result;
+	ssize_t size;
+	int i, node, ret;
+
+	TRACE("\n===== %s rank=%d bad=%d concur=%d\n",
+		__func__, start_node, bad_node, concur);
+	concur = MAX(concur, 1);
+	nodes = cxit_coll_mc_list.count;
+	context = calloc(nodes, sizeof(**context));
+	mc_obj = calloc(nodes, sizeof(**mc_obj));
+	rslt = calloc(nodes, sizeof(**rslt));
+	data = calloc(nodes, sizeof(*data));
+	start_node %= nodes;
+	snprintf(label, sizeof(label), "{%2d,%2d,%2d}",
+		 start_node, bad_node, concur);
+
+	_resolve_group(label, nodes, mc_obj, &rx_cq_fid, &tx_cq_fid);
+	for (node = 0; node < nodes; node++) {
+		context[node] = calloc(concur, sizeof(struct user_context));
+		rslt[node] = calloc(concur, sizeof(struct cxip_intval));
+	}
+
+	/* Inject all of the collectives */
+	first = 0;
+	last = 0;
+	base = 1;
+	result = 0;
+
+	/* last advances from 0 to concur */
+	while (last < concur) {
+		uint64_t undone = (1 << nodes) - 1;
+
+		/* use different values on each concurrency */
+		base <<= 1;
+		if (base > 16)
+			base = 1;
+
+		/* FI_EAGAIN results will force reordering */
+		result = 0;
+		while (undone) {
+			/* Polls once if we have free reduction IDs */
+			_allreduce_wait(rx_cq_fid, tx_cq_fid, NULL);
+			/* Initiates a single BAND reduction across the nodes */
+			for (i = 0; i < nodes; i++) {
+				enum fi_op op;
+				uint64_t mask;
+
+				node = (start_node + i) % nodes;
+				mask = 1LL << node;
+				op = (node == bad_node) ? FI_BAND : FI_BOR;
+
+				/* Don't repeat nodes that succeeded */
+				if (! (mask & undone))
+					continue;
+
+				/* Each node contributes a bit */
+				data[node].ival[0] = (base << node);
+				result |= data[node].ival[0];
+				context[node][last].node = node;
+				context[node][last].seqno = last;
+
+				cxip_capture_red_id(&context[node][last].red_id);
+				size = cxip_allreduce(cxit_ep,
+					&data[node], 1, NULL,
+					&rslt[node][last], NULL,
+					(fi_addr_t)mc_obj[node],
+					FI_UINT64, op, 0,
+					&context[node][last]);
+				if (size == -FI_EAGAIN)
+					continue;
+
+				/* Completed this one */
+				undone &= ~mask;
+
+				/* Event queue should be one deeper */
+				if (ret != -FI_EAGAIN &&
+				    max_queue_depth < ++queue_depth)
+					max_queue_depth = queue_depth;
+			}
+		}
+
+		/* record the final expected result */
+		for (node = 0; node < nodes; node++)
+			context[node][last].expval = result;
+
+		/* Ensure these all used the same reduction ID */
+		ret = 0;
+		for (node = 1; node < nodes; node++)
+			if (context[0][last].red_id !=
+			    context[node][last].red_id)
+				ret = -1;
+		if (ret)
+			cr_assert(true, "%s reduction ID mismatch\n", label);
+
+		last++;
+	}
+
+	/* Wait for all reductions to complete */
+	while (first < last) {
+		struct user_context *ctx;
+		int red_id0, fi_err0, rc_err0;
+		uint64_t expval, actval;
+
+		/* If there was a bad node, all reductions should fail */
+		rc_err0 = (bad_node < 0) ? 0 : CXIP_COLL_RC_OP_MISMATCH;
+		for (node = 0; node < nodes; node++) {
+			_allreduce_wait(rx_cq_fid, tx_cq_fid,
+					&context[node][first]);
+			ctx = &context[node][first];
+
+			/* Use the root values as definitive */
+			if (node == 0) {
+				red_id0 = ctx->red_id;
+				fi_err0 = ctx->errcode;
+				expval = ctx->expval;
+			}
+			actval = rslt[node][first].ival[0];
+
+			/* Test values */
+			if (ctx->node != node ||
+			    ctx->seqno != first  ||
+			    ctx->red_id != red_id0 ||
+			    ctx->errcode != fi_err0 ||
+			    ctx->hw_rc != rc_err0 ||
+			    (!fi_err0 && expval != actval)) {
+				TRACE("%s =====\n", label);
+				TRACE("  node    %3d, exp %3d\n",
+				       ctx->node, node);
+				TRACE("  seqno   %3d, exp %3d\n",
+				       ctx->seqno, first);
+				TRACE("  red_id  %3d, exp %3d\n",
+				       ctx->red_id, red_id0);
+				TRACE("  errcode %3d, exp %3d\n",
+				       ctx->errcode, fi_err0);
+				TRACE("  hw_rc   %3d, exp %3d\n",
+				       ctx->hw_rc, rc_err0);
+				TRACE("  value   %08lx, exp %08lx\n",
+				       actval, expval);
+				cr_assert(true, "%s context failure\n",
+					  label);
+			}
+		}
+		first++;
+	}
+	cr_assert(!rx_count && !tx_count,
+		  "rx_count=%d tx_count=%d should be 0\n", rx_count, tx_count);
+
+	for (node = 0; node < nodes; node++) {
+		TRACE("tmout[%d] = %d\n", node,
+		    ofi_atomic_get32(&mc_obj[node]->tmout_cnt));
+	}
+
+	/* make sure we got them all */
+	cr_assert(dlist_empty(&done_list), "Pending contexts\n");
+	cr_assert(queue_depth == 0, "queue_depth = %d\n", queue_depth);
+	TRACE("completed\n");
+
+	for (node = 0; node < nodes; node++) {
+		free(rslt[node]);
+		free(context[node]);
+	}
+	free(context);
+	free(rslt);
+	free(data);
+	free(mc_obj);
+}
+
+void _reduce_test_set(int concur)
+{
+	_create_netsim_collective(31, true, FI_SUCCESS);
+	_wait_for_join(31, FI_SUCCESS, 0);
+	/* success with each of the nodes starting */
+	_allreduce(0, -1, concur);
+	_allreduce(1, -1, concur);
+	_allreduce(2, -1, concur);
+	_allreduce(3, -1, concur);
+	_allreduce(4, -1, concur);
+	_allreduce(27, -1, concur);
+	_allreduce(28, -1, concur);
+	_allreduce(29, -1, concur);
+	_allreduce(30, -1, concur);
+	/* failure with root starting */
+	_allreduce(0, 0, concur);
+	_allreduce(0, 1, concur);
+	/* failure with leaf starting */
+	_allreduce(1, 0, concur);
+	_allreduce(1, 1, concur);
+	_destroy_netsim_collective();
+}
+
+Test(coll_reduce, concur1)
+{
+	_reduce_test_set(1);
+}
+
+Test(coll_reduce, concur2)
+{
+	_reduce_test_set(2);
+}
+
+Test(coll_reduce, concur3)
+{
+	_reduce_test_set(3);
+}
+
+Test(coll_reduce, concur8)
+{
+	_reduce_test_set(8);
+}
+
+Test(coll_reduce, concurN)
+{
+	_reduce_test_set(29);
+}
+
+/***************************************/
+/* Collective operation testing */
+#define	REDUCE_NODES	10
+
+void setup_coll(void)
+{
+	cxit_setup_rma();
+	_create_netsim_collective(REDUCE_NODES, true, FI_SUCCESS);
+	_wait_for_join(REDUCE_NODES, FI_SUCCESS, 0);
+}
+
+void teardown_coll(void) {
+	_destroy_netsim_collective();
+	cxit_teardown_rma();
+}
+
+TestSuite(coll_reduce_ops, .init = setup_coll, .fini = teardown_coll,
+	  .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test barrier */
+Test(coll_reduce_ops, barrier)
+{
+	struct cxip_coll_mc **mc_obj;
+	struct fid_cq *rx_cq_fid, *tx_cq_fid;
+	int nodes, node;
+	ssize_t size;
+	struct user_context *context;
+
+	nodes = cxit_coll_mc_list.count;
+	mc_obj = calloc(nodes, sizeof(**mc_obj));
+	context = calloc(nodes, sizeof(*context));
+	_resolve_group("barrier", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid);
+
+	/* test bad parameters */
+	cr_assert(-FI_EINVAL == cxip_barrier(NULL, 0L, NULL));
+	cr_assert(-FI_EINVAL == cxip_barrier(cxit_ep, 0L, NULL));
+
+	/* 'parallel' injection across nodes */
+	for (node = 0; node < nodes; node++) {
+		size = cxip_barrier(cxit_ep, (fi_addr_t)mc_obj[node],
+				    &context[node]);
+		cr_assert(size == FI_SUCCESS,
+			  "cxip_barrier[%d]=%ld\n", node, size);
+	}
+
+	/* 'parallel' wait for all to complete */
+	for (node = 0; node < nodes; node++)
+		_allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node]);
+
+	free(context);
+	free(mc_obj);
+}
+
+/* Test broadcast */
+Test(coll_reduce_ops, broadcast)
+{
+	struct cxip_coll_mc **mc_obj;
+	struct fid_cq *rx_cq_fid, *tx_cq_fid;
+	int nodes, node, root;
+	fi_addr_t fi_root;
+	struct cxip_intval *data;
+	struct user_context *context;
+	ssize_t size;
+	int i, err;
+
+	nodes = cxit_coll_mc_list.count;
+	mc_obj = calloc(nodes, sizeof(**mc_obj));
+	context = calloc(nodes, sizeof(*context));
+	data = calloc(nodes, sizeof(*data));
+	_resolve_group("broadcast", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid);
+
+	/* test bad parameters */
+	cr_assert(-FI_EINVAL == cxip_broadcast(NULL, NULL, 0L, NULL,
+					       0L, -1L, -1L, -1L, NULL));
+	cr_assert(-FI_EINVAL == cxip_broadcast(cxit_ep, NULL, 0L, NULL,
+					       0L, -1L, -1L, -1L, NULL));
+	cr_assert(-FI_EINVAL == cxip_broadcast(cxit_ep, data, 0L, NULL,
+					       0L, -1L, -1L, -1L, NULL));
+	cr_assert(-FI_EINVAL == cxip_broadcast(cxit_ep, data, 4L, NULL,
+					       0L, -1L, -1L, -1L, NULL));
+
+	/* repeat for each node serving as root */
+	for (root = 0; root < nodes; root++) {
+		/* set root data to be different from other data */
+		memset(data, -1, nodes*sizeof(*data));
+		for (i = 0; i < 4; i++)
+			data[root].ival[i] = root;
+		/* convert root rank to root fi_addr */
+		fi_root = (fi_addr_t)root;
+		/* 'parallel' injection across nodes */
+		for (node = 0; node < nodes; node++) {
+			size = cxip_broadcast(cxit_ep, &data[node], 4, NULL,
+					      (fi_addr_t)mc_obj[node],
+					      fi_root, FI_UINT64, 0L,
+					      &context[node]);
+			cr_assert(size == FI_SUCCESS,
+				"cxip_broadcast[%d]=%ld\n", node, size);
+		}
+		/* 'parallel' wait for all to complete */
+		for (node = 0; node < nodes; node++)
+			_allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node]);
+		/* ensure broadcast worked */
+		err = 0;
+		for (node = 0; node < nodes; node++) {
+			for (i = 0; i < 4; i++) {
+				if (data[node].ival[i] != root)
+					err++;
+			}
+		}
+		if (err) {
+			printf("FAILED on node=%d, ival=%d\n", node, i);
+			for (node = 0; node < nodes; node++) {
+				printf("root=%d node=%2d [", root, node);
+				for (i = 0; i < 4; i++) {
+					printf("%016lx ", data[node].ival[i]);
+				}
+				printf("]\n");
+			}
+			cr_assert(1, "failed\n");
+		}
+	}
+
+	free(data);
+	free(context);
+	free(mc_obj);
+}
+
+/* Test reduce */
+Test(coll_reduce_ops, reduce)
+{
+	struct cxip_coll_mc **mc_obj;
+	struct fid_cq *rx_cq_fid, *tx_cq_fid;
+	int nodes, node, root;
+	fi_addr_t fi_root;
+	struct cxip_intval *data, rslt;
+	struct user_context *context;
+	uint64_t testval;
+	ssize_t size;
+	int i;
+
+	/* test bad parameters */
+	cr_assert(-FI_EINVAL == cxip_reduce(NULL, NULL, 0L, NULL, NULL, NULL,
+					       0L, -1L, -1L, -1L, 0L, NULL));
+	cr_assert(-FI_EINVAL == cxip_reduce(cxit_ep, NULL, 0L, NULL, NULL, NULL,
+					       0L, -1L, -1L, -1L, 0L, NULL));
+
+	nodes = cxit_coll_mc_list.count;
+	mc_obj = calloc(nodes, sizeof(**mc_obj));
+	context = calloc(nodes, sizeof(*context));
+	data = calloc(nodes, sizeof(*data));
+	_resolve_group("reduce", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid);
+
+	/* repeat for each node serving as root */
+	for (root = 0; root < nodes; root++) {
+		/* set root data to be different from other data */
+		memset(data, -1, nodes*sizeof(*data));
+		/* convert root rank to root fi_addr */
+		fi_root = (fi_addr_t)root;
+		/* 'parallel' injection across nodes */
+		for (node = 0; node < nodes; node++) {
+			data[node].ival[0] = (1L << node);
+			data[node].ival[1] = (1L << node) << 1;
+			data[node].ival[2] = (1L << node) << 2;
+			data[node].ival[3] = (1L << node) << 3;
+			size = cxip_reduce(cxit_ep, &data[node], 4, NULL,
+					   (node == root) ? &rslt : NULL, NULL,
+					   (fi_addr_t)mc_obj[node],
+					   fi_root, FI_UINT64, FI_BOR, 0L,
+					   &context[node]);
+			cr_assert(size == FI_SUCCESS,
+				"cxip_broadcast[%d]=%ld\n", node, size);
+		}
+		/* 'parallel' wait for all to complete */
+		for (node = 0; node < nodes; node++)
+			_allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node]);
+		/* ensure reduce worked */
+		testval = (1L << nodes) - 1;
+		for (i = 0; i < 4; i++) {
+			cr_assert(rslt.ival[i] == testval,
+				"ival[%d] %016lx != %016lx\n",
+				i, rslt.ival[i], testval);
+			testval <<= 1;
+		}
+	}
+
+	free(data);
+	free(context);
+	free(mc_obj);
+}
+
+/* Perform reduction operation with data, wait for result */
+int _allreduceop(enum fi_op opcode,
+		enum fi_datatype typ,
+		uint64_t flags,
+		void *data,
+		void *rslt,
+		size_t count,
+		struct user_context *context)
+{
+	struct cxip_coll_mc **mc_obj;
+	struct fid_cq *rx_cq_fid, *tx_cq_fid;
+	int nodes, node, datawidth, rsltwidth, ret;
+	ssize_t size;
+
+	datawidth = (flags & FI_CXI_PRE_REDUCED) ?
+			sizeof(struct cxip_coll_accumulator) :
+			sizeof(struct cxip_intval);
+	rsltwidth = (flags & FI_MORE) ?
+			sizeof(struct cxip_coll_accumulator) :
+			sizeof(struct cxip_intval);
+	nodes = cxit_coll_mc_list.count;
+	mc_obj = calloc(nodes, sizeof(**mc_obj));
+	_resolve_group("reduce", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid);
+	/* 'parallel' injection across nodes */
+	ret = 0;
+	for (node = 0; node < nodes; node++) {
+		size = cxip_allreduce(cxit_ep,
+			(char *)data + (node*datawidth), count, NULL,
+			(char *)rslt + (node*rsltwidth), NULL,
+			(fi_addr_t)mc_obj[node],
+			typ, opcode, flags,
+			&context[node]);
+			if (size != FI_SUCCESS) {
+				printf("%s cxip_allreduce()[%d]=%ld\n",
+					__func__, node, size);
+				ret = 1;
+				goto done;
+			}
+	}
+
+	/* 'parallel' wait for all to complete */
+	if (!(flags & FI_MORE)) {
+		for (node = 0; node < nodes; node++)
+			_allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node]);
+	}
+
+done:
+	free(mc_obj);
+	return ret;
+}
+
+/* Signaling NaN generation, for testing.
+ * Linux feature requires GNU_SOURCE.
+ * This generates a specific sNaN value.
+ */
+static inline double _snan64(void)
+{
+	return _bits2dbl(0x7ff4000000000000);
+}
+
+/* Returns true if this is a signalling NAN */
+static inline bool _is_snan64(double d)
+{
+	/* This detection is universal IEEE */
+	return isnan(d) && !(_dbl2bits(d) & 0x0008000000000000);
+}
+
+/* Converts a signalling NAN to a non-signalling NAN */
+static void _quiesce_nan(double *d)
+{
+	if (isnan(*d))
+		*d = NAN;
+}
+
+/* random generation for doubles */
+static inline double _frand(double range)
+{
+	return ((double)rand()/(double)RAND_MAX) * range;
+}
+
+/* float equality measure, accommodates snan */
+static inline bool _feq(double a, double b)
+{
+	if (_is_snan64(a) && _is_snan64(b))
+		return true;
+	if (_is_snan64(a) || _is_snan64(b))
+		return false;
+	if (isnan(a) && isnan(b))
+		return true;
+	if (isnan(a) || isnan(b))
+		return false;
+	return (a == b);
+}
+
+/* returns true if a is preferred, false if b is preferred.
+ * preference is determined by prefer_nan and prefer_min.
+ * if (a==b), a is preferred.
+ */
+static inline bool _fcmp(double a, double b, bool prefer_min, bool prefer_nan)
+{
+	if (prefer_nan) {
+		/* leftmost snan places first */
+		if (_is_snan64(a))
+			return false;
+		/* rightmost snan places second */
+		if (_is_snan64(b))
+			return true;
+		/* leftmost nan places third */
+		if (isnan(a))
+			return false;
+		/* rightmost nan places last */
+		if (isnan(b))
+			return true;
+	}
+	/* right argument is nan, give preference to left (possibly nan) */
+	if (isnan(b))
+		return false;
+	/* left argument is nan and right argument is not, use right */
+	if (isnan(a))
+		return true;
+	/* neither argument is nan, return left or right by preference */
+	return (a > b) ? prefer_min : !prefer_min;
+}
+
+/* Sanity test for the above */
+Test(coll_reduce_ops, fcmp)
+{
+	cr_assert(!_fcmp(1.0, 2.0, true, true));
+	cr_assert( _fcmp(1.0, 2.0, false, true));
+	cr_assert(!_fcmp(1.0, 2.0, true, false));
+	cr_assert( _fcmp(1.0, 2.0, false, false));
+	cr_assert( _fcmp(2.0, NAN, true, true));
+	cr_assert( _fcmp(2.0, NAN, false, true));
+	cr_assert(!_fcmp(2.0, NAN, true, false));
+	cr_assert(!_fcmp(2.0, NAN, false, false));
+	cr_assert(!_fcmp(NAN, NAN, true, true));
+	cr_assert(!_fcmp(NAN, NAN, false, true));
+	cr_assert(!_fcmp(NAN, NAN, true, false));
+	cr_assert(!_fcmp(NAN, NAN, false, false));
+	cr_assert( _fcmp(2.0, _snan64(), true, true));
+	cr_assert( _fcmp(2.0, _snan64(), false, true));
+	cr_assert(!_fcmp(2.0, _snan64(), true, false));
+	cr_assert(!_fcmp(2.0, _snan64(), false, false));
+	cr_assert( _fcmp(NAN, _snan64(), true, true));
+	cr_assert( _fcmp(NAN, _snan64(), false, true));
+	cr_assert(!_fcmp(NAN, _snan64(), true, false));
+	cr_assert(!_fcmp(NAN, _snan64(), false, false));
+	cr_assert(!_fcmp(_snan64(), _snan64(), true, true));
+	cr_assert(!_fcmp(_snan64(), _snan64(), false, true));
+	cr_assert(!_fcmp(_snan64(), _snan64(), true, false));
+	cr_assert(!_fcmp(_snan64(), _snan64(), false, false));
+}
+
+/* finds MIN(a, b) with two NAN models */
+static inline double _fmin(double a, double b, bool prefer_nan)
+{
+	return (!_fcmp(a, b, true, prefer_nan)) ? a : b;
+}
+
+/* finds MAX(a, b) with two NAN models */
+static inline double _fmax(double a, double b, bool prefer_nan)
+{
+	return (!_fcmp(a, b, false, prefer_nan)) ? a : b;
+}
+
+/* Prediction of results takes into account the two NAN models and accounts
+ * for the distinction between NAN and sNAN. After collective processing, the
+ * sNAN will be quiesced, so after accounting for its effect, we need to
+ * quiesce it here for comparison.
+ */
+
+/* computes fmin result */
+static void _predict_fmin(int nodes, struct cxip_fltval *data,
+			struct cxip_fltval *check, bool prefer_nan)
+{
+	int i, j;
+
+	prefer_nan = false;	// NETCASSINI-5959
+	memcpy(check, &data[0], sizeof(*check));
+	for (i = 1; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			check->fval[j] =
+				_fmin(data[i].fval[j], check->fval[j],
+					prefer_nan);
+	for (i = 0; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			_quiesce_nan(&check->fval[j]);
+}
+
+/* computes fmax result */
+static void _predict_fmax(int nodes, struct cxip_fltval *data,
+			struct cxip_fltval *check, bool prefer_nan)
+{
+	int i, j;
+
+	prefer_nan = false;	// NETCASSINI-5959
+	memcpy(check, &data[0], sizeof(*check));
+	for (i = 1; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			check->fval[j] =
+				_fmax(data[i].fval[j], check->fval[j],
+					prefer_nan);
+	for (i = 0; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			_quiesce_nan(&check->fval[j]);
+}
+
+/* computes minmax result */
+static void _predict_fminmax(int nodes, struct cxip_fltminmax *data,
+				struct cxip_fltminmax *check, bool prefer_nan)
+{
+	double a, b;
+	int i;
+
+	prefer_nan = false;	// NETCASSINI-5959
+	memcpy(check, &data[0], sizeof(*check));
+	for (i = 1; i < nodes; i++) {
+		a = data[i].fminval;
+		b = check->fminval;
+		if (_feq(a, b)) {
+			/* if equal, choose lowest index */
+			if (data[i].fminidx < check->fminidx)
+				check->fminidx = data[i].fminidx;
+		} else if (!_fcmp(a, b, true, prefer_nan)) {
+			check->fminval = a;
+			check->fminidx = i;
+		}
+		a = data[i].fmaxval;
+		b = check->fmaxval;
+		if (_feq(a, b)) {
+			/* if equal, choose lowest index */
+			if (data[i].fmaxidx < check->fmaxidx)
+				check->fmaxidx = data[i].fmaxidx;
+		} else if (!_fcmp(a, b, false, prefer_nan)) {
+			check->fmaxval = a;
+			check->fmaxidx = i;
+		}
+	}
+	for (i = 0; i < nodes; i++) {
+		_quiesce_nan(&check->fminval);
+		_quiesce_nan(&check->fmaxval);
+	}
+}
+
+/* Routines to dump error messages on failure */
+static int _dump_ival(int nodes, int i0, int j0,
+		      struct cxip_intval *rslt,
+		      struct cxip_intval *check)
+{
+	int i, j;
+
+	for (i = 0; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			printf("[%2d][%2d] rslt=%016lx expect=%016lx%s\n",
+				i, j, rslt[i].ival[j], check->ival[j],
+				(i==i0 && j==j0) ? "<-failed" : "");
+	return 1;
+}
+
+static int _dump_fval(int nodes, int i0, int j0,
+		      struct cxip_fltval *rslt,
+		      struct cxip_fltval *check)
+{
+	int i, j;
+
+	for (i = 0; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			printf("[%2d][%2d] rslt=%016g expect=%016g%s\n",
+				i, j, rslt[i].fval[j], check->fval[j],
+				(i==i0 && j==j0) ? "<-failed" : "");
+	return 1;
+}
+
+static int _dump_iminmax(int nodes, int i0,
+			struct cxip_iminmax *rslt,
+			struct cxip_iminmax *check)
+{
+	int i;
+
+	for (i = 0; i < nodes; i++) {
+		printf("[%2d] iminval=%16lx expect=%16lx%s\n",
+			i, rslt[i].iminval, check->iminval,
+			(i==i0) ? "<-failed" : "");
+		printf("[%2d] iminidx=%16ld expect=%16ld%s\n",
+			i, rslt[i].iminidx, check->iminidx,
+			(i==i0) ? "<-failed" : "");
+		printf("[%2d] imaxval=%16lx expect=%16lx%s\n",
+			i, rslt[i].imaxval, check->imaxval,
+			(i==i0) ? "<-failed" : "");
+		printf("[%2d] imaxidx=%16ld expect=%16ld%s\n",
+			i, rslt[i].imaxidx, check->imaxidx,
+			(i==i0) ? "<-failed" : "");
+	}
+	return 1;
+}
+
+static int _dump_fminmax(int nodes, int i0,
+			struct cxip_fltminmax *rslt,
+			struct cxip_fltminmax *check)
+{
+	int i;
+
+	for (i = 0; i < nodes; i++) {
+		printf("[%2d] fminval=%16g expect=%16g%s\n",
+			i, rslt[i].fminval, check->fminval,
+			(i==i0) ? "<-failed" : "");
+		printf("[%2d] fminidx=%16ld expect=%16ld%s\n",
+			i, rslt[i].fminidx, check->fminidx,
+			(i==i0) ? "<-failed" : "");
+		printf("[%2d] fmaxval=%16g expect=%16g%s\n",
+			i, rslt[i].fmaxval, check->fmaxval,
+			(i==i0) ? "<-failed" : "");
+		printf("[%2d] fmaxidx=%16ld expect=%16ld%s\n",
+			i, rslt[i].fmaxidx, check->fmaxidx,
+			(i==i0) ? "<-failed" : "");
+	}
+	return 1;
+}
+
+/* compares collective integer rslt with computed check */
+static int _check_ival(int nodes, struct cxip_intval *rslt,
+			struct cxip_intval *check)
+{
+	int i, j, ret;
+
+	ret = 0;
+	for (i = 0; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			if (rslt[i].ival[j] != check->ival[j])
+				ret += _dump_ival(nodes, i, j, rslt, check);
+	return ret;
+}
+
+/* compares collective double rslt with computed check */
+static int _check_fval(int nodes, struct cxip_fltval *rslt,
+			struct cxip_fltval *check)
+{
+	int i, j;
+
+	for (i = 0; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			if (!_feq(rslt[i].fval[j], check->fval[j]))
+				return _dump_fval(nodes, i, j, rslt, check);
+	return 0;
+}
+
+/* compares collective integer minmax rslt with computed check */
+static int _check_iminmax(int nodes, struct cxip_iminmax *rslt,
+			  struct cxip_iminmax *check)
+{
+	int i;
+
+	for (i = 0; i < nodes; i++) {
+		if (rslt[i].iminval != check->iminval ||
+		    rslt[i].iminidx != check->iminidx ||
+		    rslt[i].imaxval != check->imaxval ||
+		    rslt[i].imaxidx != check->imaxidx)
+			return _dump_iminmax(nodes, i, rslt, check);
+	}
+	return 0;
+}
+
+/* compares collective double minmax rslt with computed check */
+static int _check_fminmax(int nodes, struct cxip_fltminmax *rslt,
+			  struct cxip_fltminmax *check)
+{
+	int i;
+
+	for (i = 0; i < nodes; i++)
+		if (!_feq(rslt[i].fminval, check->fminval) ||
+		    !_feq(rslt[i].fmaxval, check->fmaxval) ||
+		    rslt[i].fminidx != check->fminidx ||
+		    rslt[i].fmaxidx != check->fmaxidx)
+			return _dump_fminmax(nodes, i, rslt, check);
+	return 0;
+}
+
+/* compares returned RC code with expected value */
+static int _check_rc(int nodes, struct user_context *context, int rc)
+{
+	int i, ret;
+
+	ret = 0;
+	for (i = 0; i < nodes; i++)
+		if (context[i].hw_rc != rc) {
+			printf("hw_rc[%d]=%d!=%d\n", i, context[i].hw_rc, rc);
+			ret = 1;
+		}
+	return ret;
+}
+
+/* keeps code easier to read */
+#define STDINTSETUP \
+	struct user_context *context; \
+	struct cxip_intval *data; \
+	struct cxip_intval *rslt; \
+	struct cxip_intval check; \
+	int i, j, ret, nodes; \
+	nodes = cxit_coll_mc_list.count; \
+	data = calloc(nodes, sizeof(*data)); \
+	rslt = calloc(nodes, sizeof(*rslt)); \
+	context = calloc(nodes, sizeof(*context)); \
+
+#define STDILOCSETUP \
+	struct user_context *context; \
+	struct cxip_iminmax *data; \
+	struct cxip_iminmax *rslt; \
+	struct cxip_iminmax check; \
+	int i, ret, nodes; \
+	nodes = cxit_coll_mc_list.count; \
+	data = calloc(nodes, sizeof(*data)); \
+	rslt = calloc(nodes, sizeof(*rslt)); \
+	context = calloc(nodes, sizeof(*context));
+
+#define STDFLTSETUP \
+	struct user_context *context; \
+	struct cxip_fltval *data; \
+	struct cxip_fltval *rslt; \
+	struct cxip_fltval check; \
+	int i, ret, nodes; \
+	nodes = cxit_coll_mc_list.count; \
+	data = calloc(nodes, sizeof(*data)); \
+	rslt = calloc(nodes, sizeof(*rslt)); \
+	context = calloc(nodes, sizeof(*context));
+
+#define STDFLOCSETUP \
+	struct user_context *context; \
+	struct cxip_fltminmax *data; \
+	struct cxip_fltminmax *rslt; \
+	struct cxip_fltminmax check; \
+	int i, ret, nodes; \
+	nodes = cxit_coll_mc_list.count; \
+	data = calloc(nodes, sizeof(*data)); \
+	rslt = calloc(nodes, sizeof(*rslt)); \
+	context = calloc(nodes, sizeof(*context));
+
+#define	STDCLEANUP \
+	free(context); \
+	free(rslt); \
+	free(data);
+
+/* Test binary OR */
+Test(coll_reduce_ops, bor)
+{
+	STDINTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].ival[0] = 1 << i;
+		data[i].ival[1] = i << 2*i;
+		data[i].ival[2] = i;
+		data[i].ival[3] = 2*i;
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			check.ival[j] |= data[i].ival[j];
+
+	ret = _allreduceop(FI_BOR, FI_UINT64, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop() failed\n");
+	ret = _check_ival(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed\n");
+	STDCLEANUP
+}
+
+/* Test binary AND */
+Test(coll_reduce_ops, band)
+{
+	STDINTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].ival[0] = ~(1 << i);
+		data[i].ival[1] = ~(i << 2*i);
+		data[i].ival[2] = ~i;
+		data[i].ival[3] = ~(2*i);
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			check.ival[j] &= data[i].ival[j];
+
+	ret = _allreduceop(FI_BAND, FI_UINT64, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop() failed = %d\n", ret);
+	ret = _check_ival(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed\n");
+	STDCLEANUP
+}
+
+/* Test binary XOR */
+Test(coll_reduce_ops, bxor)
+{
+	STDINTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].ival[0] = 1 << i;
+		data[i].ival[1] = ~(i << i);
+		data[i].ival[2] = i;
+		data[i].ival[3] = ~i;
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			check.ival[j] ^= data[i].ival[j];
+
+	ret = _allreduceop(FI_BXOR, FI_UINT64, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop() failed\n");
+	ret = _check_ival(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed\n");
+	STDCLEANUP
+}
+
+/* Tests int64 minimum */
+Test(coll_reduce_ops, imin)
+{
+	STDINTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].ival[0] = rand();
+		data[i].ival[1] = -rand();
+		data[i].ival[2] = rand();
+		data[i].ival[3] = -rand();
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			check.ival[j] = MIN(check.ival[j], data[i].ival[j]);
+
+	ret = _allreduceop(FI_MIN, FI_INT64, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop() failed\n");
+	ret = _check_ival(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed\n");
+	STDCLEANUP
+}
+
+/* Tests int64 maximum */
+Test(coll_reduce_ops, imax)
+{
+	STDINTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].ival[0] = rand();
+		data[i].ival[1] = -rand();
+		data[i].ival[2] = rand();
+		data[i].ival[3] = -rand();
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			check.ival[j] = MAX(check.ival[j], data[i].ival[j]);
+
+	ret = _allreduceop(FI_MAX, FI_INT64, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop() failed\n");
+	ret = _check_ival(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed\n");
+	STDCLEANUP
+}
+
+/* Tests int64 SUM */
+Test(coll_reduce_ops, isum)
+{
+	STDINTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].ival[0] = rand();
+		data[i].ival[1] = -rand();
+		data[i].ival[2] = rand();
+		data[i].ival[3] = -rand();
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			check.ival[j] += data[i].ival[j];
+
+	ret = _allreduceop(FI_SUM, FI_INT64, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop() failed\n");
+	ret = _check_ival(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed\n");
+	STDCLEANUP
+}
+
+/* Tests int64 minmaxloc */
+Test(coll_reduce_ops, iminmaxloc)
+{
+	STDILOCSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].iminval = rand();
+		data[i].iminidx = i;
+		data[i].imaxval = rand();
+		data[i].imaxidx = i;
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++) {
+		if (check.iminval > data[i].iminval) {
+			check.iminval = data[i].iminval;
+			check.iminidx = data[i].iminidx;
+		}
+		if (check.imaxval < data[i].imaxval) {
+			check.imaxval = data[i].imaxval;
+			check.imaxidx = data[i].imaxidx;
+		}
+	}
+
+	ret = _allreduceop(FI_CXI_MINMAXLOC, FI_INT64, 0L, data, rslt, 1,
+			   context);
+	cr_assert(!ret, "_allreduceop() failed = %d\n", ret);
+	ret = _check_iminmax(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed\n");
+	STDCLEANUP
+}
+
+/* Tests double sum */
+Test(coll_reduce_ops, fsum)
+{
+	STDFLTSETUP
+	int j;
+
+	/* max nodes == 32 under NETSIM */
+	data[0].fval[0] = 1.0e-53;
+	data[0].fval[1] = 1.0e-53;
+	data[0].fval[2] = 1.0e-53;
+	data[0].fval[3] = 1.0e-53;
+	for (i = 1; i < nodes; i++) {
+		data[i].fval[0] = _frand(1.0);
+		data[i].fval[1] = -_frand(1.0);
+		data[i].fval[2] = _frand(1.0);
+		data[i].fval[3] = -_frand(1.0);
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++)
+		for (j = 0; j < 4; j++)
+			check.fval[j] += data[i].fval[j];
+
+	ret = _allreduceop(FI_SUM, FI_DOUBLE, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop() failed\n");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INEXACT);
+	cr_assert(!ret, "rc failed\n");
+
+	/* Note: inexact computation is guaranteed by the small value included
+	 * in the data set. There is a hidden trick when performing the
+	 * comparison that relies on the prediction and the NETSIM allreduce
+	 * operation both occuring in the same order, due to the nature of the
+	 * simulated endpoints. In a real collective, ordering will be random,
+	 * and the results will vary according to the ordering.
+	 */
+	STDCLEANUP
+}
+
+/* Test double minimum -- this should be exact */
+Test(coll_reduce_ops, fmin)
+{
+	STDFLTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].fval[0] = _frand(1.0);
+		data[i].fval[1] = -_frand(1.0);
+		data[i].fval[2] = _frand(1.0);
+		data[i].fval[3] = -_frand(1.0);
+	}
+
+	/* normal floating point */
+	_predict_fmin(nodes, data, &check, true);
+	ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop failed normal");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed normal\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed normal\n");
+
+	data[1].fval[1] = NAN;
+	_predict_fmin(nodes, data, &check, true);
+	ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop failed NAN");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed NAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW);
+	cr_assert(!ret, "rc failed NAN\n");
+
+	data[1].fval[1] = _snan64();
+	_predict_fmin(nodes, data, &check, true);
+	ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop failed sNAN");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed sNAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID);
+	cr_assert(!ret, "rc failed sNAN\n");
+	STDCLEANUP
+}
+
+/* Test double maximum -- this should be exact */
+Test(coll_reduce_ops, fmax)
+{
+	STDFLTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].fval[0] = _frand(1.0);
+		data[i].fval[1] = -_frand(1.0);
+		data[i].fval[2] = _frand(1.0);
+		data[i].fval[3] = -_frand(1.0);
+	}
+
+	_predict_fmax(nodes, data, &check, true);
+	ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop failed normal");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed normal\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed normal\n");
+
+	data[1].fval[1] = NAN;
+	_predict_fmax(nodes, data, &check, true);
+	ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop failed NAN");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed NAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW);
+	cr_assert(!ret, "rc failed NAN\n");
+
+	data[1].fval[1] = _snan64();
+	_predict_fmax(nodes, data, &check, true);
+	ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4, context);
+	cr_assert(!ret, "_allreduceop failed sNAN");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed sNAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID);
+	cr_assert(!ret, "rc failed sNAN\n");
+	STDCLEANUP
+}
+
+/* Test double minmax with index -- should be exact */
+Test(coll_reduce_ops, fminmaxloc)
+{
+	STDFLOCSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].fminval = _frand(1.0);
+		data[i].fminidx = i;
+		data[i].fmaxval = _frand(1.0);
+		data[i].fmaxidx = i;
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++) {
+		if (check.fminval > data[i].fminval) {
+			check.fminval = data[i].fminval;
+			check.fminidx = data[i].fminidx;
+		}
+		if (check.fmaxval < data[i].fmaxval) {
+			check.fmaxval = data[i].fmaxval;
+			check.fmaxidx = data[i].fmaxidx;
+		}
+	}
+
+	_predict_fminmax(nodes, data, &check, true);
+	ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1,
+			   context);
+	cr_assert(!ret, "_allreduceop failed normal");
+	ret = _check_fminmax(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed normal\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed normal\n");
+
+	/* NAN is given preference over number */
+	data[1].fminval = NAN;
+	data[3].fmaxval = NAN;
+	_predict_fminmax(nodes, data, &check, true);
+	ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1,
+			   context);
+	cr_assert(!ret, "_allreduceop failed NAN");
+	ret = _check_fminmax(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed NAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed NAN\n");
+
+	/* SNAN is given preference over NAN */
+	data[1].fminval = NAN;
+	data[2].fminval = _snan64();
+	data[3].fmaxval = NAN;
+	_predict_fminmax(nodes, data, &check, true);
+	ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1,
+			   context);
+	cr_assert(!ret, "_allreduceop failed sNAN");
+	ret = _check_fminmax(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed sNAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID);
+	cr_assert(!ret, "rc failed sNAN\n");
+	STDCLEANUP
+}
+
+/* Test double minimum ignoring NAN -- should be exact */
+Test(coll_reduce_ops, fminnum)
+{
+	STDFLTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].fval[0] = _frand(1.0);
+		data[i].fval[1] = -_frand(1.0);
+		data[i].fval[2] = _frand(1.0);
+		data[i].fval[3] = -_frand(1.0);
+	}
+
+	_predict_fmin(nodes, data, &check, false);
+	ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4,
+			   context);
+	cr_assert(!ret, "_allreduceop failed normal");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed normal\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed normal\n");
+
+	/* number is given preference over NAN */
+	data[1].fval[1] = NAN;
+	_predict_fmin(nodes, data, &check, false);
+	ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4,
+			   context);
+	cr_assert(!ret, "_allreduceop failed NAN");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed NAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW);
+	cr_assert(!ret, "rc failed NAN\n");
+
+	/* number is given preference over NAN */
+	data[1].fval[1] = _snan64();
+	_predict_fmin(nodes, data, &check, false);
+	ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4,
+			   context);
+	cr_assert(!ret, "_allreduceop failed sNAN");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed sNAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID);
+	cr_assert(!ret, "rc failed sNAN\n");
+	STDCLEANUP
+}
+
+/* Test double maximum ignoring NAN -- should be exact */
+Test(coll_reduce_ops, fmaxnum)
+{
+	STDFLTSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].fval[0] = _frand(1.0);
+		data[i].fval[1] = -_frand(1.0);
+		data[i].fval[2] = _frand(1.0);
+		data[i].fval[3] = -_frand(1.0);
+	}
+
+	_predict_fmax(nodes, data, &check, false);
+	ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4,
+			   context);
+	cr_assert(!ret, "_allreduceop failed normal");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed normal\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed normal\n");
+
+	/* number is given preference over NAN */
+	data[1].fval[1] = NAN;
+	_predict_fmax(nodes, data, &check, false);
+	ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4,
+			   context);
+	cr_assert(!ret, "_allreduceop failed NAN");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed NAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW);
+	cr_assert(!ret, "rc failed NAN\n");
+
+	/* SNAN is given preference over number */
+	data[1].fval[1] = _snan64();
+	_predict_fmax(nodes, data, &check, false);
+	ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4,
+			   context);
+	cr_assert(!ret, "_allreduceop failed sNAN");
+	ret = _check_fval(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed sNAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID);
+	cr_assert(!ret, "rc failed sNAN\n");
+	STDCLEANUP
+}
+
+/* Test double minmax with index ignoring NAN -- should be exact */
+Test(coll_reduce_ops, fminmaxnumloc)
+{
+	STDFLOCSETUP
+	/* max nodes == 32 under NETSIM */
+	for (i = 0; i < nodes; i++) {
+		data[i].fminval = _frand(1.0);
+		data[i].fminidx = i;
+		data[i].fmaxval = _frand(1.0);
+		data[i].fmaxidx = i;
+	}
+	memcpy(&check, &data[0], sizeof(check));
+	for (i = 1; i < nodes; i++) {
+		if (check.fminval > data[i].fminval) {
+			check.fminval = data[i].fminval;
+			check.fminidx = data[i].fminidx;
+		}
+		if (check.fmaxval < data[i].fmaxval) {
+			check.fmaxval = data[i].fmaxval;
+			check.fmaxidx = data[i].fmaxidx;
+		}
+	}
+
+	_predict_fminmax(nodes, data, &check, false);
+	ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1,
+			   context);
+	cr_assert(!ret, "_allreduceop failed normal");
+	ret = _check_fminmax(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed normal\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed normal\n");
+
+	/* NAN is given preference over number */
+	data[1].fminval = NAN;
+	data[3].fmaxval = NAN;
+	_predict_fminmax(nodes, data, &check, false);
+	ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1,
+			   context);
+	cr_assert(!ret, "_allreduceop failed NAN");
+	ret = _check_fminmax(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed NAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed NAN\n");
+
+	/* SNAN is given preference over NAN */
+	data[1].fminval = NAN;
+	data[2].fminval = _snan64();
+	data[3].fmaxval = NAN;
+	_predict_fminmax(nodes, data, &check, false);
+	ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1,
+			   context);
+	cr_assert(!ret, "_allreduceop failed sNAN");
+	ret = _check_fminmax(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed sNAN\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID);
+	cr_assert(!ret, "rc failed sNAN\n");
+	STDCLEANUP
+}
+
+Test(coll_reduce_ops, prereduce)
+{
+	STDINTSETUP
+	struct cxip_coll_mc **mc_obj;
+	struct fid_cq *rx_cq_fid, *tx_cq_fid;
+	struct cxip_coll_accumulator *accum1, accum2;
+	struct cxip_intval rawdata;
+
+	mc_obj = calloc(nodes, sizeof(**mc_obj));
+	_resolve_group("prereduce", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid);
+
+	accum1 = calloc(nodes, sizeof(*accum1));
+	memset(&check, 0, sizeof(check));
+	ret = -1;
+	for (i = 0; i < nodes; i++) {
+		/* reset accum2 for next node */
+		memset(&accum2, 0, sizeof(accum2));
+		/* reduce over 128 threads */
+		for (j = 0; j < 128; j++) {
+			rawdata.ival[0] = rand();
+			rawdata.ival[1] = -rand();
+			rawdata.ival[2] = rand();
+			rawdata.ival[3] = -rand();
+
+			/* total contributions from all nodes/threads */
+			check.ival[0] += rawdata.ival[0];
+			check.ival[1] += rawdata.ival[1];
+			check.ival[2] += rawdata.ival[2];
+			check.ival[3] += rawdata.ival[3];
+
+			/* FI_MORE interleaved into accum1[], accum2 */
+			ret = cxip_allreduce(NULL, &rawdata, 4, NULL,
+					     (j & 1) ? &accum2 : &accum1[i], NULL, (fi_addr_t)mc_obj[i],
+					     FI_INT64, FI_SUM,
+					     FI_MORE, NULL);
+
+		}
+		/* Fold accum2 into accum1[] */
+		ret = cxip_allreduce(NULL, &accum2, 4, NULL, &accum1[i], NULL,
+				     (fi_addr_t)mc_obj[i], FI_INT64, FI_SUM,
+				     FI_MORE | FI_CXI_PRE_REDUCED, NULL);
+	}
+	/* after all accumulators loaded, reduce them across nodes */
+	for (i = 0; i < nodes; i++) {
+		ret = cxip_allreduce(cxit_ep, &accum1[i], 4, NULL, &rslt[i],
+				     NULL, (fi_addr_t)mc_obj[i], FI_INT64,
+				     FI_SUM, FI_CXI_PRE_REDUCED, &context[i]);
+	}
+	/* wait for all reductions to post completion */
+	for (i = 0; i < nodes; i++)
+		_allreduce_wait(rx_cq_fid, tx_cq_fid, &context[i]);
+	cr_assert(!ret, "_allreduceop() failed\n");
+
+	/* validate results */
+	ret = _check_ival(nodes, rslt, &check);
+	cr_assert(!ret, "compare failed\n");
+	ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS);
+	cr_assert(!ret, "rc failed\n");
+
+	free(accum1);
+	free(mc_obj);
+	STDCLEANUP
+}
diff --git a/prov/cxi/test/cq.c b/prov/cxi/test/cq.c
new file mode 100644
index 00000000000..592190c7bf5
--- /dev/null
+++ b/prov/cxi/test/cq.c
@@ -0,0 +1,615 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+
+#include <ofi.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(cq, .init = cxit_setup_cq, .fini = cxit_teardown_cq,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test basic CQ creation */
+Test(cq, simple)
+{
+	cxit_create_cqs();
+	cr_assert(cxit_tx_cq != NULL);
+	cr_assert(cxit_rx_cq != NULL);
+
+	cxit_destroy_cqs();
+}
+
+static void req_populate(struct cxip_req *req, fi_addr_t *addr)
+{
+	*addr = 0xabcd0;
+	req->flags = FI_SEND;
+	req->context = 0xabcd2;
+	req->data = 0xabcd4;
+	req->tag = 0xabcd5;
+	req->buf = 0xabcd6;
+	req->data_len = 0xabcd7;
+	req->discard = false;
+}
+
+Test(cq, read_fmt_context)
+{
+	int ret;
+	struct cxip_req req;
+	struct fi_cq_entry entry;
+	fi_addr_t req_addr;
+	struct cxip_cq *cxi_cq;
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_CONTEXT;
+	cxit_create_cqs();
+
+	req_populate(&req, &req_addr);
+	cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	req.cq = cxi_cq;
+
+	cxip_cq_req_complete(&req);
+	ret = fi_cq_read(cxit_tx_cq, &entry, 1);
+	cr_assert(ret == 1);
+	cr_assert((uint64_t)entry.op_context == req.context);
+
+	cxit_destroy_cqs();
+}
+
+Test(cq, read_fmt_msg)
+{
+	int ret;
+	struct cxip_req req;
+	struct fi_cq_msg_entry entry;
+	fi_addr_t req_addr;
+	struct cxip_cq *cxi_cq;
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_MSG;
+	cxit_create_cqs();
+
+	req_populate(&req, &req_addr);
+	cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	req.cq = cxi_cq;
+
+	cxip_cq_req_complete(&req);
+	ret = fi_cq_read(cxit_tx_cq, &entry, 1);
+	cr_assert(ret == 1);
+
+	cr_assert((uint64_t)entry.op_context == req.context);
+	cr_assert(entry.flags == req.flags);
+	cr_assert(entry.len == req.data_len);
+
+	cxit_destroy_cqs();
+}
+
+Test(cq, read_fmt_data)
+{
+	int ret;
+	struct cxip_req req;
+	struct fi_cq_data_entry entry;
+	fi_addr_t req_addr;
+	struct cxip_cq *cxi_cq;
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_DATA;
+	cxit_create_cqs();
+
+	req_populate(&req, &req_addr);
+	cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	req.cq = cxi_cq;
+
+	cxip_cq_req_complete(&req);
+	ret = fi_cq_read(cxit_tx_cq, &entry, 1);
+	cr_assert(ret == 1);
+
+	cr_assert((uint64_t)entry.op_context == req.context);
+	cr_assert(entry.flags == req.flags);
+	cr_assert(entry.len == req.data_len);
+	cr_assert((uint64_t)entry.buf == req.buf);
+	cr_assert(entry.data == req.data);
+
+	cxit_destroy_cqs();
+}
+
+Test(cq, read_fmt_tagged)
+{
+	int ret;
+	struct cxip_req req;
+	struct fi_cq_tagged_entry entry;
+	fi_addr_t req_addr;
+	struct cxip_cq *cxi_cq;
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED;
+	cxit_create_cqs();
+
+	req_populate(&req, &req_addr);
+	cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	req.cq = cxi_cq;
+
+	cxip_cq_req_complete(&req);
+	ret = fi_cq_read(cxit_tx_cq, &entry, 1);
+	cr_assert(ret == 1);
+
+	cr_assert((uint64_t)entry.op_context == req.context);
+	cr_assert(entry.flags == req.flags);
+	cr_assert(entry.len == req.data_len);
+	cr_assert((uint64_t)entry.buf == req.buf);
+	cr_assert(entry.data == req.data);
+	cr_assert(entry.tag == req.tag);
+
+	cxit_destroy_cqs();
+}
+
+Test(cq, readfrom_fmt_context)
+{
+	int ret;
+	struct cxip_req req;
+	struct fi_cq_entry entry;
+	fi_addr_t addr = 0, req_addr;
+	struct cxip_cq *cxi_cq;
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_CONTEXT;
+	cxit_create_cqs();
+
+	req_populate(&req, &req_addr);
+	cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	req.cq = cxi_cq;
+
+	cxip_cq_req_complete_addr(&req, req_addr);
+	ret = fi_cq_readfrom(cxit_tx_cq, &entry, 1, &addr);
+	cr_assert(ret == 1);
+
+	cr_assert((uint64_t)entry.op_context == req.context);
+	cr_assert(addr == req_addr);
+
+	cxit_destroy_cqs();
+}
+
+Test(cq, readfrom_fmt_msg)
+{
+	int ret;
+	struct cxip_req req;
+	struct fi_cq_msg_entry entry;
+	fi_addr_t addr = 0, req_addr;
+	struct cxip_cq *cxi_cq;
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_MSG;
+	cxit_create_cqs();
+
+	req_populate(&req, &req_addr);
+	cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	req.cq = cxi_cq;
+
+	cxip_cq_req_complete_addr(&req, req_addr);
+	ret = fi_cq_readfrom(cxit_tx_cq, &entry, 1, &addr);
+	cr_assert(ret == 1);
+
+	cr_assert((uint64_t)entry.op_context == req.context);
+	cr_assert(entry.flags == req.flags);
+	cr_assert(entry.len == req.data_len);
+	cr_assert(addr == req_addr);
+
+	cxit_destroy_cqs();
+}
+
+Test(cq, readfrom_fmt_data)
+{
+	int ret;
+	struct cxip_req req;
+	struct fi_cq_data_entry entry;
+	fi_addr_t addr = 0, req_addr;
+	struct cxip_cq *cxi_cq;
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_DATA;
+	cxit_create_cqs();
+
+	req_populate(&req, &req_addr);
+	cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	req.cq = cxi_cq;
+
+	cxip_cq_req_complete_addr(&req, req_addr);
+	ret = fi_cq_readfrom(cxit_tx_cq, &entry, 1, &addr);
+	cr_assert(ret == 1);
+
+	cr_assert((uint64_t)entry.op_context == req.context);
+	cr_assert(entry.flags == req.flags);
+	cr_assert(entry.len == req.data_len);
+	cr_assert((uint64_t)entry.buf == req.buf);
+	cr_assert(entry.data == req.data);
+	cr_assert(addr == req_addr);
+
+	cxit_destroy_cqs();
+}
+
+Test(cq, readfrom_fmt_tagged)
+{
+	int ret;
+	struct cxip_req req;
+	struct fi_cq_tagged_entry entry;
+	fi_addr_t addr = 0, req_addr;
+	struct cxip_cq *cxi_cq;
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED;
+	cxit_create_cqs();
+
+	req_populate(&req, &req_addr);
+	cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	req.cq = cxi_cq;
+
+	cxip_cq_req_complete_addr(&req, req_addr);
+	ret = fi_cq_readfrom(cxit_tx_cq, &entry, 1, &addr);
+	cr_assert(ret == 1);
+
+	cr_assert((uint64_t)entry.op_context == req.context);
+	cr_assert(entry.flags == req.flags);
+	cr_assert(entry.len == req.data_len);
+	cr_assert((uint64_t)entry.buf == req.buf);
+	cr_assert(entry.data == req.data);
+	cr_assert(entry.tag == req.tag);
+	cr_assert(addr == req_addr);
+
+	cxit_destroy_cqs();
+}
+
+Test(cq, cq_open_null_attr)
+{
+	int ret;
+	struct fid_cq *cxi_open_cq = NULL;
+	struct cxip_cq *cxi_cq = NULL;
+
+	/* Open a CQ with a NULL attribute object pointer */
+	ret = fi_cq_open(cxit_domain, NULL, &cxi_open_cq, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cq_open with NULL attr");
+	cr_assert_not_null(cxi_open_cq);
+
+	/* Validate that the default attributes were set */
+	cxi_cq = container_of(cxi_open_cq, struct cxip_cq, util_cq.cq_fid);
+	cr_assert_eq(cxi_cq->attr.size, CXIP_CQ_DEF_SZ);
+	cr_assert_eq(cxi_cq->attr.flags, 0);
+	cr_assert_eq(cxi_cq->attr.format, FI_CQ_FORMAT_CONTEXT);
+	cr_assert_eq(cxi_cq->attr.wait_obj, FI_WAIT_NONE);
+	cr_assert_eq(cxi_cq->attr.signaling_vector, 0);
+	cr_assert_eq(cxi_cq->attr.wait_cond, FI_CQ_COND_NONE);
+	cr_assert_null((void *)cxi_cq->attr.wait_set);
+
+	ret = fi_close(&cxi_open_cq->fid);
+	cr_assert(ret == FI_SUCCESS);
+	cxi_open_cq = NULL;
+}
+
+struct cq_format_attr_params {
+	enum fi_cq_format in_format;
+	enum fi_cq_format out_format;
+	int status;
+};
+
+ParameterizedTestParameters(cq, cq_attr_format)
+{
+	size_t param_sz;
+
+	static struct cq_format_attr_params params[] = {
+		{.in_format = FI_CQ_FORMAT_CONTEXT,
+		 .out_format = FI_CQ_FORMAT_CONTEXT,
+		 .status = FI_SUCCESS},
+		{.in_format = FI_CQ_FORMAT_MSG,
+		 .out_format = FI_CQ_FORMAT_MSG,
+		 .status = FI_SUCCESS},
+		{.in_format = FI_CQ_FORMAT_DATA,
+		 .out_format = FI_CQ_FORMAT_DATA,
+		 .status = FI_SUCCESS},
+		{.in_format = FI_CQ_FORMAT_TAGGED,
+		 .out_format = FI_CQ_FORMAT_TAGGED,
+		 .status = FI_SUCCESS},
+		{.in_format = FI_CQ_FORMAT_UNSPEC,
+		 .out_format = FI_CQ_FORMAT_CONTEXT,
+		 .status = FI_SUCCESS},
+		{.in_format = FI_CQ_FORMAT_UNSPEC - 1,
+		 .out_format = -1, /* Unchecked in failure case */
+		 .status = -FI_ENOSYS}
+	};
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct cq_format_attr_params, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct cq_format_attr_params *param, cq, cq_attr_format)
+{
+	int ret;
+	struct fid_cq *cxi_open_cq = NULL;
+	struct fi_cq_attr cxit_cq_attr = {0};
+	struct cxip_cq *cxi_cq = NULL;
+
+	cxit_cq_attr.format = param->in_format;
+	cxit_cq_attr.wait_obj = FI_WAIT_NONE; /* default */
+	cxit_cq_attr.size = 0; /* default */
+
+	/* Open a CQ with a NULL attribute object pointer */
+	ret = fi_cq_open(cxit_domain, &cxit_cq_attr, &cxi_open_cq, NULL);
+	cr_assert_eq(ret, param->status,
+		     "fi_cq_open() status mismatch %d != %d with format %d. %s",
+		     ret, param->status, cxit_cq_attr.format,
+		     fi_strerror(-ret));
+
+	if (ret != FI_SUCCESS) {
+		/* Test Complete */
+		return;
+	}
+
+	/* Validate that the format attribute */
+	cr_assert_not_null(cxi_open_cq,
+			   "fi_cq_open() cxi_open_cq is NULL with format %d",
+			   cxit_cq_attr.format);
+	cxi_cq = container_of(cxi_open_cq, struct cxip_cq, util_cq.cq_fid);
+	cr_assert_eq(cxi_cq->attr.format, param->out_format);
+
+	ret = fi_close(&cxi_open_cq->fid);
+	cr_assert(ret == FI_SUCCESS);
+}
+
+struct cq_wait_attr_params {
+	enum fi_wait_obj in_wo;
+	enum fi_wait_obj out_wo;
+	int status;
+};
+
+ParameterizedTestParameters(cq, cq_attr_wait)
+{
+	size_t param_sz;
+
+	static struct cq_wait_attr_params params[] = {
+		{.in_wo = FI_WAIT_NONE,
+		 .status = FI_SUCCESS},
+		{.in_wo = FI_WAIT_FD,
+		 .status = FI_SUCCESS},
+		{.in_wo = FI_WAIT_SET,
+		 .status = -FI_ENOSYS},
+		{.in_wo = FI_WAIT_MUTEX_COND,
+		 .status = -FI_ENOSYS},
+		{.in_wo = FI_WAIT_UNSPEC,
+		 .status = FI_SUCCESS},
+		{.in_wo = FI_WAIT_NONE - 1,
+		 .status = -FI_ENOSYS}
+	};
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct cq_wait_attr_params, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct cq_wait_attr_params *param, cq, cq_attr_wait)
+{
+	int ret;
+	struct fid_cq *cxi_open_cq = NULL;
+	struct fi_cq_attr cxit_cq_attr = {0};
+
+	cxit_cq_attr.wait_obj = param->in_wo;
+	cxit_cq_attr.format = FI_CQ_FORMAT_UNSPEC; /* default */
+	cxit_cq_attr.size = 0; /* default */
+
+	/* Open a CQ with a NULL attribute object pointer */
+	ret = fi_cq_open(cxit_domain, &cxit_cq_attr, &cxi_open_cq, NULL);
+	cr_assert_eq(ret, param->status,
+		     "fi_cq_open() status mismatch %d != %d with wait obj %d. %s",
+		     ret, param->status, cxit_cq_attr.wait_obj,
+		     fi_strerror(-ret));
+
+	if (ret != FI_SUCCESS) {
+		/* Test Complete */
+		return;
+	}
+
+	ret = fi_close(&cxi_open_cq->fid);
+	cr_assert(ret == FI_SUCCESS);
+}
+
+struct cq_size_attr_params {
+	size_t in_sz;
+	size_t out_sz;
+};
+
+ParameterizedTestParameters(cq, cq_attr_size)
+{
+	size_t param_sz;
+
+	static struct cq_size_attr_params params[] = {
+		{.in_sz = 0,
+		 .out_sz = CXIP_CQ_DEF_SZ},
+		{.in_sz = 1 << 9,
+		 .out_sz = 1 << 9},
+		{.in_sz = 1 << 6,
+		 .out_sz = 1 << 6}
+	};
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct cq_size_attr_params, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct cq_size_attr_params *param, cq, cq_attr_size)
+{
+	int ret;
+	struct fid_cq *cxi_open_cq = NULL;
+	struct fi_cq_attr cxit_cq_attr = {0};
+
+	cxit_cq_attr.format = FI_CQ_FORMAT_UNSPEC; /* default */
+	cxit_cq_attr.wait_obj = FI_WAIT_NONE; /* default */
+	cxit_cq_attr.size = param->in_sz;
+
+	/* Open a CQ with a NULL attribute object pointer */
+	ret = fi_cq_open(cxit_domain, &cxit_cq_attr, &cxi_open_cq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS,
+		     "fi_cq_open() status mismatch %d != %d with size %ld. %s",
+		     ret, FI_SUCCESS, cxit_cq_attr.size,
+		     fi_strerror(-ret));
+	cr_assert_not_null(cxi_open_cq);
+
+	ret = fi_close(&cxi_open_cq->fid);
+	cr_assert(ret == FI_SUCCESS);
+}
+
+Test(cq, cq_open_null_domain, .signal = SIGSEGV)
+{
+	struct fid_cq *cxi_open_cq = NULL;
+
+	/*
+	 * Attempt to open a CQ with a NULL domain pointer
+	 * Expect a SIGSEGV since the fi_cq_open implementation attempts to
+	 * use the domain pointer before checking.
+	 */
+	fi_cq_open(NULL, NULL, &cxi_open_cq, NULL);
+}
+
+Test(cq, cq_open_null_cq)
+{
+	/* Attempt to open a CQ with a NULL cq pointer */
+	int ret;
+
+	ret = fi_cq_open(cxit_domain, NULL, NULL, NULL);
+	cr_assert(ret == -FI_EINVAL, "fi_cq_open with NULL cq");
+}
+
+Test(cq, cq_readerr_null_cq, .signal = SIGSEGV)
+{
+	struct fi_cq_err_entry err_entry;
+
+	/* Attempt to read an err with a CQ with a NULL cq pointer */
+	fi_cq_readerr(NULL, &err_entry, (uint64_t)0);
+}
+
+Test(cq, cq_readerr_no_errs)
+{
+	int ret;
+	struct fid_cq *cxi_open_cq = NULL;
+	struct fi_cq_err_entry err_entry;
+
+	/* Open a CQ */
+	ret = fi_cq_open(cxit_domain, NULL, &cxi_open_cq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open with NULL attr");
+	cr_assert_not_null(cxi_open_cq);
+
+	/* Attempt to read an err with a CQ with a NULL buff pointer */
+	ret = fi_cq_readerr(cxi_open_cq, &err_entry, (uint64_t)0);
+	/* Expect no completions to be available */
+	cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_readerr returned %d", ret);
+
+	ret = fi_close(&cxi_open_cq->fid);
+	cr_assert_eq(ret, FI_SUCCESS);
+}
+
+void err_entry_comp(struct fi_cq_err_entry *a,
+		   struct fi_cq_err_entry *b,
+		   size_t size)
+{
+	uint8_t *data_a, *data_b;
+
+	data_a = (uint8_t *)a;
+	data_b = (uint8_t *)b;
+
+	for (int i = 0; i < size; i++)
+		if (data_a[i] != data_b[i])
+			cr_expect_fail("Mismatch at offset %d. %02X - %02X",
+				       i, data_a[i], data_b[i]);
+}
+
+Test(cq, cq_readerr_err)
+{
+	int ret;
+	struct fid_cq *cxi_open_cq = NULL;
+	struct fi_cq_err_entry err_entry, fake_entry;
+	struct cxip_cq *cxi_cq;
+	uint8_t *data_fake, *data_err;
+
+	/* initialize the entries with data */
+	data_fake = (uint8_t *)&fake_entry;
+	data_err = (uint8_t *)&err_entry;
+	for (int i = 0; i < sizeof(fake_entry); i++) {
+		data_fake[i] = (uint8_t)i;
+		data_err[i] = (uint8_t)0xa5;
+	}
+	fake_entry.prov_errno = 18;
+	fake_entry.err_data = err_entry.err_data = NULL;
+	fake_entry.err_data_size = err_entry.err_data_size = 0;
+
+	/* Open a CQ */
+	ret = fi_cq_open(cxit_domain, NULL, &cxi_open_cq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open with NULL attr");
+	cr_assert_not_null(cxi_open_cq);
+
+	/* Add a fake error to the CQ's error ringbuffer */
+	cxi_cq = container_of(cxi_open_cq, struct cxip_cq, util_cq.cq_fid);
+	ofi_cq_write_error(&cxi_cq->util_cq, &fake_entry);
+
+	/* Attempt to read an err with a CQ with a NULL buff pointer */
+	ret = fi_cq_readerr(cxi_open_cq, &err_entry, (uint64_t)0);
+	/* Expect 1 completion to be available */
+	cr_assert_eq(ret, 1, "fi_cq_readerr returned %d", ret);
+	/* Expect the data to match the fake entry */
+	err_entry_comp(&err_entry, &fake_entry, sizeof(fake_entry));
+	printf("prov_errno: %s\n",
+			fi_cq_strerror(cxi_open_cq, err_entry.prov_errno,
+				       NULL, NULL, 0));
+
+	ret = fi_close(&cxi_open_cq->fid);
+	cr_assert_eq(ret, FI_SUCCESS);
+}
+
+Test(cq, cq_readerr_reperr)
+{
+	int ret;
+	struct fi_cq_err_entry err_entry = {0};
+	struct cxip_req req = {0};
+	size_t olen, err_data_size;
+	int err, prov_errno;
+	void *err_data;
+	struct cxip_cq *cxi_cq;
+	uint8_t err_buff[32] = {0};
+
+	/* initialize the input data */
+	req.flags = 0x12340987abcd5676;
+	req.context = 0xa5a5a5a5a5a5a5a5;
+	req.data_len = 0xabcdef0123456789;
+	req.data = 0xbadcfe1032547698;
+	req.tag = 0xefcdab0192837465;
+	olen = 0x4545121290907878;
+	err = -3;
+	prov_errno = -2;
+	err_data = (void *)err_buff;
+	err_data_size = ARRAY_SIZE(err_buff);
+
+	/* Open a CQ */
+	cxit_create_cqs();
+	cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	req.cq = cxi_cq;
+
+	/* Add an error to the CQ's error ringbuffer */
+	ret = cxip_cq_req_error(&req, olen, err, prov_errno,
+				err_data, err_data_size, FI_ADDR_UNSPEC);
+	cr_assert_eq(ret, 0, "cxip_cq_report_error() error %d", ret);
+
+	/* Attempt to read an err with a CQ with a NULL buff pointer */
+	ret = fi_cq_readerr(cxit_tx_cq, &err_entry, (uint64_t)0);
+	cr_assert_eq(ret, 1, "fi_cq_readerr returned %d", ret);
+
+	/* Expect the data to match the fake entry */
+	cr_assert_eq(err_entry.err, err);
+	cr_assert_eq(err_entry.olen, olen);
+	cr_assert_eq(err_entry.len, req.data_len);
+	cr_assert_eq(err_entry.prov_errno, prov_errno);
+	cr_assert_eq(err_entry.flags, req.flags);
+	cr_assert_eq(err_entry.data, req.data);
+	cr_assert_eq(err_entry.tag, req.tag);
+	cr_assert_eq(err_entry.op_context, (void *)(uintptr_t)req.context);
+	cr_assert_eq(memcmp(err_entry.err_data, err_data, err_data_size), 0);
+	cr_assert_leq(err_entry.err_data_size, err_data_size,
+		      "Size mismatch. %zd, %zd",
+		      err_entry.err_data_size, err_data_size);
+
+	cxit_destroy_cqs();
+}
diff --git a/prov/cxi/test/criterion.yaml b/prov/cxi/test/criterion.yaml
new file mode 100644
index 00000000000..9abd1514ba7
--- /dev/null
+++ b/prov/cxi/test/criterion.yaml
@@ -0,0 +1,100 @@
+# example input file for run_criterion_tests script
+
+# Set paths, prompt, and password for node under test
+env:
+  libfabric_dir_on_node: /path/to/libfabric
+  pycxi_dir_on_node: /path/to/pycxi   # required for cxiutil
+  node_prompt: '#'
+  node_password: <password>
+
+
+# These parameters apply to all tests
+global_runtime_parameters:
+  - {DMA_FAULT_RATE: .1,
+     MALLOC_FAULT_RATE: .1,
+     FI_LOG_LEVEL: warn,
+     FI_CXI_FC_RECOVERY: 1}
+
+
+# Test definitions
+tests:
+  #-------------------------------------------------------------------------------------------------------
+  #  EXAMPLE:
+  #  - {description: "Meaningful description of test(s) to be included in tap report",
+  #     filter: "tagged/*",            # run all tagged tests (null = run all tests)
+  #     runtime_parameters: {          # include these params when running the test (null = no params)
+  #       FI_CXI_PARAM_1: val,
+  #       FI_CXI_PARAM_2: val},
+  #     csrs: [                        # set these CSRs prior to running the test (null = no CSRs)
+  #       [<csr1>, <field>, <value>],
+  #       [<csr2>, <field>, <value>]
+  #     ]}
+  #-------------------------------------------------------------------------------------------------------
+
+  - {description: "Run with default settings",
+     filter: null,
+     runtime_parameters: null,
+     csrs: null}
+
+  - {description: "Disable caching of FI_HMEM_SYSTEM",
+     filter: null,
+     runtime_parameters: {
+       FI_MR_CACHE_MONITOR: disable},
+     csrs: null}
+
+  - {description: "Run with RPut and SW Gets",
+     filter: "(tagged|msg)/*",
+     runtime_parameters: null,
+     csrs: [
+       [get_ctrl, get_en, 0]
+     ]}
+
+  - {description: "Run with constrained LE count",
+     filter: "tagged/fc*",
+     runtime_parameters: null,
+     csrs: [
+       ["le_pools[]", max_alloc, 10]
+     ]}
+
+  - {description: "Verify tag matching with rendezvous",
+     filter: "tagged_directed/*",
+     runtime_parameters: {
+       FI_CXI_DEVICE_NAME: "cxi1,cxi0",
+       FI_CXI_RDZV_GET_MIN: 0,
+       FI_CXI_RDZV_THRESHOLD: 2048},
+     csrs: null}
+
+  - {description: "Run with software RX matching mode",
+     filter: null,
+     runtime_parameters: {
+       FI_CXI_RX_MATCH_MODE: '"software"',
+       FI_CXI_RDZV_GET_MIN: 0,
+       FI_CXI_RDZV_THRESHOLD: 2048},
+     csrs: null}
+
+  - {description: "Run with FI_CXI_MSG_OFFLOAD disabled",
+     filter: null,
+     runtime_parameters: {
+       FI_CXI_MSG_OFFLOAD: 0,
+       FI_CXI_RDZV_GET_MIN: 0,
+       FI_CXI_RDZV_THRESHOLD: 2048},
+     csrs: null}
+
+  - {description: "Verify fc_no_eq_space_expected_multi_recv",
+     filter: "tagged/fc_no_eq_space_expected_multi_recv",
+     runtime_parameters: {
+       FI_CXI_DEFAULT_CQ_SIZE: 64,
+       FI_CXI_DISABLE_CQ_HUGETLB: 1,
+       FI_CXI_RDZV_GET_MIN: 0,
+       FI_CXI_RDZV_THRESHOLD: 2048},
+     csrs: null}
+
+  - {description: "Verify fc_no_eq_space_expected_multi_recv and FI_CXI_CQ_FILL_PERCENT",
+     filter: "tagged/fc_no_eq_space_expected_multi_recv",
+     runtime_parameters: {
+       FI_CXI_CQ_FILL_PERCENT: 20,
+       FI_CXI_DEFAULT_CQ_SIZE: 64,
+       FI_CXI_DISABLE_CQ_HUGETLB: 1,
+       FI_CXI_RDZV_GET_MIN: 0,
+       FI_CXI_RDZV_THRESHOLD: 2048},
+     csrs: null}
diff --git a/prov/cxi/test/ctrl.c b/prov/cxi/test/ctrl.c
new file mode 100644
index 00000000000..4651a52a94a
--- /dev/null
+++ b/prov/cxi/test/ctrl.c
@@ -0,0 +1,1086 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2021-2022 Hewlett Packard Enterprise Development LP
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+
+#include <ofi.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+#define	TRACE(fmt, ...)	CXIP_TRACE(CXIP_TRC_CTRL, fmt, ##__VA_ARGS__)
+
+TestSuite(ctrl, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/**
+ * @brief Test reversibility of N <-> (r,c), error conditions
+ *
+ * For a range of radix values, select a node number (N), convert to
+ * a (row,column) pair, and then convert back to node number. These
+ * should match, unless an invalid column (for a row) is specified,
+ * in which case we see an error.
+ */
+Test(ctrl, radix_tree_reversible)
+{
+	int radix, N, M, row, col, siz, rowold, rowwid;
+
+	for (radix = 1; radix < 8; radix++) {
+		rowold = -1;
+		rowwid = 1;
+		for (N = 0; N < 256; N++) {
+			/* test reversibility */
+			cxip_tree_rowcol(radix, N, &row, &col, &siz);
+			cxip_tree_nodeidx(radix, row, col, &M);
+			cr_assert(M == N, "M=%d != N=%d\n", M, N);
+			if (rowold != row) {
+				rowold = row;
+				rowwid *= radix;
+			}
+			/* test invalid column */
+			col = rowwid + 1;
+			cxip_tree_nodeidx(radix, row, col, &M);
+			cr_assert(M == -1,
+				  "radix=%d N=%d row=%d col=%d"
+				  " M=%d != -1\n",
+				  radix, N, row, col, M);
+		}
+	}
+}
+
+/**
+ * @brief Test parent/child mapping.
+ *
+ * For a range of radix values, generate the relatives in the tree (one
+ * parent, multiple children), and confirm that these relatives have the
+ * expected position in the tree, which guarantees that we have no loops
+ * in the tree, and that every node has a parent (except the root), and
+ * is a child of its parent.
+ */
+Test(ctrl, radix_tree_mapping)
+{
+	int *rels, parent, child;
+	int radix, nodes, N, M;
+	int count, i;
+
+	/* Test radix zero case */
+	M = cxip_tree_relatives(0, 0, 0, NULL);
+	cr_assert(M == 0);
+
+	/* Test expected pattern of parent/child indices */
+	for (radix = 1; radix < 8; radix++) {
+		/* only needs radix+1, but for test, provide extra space */
+		rels = calloc(radix+2, sizeof(*rels));
+		for (nodes = 0; nodes < 256; nodes++) {
+			count = 0;
+			parent = -1;
+			child = 1;
+			for (N = 0; N < nodes; N++) {
+				M = cxip_tree_relatives(radix, N, nodes, rels);
+				cr_assert(M >= 0);
+				cr_assert(M <= radix+1);
+				if (M > 0) {
+					/* test parent node index */
+					cr_assert(rels[0] == parent,
+						"radix=%d nodes=%d index=%d"
+						" parent=%d != rels[0]=%d\n",
+						radix, nodes, N, parent, rels[0]);
+					/* test child node indices */
+					for (i = 1; i < M; i++, child++)
+						cr_assert(rels[i] == child,
+							"radix=%d nodes=%d"
+							" index=%d child=%d"
+							" != rels[%d]=%d\n",
+							radix, nodes, N,
+							child, i, rels[i]);
+				}
+				count++;
+				if (N == 0 || count >= radix) {
+					count = 0;
+					parent++;
+				}
+			}
+		}
+		free(rels);
+	}
+}
+
+/* Utility to show the node relatives */
+__attribute__((unused))
+static void dumpmap(struct cxip_zbcoll_obj *zb)
+{
+	int i, j;
+
+	printf("MAP=======\n");
+	for (i = 0; i < zb->simcount; i++) {
+		printf("%2d:", i);
+		for (j = 0; j < zb->state[i].num_relatives; j++)
+			printf(" %2d", zb->state[i].relatives[j]);
+		printf("\n");
+	}
+	printf("\n");
+}
+
+/**
+ * @brief Test the valid and invalid cxip_zbcoll_obj configurations.
+ */
+Test(ctrl, zb_config)
+{
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_zbcoll_obj *zb;
+	struct cxip_addr *caddrs;
+	fi_addr_t *fiaddrs;
+	int i, ret;
+
+	int num_addrs = 5;
+
+	cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	ep_obj = cxip_ep->ep_obj;
+
+	caddrs = calloc(num_addrs, sizeof(*caddrs));
+	cr_assert(caddrs);
+	fiaddrs = calloc(num_addrs, sizeof(*fiaddrs));
+	cr_assert(fiaddrs);
+
+	for (i = 0; i < num_addrs; i++)
+		caddrs[i] = ep_obj->src_addr;
+	ret = fi_av_insert(&ep_obj->av->av_fid, caddrs, num_addrs, fiaddrs,
+			   0L, NULL);
+	cr_assert(ret == num_addrs);
+
+	/* test case, object but no tree */
+	TRACE("case: no tree\n");
+	ret = cxip_zbcoll_alloc(ep_obj, 0, NULL, ZB_NOSIM, &zb);
+	cr_assert(ret == 0,
+		  "no tree: ret=%d\n", ret);
+	cr_assert(zb->simcount == 1,
+		  "no tree: simcnt=%d\n", zb->simcount);
+	cr_assert(zb->num_caddrs == 1,
+		  "no_tree: num_caddrs=%d\n", zb->num_caddrs);
+	cr_assert(memcmp(&zb->caddrs[0], &ep_obj->src_addr, sizeof(ep_obj->src_addr)) == 0);
+	cxip_zbcoll_free(zb);
+
+	/* request simulation */
+	TRACE("case: simulated\n");
+	ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb);
+	cr_assert(ret == 0,
+		  "sim tree 4: ret=%d\n", ret);
+	cr_assert(zb->simcount == num_addrs,
+		  "sim tree 4: cnt=%d\n", zb->simcount);
+	cxip_zbcoll_free(zb);
+
+	/* exercise real setup, send-to-self-only */
+	TRACE("case: real send-only\n");
+	ret = cxip_zbcoll_alloc(ep_obj, 0, NULL, ZB_NOSIM, &zb);
+	cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret));
+	cr_assert(zb != NULL);
+	cr_assert(zb->simcount == 1);
+	cr_assert(zb->state != NULL);
+	cr_assert(CXIP_ADDR_EQUAL(zb->caddrs[0], ep_obj->src_addr));
+
+	/* exercise real setup success, all caddrs are real */
+	TRACE("case: real addresses root 0\n");
+	ret = cxip_zbcoll_alloc(ep_obj, num_addrs, fiaddrs, ZB_NOSIM, &zb);
+	cr_assert(ret == 0, "real tree0: ret=%s\n", fi_strerror(-ret));
+	cr_assert(zb->simcount == 1, "real tree0: simcnt=%d\n", zb->simcount);
+	cr_assert(zb->state[0].grp_rank == 0, "real tree0: grp_rank=%d\n",
+		  zb->state[0].grp_rank);
+	cxip_zbcoll_free(zb);
+
+	/* exercise real setup success, first caddr is not me */
+	TRACE("case: real addresses root 1\n");
+	caddrs[0].nic += 1;
+	ret = fi_av_insert(&ep_obj->av->av_fid, caddrs, num_addrs, fiaddrs,
+			   0L, NULL);
+	cr_assert(ret == num_addrs);
+	ret = cxip_zbcoll_alloc(ep_obj, num_addrs, fiaddrs, ZB_NOSIM, &zb);
+	cr_assert(ret == 0, "real tree1: ret=%s\n", fi_strerror(-ret));
+	cr_assert(zb->simcount == 1, "real tree1: simcnt=%d\n", zb->simcount);
+	cr_assert(zb->state[0].grp_rank == 1, "real tree1: grp_rank=%d\n",
+		  zb->state[0].grp_rank);
+	cxip_zbcoll_free(zb);
+
+	/* exercise real setup failure, no caddr is me */
+	TRACE("case: real addresses root N\n");
+	for (i = 0; i < num_addrs; i++)
+		caddrs[i].nic += i + 1;
+	ret = fi_av_insert(&ep_obj->av->av_fid, caddrs, num_addrs, fiaddrs,
+			   0L, NULL);
+	cr_assert(ret == num_addrs);
+	ret = cxip_zbcoll_alloc(ep_obj, num_addrs, fiaddrs, ZB_NOSIM, &zb);
+	cr_assert(ret == -FI_ECONNREFUSED, "real treeN: ret=%s\n", fi_strerror(-ret));
+	cxip_zbcoll_free(zb);
+
+	free(fiaddrs);
+}
+
+/**
+ * @brief Send a single packet using a self to self send-only configuration.
+ */
+Test(ctrl, zb_send0)
+{
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_zbcoll_obj *zb;
+	union cxip_match_bits mb = {.raw = 0};
+	uint32_t dsc, err, ack, rcv, cnt;
+	int ret;
+
+	cr_assert(sizeof(union cxip_match_bits) == 8);
+
+	cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	ep_obj = cxip_ep->ep_obj;
+
+	/* Set up the send-only zbcoll */
+	ret = cxip_zbcoll_alloc(ep_obj, 0, NULL, ZB_NOSIM, &zb);
+
+	/* Test that if disabled, getgroup is no-op */
+	ep_obj->zbcoll.disable = true;
+	ret = cxip_zbcoll_getgroup(zb);
+	cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret));
+
+	/* Legitimate send to self */
+	cxip_zbcoll_reset_counters(ep_obj);
+	cxip_zbcoll_send(zb, 0, 0, mb.raw);
+	cnt = 0;
+	do {
+		usleep(1);
+		cxip_ep_zbcoll_progress(ep_obj);
+		cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+		ret = (dsc || err || (ack && rcv));
+		cnt++;
+	} while (!ret && cnt < 1000);
+	cr_assert(cnt < 1000, "repeat count = %d >= %d\n", cnt, 1000);
+	cr_assert(dsc == 0, "dsc = %d, != 0\n", dsc);
+	cr_assert(err == 0, "err = %d, != 0\n", err);
+	cr_assert(ack == 1, "ack = %d, != 1\n", ack);
+	cr_assert(rcv == 1, "rcv = %d, != 1\n", rcv);
+
+	/* Invalid send to out-of-range address index */
+	cxip_zbcoll_reset_counters(ep_obj);
+	cxip_zbcoll_send(zb, 0, 1, mb.raw);
+	cnt = 0;
+	do {
+		usleep(1);
+		cxip_ep_zbcoll_progress(ep_obj);
+		cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+		ret = (err || dsc || (ack && rcv));
+		cnt++;
+	} while (!ret && cnt < 1000);
+	cr_assert(cnt < 1000, "repeat count = %d < %d\n", cnt, 1000);
+	cr_assert(dsc == 0, "dsc = %d, != 0\n", dsc);
+	cr_assert(err == 1, "err = %d, != 1\n", err);
+	cr_assert(ack == 0, "ack = %d, != 0\n", ack);
+	cr_assert(rcv == 0, "rcv = %d, != 0\n", rcv);
+
+	cxip_zbcoll_free(zb);
+}
+
+/* utility to send from src to dst */
+static void _send(struct cxip_zbcoll_obj *zb, int srcidx, int dstidx)
+{
+	struct cxip_ep_obj *ep_obj;
+	union cxip_match_bits mb = {.zb_data=0};
+	int ret, cnt;
+	uint32_t dsc, err, ack, rcv;
+
+	/* send to dstidx simulated address */
+	ep_obj = zb->ep_obj;
+	cxip_zbcoll_reset_counters(ep_obj);
+	cxip_zbcoll_send(zb, srcidx, dstidx, mb.raw);
+
+	/* wait for errors, or completion */
+	cnt = 0;
+	do {
+		usleep(1);
+		cxip_ep_zbcoll_progress(ep_obj);
+		cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+		ret = (err || dsc || (ack && rcv));
+		cnt++;
+	} while (!ret && cnt < 1000);
+	cr_assert(cnt < 1000, "repeat count = %d\n", cnt);
+
+	cr_assert(dsc == 0, "dsc = %d, != 0\n", dsc);
+	cr_assert(err == 0, "err = %d, != 0\n", err);
+	cr_assert(ack == 1, "ack = %d, != 1\n", ack);
+	cr_assert(rcv == 1, "rcv = %d, != 1\n", rcv);
+}
+
+/**
+ * @brief Send a single packet from each src to dst in NETSIM simulation.
+ *
+ * Scales as O(N^2), so keep number of addresses small.
+ */
+Test(ctrl, zb_sendN)
+{
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_zbcoll_obj *zb;
+	int srcidx, dstidx, ret;
+
+	int num_addrs = 5;
+
+	cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	ep_obj = cxip_ep->ep_obj;
+
+	ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb);
+	cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret));
+	cr_assert(zb != NULL);
+	cr_assert(zb->simcount == num_addrs);
+	cr_assert(zb->state != NULL);
+
+	/* Test that if disabled, getgroup is no-op */
+	ep_obj->zbcoll.disable = true;
+	ret = cxip_zbcoll_getgroup(zb);
+	cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret));
+
+	for (srcidx = 0; srcidx < num_addrs; srcidx++)
+		for (dstidx = 0; dstidx < num_addrs; dstidx++)
+			_send(zb, srcidx, dstidx);
+	cxip_zbcoll_free(zb);
+}
+
+/* Utility to wait until an ALLSIM collective has completed */
+static int _await_complete(struct cxip_zbcoll_obj *zb)
+{
+	uint32_t rep;
+
+	/* We only wait for 1 sec */
+	for (rep = 0; rep < 10000; rep++) {
+		usleep(100);
+		cxip_ep_zbcoll_progress(zb->ep_obj);
+		if (zb->error)
+			return zb->error;
+		if (!zb->busy)
+			break;
+	}
+	return (zb->busy) ? -FI_ETIMEDOUT : FI_SUCCESS;
+}
+
+/* Utility to wait until a multi-zb collective has completed */
+static int _await_complete_all(struct cxip_zbcoll_obj **zb, int cnt)
+{
+	uint32_t i, rep;
+
+	/* We only wait for 1 sec */
+	for (rep = 0; rep < 10000; rep++) {
+		usleep(100);
+		cxip_ep_zbcoll_progress(zb[0]->ep_obj);
+		for (i = 0; i < cnt; i++) {
+			if (zb[i]->error)
+				return zb[i]->error;
+			if (zb[i]->busy)
+				break;
+		}
+		if (i == cnt)
+			break;
+	}
+	return (i < cnt) ? -FI_ETIMEDOUT : FI_SUCCESS;
+}
+
+/* shuffle the array */
+void _shuffle_array32(uint32_t *array, size_t size)
+{
+	uint32_t i, j, t;
+
+	for (i = 0; i < size-1; i++) {
+		j = i + rand() / (RAND_MAX / (size - i) + 1);
+		t = array[j];
+		array[j] = array[i];
+		array[i] = t;
+	}
+}
+
+/* create a randomized shuffle array */
+void _addr_shuffle(struct cxip_zbcoll_obj *zb, bool shuffle)
+{
+	struct timespec tv;
+	int i;
+
+	clock_gettime(CLOCK_MONOTONIC, &tv);
+	srand((unsigned int)tv.tv_nsec);
+	free(zb->shuffle);
+	zb->shuffle = calloc(zb->simcount, sizeof(uint32_t));
+	if (!zb->shuffle)
+		return;
+	/* create ordered list */
+	for (i = 0; i < zb->simcount; i++)
+		zb->shuffle[i] = i;
+	/* if requested, randomize */
+	if (shuffle)
+		_shuffle_array32(zb->shuffle, zb->simcount);
+}
+
+/*****************************************************************/
+/**
+ * @brief Test simulated getgroup.
+ *
+ * This exercises the basic getgroup operation, the user callback, and the
+ * non-concurrency lockout. It tests grpid wrap-around at the limit.
+ *
+ * This does not test error returns, which are not robustly simulated.
+ */
+
+struct getgroup_data {
+	int count;
+};
+static void getgroup_func(struct cxip_zbcoll_obj *zb, void *usrptr)
+{
+	struct getgroup_data *data = (struct getgroup_data *)usrptr;
+	data->count++;
+}
+
+/* Test getgroup single-zb simulation */
+Test(ctrl, zb_getgroup)
+{
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_zbcoll_obj **zb;
+	struct getgroup_data zbd = {};
+	int i, j, ret;
+	uint32_t dsc, err, ack, rcv;
+	int max_zb = cxip_zbcoll_max_grps(true);
+	int num_zb = 2*max_zb;
+	int num_addrs = 9;
+	int cnt = 0;
+
+	cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	ep_obj = cxip_ep->ep_obj;
+
+	zb = calloc(num_zb, sizeof(struct cxip_zbcoll_obj *));
+	cr_assert(zb, "zb out of memory\n");
+
+	TRACE("%s entry\n", __func__);
+	for (i = 0; i < num_zb; i++) {
+		/* Verify multiple allocations */
+		ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL,
+					ZB_ALLSIM, &zb[i]);
+		cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n",
+			  fi_strerror(-ret));
+		cr_assert(zb[i]->simcount == num_addrs,
+			"zb->simcount = %d, != %d\n",
+			zb[i]->simcount, num_addrs);
+		/* Add callback function */
+		cxip_zbcoll_set_user_cb(zb[i], getgroup_func, &zbd);
+		/* Initialize the address shuffling */
+		_addr_shuffle(zb[i], true);
+		TRACE("created zb[%d]\n", i);
+	}
+	for (i = j = 0; i < num_zb; i++) {
+		/* Free space if necessary */
+		while ((i - j) >= max_zb)
+			cxip_zbcoll_free(zb[j++]);
+		_addr_shuffle(zb[i], true);
+		/* Test getgroup operation */
+		TRACE("initiate getgroup %d\n", i);
+		ret = cxip_zbcoll_getgroup(zb[i]);
+		cr_assert(ret == FI_SUCCESS, "%d getgroup = %s\n",
+			  i, fi_strerror(-ret));
+		/* Test getgroup non-concurrency */
+		TRACE("second initiate getgroup %d\n", i);
+		ret = cxip_zbcoll_getgroup(zb[i]);
+		cr_assert(ret == -FI_EAGAIN, "%d getgroup = %s\n",
+			  i, fi_strerror(-ret));
+		/* Poll until complete */
+		TRACE("await completion %d\n", i);
+		ret = _await_complete(zb[i]);
+		cr_assert(ret == FI_SUCCESS, "%d getgroup = %s\n",
+			  i, fi_strerror(-ret));
+		/* Check user callback completion count result */
+		cr_assert(zbd.count == i+1, "%d zbdcount = %d\n",
+			  i, zbd.count);
+		/* Confirm expected grpid */
+		cr_assert(zb[i]->grpid == (i % max_zb),
+			  "%d grpid = %d, exp %d\n",
+			  i, zb[i]->grpid, i % max_zb);
+		TRACE("second getgroup after completion\n");
+		/* Attempt another getgroup on same zb */
+		ret = cxip_zbcoll_getgroup(zb[i]);
+		cr_assert(ret == -FI_EINVAL, "%d getgroup = %s\n",
+			  i, fi_strerror(-ret));
+		/* Compute expected transfer count */
+		cnt += 2 * (num_addrs - 1);
+	}
+
+	cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+	cr_assert(dsc == 0 && err == 0,
+		  "FAILED dsc=%d err=%d ack=%d rcv=%d cnt=%d\n",
+		  dsc, err, ack, rcv, cnt);
+	/* cleanup */
+	while (j < num_zb)
+		cxip_zbcoll_free(zb[j++]);
+	free(zb);
+}
+
+/*****************************************************************/
+/**
+ * @brief Test simulated getgroup with multi-zb model.
+ */
+
+void _getgroup_multi(int num_addrs, struct cxip_zbcoll_obj **zb,
+		     int expect_grpid)
+{
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct getgroup_data zbd = {};
+	int i, ret;
+
+	cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	ep_obj = cxip_ep->ep_obj;
+
+	/* allocate multiple zb objects, simrank = i */
+	for (i = 0; i < num_addrs; i++) {
+		ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, i, &zb[i]);
+		cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n",
+			  fi_strerror(-ret));
+		cr_assert(zb[i]->simcount == num_addrs,
+			  "zb->simcount = %d, != %d\n",
+			  zb[i]->simcount, num_addrs);
+		ret = cxip_zbcoll_simlink(zb[0], zb[i]);
+		cr_assert(!ret, "link zb[%d] failed\n", i);
+	}
+
+	for (i = 0; i < num_addrs; i++)
+		cxip_zbcoll_set_user_cb(zb[i], getgroup_func, &zbd);
+
+	/* initiate getgroup across all of the zb objects */
+	for (i = 0; i < num_addrs; i++) {
+		ret = cxip_zbcoll_getgroup(zb[i]);
+		cr_assert(ret == FI_SUCCESS, "getgroup[%d]=%s, exp success\n",
+			  i, fi_strerror(-ret));
+	}
+
+	/* make a second attempt */
+	for (i = 0; i < num_addrs; i++) {
+		ret = cxip_zbcoll_getgroup(zb[i]);
+		cr_assert(ret == -FI_EAGAIN, "getgroup[%d]=%s exp FI_EAGAIN\n",
+			  i, fi_strerror(-ret));
+	}
+
+	/* Poll until all are complete */
+	ret = _await_complete_all(zb, num_addrs);
+	cr_assert(ret == FI_SUCCESS, "getgroup = %s\n",
+		  fi_strerror(-ret));
+
+	/* Ensure all objects have the same group ids */
+	ret = 0;
+	for (i = 0; i < num_addrs; i++) {
+		if (zb[i]->grpid != expect_grpid) {
+			TRACE("zb[%d]->grpid = %d, exp %d\n",
+			    i, zb[i]->grpid, expect_grpid);
+			ret++;
+		}
+	}
+	cr_assert(!ret, "Some zb objects have the wrong group id\n");
+
+	/* Make sure we can't take a second group id */
+	for (i = 0; i < num_addrs; i++) {
+		ret = cxip_zbcoll_getgroup(zb[i]);
+		cr_assert(ret == -FI_EINVAL, "getgroup[%d]=%s exp FI_EINVAL\n",
+			  i, fi_strerror(-ret));
+	}
+
+}
+
+void _free_getgroup_multi(int num_addrs, struct cxip_zbcoll_obj **zb)
+{
+	int i;
+
+	for (i = 0; i < num_addrs; i++)
+		cxip_zbcoll_free(zb[i]);
+	free(zb);
+}
+
+/* Test getgroup multi-zb simulation */
+Test(ctrl, zb_getgroup2)
+{
+	struct cxip_zbcoll_obj **zb1, **zb2;
+	int num_addrs = 9;	// arbitrary
+
+	zb1 = calloc(num_addrs, sizeof(struct cxip_zbcoll_obj *));
+	cr_assert(zb1, "zb out of memory\n");
+	zb2 = calloc(num_addrs, sizeof(struct cxip_zbcoll_obj *));
+	cr_assert(zb2, "zb out of memory\n");
+
+	_getgroup_multi(num_addrs, zb1, 0);
+	_getgroup_multi(num_addrs, zb2, 1);
+
+	_free_getgroup_multi(num_addrs, zb2);
+	_free_getgroup_multi(num_addrs, zb1);
+}
+
+/*****************************************************************/
+/**
+ * @brief Test simulated barrier.
+ *
+ * This exercises the basic barrier operation, the user callback, and the
+ * non-concurrency lockout.
+ *
+ * This is done in a single thread, so it tests only a single barrier across
+ * multiple addrs. It randomizes the nid processing order, and performs multiple
+ * barriers to uncover any ordering issues.
+ */
+struct barrier_data {
+	int count;
+};
+static void barrier_func(struct cxip_zbcoll_obj *zb, void *usrptr)
+{
+	struct barrier_data *data = (struct barrier_data *)usrptr;
+
+	/* increment the user completion count */
+	data->count++;
+}
+
+/* Test barrier single-zb simulation */
+Test(ctrl, zb_barrier)
+{
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_zbcoll_obj *zb;
+	struct barrier_data zbd;
+	int rep, ret;
+
+	int num_addrs = 9;
+
+	cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	ep_obj = cxip_ep->ep_obj;
+
+	ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb);
+	cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret));
+	cr_assert(zb->simcount == num_addrs,
+		  "zb->simcount = %d, != %d\n", zb->simcount, num_addrs);
+	/* Initialize the addresses */
+	_addr_shuffle(zb, true);
+
+	/* Acquire a group id */
+	ret = cxip_zbcoll_getgroup(zb);
+	cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret));
+	ret = _await_complete(zb);
+	cr_assert(ret == 0, "getgroup done = %s\n", fi_strerror(-ret));
+
+	cxip_zbcoll_set_user_cb(zb, barrier_func, &zbd);
+
+	memset(&zbd, 0, sizeof(zbd));
+	for (rep = 0; rep < 20; rep++) {
+		/* Shuffle the addresses */
+		_addr_shuffle(zb, true);
+		/* Perform a barrier */
+		ret = cxip_zbcoll_barrier(zb);
+		cr_assert(ret == 0, "%d barrier = %s\n",
+			  rep, fi_strerror(-ret));
+		/* Try again immediately, should show BUSY */
+		ret = cxip_zbcoll_barrier(zb);
+		cr_assert(ret == -FI_EAGAIN, "%d barrier = %s\n",
+			  rep, fi_strerror(-ret));
+		/* Poll until complete */
+		ret = _await_complete(zb);
+		cr_assert(ret == FI_SUCCESS, "%d barrier = %s\n",
+			  rep, fi_strerror(-ret));
+	}
+	/* Confirm completion count */
+	cr_assert(zbd.count == rep, "expected zbd.count=%d == rep=%d\n",
+		  zbd.count, rep);
+
+	uint32_t dsc, err, ack, rcv;
+	cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+	cr_assert(dsc == 0 && err == 0,
+		  "FAILED dsc=%d err=%d ack=%d rcv=%d\n",
+		  dsc, err, ack, rcv);
+
+	cxip_zbcoll_free(zb);
+}
+
+/* Test barrier multi-zb simulation */
+Test(ctrl, zb_barrier2)
+{
+	struct cxip_zbcoll_obj **zb1, **zb2;
+	struct barrier_data zbd1 = {};
+	struct barrier_data zbd2 = {};
+	int num_addrs = 17;	// arbitrary
+	int i, ret;
+
+	zb1 = calloc(num_addrs, sizeof(*zb1));
+	cr_assert(zb1);
+	zb2 = calloc(num_addrs, sizeof(*zb2));
+	cr_assert(zb2);
+
+	_getgroup_multi(num_addrs, zb1, 0);
+	_getgroup_multi(num_addrs, zb2, 1);
+
+	for (i = 0; i < num_addrs; i++) {
+		cxip_zbcoll_set_user_cb(zb1[i], barrier_func, &zbd1);
+		cxip_zbcoll_set_user_cb(zb2[i], barrier_func, &zbd2);
+	}
+
+	for (i = 0; i < num_addrs; i++) {
+		ret = cxip_zbcoll_barrier(zb1[i]);
+		cr_assert(!ret, "zb1 barrier[%d]=%s\n", i, fi_strerror(-ret));
+
+		ret = cxip_zbcoll_barrier(zb2[i]);
+		cr_assert(!ret, "zb2 barrier[%d]=%s\n", i, fi_strerror(-ret));
+	}
+
+	/* Poll until all are complete */
+	ret = _await_complete_all(zb1, num_addrs);
+	cr_assert(ret == FI_SUCCESS, "zb1 barrier = %s\n",
+		  fi_strerror(-ret));
+	ret = _await_complete_all(zb2, num_addrs);
+	cr_assert(ret == FI_SUCCESS, "zb2 barrier = %s\n",
+		  fi_strerror(-ret));
+
+	/* Validate data */
+	cr_assert(zbd1.count == num_addrs, "zb1 count=%d != %d\n",
+		  zbd1.count, num_addrs);
+	cr_assert(zbd2.count == num_addrs, "zb2 count=%d != %d\n",
+		  zbd2.count, num_addrs);
+
+	_free_getgroup_multi(num_addrs, zb2);
+	_free_getgroup_multi(num_addrs, zb1);
+}
+
+/*****************************************************************/
+/**
+ * @brief Perform a simulated broadcast.
+ *
+ * This exercises the basic broadcast operation, the user callback, and the
+ * non-concurrency lockout. The user callback captures all of the results and
+ * ensures they all match the broadcast value.
+ *
+ * This is done in a single thread, so it tests only a single broadcast across
+ * multiple addrs. It randomizes the nid processing order, and performs multiple
+ * broadcasts to uncover any ordering issues.
+ */
+struct bcast_data {
+	uint64_t *data;
+	int count;
+};
+
+static void bcast_func(struct cxip_zbcoll_obj *zb, void *usrptr)
+{
+	struct bcast_data *data = (struct bcast_data *)usrptr;
+	int i;
+
+	if (zb->simrank >= 0) {
+		data->data[zb->simrank] = *zb->state[zb->simrank].dataptr;
+	} else {
+		for (i = 0; i < zb->simcount; i++)
+			data->data[i] = *zb->state[i].dataptr;
+	}
+	data->count++;
+}
+
+/* Test broadcast single-zb simulation */
+Test(ctrl, zb_broadcast)
+{
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_zbcoll_obj *zb;
+	struct bcast_data zbd = {};
+	int i, n, rep, ret;
+	uint64_t *data;
+
+	int num_addrs = 25;
+
+	cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	ep_obj = cxip_ep->ep_obj;
+
+	ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb);
+	cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret));
+	cr_assert(zb->simcount == num_addrs,
+		  "zb->simcount = %d, != %d\n", zb->simcount, num_addrs);
+	_addr_shuffle(zb, true);
+
+	data = calloc(num_addrs, sizeof(uint64_t));
+
+	/* Acquire a group id */
+	ret = cxip_zbcoll_getgroup(zb);
+	cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret));
+	ret = _await_complete(zb);
+	cr_assert(ret == 0, "getgroup done = %s\n", fi_strerror(-ret));
+
+	cxip_zbcoll_set_user_cb(zb, bcast_func, &zbd);
+
+	memset(&zbd, 0, sizeof(zbd));
+	zbd.data = calloc(num_addrs, sizeof(uint64_t));
+	for (rep = 0; rep < 20; rep++) {
+		_addr_shuffle(zb, true);
+		n = zb->shuffle[0];
+		memset(zbd.data, -1, num_addrs*sizeof(uint64_t));
+		/* Perform a broadcast */
+		for (i = 0; i < num_addrs; i++)
+			data[i] = (rand() & ((1 << 29) - 1)) | (1 << 28);
+		ret = cxip_zbcoll_broadcast(zb, data);
+		cr_assert(ret == 0, "%d bcast = %s\n",
+			  rep, fi_strerror(-ret));
+		/* Try again immediately, should fail */
+		ret = cxip_zbcoll_broadcast(zb, data);
+		cr_assert(ret == -FI_EAGAIN, "%d bcast = %s\n",
+			  rep, fi_strerror(-ret));
+		/* Poll until complete */
+		ret = _await_complete(zb);
+		cr_assert(ret == FI_SUCCESS, "%d bcast = %s\n",
+			  rep, fi_strerror(-ret));
+		/* Validate the data */
+		for (i = 0; i < num_addrs; i++)
+			cr_assert(zbd.data[i] == data[n], "[%d] %ld != %ld\n",
+				  i, zbd.data[i], data[n]);
+	}
+	cr_assert(zbd.count == rep, "zbd.count=%d rep=%d\n",
+		  zbd.count, rep);
+
+	uint32_t dsc, err, ack, rcv;
+	cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+	cr_assert(dsc == 0 && err == 0,
+		  "FAILED dsc=%d err=%d ack=%d rcv=%d\n",
+		  dsc, err, ack, rcv);
+
+	free(zbd.data);
+	free(data);
+	cxip_zbcoll_free(zb);
+}
+
+/* Test broadcast multi-zb simulation */
+Test(ctrl, zb_broadcast2)
+{
+	struct cxip_zbcoll_obj **zb1, **zb2;
+	uint64_t data1, data2;
+	struct bcast_data zbd1 = {};
+	struct bcast_data zbd2 = {};
+	int i, ret;
+
+	int num_addrs = 11;	// arbitrary
+
+	zb1 = calloc(num_addrs, sizeof(*zb1));
+	cr_assert(zb1);
+	zb2 = calloc(num_addrs, sizeof(*zb2));
+	cr_assert(zb2);
+	zbd1.data = calloc(num_addrs, sizeof(*zbd1.data));
+	cr_assert(zbd1.data);
+	zbd2.data = calloc(num_addrs, sizeof(*zbd2.data));
+	cr_assert(zbd2.data);
+
+	/* Acquire group ids */
+	_getgroup_multi(num_addrs, zb1, 0);
+	_getgroup_multi(num_addrs, zb2, 1);
+
+	data1 = (rand() & ((1 << 29) - 1)) | (1 << 28);
+	data2 = (rand() & ((1 << 29) - 1)) | (1 << 28);
+
+	for (i = 0; i < num_addrs; i++) {
+		cxip_zbcoll_set_user_cb(zb1[i], bcast_func, &zbd1);
+		cxip_zbcoll_set_user_cb(zb2[i], bcast_func, &zbd2);
+	}
+	for (i = 0; i < num_addrs; i++) {
+		ret = cxip_zbcoll_broadcast(zb1[i], &data1);
+		cr_assert(!ret, "zb1 broadcast[%d]=%s\n", i, fi_strerror(-ret));
+
+		ret = cxip_zbcoll_broadcast(zb2[i], &data2);
+		cr_assert(!ret, "zb2 broadcast[%d]=%s\n", i, fi_strerror(-ret));
+	}
+
+	/* Poll until all are complete */
+	ret = _await_complete_all(zb1, num_addrs);
+	cr_assert(ret == FI_SUCCESS, "zb1 broadcast = %s\n",
+		  fi_strerror(-ret));
+	ret = _await_complete_all(zb2, num_addrs);
+	cr_assert(ret == FI_SUCCESS, "zb2 broadcast = %s\n",
+		  fi_strerror(-ret));
+
+	/* Validate data */
+	cr_assert(zbd1.count == num_addrs, "count=%d != %d\n",
+		  zbd1.count, num_addrs);
+	for (i = 0; i < num_addrs; i++) {
+		cr_assert(data1 == zbd1.data[i],
+			  "data1=%ld != zbd1[%d]=%ld\n",
+			  data1, i, zbd1.data[i]);
+	}
+	cr_assert(zbd2.count == num_addrs, "count=%d != %d\n",
+		  zbd2.count, num_addrs);
+	for (i = 0; i < zbd2.count; i++) {
+		cr_assert(data2 == zbd2.data[i],
+			  "data2=%ld != zbd2[%d]=%ld\n",
+			  data2, i, zbd2.data[i]);
+	}
+
+	_free_getgroup_multi(num_addrs, zb2);
+	_free_getgroup_multi(num_addrs, zb1);
+}
+
+/*****************************************************************/
+/**
+ * @brief Perform a simulated reduce.
+ *
+ * This exercises the basic reduce operation, the user callback, and the
+ * non-concurrency lockout. The user callback captures all of the results and
+ * ensures they all match the reduce value.
+ *
+ * This is done in a single thread, so it tests only a single reduce across
+ * multiple addrs. It randomizes the nid processing order, and performs multiple
+ * reductions to uncover any ordering issues.
+ */
+struct reduce_data {
+	uint64_t *data;
+	int count;
+};
+
+static void reduce_func(struct cxip_zbcoll_obj *zb, void *usrptr)
+{
+	struct reduce_data *data = (struct reduce_data *)usrptr;
+	int i;
+
+	if (zb->simrank >= 0) {
+		data->data[zb->simrank] = *zb->state[zb->simrank].dataptr;
+	} else {
+		for (i = 0; i < zb->simcount; i++)
+			data->data[i] = *zb->state[i].dataptr;
+	}
+	data->count++;
+}
+
+/* Test reduce single-zb simulation */
+Test(ctrl, zb_reduce)
+{
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_zbcoll_obj *zb;
+	struct reduce_data zbd = {};
+	int i, rep, ret;
+	uint64_t *data, rslt;
+
+	int num_addrs = 25;
+
+	cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	ep_obj = cxip_ep->ep_obj;
+
+	ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb);
+	cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret));
+	cr_assert(zb->simcount == num_addrs,
+		  "zb->simcount = %d, != %d\n", zb->simcount, num_addrs);
+	_addr_shuffle(zb, true);
+
+	data = calloc(num_addrs, sizeof(uint64_t));
+
+	/* Acquire a group id */
+	ret = cxip_zbcoll_getgroup(zb);
+	cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret));
+	ret = _await_complete(zb);
+	cr_assert(ret == 0, "getgroup done = %s\n", fi_strerror(-ret));
+
+	cxip_zbcoll_set_user_cb(zb, reduce_func, &zbd);
+
+	memset(&zbd, 0, sizeof(zbd));
+	zbd.data = calloc(num_addrs, sizeof(uint64_t));
+
+	for (rep = 0; rep < 20; rep++) {
+		_addr_shuffle(zb, true);
+		memset(zbd.data, -1, num_addrs*sizeof(uint64_t));
+		/* Perform a reduce */
+		for (i = 0; i < num_addrs; i++) {
+			data[i] = (rand() & ((1 << 29) - 1)) | (1 << 28);
+			data[i] |= 3;
+		}
+		rslt = -1L;
+		for (i = 1; i < num_addrs; i++) {
+			rslt &= data[i];
+		}
+		ret = cxip_zbcoll_reduce(zb, data);
+		cr_assert(ret == 0, "%d reduce = %s\n",
+			  rep, fi_strerror(-ret));
+		/* Try again immediately, should fail */
+		ret = cxip_zbcoll_reduce(zb, data);
+		cr_assert(ret == -FI_EAGAIN, "%d reduce = %s\n",
+			  rep, fi_strerror(-ret));
+		/* Poll until complete */
+		ret = _await_complete(zb);
+		cr_assert(ret == FI_SUCCESS, "%d reduce = %s\n",
+			  rep, fi_strerror(-ret));
+		/* Validate the data */
+		for (i = 0; i < num_addrs; i++)
+			cr_assert(zbd.data[i] == rslt, "[%d] %lx != %lx\n",
+				  i, zbd.data[i], rslt);
+	}
+	cr_assert(zbd.count == rep, "zbd.count=%d rep=%d\n",
+		  zbd.count, rep);
+
+	uint32_t dsc, err, ack, rcv;
+	cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+	cr_assert(dsc == 0 && err == 0,
+		  "FAILED dsc=%d err=%d ack=%d rcv=%d\n",
+		  dsc, err, ack, rcv);
+
+	free(zbd.data);
+	free(data);
+	cxip_zbcoll_free(zb);
+}
+
+/* Test reduce multi-zb simulation */
+Test(ctrl, zb_reduce2)
+{
+	struct cxip_zbcoll_obj **zb1, **zb2;
+	int num_addrs = 11;	// arbitrary
+	uint64_t data1, data2;
+	struct reduce_data zbd1 = {};
+	struct reduce_data zbd2 = {};
+	int i, ret;
+
+	zb1 = calloc(num_addrs, sizeof(*zb1));
+	cr_assert(zb1);
+	zb2 = calloc(num_addrs, sizeof(*zb2));
+	cr_assert(zb2);
+	zbd1.data = calloc(num_addrs, sizeof(*zbd1.data));
+	cr_assert(zbd1.data);
+	zbd2.data = calloc(num_addrs, sizeof(*zbd2.data));
+	cr_assert(zbd2.data);
+
+	_getgroup_multi(num_addrs, zb1, 0);
+	_getgroup_multi(num_addrs, zb2, 1);
+
+	data1 = (rand() & ((1 << 29) - 1)) | (1 << 28);
+	data2 = (rand() & ((1 << 29) - 1)) | (1 << 28);
+
+	for (i = 0; i < num_addrs; i++) {
+		cxip_zbcoll_set_user_cb(zb1[i], reduce_func, &zbd1);
+		cxip_zbcoll_set_user_cb(zb2[i], reduce_func, &zbd2);
+	}
+	for (i = 0; i < num_addrs; i++) {
+		ret = cxip_zbcoll_reduce(zb1[i], &data1);
+		cr_assert(!ret, "zb1 reduce[%d]=%s\n", i, fi_strerror(-ret));
+
+		ret = cxip_zbcoll_reduce(zb2[i], &data2);
+		cr_assert(!ret, "zb2 reduce[%d]=%s\n", i, fi_strerror(-ret));
+	}
+
+	/* Poll until all are complete */
+	ret = _await_complete_all(zb1, num_addrs);
+	cr_assert(ret == FI_SUCCESS, "zb1 reduce = %s\n",
+		  fi_strerror(-ret));
+	ret = _await_complete_all(zb2, num_addrs);
+	cr_assert(ret == FI_SUCCESS, "zb2 reduce = %s\n",
+		  fi_strerror(-ret));
+
+	/* Validate data */
+	cr_assert(zbd1.count == num_addrs, "count=%d != %d\n",
+		  zbd1.count, num_addrs);
+	for (i = 0; i < num_addrs; i++) {
+		cr_assert(data1 == zbd1.data[i],
+			  "data1=%ld != zbd1[%d]=%ld\n",
+			  data1, i, zbd1.data[i]);
+	}
+	cr_assert(zbd2.count == num_addrs, "count=%d != %d\n",
+		  zbd2.count, num_addrs);
+	for (i = 0; i < zbd2.count; i++) {
+		cr_assert(data2 == zbd2.data[i],
+			  "data2=%ld != zbd2[%d]=%ld\n",
+			  data2, i, zbd2.data[i]);
+	}
+
+	_free_getgroup_multi(num_addrs, zb2);
+	_free_getgroup_multi(num_addrs, zb1);
+}
diff --git a/prov/cxi/test/cuda.c b/prov/cxi/test/cuda.c
new file mode 100644
index 00000000000..5398dcd98f3
--- /dev/null
+++ b/prov/cxi/test/cuda.c
@@ -0,0 +1,425 @@
+/*
+ * (C) Copyright 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <ctype.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "libcxi/libcxi.h"
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+#define MAX_MSG_SIZE 1048576U
+#define MAX_BUF_OFFSET 65536U
+
+unsigned int seed;
+
+static void cuda_init(void)
+{
+	enable_cxi_hmem_ops = 0;
+	seed = time(NULL);
+	srand(seed);
+}
+
+TestSuite(cuda, .timeout = CXIT_DEFAULT_TIMEOUT, .init = cuda_init);
+
+static void cuda_message_runner(void *cuda_send_buf, void *cuda_recv_buf,
+				size_t buf_size, bool device_only_mem,
+				bool unexpected)
+{
+	int ret;
+	char *send_buf;
+	char *recv_buf;
+	struct fi_cq_tagged_entry cqe;
+	int i;
+	cudaError_t cuda_ret;
+	int j;
+
+	cxit_setup_msg();
+
+	/* For device only memcpy, send and recv buffer as used for data
+	   validation.
+	*/
+	if (device_only_mem) {
+		send_buf = malloc(buf_size);
+		cr_assert_neq(send_buf, NULL, "Failed to allocate memory");
+
+		recv_buf = calloc(1, buf_size);
+		cr_assert_neq(send_buf, NULL, "Failed to allocate memory");
+	} else {
+		send_buf = cuda_send_buf;
+		recv_buf = cuda_recv_buf;
+	}
+
+	for (j = 0; j < 2; j++) {
+
+		ret = open("/dev/urandom", O_RDONLY);
+		cr_assert_neq(ret, -1, "open failed: %d", -errno);
+		read(ret, send_buf, buf_size);
+		close(ret);
+
+		if (device_only_mem) {
+			cuda_ret = cudaMemcpy(cuda_send_buf, send_buf, buf_size,
+					      cudaMemcpyHostToDevice);
+			cr_assert_eq(cuda_ret, cudaSuccess, "cudaMemcpy failed: %d",
+				     cuda_ret);
+		}
+
+
+		if (unexpected) {
+			ret = fi_send(cxit_ep, cuda_send_buf, buf_size, NULL, cxit_ep_fi_addr,
+				      NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+			ret = fi_recv(cxit_ep, cuda_recv_buf, buf_size, NULL, cxit_ep_fi_addr,
+				      NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+		} else {
+			ret = fi_recv(cxit_ep, cuda_recv_buf, buf_size, NULL, cxit_ep_fi_addr,
+				      NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+			ret = fi_send(cxit_ep, cuda_send_buf, buf_size, NULL, cxit_ep_fi_addr,
+				      NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+		}
+
+		do {
+			ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		do {
+			ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		if (device_only_mem) {
+			cuda_ret = cudaMemcpy(recv_buf, cuda_recv_buf, buf_size,
+					      cudaMemcpyDeviceToHost);
+			cr_assert_eq(cuda_ret, cudaSuccess, "cudaMemcpy failed: %d",
+				     cuda_ret);
+		}
+
+		for (i = 0; i < buf_size; i++)
+			cr_assert_eq(send_buf[i], recv_buf[i],
+				     "Data corruption at byte %d seed %u iter %d", i, seed, j);
+	}
+
+	if (device_only_mem) {
+		free(recv_buf);
+		free(send_buf);
+	}
+
+	cxit_teardown_msg();
+}
+
+static void cuda_dev_memory_test(size_t buf_size, size_t buf_offset,
+				 bool unexpected, bool hmem_dev_reg)
+{
+	cudaError_t cuda_ret;
+	void *cuda_send_buf;
+	void *cuda_recv_buf;
+	int ret;
+
+	if (hmem_dev_reg)
+		ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1);
+	else
+		ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", -errno);
+
+	/* cuda buffers will be used for RDMA. */
+	cuda_ret = cudaMalloc(&cuda_send_buf, buf_size + buf_offset);
+	cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret);
+
+	cuda_ret = cudaMalloc(&cuda_recv_buf, buf_size + buf_offset);
+	cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret);
+
+	int attr_value = 1;
+	cuPointerSetAttribute(&attr_value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)cuda_send_buf);
+
+	cuda_message_runner((void *)((char *)cuda_send_buf + buf_offset),
+			    (void *)((char *)cuda_recv_buf + buf_offset),
+			    buf_size, true, unexpected);
+
+	cuda_ret = cudaFree(cuda_recv_buf);
+	cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree  failed: %d", cuda_ret);
+
+	cuda_ret = cudaFree(cuda_send_buf);
+	cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree  failed: %d", cuda_ret);
+
+}
+
+/* Test messaging using rendezvous, device memory, and HMEM device memory
+ * registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_rdvz_hmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, false, true);
+}
+
+/* Test messaging using eager, device memory, and HMEM device memory
+ * registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_eager_hmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, false, true);
+}
+
+/* Test messaging using IDC, device memory, and HMEM device memory
+ * registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_idc_hmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, false, true);
+}
+
+/* Test messaging using rendezvous, device memory, unexpected messaging, and
+ * HMEM device memory registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_rdvz_unexpected_hmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, true, true);
+}
+
+/* Test messaging using eager, device memory, unexpected messaging, and
+ * HMEM device memory registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_eager_unexpected_hmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, true, true);
+}
+
+/* Test messaging using IDC, device memory, unexpected messaging, and
+ * HMEM device memory registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_idc_unexpected_hmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, true, true);
+}
+
+/* Test messaging using rendezvous, device memory, and without HMEM device memory
+ * registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_rdvz_noHmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, false, false);
+}
+
+/* Test messaging using eager, device memory, and without HMEM device memory
+ * registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_eager_noHmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, false, false);
+}
+
+/* Test messaging using IDC, device memory, and without HMEM device memory
+ * registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_idc_noHmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, false, false);
+}
+
+/* Test messaging using rendezvous, device memory, unexpected messaging, and
+ * without HMEM device memory registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_rdvz_unexpected_noHmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, true, false);
+}
+
+/* Test messaging using eager, device memory, unexpected messaging, and
+ * without HMEM device memory registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_eager_unexpected_noHmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, true, false);
+}
+
+/* Test messaging using IDC, device memory, unexpected messaging, and
+ * without HMEM device memory registration for load/store access.
+ */
+Test(cuda, messaging_devMemory_idc_unexpected_noHmemDevReg)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	cuda_dev_memory_test(buf_size, buf_offset, true, false);
+}
+
+static void verify_dev_reg_handle(bool hmem_dev_reg)
+{
+	int ret;
+	void *buf;
+	cudaError_t cuda_ret;
+	struct fid_mr *fid_mr;
+	size_t buf_size = 1024;
+	struct cxip_mr *mr;
+
+	cxit_setup_msg();
+
+	cuda_ret = cudaMalloc(&buf, buf_size);
+	cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret);
+
+	ret = fi_mr_reg(cxit_domain, buf, buf_size, FI_READ, 0, 0x123, 0,
+			&fid_mr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret);
+
+	mr = container_of(fid_mr, struct cxip_mr, mr_fid);
+
+	cr_assert_eq(mr->md->handle_valid, hmem_dev_reg,
+		     "Bad cxip_md handle_valid");
+	cr_assert_eq(mr->md->info.iface, FI_HMEM_CUDA,
+		     "Invalid CXIP MD iface: %d", mr->md->info.iface);
+
+	ret = fi_close(&fid_mr->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret);
+
+	cuda_ret = cudaFree(buf);
+	cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree  failed: %d", cuda_ret);
+
+	cxit_teardown_msg();
+}
+
+/* Verify MD handle is false. */
+Test(cuda, verify_noHmemDevReg)
+{
+	int ret;
+
+	ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", -errno);
+
+	verify_dev_reg_handle(false);
+}
+
+/* Verify MD handle is true. */
+Test(cuda, verify_hmemDevReg)
+{
+	int ret;
+
+	ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", -errno);
+
+	verify_dev_reg_handle(true);
+}
diff --git a/prov/cxi/test/curl.c b/prov/cxi/test/curl.c
new file mode 100644
index 00000000000..143a8b1fe92
--- /dev/null
+++ b/prov/cxi/test/curl.c
@@ -0,0 +1,546 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <complex.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <json-c/json.h>
+
+#include <ofi.h>
+
+#include "cxip.h"
+
+/* Parsed arguments */
+static bool autotest = false;
+static bool verbose = false;
+static char *cmd = "get";
+static char *data = NULL;
+static int parallel = 100;
+static char *server = "http://127.0.0.1:5000";
+static char *endpoint = "/test";
+static char serverpath[1024];
+
+/* Measure timings */
+static inline void tmark(struct timespec *t0)
+{
+	clock_gettime(CLOCK_MONOTONIC, t0);
+}
+
+static inline void tmeas(struct timespec *t0)
+{
+	struct timespec t1;
+
+	clock_gettime(CLOCK_MONOTONIC, &t1);
+	if (t1.tv_nsec < t0->tv_nsec) {
+		t1.tv_nsec += 1000000000;
+		t1.tv_sec -= 1;
+	}
+	t0->tv_nsec = t1.tv_nsec - t0->tv_nsec;
+	t0->tv_sec = t1.tv_sec - t0->tv_sec;
+}
+
+#define	failtest(action, fmt, ...) \
+	do { fprintf(stderr, fmt, ##__VA_ARGS__); action; } while (0)
+
+/**
+ * @brief Exercise the json value parser.
+ *
+ * @return int : error code
+ */
+int auto_test_cxip_json(void)
+{
+	/* Two test objects to parse */
+	char json1[] =
+		"{"
+			"'string': 'string',"
+			"'double': 0.1234,"
+			"'int64': 9000000000,"
+			"'int': 2000000000,"
+			"'bool': true,"
+			"'object': {"
+				"'one': 1,"
+				"'two': 2,"
+			"},"
+			"'array': [0, 1, 2, 3],"
+			"'nestedarr': ["
+				"[0, 1, 2, 3],"
+				"[4, 5, 6, 7]"
+			"],"
+			"'nestedobj': ["
+				"{"
+					"'one': 1,"
+					"'two': 2"
+				"},"
+				"{"
+					"'three': 3,"
+					"'four': 4"
+				"}"
+			"]"
+		"}";
+	char json2[] = "[0, 1, 2, 3]";
+
+	json_object *json_obj;
+	const char *key;
+	const char *string_val;
+	double double_val;
+	int64_t int64_val;
+	int int_val;
+	bool bool_val;
+	int i;
+
+	/* Change embedded single quotes to double quotes */
+	single_to_double_quote(json1);
+	single_to_double_quote(json2);
+
+	/* Test parsing of json1 */
+	if (!(json_obj = json_tokener_parse(json1)))
+		failtest(return 1, "json1 could not be parsed\n");
+
+	key = "string";
+	if (cxip_json_string(key, json_obj, &string_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (strcmp(string_val, "string"))
+		failtest(return 1, "'%s' returned '%s'\n", key, string_val);
+
+	key = "double";
+	if (cxip_json_double(key, json_obj, &double_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (double_val != 0.1234)
+		failtest(return 1, "'%s' returned %f\n", key, double_val);
+
+	key = "int64";
+	if (cxip_json_int64(key, json_obj, &int64_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (int64_val != 9000000000)
+		failtest(return 1, "'%s' returned 0x%lx\n", key, int64_val);
+
+	key = "int";
+	if (cxip_json_int(key, json_obj, &int_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (int_val != 2000000000)
+		failtest(return 1, "'%s' returned 0x%x\n", key, int_val);
+
+	key = "bool";
+	if (cxip_json_bool(key, json_obj, &bool_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (bool_val != true)
+		failtest(return 1, "'%s' returned %d\n", key, bool_val);
+
+	key = "object.one";
+	if (cxip_json_int(key, json_obj, &int_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (int_val != 1)
+		failtest(return 1, "'%s' returned %d\n", key, int_val);
+
+	key = "object.two";
+	if (cxip_json_int(key, json_obj, &int_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (int_val != 2)
+		failtest(return 1, "'%s' returned %d\n", key, int_val);
+
+	key = "nestedobj[0].one";
+	if (cxip_json_int(key, json_obj, &int_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (int_val != 1)
+		failtest(return 1, "'%s' returned %d\n", key, int_val);
+
+	key = "nestedobj[0].two";
+	if (cxip_json_int(key, json_obj, &int_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (int_val != 2)
+		failtest(return 1, "'%s' returned %d\n", key, int_val);
+
+	key = "nestedobj[1].three";
+	if (cxip_json_int(key, json_obj, &int_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (int_val != 3)
+		failtest(return 1, "'%s' returned %d\n", key, int_val);
+
+	key = "nestedobj[1].four";
+	if (cxip_json_int(key, json_obj, &int_val))
+		failtest(return 1, "'%s' key not found\n", key);
+	if (int_val != 4)
+		failtest(return 1, "'%s' returned %d\n", key, int_val);
+
+	for (i = 0; i < 4; i++) {
+		char key[256];
+		snprintf(key, sizeof(key), "array[%d]", i);
+		if (cxip_json_int(key, json_obj, &int_val))
+			failtest(return 1, "'%s' key not found\n", key);
+		if (int_val != i)
+			failtest(return 1, "'%s' returned %d\n", key, int_val);
+	}
+
+	for (i = 0; i < 8; i++) {
+		char key[256];
+		snprintf(key, sizeof(key), "nestedarr[%d][%d]", i/4, i%4);
+		if (cxip_json_int(key, json_obj, &int_val))
+			failtest(return 1, "'%s' key not found\n", key);
+		if (int_val != i)
+			failtest(return 1, "'%s' returned %d\n", key, int_val);
+	}
+	json_object_put(json_obj);
+
+	/* Test parsing of json2 */
+	if (!(json_obj = json_tokener_parse(json2)))
+		failtest(return 1, "json2 could not be parsed\n");
+	for (i = 0; i < 4; i++) {
+		char key[256];
+		snprintf(key, sizeof(key), "[%d]", i);
+		if (cxip_json_int(key, json_obj, &int_val))
+			failtest(return 1, "'%s' key not found\n", key);
+		if (int_val != i)
+			failtest(return 1, "'%s' returned %d\n", key, int_val);
+	}
+	json_object_put(json_obj);
+
+	if (verbose)
+		printf("PASSED JSON tests\n");
+	return 0;
+}
+
+/**
+ * @brief Simple completion callback.
+ *
+ * This expects an (int) usrptr to be registered with the CURL initiation,
+ * and simply increments it every time a CURL operation completes.
+ *
+ * @param handle : CURL operation handle
+ */
+static void complete(struct cxip_curl_handle *handle)
+{
+	int *counter = (int *)handle->usrptr;
+
+	(*counter)++;
+}
+
+/**
+ * @brief Exercise the CURL code.
+ *
+ * The flask_testsrv.py code must be running to perform this test. It will
+ * pass with a warning message if the server is not found.
+ *
+ * @return int : error code
+ */
+int auto_test_cxip_curl(void)
+{
+	struct cxip_curl_handle *handle;
+	struct timespec t0, t1;
+	json_object *json_obj;
+	int op, ret;
+	char tag[256];
+	int counter;
+
+	/* confirm that the server is running : status is 0 if no server */
+	ret = cxip_curl_perform(serverpath, NULL, NULL, 0, CURL_GET, false,
+				complete, &counter);
+	do {
+		ret = cxip_curl_progress(&handle);
+	} while (ret == -FI_EAGAIN);
+	if (ret) {
+		fprintf(stderr, "cxip_curl_perform() returned %s\n",
+			fi_strerror(-ret));
+		return ret;
+	}
+	if (!handle) {
+		fprintf(stderr, "cxip_curl_perform() returned no handle\n");
+		return -1;
+	}
+	if (handle->status == 0) {
+		fprintf(stderr, "SERVER at %s is not running\n", serverpath);
+		cxip_curl_free(handle);
+		return 0;
+	}
+	cxip_curl_free(handle);
+
+	/* Walk through all of the test-supported operations */
+	for (op = CURL_GET; op < CURL_MAX; op++) {
+		const char *opname = cxip_curl_opname(op);
+		bool reordered = false;
+		int nextseqid = 0;
+		int seqid;
+		int i, err = 0;
+
+		/* reset the callback counter to zero on every opcode */
+		counter = 0;
+
+		if (verbose)
+			printf("\nOperation %s\n", cxip_curl_opname(op));
+
+		/* Run 'parallel' operations concurrently */
+		tmark(&t0);
+		for (i = 0; i < parallel; i++) {
+			sprintf(tag, "{\"seqid\": %d}", i);
+			ret = cxip_curl_perform(serverpath, tag, NULL, 0,
+						op, false, complete, &counter);
+			if (ret != 0)
+				fprintf(stderr, "cxip_curl_perform(%d) = %s\n",
+				 	i, fi_strerror(ret));
+		}
+		tmeas(&t0);
+
+		/* Wait for all initiated operations to finish */
+		tmark(&t1);
+		while (i-- > 0) {
+			do {
+				sched_yield();
+				ret = cxip_curl_progress(&handle);
+			} while (ret == -FI_EAGAIN);
+			if (ret) {
+				/* should not happen, as we are counting */
+				fprintf(stderr, "cxip_curl_progress() %s\n",
+					fi_strerror(-ret));
+				err++;
+				continue;
+			}
+			if (!handle) {
+				/* should NEVER happen with good return */
+				fprintf(stderr,
+					"cxip_curl_progress() no handle\n");
+				err++;
+				continue;
+			}
+			if (handle->status != 200) {
+				/* our test server should generate 200 */
+				fprintf(stderr, "status=%ld\n", handle->status);
+				err++;
+				goto free_handle;
+			}
+			if (!handle->response) {
+				/* CURL should not return a NULL response */
+				fprintf(stderr, "NULL response\n");
+				err++;
+				goto free_handle;
+			}
+
+			/* Test server should return:
+			 * {
+			 *    "operation": <GET, POST, ...>,
+			 *    "data": {"seqid": <seqid>}
+			 * }
+			 */
+			const char *str;
+			json_obj = json_tokener_parse(handle->response);
+			if (! json_obj) {
+				fprintf(stderr, "%s: JSON unparseable\n",
+					opname);
+				err++;
+				goto free_handle;
+			}
+
+			if (cxip_json_string("operation", json_obj, &str)) {
+				fprintf(stderr, "no 'operation' field\n");
+				err++;
+				goto free_json;
+			}
+
+			if (strcmp(str, opname)) {
+				fprintf(stderr, "op=%s exp %s\n", str, opname);
+				err++;
+				goto free_json;
+			}
+
+			/* For GET, seqid is is meaningless */
+			if (op == CURL_GET)
+				goto free_json;
+
+			if (cxip_json_int("data.seqid", json_obj, &seqid)) {
+				fprintf(stderr, "op=%s no seqid\n", opname);
+				err++;
+				goto free_json;
+			}
+
+			/* This confirms that CURL does not order responses */
+			if (seqid != nextseqid)
+				reordered = true;
+free_json:
+			json_object_put(json_obj);
+free_handle:
+			cxip_curl_free(handle);
+			nextseqid++;
+		}
+		tmeas(&t1);
+
+		/* Should be no strays */
+		ret = cxip_curl_progress(&handle);
+		if (ret != -FI_ENODATA) {
+			fprintf(stderr, "op=%s stray handles\n", opname);
+			err++;
+		}
+
+		/* Callback counter should match number of calls */
+		if (counter != parallel) {
+			fprintf(stderr, "op=%s count=%d, exp %d\n",
+				opname, counter, parallel);
+			err++;
+		}
+
+		if (verbose) {
+			printf("  iterations(%d)\n", parallel);
+			printf("  counter   (%d)\n", counter);
+			printf("  reordered (%s)\n", reordered ? "true" : "false");
+			printf("  errors    (%d)\n", err);
+			printf("  issue     (%ld.%09lds)\n", t0.tv_sec, t0.tv_nsec);
+			printf("  response  (%ld.%09lds)\n", t1.tv_sec, t1.tv_nsec);
+		}
+
+		if (err)
+			failtest(return 1, "FAILED CURL tests\n");
+	}
+	if (verbose)
+		printf("\n");
+
+	printf("PASSED CURL tests\n");
+	return 0;
+}
+
+/**
+ * @brief Perform a manual (command-line arguments) test
+ *
+ * @return int : error code
+ */
+int do_test(void)
+{
+	struct cxip_curl_handle *handle;
+	struct timespec t0;
+	enum curl_ops op;
+	int ret;
+
+	if (!strcasecmp(cmd, "get"))
+		op = CURL_GET;
+	else if (!strcasecmp(cmd, "put"))
+		op = CURL_PUT;
+	else if (!strcasecmp(cmd, "post"))
+		op = CURL_POST;
+	else if (!strcasecmp(cmd, "patch"))
+		op = CURL_PATCH;
+	else if (!strcasecmp(cmd, "delete"))
+		op = CURL_DELETE;
+	else {
+		fprintf(stderr, "Bad HTTP operation \"%s\"", cmd);
+		return 1;
+	}
+
+	tmark(&t0);
+	ret = cxip_curl_perform(serverpath, data, NULL, 0, op, verbose, 0, 0);
+	if (ret) {
+		fprintf(stderr, "cxip_curl_perform() returned %d\n", ret);
+		return ret;
+	}
+
+	do {
+		sched_yield();
+		ret = cxip_curl_progress(&handle);
+	} while (ret == -FI_EAGAIN);
+	tmeas(&t0);
+
+	if (ret)
+		failtest(return 1, "cxip_curl_progress() ret %d\n", ret);
+	if (!handle)
+		failtest(return 1, "cxip_curl_progress() ret no handle\n");
+	if (!handle->status) {
+		fprintf(stderr, "SERVER at %s is not running\n", serverpath);
+		return 0;
+	}
+
+	printf("\n");
+	printf("endpoint = %s\n", handle->endpoint);
+	printf("time     = %ld.%09lds\n", t0.tv_sec, t0.tv_nsec);
+	printf("status   = %ld\n", handle->status);
+	printf("request------------\n%s\n", handle->request);
+	printf("response-----------\n%s\n", handle->response);
+
+	return 0;
+}
+
+int main(int argc, char **argv) {
+	static char *opts = "c:d:e:p:r:s:hv";
+	static struct option lopts[] = {
+		{"help", no_argument, NULL, 'h'},
+		{"auto", no_argument, NULL, 'a'},
+		{"verbose", no_argument, NULL, 'v'},
+		{"command", required_argument, NULL, 'c'},
+		{"data", required_argument, NULL, 'd'},
+		{"parallel", required_argument, NULL, 'p'},
+		{"server", required_argument, NULL, 's'},
+		{"endpoint", required_argument, NULL, 'e'},
+		{0, 0, 0, 0}
+	};
+	static const char *help =
+		"\nExercise cxip_curl module:\n"
+		"  --auto            Perform automated test suite\n"
+		"  -c, --command     Define HTTP command\n"
+		"  -d, --data        Define HTTP payload (json)\n"
+		"  -p, --parallel    Define level of auto-test parallism\n"
+		"  -s, --server      REST server address"
+			" (default \"%s\")\n"
+		"  -e, --endpoint    REST server endpoint\n"
+		"  -v, --verbose     Verbose operation\n"
+		"  -h, --help        Display help\n";
+
+	int ret = 1;
+
+	while (1) {
+		int idx, c;
+
+		c = getopt_long(argc, argv, opts, lopts, &idx);
+		if (c == -1)
+			break;
+		switch (c) {
+		case 0:	// long option, all map to single characters
+			break;
+		case 'a':
+			autotest = 1;
+			break;
+		case 'c':
+			cmd = strdup(optarg);
+			break;
+		case 'd':
+			data = strdup(optarg);
+			break;
+		case 'p':
+			parallel = atoi(optarg);
+			break;
+		case 's':
+			server = strdup(optarg);
+			break;
+		case 'e':
+			endpoint = strdup(optarg);
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		case 'h':
+			ret = 0;
+			// fall through
+		default:
+			printf(help, server);
+			return ret;
+		}
+	}
+
+	snprintf(serverpath, sizeof(serverpath), "%s%s", server, endpoint);
+
+	if (cxip_curl_init()) {
+		fprintf(stderr, "CURL could not be initialized\n");
+		return ret;
+	}
+
+	if (autotest) {
+		ret = auto_test_cxip_json() |
+		      auto_test_cxip_curl();
+	} else {
+		ret = do_test();
+	}
+
+	cxip_curl_fini();
+	return ret;
+}
diff --git a/prov/cxi/test/cxip_test_common.c b/prov/cxi/test/cxip_test_common.c
new file mode 100644
index 00000000000..999a12627df
--- /dev/null
+++ b/prov/cxi/test/cxip_test_common.c
@@ -0,0 +1,1105 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018,2020-2022 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <criterion/criterion.h>
+
+#include "cxip_test_common.h"
+
+struct fi_info *cxit_fi_hints;
+struct fi_info *cxit_fi;
+struct fid_fabric *cxit_fabric;
+struct fid_domain *cxit_domain;
+struct fi_cxi_dom_ops *dom_ops;
+struct fid_ep *cxit_ep;
+struct fid_ep *cxit_tx_alias_ep;
+struct cxip_addr cxit_ep_addr;
+fi_addr_t cxit_ep_fi_addr;
+struct fi_eq_attr cxit_eq_attr = {};
+struct fid_eq *cxit_eq;
+struct fi_cq_attr cxit_tx_cq_attr = {
+	.format = FI_CQ_FORMAT_TAGGED,
+	.size = 16384
+};
+struct fi_cq_attr cxit_rx_cq_attr = { .format = FI_CQ_FORMAT_TAGGED };
+uint64_t cxit_eq_bind_flags = 0;
+uint64_t cxit_tx_cq_bind_flags = FI_TRANSMIT;
+uint64_t cxit_rx_cq_bind_flags = FI_RECV;
+struct fid_cq *cxit_tx_cq, *cxit_rx_cq;
+struct fi_cntr_attr cxit_cntr_attr = {};
+struct fid_cntr *cxit_send_cntr, *cxit_recv_cntr;
+struct fid_cntr *cxit_read_cntr, *cxit_write_cntr;
+struct fid_cntr *cxit_rem_cntr;
+struct fi_av_attr cxit_av_attr;
+struct fid_av *cxit_av;
+struct cxit_coll_mc_list cxit_coll_mc_list = { .count = 5 };
+char *cxit_node, *cxit_service;
+uint64_t cxit_flags;
+int cxit_n_ifs;
+struct fid_av_set *cxit_av_set;
+struct fid_mc *cxit_mc;
+bool cxit_prov_key;
+int s_page_size;
+bool enable_cxi_hmem_ops = 1;
+
+/* Get _SC_PAGESIZE */
+static void cxit_set_page_size(void)
+{
+	if (!s_page_size)
+		s_page_size = sysconf(_SC_PAGESIZE);
+}
+
+int cxit_dom_read_cntr(unsigned int cntr, uint64_t *value,
+		       struct timespec *ts, bool sync)
+{
+	int ret;
+	struct timespec start;
+	struct timespec delta;
+
+	/* Map counters if not already mapped */
+	ret = dom_ops->cntr_read(&cxit_domain->fid, cntr, value, &start);
+	if (ret || !sync)
+		goto done;
+
+	/* Wait for an update to occur to read latest counts */
+	do {
+		usleep(100);
+		ret = dom_ops->cntr_read(&cxit_domain->fid, cntr, value,
+					 &delta);
+	} while (!ret && delta.tv_sec == start.tv_sec &&
+		 delta.tv_nsec == start.tv_nsec);
+
+done:
+	if (ts && !ret)
+		*ts = sync ? delta : start;
+
+	return ret;
+}
+
+static ssize_t copy_from_hmem_iov(void *dest, size_t size,
+				 enum fi_hmem_iface iface, uint64_t device,
+				 const struct iovec *hmem_iov,
+				 size_t hmem_iov_count,
+				 uint64_t hmem_iov_offset)
+{
+	size_t cpy_size = MIN(size, hmem_iov->iov_len);
+
+	assert(iface == FI_HMEM_SYSTEM);
+	assert(hmem_iov_count == 1);
+	assert(hmem_iov_offset == 0);
+
+	memcpy(dest, hmem_iov->iov_base, cpy_size);
+
+	return cpy_size;
+}
+
+static ssize_t copy_to_hmem_iov(enum fi_hmem_iface iface, uint64_t device,
+				const struct iovec *hmem_iov,
+				size_t hmem_iov_count,
+				uint64_t hmem_iov_offset, const void *src,
+				size_t size)
+{
+	size_t cpy_size = MIN(size, hmem_iov->iov_len);
+
+	assert(iface == FI_HMEM_SYSTEM);
+	assert(hmem_iov_count == 1);
+	assert(hmem_iov_offset == 0);
+
+	memcpy(hmem_iov->iov_base, src, cpy_size);
+
+	return cpy_size;
+}
+
+struct fi_hmem_override_ops cxi_hmem_ops = {
+	.copy_from_hmem_iov = copy_from_hmem_iov,
+	.copy_to_hmem_iov = copy_to_hmem_iov,
+};
+
+void cxit_create_fabric_info(void)
+{
+	int ret;
+
+	if (cxit_fi)
+		return;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &cxit_fi);
+	cr_assert(ret == FI_SUCCESS, "fi_getinfo");
+	cxit_fi->ep_attr->tx_ctx_cnt = cxit_fi->domain_attr->tx_ctx_cnt;
+	cxit_fi->ep_attr->rx_ctx_cnt = cxit_fi->domain_attr->rx_ctx_cnt;
+
+	/* Add in FI_SOURCE and FI_SOURCE_ERR to include all capabilities */
+	cxit_fi->caps |= FI_SOURCE | FI_SOURCE_ERR;
+	cxit_fi->rx_attr->caps |= FI_SOURCE | FI_SOURCE_ERR;
+}
+
+void cxit_destroy_fabric_info(void)
+{
+	fi_freeinfo(cxit_fi);
+	cxit_fi = NULL;
+}
+
+void cxit_create_fabric(void)
+{
+	int ret;
+
+	if (cxit_fabric)
+		return;
+
+	ret = fi_fabric(cxit_fi->fabric_attr, &cxit_fabric, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_fabric");
+}
+
+void cxit_destroy_fabric(void)
+{
+	int ret;
+
+	ret = fi_close(&cxit_fabric->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close fabric");
+	cxit_fabric = NULL;
+}
+
+void cxit_create_domain(void)
+{
+	int ret;
+
+	if (cxit_domain)
+		return;
+
+	ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_domain");
+
+	/* Should be able to open either v1 - v6 */
+	ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_1, 0,
+			  (void **)&dom_ops, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_open_ops v1");
+	cr_assert(dom_ops->cntr_read != NULL, "v1 function returned");
+
+	ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_2, 0,
+			  (void **)&dom_ops, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_open_ops v2");
+	cr_assert(dom_ops->cntr_read != NULL &&
+		  dom_ops->topology != NULL, "V2 functions returned");
+
+	ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0,
+			  (void **)&dom_ops, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_open_ops v3");
+	cr_assert(dom_ops->cntr_read != NULL &&
+		  dom_ops->topology != NULL &&
+		  dom_ops->enable_hybrid_mr_desc != NULL,
+		  "V3 functions returned");
+
+	ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_6, 0,
+			  (void **)&dom_ops, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_open_ops v6");
+	cr_assert(dom_ops->cntr_read != NULL &&
+		  dom_ops->topology != NULL &&
+		  dom_ops->enable_hybrid_mr_desc != NULL &&
+		  dom_ops->ep_get_unexp_msgs != NULL &&
+		  dom_ops->get_dwq_depth != NULL &&
+		  dom_ops->enable_mr_match_events != NULL,
+		  "V3 functions returned");
+
+	if (enable_cxi_hmem_ops) {
+		ret = fi_set_ops(&cxit_domain->fid, FI_SET_OPS_HMEM_OVERRIDE, 0,
+				 &cxi_hmem_ops, NULL);
+		cr_assert(ret == FI_SUCCESS, "fi_set_ops");
+	}
+}
+
+void cxit_destroy_domain(void)
+{
+	int ret;
+
+	ret = fi_close(&cxit_domain->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close domain. %d", ret);
+	cxit_domain = NULL;
+}
+
+void cxit_create_ep(void)
+{
+	int ret;
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_endpoint");
+	cr_assert_not_null(cxit_ep);
+}
+
+void cxit_destroy_ep(void)
+{
+	int ret;
+
+	if (cxit_ep != NULL) {
+		ret = fi_close(&cxit_ep->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close endpoint = %d", ret);
+		cxit_ep = NULL;
+	}
+}
+
+void cxit_create_eq(void)
+{
+	struct fi_eq_attr attr = {
+		.size = 32,
+		.flags = FI_WRITE,
+		.wait_obj = FI_WAIT_NONE
+	};
+	int ret;
+
+	ret = fi_eq_open(cxit_fabric, &attr, &cxit_eq, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_eq_open failed %d", ret);
+	cr_assert_not_null(cxit_eq, "fi_eq_open returned NULL eq");
+}
+
+void cxit_destroy_eq(void)
+{
+	int ret;
+
+	ret = fi_close(&cxit_eq->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close EQ failed %d", ret);
+	cxit_eq = NULL;
+}
+
+void cxit_bind_eq(void)
+{
+	int ret;
+
+	/* NOTE: ofi implementation does not allow any flags */
+	ret = fi_ep_bind(cxit_ep, &cxit_eq->fid, cxit_eq_bind_flags);
+	cr_assert(!ret, "fi_ep_bind EQ");
+}
+
+void cxit_create_cqs(void)
+{
+	int ret;
+
+	ret = fi_cq_open(cxit_domain, &cxit_tx_cq_attr, &cxit_tx_cq, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cq_open (TX)");
+
+	ret = fi_cq_open(cxit_domain, &cxit_rx_cq_attr, &cxit_rx_cq, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cq_open (RX)");
+}
+
+void cxit_destroy_cqs(void)
+{
+	int ret;
+
+	ret = fi_close(&cxit_rx_cq->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close RX CQ");
+	cxit_rx_cq = NULL;
+
+	ret = fi_close(&cxit_tx_cq->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close TX CQ");
+	cxit_tx_cq = NULL;
+}
+
+void cxit_bind_cqs(void)
+{
+	int ret;
+
+	ret = fi_ep_bind(cxit_ep, &cxit_tx_cq->fid, cxit_tx_cq_bind_flags);
+	cr_assert(!ret, "fi_ep_bind TX CQ");
+
+	ret = fi_ep_bind(cxit_ep, &cxit_rx_cq->fid, cxit_rx_cq_bind_flags);
+	cr_assert(!ret, "fi_ep_bind RX CQ");
+}
+
+void cxit_create_rem_cntrs(void)
+{
+	int ret;
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_rem_cntr, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cntr_open (rem)");
+}
+
+void cxit_create_local_cntrs(void)
+{
+	int ret;
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_send_cntr,
+			   NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cntr_open (send)");
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_recv_cntr,
+			   NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cntr_open (recv)");
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_read_cntr,
+			   NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cntr_open (read)");
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_write_cntr,
+			   NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cntr_open (write)");
+}
+
+void cxit_create_cntrs(void)
+{
+	cxit_create_local_cntrs();
+	cxit_create_rem_cntrs();
+}
+
+void cxit_destroy_cntrs(void)
+{
+	int ret;
+
+	if (cxit_send_cntr) {
+		ret = fi_close(&cxit_send_cntr->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close send_cntr");
+		cxit_send_cntr = NULL;
+	}
+
+	if (cxit_recv_cntr) {
+		ret = fi_close(&cxit_recv_cntr->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close recv_cntr");
+		cxit_recv_cntr = NULL;
+	}
+
+	if (cxit_read_cntr) {
+		ret = fi_close(&cxit_read_cntr->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close read_cntr");
+		cxit_read_cntr = NULL;
+	}
+
+	if (cxit_write_cntr) {
+		ret = fi_close(&cxit_write_cntr->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close write_cntr");
+		cxit_write_cntr = NULL;
+	}
+
+	if (cxit_rem_cntr) {
+		ret = fi_close(&cxit_rem_cntr->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close rem_cntr");
+		cxit_rem_cntr = NULL;
+	}
+}
+
+void cxit_bind_cntrs(void)
+{
+	int ret;
+
+	if (cxit_send_cntr) {
+		ret = fi_ep_bind(cxit_ep, &cxit_send_cntr->fid, FI_SEND);
+		cr_assert(!ret, "fi_ep_bind send_cntr");
+	}
+
+	if (cxit_recv_cntr) {
+		ret = fi_ep_bind(cxit_ep, &cxit_recv_cntr->fid, FI_RECV);
+		cr_assert(!ret, "fi_ep_bind recv_cntr");
+	}
+
+	if (cxit_read_cntr) {
+		ret = fi_ep_bind(cxit_ep, &cxit_read_cntr->fid, FI_READ);
+		cr_assert(!ret, "fi_ep_bind read_cntr");
+	}
+
+	if (cxit_write_cntr) {
+		ret = fi_ep_bind(cxit_ep, &cxit_write_cntr->fid, FI_WRITE);
+		cr_assert(!ret, "fi_ep_bind write_cntr");
+	}
+}
+
+void cxit_create_av(void)
+{
+	int ret;
+
+	ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_av_open");
+}
+
+void cxit_destroy_av(void)
+{
+	int ret;
+
+	ret = fi_close(&cxit_av->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close AV. %d", ret);
+	cxit_av = NULL;
+}
+
+void cxit_bind_av(void)
+{
+	int ret;
+
+	ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0);
+	cr_assert(!ret, "fi_ep_bind AV");
+}
+
+void cxit_init(void)
+{
+	struct slist_entry *entry, *prev __attribute__((unused));
+	int ret;
+	struct fi_info *hints = cxit_allocinfo();
+	struct fi_info *info;
+
+	setlinebuf(stdout);
+	cxit_set_page_size();
+
+	/* Force provider init */
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+
+	slist_foreach(&cxip_if_list, entry, prev) {
+		cxit_n_ifs++;
+	}
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+struct fi_info *cxit_allocinfo(void)
+{
+	struct fi_info *info;
+	char *odp_env;
+	char *prov_key_env;
+
+	info = fi_allocinfo();
+	cr_assert(info, "fi_allocinfo");
+
+	/* Always select CXI */
+	info->fabric_attr->prov_name = strdup(cxip_prov_name);
+
+	info->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+
+	/* Test with provider generated keys instead of client */
+	prov_key_env = getenv("CXIP_TEST_PROV_KEY");
+	if (prov_key_env && strtol(prov_key_env, NULL, 10)) {
+		cxit_prov_key = 1;
+		info->domain_attr->mr_mode |= FI_MR_PROV_KEY;
+	} else {
+		cxit_prov_key = 0;
+	}
+
+	/* If remote ODP is enabled then test with ODP */
+	odp_env = getenv("CXIP_TEST_ODP");
+	if (odp_env && strtol(odp_env, NULL, 10))
+		info->domain_attr->mr_mode &= ~FI_MR_ALLOCATED;
+
+	return info;
+}
+
+void cxit_setup_getinfo(void)
+{
+	cxit_init();
+
+	if (!cxit_fi_hints)
+		cxit_fi_hints = cxit_allocinfo();
+}
+
+void cxit_teardown_getinfo(void)
+{
+	fi_freeinfo(cxit_fi_hints);
+	cxit_fi_hints = NULL;
+}
+
+void cxit_setup_fabric(void)
+{
+	cxit_setup_getinfo();
+	cxit_create_fabric_info();
+}
+
+void cxit_teardown_fabric(void)
+{
+	cxit_destroy_fabric_info();
+	cxit_teardown_getinfo();
+}
+
+void cxit_setup_domain(void)
+{
+	cxit_setup_fabric();
+	cxit_create_fabric();
+}
+
+void cxit_teardown_domain(void)
+{
+	cxit_destroy_fabric();
+	cxit_teardown_fabric();
+}
+
+void cxit_setup_ep(void)
+{
+	cxit_setup_domain();
+	cxit_create_domain();
+}
+
+void cxit_teardown_ep(void)
+{
+	cxit_destroy_domain();
+	cxit_teardown_domain();
+}
+
+void cxit_setup_enabled_ep_disable_fi_rma_event(void)
+{
+	int ret;
+	size_t addrlen = sizeof(cxit_ep_addr);
+
+	cxit_setup_getinfo();
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED;
+	cxit_av_attr.type = FI_AV_TABLE;
+
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+
+	cxit_setup_ep();
+
+	cxit_fi->caps &= ~FI_RMA_EVENT;
+	cxit_fi->domain_attr->caps &= ~FI_RMA_EVENT;
+	cxit_fi->tx_attr->caps &= ~FI_RMA_EVENT;
+	cxit_fi->rx_attr->caps &= ~FI_RMA_EVENT;
+
+	/* Set up RMA objects */
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_bind_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+
+	/* No FI_RMA_EVENT, don't create/bind remote counters */
+	cxit_create_local_cntrs();
+	cxit_bind_cntrs();
+
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret);
+
+	/* Find assigned Endpoint address. Address is assigned during enable. */
+	ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen);
+	cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret);
+	cr_assert(addrlen == sizeof(cxit_ep_addr));
+}
+
+void cxit_setup_enabled_ep_mr_events(void)
+{
+	int ret;
+	size_t addrlen = sizeof(cxit_ep_addr);
+
+	cxit_setup_getinfo();
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED;
+	cxit_av_attr.type = FI_AV_TABLE;
+
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+
+	cxit_setup_ep();
+
+	/* Enable FI_CXI_MR_MATCH_EVENTS via domain */
+	ret = dom_ops->enable_mr_match_events(&cxit_domain->fid,
+					      true);
+	cr_assert_eq(ret, FI_SUCCESS);
+
+	/* Disable RMA events to make sure MATCH_EVENTS on its own is
+	 * sufficient to disallow atomic with FI_DELIVERY_COMPLETE.
+	 */
+	cxit_fi->caps &= ~FI_RMA_EVENT;
+	cxit_fi->domain_attr->caps &= ~FI_RMA_EVENT;
+	cxit_fi->tx_attr->caps &= ~FI_RMA_EVENT;
+	cxit_fi->rx_attr->caps &= ~FI_RMA_EVENT;
+
+	/* Set up RMA objects */
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_bind_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+
+	/* No FI_RMA_EVENT, so only create local counters */
+	cxit_create_local_cntrs();
+	cxit_bind_cntrs();
+
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret);
+
+	/* Find assigned Endpoint address. Address is assigned during enable. */
+	ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen);
+	cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret);
+	cr_assert(addrlen == sizeof(cxit_ep_addr));
+}
+
+void cxit_setup_enabled_ep(void)
+{
+	int ret;
+	size_t addrlen = sizeof(cxit_ep_addr);
+
+	cxit_setup_getinfo();
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED;
+	cxit_av_attr.type = FI_AV_TABLE;
+
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+
+	cxit_fi_hints->tx_attr->size = 512;
+
+	cxit_setup_ep();
+
+	/* Set up RMA objects */
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_bind_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_cntrs();
+	cxit_bind_cntrs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret);
+
+	/* Find assigned Endpoint address. Address is assigned during enable. */
+	ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen);
+	cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret);
+	cr_assert(addrlen == sizeof(cxit_ep_addr));
+}
+
+void cxit_setup_enabled_ep_fd(void)
+{
+	int ret;
+	size_t addrlen = sizeof(cxit_ep_addr);
+
+	cxit_setup_getinfo();
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED;
+	cxit_rx_cq_attr.format = FI_CQ_FORMAT_TAGGED;
+	cxit_tx_cq_attr.wait_obj = FI_WAIT_FD;
+	cxit_rx_cq_attr.wait_obj = FI_WAIT_FD;
+	cxit_av_attr.type = FI_AV_TABLE;
+
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+
+	cxit_setup_ep();
+
+	/* Set up RMA objects */
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_bind_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_cntrs();
+	cxit_bind_cntrs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret);
+
+	/* Find assigned Endpoint address. Address is assigned during enable. */
+	ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen);
+	cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret);
+	cr_assert(addrlen == sizeof(cxit_ep_addr));
+}
+
+void cxit_setup_rma_disable_fi_rma_event(void)
+{
+	int ret;
+	struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc};
+
+	cxit_setup_enabled_ep_disable_fi_rma_event();
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1);
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr,
+			   0, NULL);
+	cr_assert(ret == 1);
+}
+
+void cxit_setup_rma_mr_events(void)
+{
+	int ret;
+	struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc};
+	bool disable = false;
+
+	cxit_setup_enabled_ep_mr_events();
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1);
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr,
+			   0, NULL);
+	cr_assert(ret == 1);
+
+	/* Ensure if FI_MR_PROV_KEY cache will not be used */
+	fi_control(&cxit_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE, &disable);
+}
+
+void cxit_bind_cqs_hybrid_mr_desc(void)
+{
+	int ret;
+
+	ret = fi_ep_bind(cxit_ep, &cxit_tx_cq->fid,
+			 cxit_tx_cq_bind_flags | FI_SELECTIVE_COMPLETION);
+	cr_assert(!ret, "fi_ep_bind TX CQ");
+
+	ret = fi_ep_bind(cxit_ep, &cxit_rx_cq->fid,
+			 cxit_rx_cq_bind_flags | FI_SELECTIVE_COMPLETION);
+	cr_assert(!ret, "fi_ep_bind RX CQ");
+}
+
+void cxit_create_domain_hybrid_mr_desc(void)
+{
+	int ret;
+
+	if (cxit_domain)
+		return;
+
+	ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_domain");
+
+	ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0,
+			  (void **)&dom_ops, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_open_ops v2");
+	cr_assert(dom_ops->cntr_read != NULL &&
+		  dom_ops->topology != NULL &&
+		  dom_ops->enable_hybrid_mr_desc != NULL,
+		  "V3 functions returned");
+
+	if (enable_cxi_hmem_ops) {
+		ret = fi_set_ops(&cxit_domain->fid, FI_SET_OPS_HMEM_OVERRIDE, 0,
+				 &cxi_hmem_ops, NULL);
+		cr_assert(ret == FI_SUCCESS, "fi_set_ops");
+	}
+
+	ret = dom_ops->enable_hybrid_mr_desc(&cxit_domain->fid, true);
+	cr_assert(ret == FI_SUCCESS, "enable_hybrid_mr_desc failed");
+}
+
+void cxit_setup_ep_hybrid_mr_desc(void)
+{
+	cxit_setup_domain();
+	cxit_create_domain_hybrid_mr_desc();
+}
+
+void cxit_setup_enabled_ep_hybrid_mr_desc(void)
+{
+	int ret;
+	size_t addrlen = sizeof(cxit_ep_addr);
+
+	cxit_setup_getinfo();
+
+	cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED;
+	cxit_av_attr.type = FI_AV_TABLE;
+
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+
+	cxit_setup_ep_hybrid_mr_desc();
+
+	cxit_fi->caps &= ~FI_RMA_EVENT;
+	cxit_fi->domain_attr->caps &= ~FI_RMA_EVENT;
+	cxit_fi->tx_attr->caps &= ~FI_RMA_EVENT;
+	cxit_fi->rx_attr->caps &= ~FI_RMA_EVENT;
+
+	/* Set up RMA objects */
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_bind_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs_hybrid_mr_desc();
+
+	/* No FI_RMA_EVENT, don't create/bind remote counters */
+	cxit_create_local_cntrs();
+	cxit_bind_cntrs();
+
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret);
+
+	/* Find assigned Endpoint address. Address is assigned during enable. */
+	ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen);
+	cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret);
+	cr_assert(addrlen == sizeof(cxit_ep_addr));
+}
+
+void cxit_setup_rma_hybrid_mr_desc(void)
+{
+	int ret;
+	struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc};
+
+	cxit_setup_enabled_ep_hybrid_mr_desc();
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1);
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr,
+			   0, NULL);
+	cr_assert(ret == 1);
+}
+
+void cxit_setup_rma(void)
+{
+	int ret;
+	struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc};
+
+	cxip_trace_append = true;
+	cxip_trace_enable(true);
+	cxit_setup_enabled_ep();
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1);
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr,
+			   0, NULL);
+	cr_assert(ret == 1);
+}
+
+void cxit_teardown_rma(void)
+{
+	/* Tear down RMA objects */
+	cxit_destroy_ep(); /* EP must be destroyed before bound objects */
+
+	cxit_destroy_av();
+	cxit_destroy_cntrs();
+	cxit_destroy_cqs();
+	cxit_destroy_eq();
+	cxit_teardown_ep();
+}
+
+/* Use FI_WAIT_FD CQ wait object */
+void cxit_setup_rma_fd(void)
+{
+	int ret;
+	struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc};
+
+	cxit_setup_enabled_ep_fd();
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1);
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr,
+			   0, NULL);
+	cr_assert(ret == 1);
+}
+
+#define CXI0_AMO_REMAP \
+	"/sys/class/cxi/cxi0/device/properties/amo_remap_to_pcie_fadd"
+
+void set_amo_remap_to_pcie_fadd(int amo_remap_to_pcie_fadd)
+{
+	FILE *fd;
+	int ret;
+
+	/* Assume open a single CXI device is present. */
+	fd = fopen(CXI0_AMO_REMAP, "w");
+	cr_assert(fd != NULL, "Failed to open %s: %d\n", CXI0_AMO_REMAP,
+		  -errno);
+
+	ret = fprintf(fd, "%d", amo_remap_to_pcie_fadd);
+	cr_assert(ret >= 0,
+		  "Failed to write AMO remap value: errno=%d\n", -errno);
+
+	fclose(fd);
+}
+
+void reset_amo_remap_to_pcie_fadd(void)
+{
+	set_amo_remap_to_pcie_fadd(-1);
+}
+
+static void cxit_setup_tx_alias_rma_impl(bool delivery_complete)
+{
+	int ret;
+	struct cxip_ep *cxi_ep;
+	struct cxip_ep *cxi_alias_ep = NULL;
+	uint64_t op_flags;
+	struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc};
+
+	cxit_setup_enabled_ep();
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1);
+
+	/* Insert local address into AV to prepare to send to self */
+	ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr,
+			   0, NULL);
+	cr_assert(ret == 1);
+
+	/* Create TX alias EP */
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+	cr_assert(!(cxi_ep->tx_attr.op_flags & FI_RECV), "Bad op flags");
+
+	op_flags = cxi_ep->tx_attr.op_flags | FI_TRANSMIT;
+	if (delivery_complete)
+		op_flags |= FI_DELIVERY_COMPLETE;
+	ret = fi_ep_alias(cxit_ep, &cxit_tx_alias_ep, op_flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_alias");
+
+	cxi_alias_ep = container_of(&cxit_tx_alias_ep->fid,
+				    struct cxip_ep, ep.fid);
+	cr_assert_not_null(cxi_alias_ep->ep_obj);
+}
+
+void cxit_setup_tx_alias_rma(void)
+{
+	cxit_setup_tx_alias_rma_impl(false);
+}
+
+void cxit_setup_tx_alias_rma_dc(void)
+{
+	cxit_setup_tx_alias_rma_impl(true);
+}
+
+void cxit_teardown_tx_alias_rma(void)
+{
+	struct cxip_ep *cxi_ep;
+	int ret;
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	ret = fi_close(&cxit_tx_alias_ep->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close alias endpoint");
+	cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 0,
+		     "EP reference count");
+
+	/* Tear down RMA objects */
+	cxit_destroy_ep(); /* EP must be destroyed before bound objects */
+
+	cxit_destroy_av();
+	cxit_destroy_cntrs();
+	cxit_destroy_cqs();
+	cxit_destroy_eq();
+	cxit_teardown_ep();
+}
+
+/* Everyone needs to wait sometime */
+int cxit_await_completion(struct fid_cq *cq, struct fi_cq_tagged_entry *cqe)
+{
+	int ret;
+
+	do {
+		ret = fi_cq_read(cq, cqe, 1);
+	} while (ret == -FI_EAGAIN);
+
+	return ret;
+}
+
+void validate_tx_event(struct fi_cq_tagged_entry *cqe, uint64_t flags,
+		       void *context)
+{
+	cr_assert(cqe->op_context == context, "TX CQE Context mismatch");
+	cr_assert(cqe->flags == flags, "TX CQE flags mismatch");
+	cr_assert(cqe->len == 0, "Invalid TX CQE length");
+	cr_assert(cqe->buf == 0, "Invalid TX CQE address");
+	cr_assert(cqe->data == 0, "Invalid TX CQE data");
+	cr_assert(cqe->tag == 0, "Invalid TX CQE tag");
+}
+
+void validate_rx_event(struct fi_cq_tagged_entry *cqe, void *context,
+		       size_t len, uint64_t flags, void *buf, uint64_t data,
+		       uint64_t tag)
+{
+	cr_assert(cqe->op_context == context, "CQE Context mismatch");
+	cr_assert(cqe->len == len, "Invalid CQE length");
+	cr_assert(cqe->flags == flags, "CQE flags mismatch");
+	cr_assert(cqe->buf == buf, "Invalid CQE address (%p %p)",
+		  cqe->buf, buf);
+	cr_assert(cqe->data == data, "Invalid CQE data");
+	cr_assert(cqe->tag == tag, "Invalid CQE tag");
+}
+
+void validate_rx_event_mask(struct fi_cq_tagged_entry *cqe, void *context,
+			    size_t len, uint64_t flags, void *buf,
+			    uint64_t data, uint64_t tag, uint64_t ignore)
+{
+	cr_assert(cqe->op_context == context, "CQE Context mismatch");
+	cr_assert(cqe->len == len, "Invalid CQE length: (%lu %lu)",
+		  cqe->len, len);
+	cr_assert(cqe->flags == flags, "CQE flags mismatch");
+	cr_assert(cqe->buf == buf, "Invalid CQE address (%p %p)",
+		  cqe->buf, buf);
+	cr_assert(cqe->data == data, "Invalid CQE data");
+	cr_assert(((cqe->tag & ~ignore) == (tag & ~ignore)), "Invalid CQE tag");
+}
+
+void validate_multi_recv_rx_event(struct fi_cq_tagged_entry *cqe, void
+				  *context, size_t len, uint64_t flags,
+				  uint64_t data, uint64_t tag)
+{
+	cr_assert(cqe->op_context == context, "CQE Context mismatch");
+	cr_assert(cqe->len == len, "Invalid CQE length");
+	cr_assert((cqe->flags & ~FI_MULTI_RECV) == flags,
+		  "CQE flags mismatch (%#llx %#lx)",
+		  (cqe->flags & ~FI_MULTI_RECV), flags);
+	cr_assert(cqe->data == data, "Invalid CQE data");
+	cr_assert(cqe->tag == tag, "Invalid CQE tag %#lx %#lx", cqe->tag, tag);
+}
+
+int mr_create_ext(size_t len, uint64_t access, uint8_t seed, uint64_t *key,
+		  struct fid_cntr *cntr, struct mem_region *mr)
+{
+	int ret;
+
+	cr_assert_not_null(mr);
+
+	if (len) {
+		mr->mem = calloc(1, len);
+		cr_assert_not_null(mr->mem, "Error allocating memory window");
+	} else {
+		mr->mem = 0;
+	}
+
+	for (size_t i = 0; i < len; i++)
+		mr->mem[i] = i + seed;
+
+	ret = fi_mr_reg(cxit_domain, mr->mem, len, access, 0, *key, 0, &mr->mr,
+			NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed %d", ret);
+	ret = fi_mr_bind(mr->mr, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind(ep) failed %d", ret);
+
+	if (cxit_fi->caps & FI_RMA_EVENT && cntr) {
+		ret = fi_mr_bind(mr->mr, &cntr->fid, FI_REMOTE_WRITE);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind(cntr) failed %d",
+			     ret);
+	}
+
+	ret = fi_mr_enable(mr->mr);
+	if (!ret)
+		*key = fi_mr_key(mr->mr);
+
+	return ret;
+}
+
+int mr_create(size_t len, uint64_t access, uint8_t seed, uint64_t *key,
+	      struct mem_region *mr)
+{
+	return mr_create_ext(len, access, seed, key, cxit_rem_cntr, mr);
+}
+
+void mr_destroy(struct mem_region *mr)
+{
+	fi_close(&mr->mr->fid);
+	free(mr->mem);
+}
diff --git a/prov/cxi/test/cxip_test_common.h b/prov/cxi/test/cxip_test_common.h
new file mode 100644
index 00000000000..d04e8fdf56d
--- /dev/null
+++ b/prov/cxi/test/cxip_test_common.h
@@ -0,0 +1,141 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018,2020 Hewlett Packard Enterprise Development LP
+ */
+
+#ifndef _CXIP_TEST_COMMON_H_
+#define _CXIP_TEST_COMMON_H_
+
+#include "cxip.h"
+
+#define CXIT_DEFAULT_TIMEOUT 10
+
+extern struct fi_info *cxit_fi_hints;
+extern struct fi_info *cxit_fi;
+extern struct fid_fabric *cxit_fabric;
+extern struct fid_domain *cxit_domain;
+extern struct fi_cxi_dom_ops *dom_ops;
+extern struct fid_ep *cxit_ep;
+extern struct fid_ep *cxit_tx_alias_ep;
+extern struct cxip_addr cxit_ep_addr;
+extern fi_addr_t cxit_ep_fi_addr;
+extern struct fid_eq *cxit_eq;
+extern struct fi_cq_attr cxit_tx_cq_attr, cxit_rx_cq_attr;
+extern uint64_t cxit_tx_cq_bind_flags;
+extern uint64_t cxit_rx_cq_bind_flags;
+extern struct fid_cq *cxit_tx_cq, *cxit_rx_cq;
+extern struct fi_cntr_attr cxit_cntr_attr;
+extern struct fid_cntr *cxit_send_cntr, *cxit_recv_cntr;
+extern struct fid_cntr *cxit_read_cntr, *cxit_write_cntr;
+extern struct fid_cntr *cxit_rem_cntr;
+extern struct fi_av_attr cxit_av_attr;
+extern struct fid_av *cxit_av;
+extern char *cxit_node, *cxit_service;
+extern uint64_t cxit_flags;
+extern int cxit_n_ifs;
+extern struct fid_av_set *cxit_av_set;
+extern struct fid_mc *cxit_mc;
+extern FILE *cxit_mc_fifo;
+extern bool cxit_prov_key;
+extern int s_page_size;
+extern bool enable_cxi_hmem_ops;
+
+extern bool cxip_trace_enable(bool enable);
+extern void cxip_trace_flush(void);
+
+void cxit_init(void);
+void cxit_create_fabric_info(void);
+void cxit_destroy_fabric_info(void);
+void cxit_create_fabric(void);
+void cxit_destroy_fabric(void);
+void cxit_create_domain(void);
+void cxit_destroy_domain(void);
+void cxit_create_ep(void);
+void cxit_destroy_ep(void);
+void cxit_create_eq(void);
+void cxit_destroy_eq(void);
+void cxit_create_cqs(void);
+void cxit_destroy_cqs(void);
+void cxit_bind_cqs(void);
+void cxit_create_local_cntrs(void);
+void cxit_create_rem_cntrs(void);
+void cxit_create_cntrs(void);
+void cxit_destroy_cntrs(void);
+void cxit_bind_cntrs(void);
+void cxit_create_av(void);
+void cxit_destroy_av(void);
+void cxit_bind_av(void);
+
+void cxit_setup_rma_disable_fi_rma_event(void);
+struct fi_info *cxit_allocinfo(void);
+void cxit_setup_getinfo(void);
+void cxit_teardown_getinfo(void);
+void cxit_setup_fabric(void);
+void cxit_teardown_fabric(void);
+void cxit_setup_domain(void);
+void cxit_teardown_domain(void);
+void cxit_setup_ep(void);
+void cxit_teardown_ep(void);
+#define cxit_setup_eq cxit_setup_ep
+#define cxit_teardown_eq cxit_teardown_ep
+#define cxit_setup_cq cxit_setup_ep
+#define cxit_teardown_cq cxit_teardown_ep
+#define cxit_setup_av cxit_setup_ep
+#define cxit_teardown_av cxit_teardown_ep
+void cxit_setup_enabled_ep(void);
+void cxit_setup_enabled_ep_fd(void);
+void cxit_setup_rma(void);
+void cxit_setup_rma_fd(void);
+void cxit_setup_rma_hybrid_mr_desc(void);
+void cxit_setup_rma_mr_events(void);
+#define cxit_setup_tagged cxit_setup_rma
+#define cxit_setup_msg cxit_setup_rma
+void cxit_teardown_rma(void);
+#define cxit_teardown_tagged cxit_teardown_rma
+#define cxit_teardown_msg cxit_teardown_rma
+#define	cxit_teardown_enabled_ep cxit_teardown_rma
+#define cxit_teardown_rma_fd cxit_teardown_rma
+void cxit_setup_tx_alias_rma(void);
+void cxit_setup_tx_alias_rma_dc(void);
+#define cxit_setup_tx_alias_tagged cxit_setup_tx_alias_rma
+void cxit_teardown_tx_alias_rma(void);
+#define cxit_teardown_tx_alias_tagged cxit_teardown_tx_alias_rma
+int cxit_await_completion(struct fid_cq *cq, struct fi_cq_tagged_entry *cqe);
+void validate_tx_event(struct fi_cq_tagged_entry *cqe, uint64_t flags,
+		       void *context);
+void validate_rx_event(struct fi_cq_tagged_entry *cqe, void *context,
+		       size_t len, uint64_t flags, void *buf, uint64_t data,
+		       uint64_t tag);
+void validate_rx_event_mask(struct fi_cq_tagged_entry *cqe, void *context,
+			    size_t len, uint64_t flags, void *buf,
+			    uint64_t data, uint64_t tag, uint64_t ignore);
+void validate_multi_recv_rx_event(struct fi_cq_tagged_entry *cqe,
+				  void *context, size_t len, uint64_t flags,
+				  uint64_t data, uint64_t tag);
+
+struct mem_region {
+	uint8_t *mem;
+	struct fid_mr *mr;
+};
+
+int mr_create_ext(size_t len, uint64_t access, uint8_t seed, uint64_t *key,
+		  struct fid_cntr *cntr, struct mem_region *mr);
+int mr_create(size_t len, uint64_t access, uint8_t seed, uint64_t *key,
+	      struct mem_region *mr);
+void mr_destroy(struct mem_region *mr);
+
+struct cxit_coll_mc_list {
+	int count;
+	struct fid_av_set **av_set_fid;
+	struct fid_mc **mc_fid;
+};
+extern struct cxit_coll_mc_list cxit_coll_mc_list;
+
+void set_amo_remap_to_pcie_fadd(int amo_remap_to_pcie_fadd);
+void reset_amo_remap_to_pcie_fadd(void);
+
+int cxit_dom_read_cntr(unsigned int cntr, uint64_t *value,
+		       struct timespec *ts, bool sync);
+
+#endif
diff --git a/prov/cxi/test/deferred_work.c b/prov/cxi/test/deferred_work.c
new file mode 100644
index 00000000000..369e276ffad
--- /dev/null
+++ b/prov/cxi/test/deferred_work.c
@@ -0,0 +1,1328 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2020 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/wait.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+static void poll_counter_assert(struct fid_cntr *cntr, uint64_t expected_value,
+				unsigned int timeout)
+{
+	int ret;
+	struct timespec cur = {};
+	struct timespec end;
+	uint64_t value;
+
+	ret = clock_gettime(CLOCK_MONOTONIC, &end);
+	cr_assert_eq(ret, 0);
+
+	end.tv_sec += timeout;
+
+	while (true) {
+		ret = clock_gettime(CLOCK_MONOTONIC, &cur);
+		cr_assert_eq(ret, 0);
+
+		value = fi_cntr_read(cntr);
+		if (value == expected_value)
+			break;
+
+		if (cur.tv_sec > end.tv_sec) {
+			// cr_fail doesn't work so fake it
+			cr_assert_eq(value, expected_value,
+				     "Counter failed to reach expected value: expected=%lu, got=%lu\n",
+				     expected_value, value);
+			break;
+		}
+
+		/* Progress TX side for rendezvous tests */
+		fi_cq_read(cxit_tx_cq, NULL, 0);
+	}
+}
+
+void deferred_msg_op_test(bool comp_event, size_t xfer_size,
+			  uint64_t trig_thresh, bool is_tagged, uint64_t tag)
+{
+	int i;
+	int ret;
+	uint8_t *recv_buf;
+	uint8_t *send_buf;
+	struct fi_cq_tagged_entry tx_cqe;
+	struct fi_cq_tagged_entry rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct iovec iov = {};
+	struct fi_op_msg msg = {};
+	struct fi_op_tagged tagged = {};
+	struct fi_deferred_work work = {};
+	uint64_t expected_rx_flags =
+		is_tagged ? FI_TAGGED | FI_RECV : FI_MSG | FI_RECV;
+	uint64_t expected_rx_tag = is_tagged ? tag : 0;
+	uint64_t expected_tx_flags =
+		is_tagged ? FI_TAGGED | FI_SEND : FI_MSG | FI_SEND;
+
+	recv_buf = calloc(1, xfer_size);
+	cr_assert(recv_buf);
+
+	send_buf = calloc(1, xfer_size);
+	cr_assert(send_buf);
+
+	for (i = 0; i < xfer_size; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	if (is_tagged)
+		ret = fi_trecv(cxit_ep, recv_buf, xfer_size, NULL,
+			       FI_ADDR_UNSPEC, tag, 0, NULL);
+	else
+		ret = fi_recv(cxit_ep, recv_buf, xfer_size, NULL,
+			      FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send deferred op to self */
+	iov.iov_base = send_buf;
+	iov.iov_len = xfer_size;
+
+	work.threshold = trig_thresh;
+	work.triggering_cntr = cxit_send_cntr;
+	work.completion_cntr = cxit_send_cntr;
+
+	if (is_tagged) {
+		tagged.ep = cxit_ep;
+		tagged.msg.msg_iov = &iov;
+		tagged.msg.iov_count = 1;
+		tagged.msg.addr = cxit_ep_fi_addr;
+		tagged.msg.tag = tag;
+		tagged.flags = comp_event ? FI_COMPLETION : 0;
+
+		work.op_type = FI_OP_TSEND;
+		work.op.tagged = &tagged;
+	} else {
+		msg.ep = cxit_ep;
+		msg.msg.msg_iov = &iov;
+		msg.msg.iov_count = 1;
+		msg.msg.addr = cxit_ep_fi_addr;
+		msg.flags = comp_event ? FI_COMPLETION : 0;
+
+		work.op_type = FI_OP_SEND;
+		work.op.msg = &msg;
+	}
+
+	ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work);
+	cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret);
+
+	/* Verify no target event has occurred. */
+	ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+	cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret);
+
+	ret = fi_cntr_add(cxit_send_cntr, work.threshold);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, xfer_size, expected_rx_flags, NULL, 0,
+			  expected_rx_tag);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	if (comp_event) {
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_tx_event(&tx_cqe, expected_tx_flags, NULL);
+	} else {
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d",
+			     ret);
+	}
+
+	/* Validate sent data */
+	for (i = 0; i < xfer_size; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	poll_counter_assert(cxit_send_cntr, work.threshold + 1, 5);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+
+TestSuite(deferred_work, .init = cxit_setup_msg, .fini = cxit_teardown_msg,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+
+Test(deferred_work, eager_message_comp_event)
+{
+	deferred_msg_op_test(true, 1024, 123546, false, 0);
+}
+
+Test(deferred_work, rendezvous_message_comp_event)
+{
+	deferred_msg_op_test(true, 1024 * 1024, 123546, false, 0);
+}
+
+Test(deferred_work, eager_message_no_comp_event)
+{
+	deferred_msg_op_test(false, 1024, 123546, false, 0);
+}
+
+Test(deferred_work, rendezvous_message_no_comp_event, .timeout=60)
+{
+	deferred_msg_op_test(false, 1024 * 1024, 123546, false, 0);
+}
+
+Test(deferred_work, tagged_eager_message_comp_event)
+{
+	deferred_msg_op_test(true, 1024, 123546, true, 987654321);
+}
+
+Test(deferred_work, tagged_rendezvous_message_comp_event)
+{
+	deferred_msg_op_test(true, 1024 * 1024, 123546, true, 987654321);
+}
+
+Test(deferred_work, tagged_eager_message_no_comp_event)
+{
+	deferred_msg_op_test(false, 1024, 123546, true, 987654321);
+}
+
+Test(deferred_work, tagged_rendezvous_message_no_comp_event, .timeout=60)
+{
+	deferred_msg_op_test(false, 1024 * 1024, 123546, true, 987654321);
+}
+
+Test(deferred_work, flush_work)
+{
+	int i;
+	int ret;
+	uint8_t *recv_buf;
+	uint8_t *send_buf;
+	struct fi_cq_tagged_entry tx_cqe;
+	struct fi_cq_tagged_entry rx_cqe;
+	struct iovec iov = {};
+	struct fi_op_msg msg = {};
+	struct fi_deferred_work msg_work = {};
+	unsigned int trig_thresh;
+	size_t xfer_size = 1;
+	uint64_t key = 0xbeef;
+	struct mem_region mem_window;
+	struct fi_rma_iov rma_iov = {};
+	struct fi_op_rma rma = {};
+	struct fi_deferred_work rma_work = {};
+	struct fi_ioc ioc = {};
+	struct fi_rma_ioc rma_ioc = {};
+	struct fi_op_atomic amo = {};
+	struct fi_deferred_work amo_work = {};
+	struct fi_op_cntr op_cntr = {};
+	struct fi_deferred_work cntr_work = {};
+
+	recv_buf = calloc(1, xfer_size);
+	cr_assert(recv_buf);
+
+	send_buf = calloc(1, xfer_size);
+	cr_assert(send_buf);
+
+	ret = mr_create(xfer_size, FI_REMOTE_WRITE | FI_REMOTE_READ, 0xa0, &key,
+			&mem_window);
+	cr_assert_eq(ret, FI_SUCCESS, "mr_create failed %d", ret);
+
+	for (i = 0; i < xfer_size; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_recv(cxit_ep, recv_buf, xfer_size, NULL, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send deferred 64 bytes to self */
+	msg.ep = cxit_ep;
+	iov.iov_base = send_buf;
+	iov.iov_len = xfer_size;
+	msg.msg.msg_iov = &iov;
+	msg.msg.iov_count = 1;
+	msg.msg.addr = cxit_ep_fi_addr;
+	msg.flags = FI_COMPLETION;
+
+	msg_work.triggering_cntr = cxit_send_cntr;
+	msg_work.completion_cntr = cxit_send_cntr;
+	msg_work.op_type = FI_OP_SEND;
+	msg_work.op.msg = &msg;
+
+	/* Deferred RMA op to be cancelled. */
+	rma_iov.key = key;
+
+	rma.ep = cxit_ep;
+	rma.msg.msg_iov = &iov;
+	rma.msg.iov_count = 1;
+	rma.msg.addr = cxit_ep_fi_addr;
+	rma.msg.rma_iov = &rma_iov;
+	rma.msg.rma_iov_count = 1;
+	rma.flags = FI_COMPLETION;
+
+	rma_work.triggering_cntr = cxit_send_cntr;
+	rma_work.completion_cntr = cxit_send_cntr;
+	rma_work.op_type = FI_OP_READ;
+	rma_work.op.rma = &rma;
+
+	/* Deferred AMO op to be cancelled. */
+	ioc.addr = &send_buf;
+	ioc.count = 1;
+
+	rma_ioc.key = key;
+	rma_ioc.count = 1;
+
+	amo.ep = cxit_ep;
+
+	amo.msg.msg_iov = &ioc;
+	amo.msg.iov_count = 1;
+	amo.msg.addr = cxit_ep_fi_addr;
+	amo.msg.rma_iov = &rma_ioc;
+	amo.msg.rma_iov_count = 1;
+	amo.msg.datatype = FI_UINT8;
+	amo.msg.op = FI_SUM;
+
+	amo_work.triggering_cntr = cxit_send_cntr;
+	amo_work.completion_cntr = cxit_send_cntr;
+	amo_work.op_type = FI_OP_ATOMIC;
+	amo_work.op.atomic = &amo;
+
+	/* Deferred counter op. */
+	op_cntr.cntr = cxit_send_cntr;
+	op_cntr.value = 13546;
+
+	cntr_work.op_type = FI_OP_CNTR_SET;
+	cntr_work.triggering_cntr = cxit_send_cntr;
+	cntr_work.op.cntr = &op_cntr;
+
+	/* Queue up multiple trigger requests to be cancelled. */
+	for (i = 0, trig_thresh = 12345; i < 12; i++, trig_thresh++) {
+		struct fi_deferred_work *work;
+
+		if (i < 3)
+			work = &msg_work;
+		else if (i < 6)
+			work = &rma_work;
+		else if (i < 9)
+			work = &cntr_work;
+		else
+			work = &amo_work;
+
+		work->threshold = trig_thresh;
+
+		ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, work);
+		cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret);
+	}
+
+	/* Verify no source or target event has occurred. */
+	ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+	cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret);
+
+	ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret);
+
+	/* Flush all work requests. */
+	ret = fi_control(&cxit_domain->fid, FI_FLUSH_WORK, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "FI_FLUSH_WORK failed %d", ret);
+
+	ret = fi_cntr_add(cxit_send_cntr, trig_thresh);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+
+	/* Verify no source or target event has occurred. */
+	ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+	cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret);
+
+	ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret);
+
+	poll_counter_assert(cxit_send_cntr, trig_thresh, 5);
+
+	free(send_buf);
+	free(recv_buf);
+	mr_destroy(&mem_window);
+}
+
+static void deferred_rma_test(enum fi_op_type op, size_t xfer_size,
+			      uint64_t trig_thresh, uint64_t key,
+			      bool comp_event)
+{
+	int ret;
+	struct mem_region mem_window;
+	struct fi_cq_tagged_entry cqe;
+	struct iovec iov = {};
+	struct fi_rma_iov rma_iov = {};
+	struct fi_op_rma rma = {};
+	struct fi_deferred_work work = {};
+	struct fid_cntr *trig_cntr = cxit_write_cntr;
+	struct fid_cntr *comp_cntr = cxit_read_cntr;
+	uint8_t *send_buf;
+	uint64_t expected_flags =
+		op == FI_OP_WRITE ? FI_RMA | FI_WRITE : FI_RMA | FI_READ;
+
+	send_buf = calloc(1, xfer_size);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(xfer_size, FI_REMOTE_WRITE | FI_REMOTE_READ, 0xa0, &key,
+		  &mem_window);
+
+	iov.iov_base = send_buf;
+	iov.iov_len = xfer_size;
+
+	rma_iov.key = key;
+
+	rma.ep = cxit_ep;
+	rma.msg.msg_iov = &iov;
+	rma.msg.iov_count = 1;
+	rma.msg.addr = cxit_ep_fi_addr;
+	rma.msg.rma_iov = &rma_iov;
+	rma.msg.rma_iov_count = 1;
+	rma.flags = comp_event ? FI_COMPLETION : 0;
+
+	work.threshold = trig_thresh;
+	work.triggering_cntr = trig_cntr;
+	work.completion_cntr = comp_cntr;
+	work.op_type = op;
+	work.op.rma = &rma;
+
+	ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work);
+	cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret);
+
+	/* Verify no target event has occurred. */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret);
+
+	ret = fi_cntr_add(trig_cntr, work.threshold);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+
+	if (comp_event) {
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_tx_event(&cqe, expected_flags, NULL);
+	} else {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d",
+			     ret);
+	}
+
+	poll_counter_assert(trig_cntr, work.threshold, 5);
+	poll_counter_assert(comp_cntr, 1, 5);
+
+	/* Validate RMA data */
+	for (size_t i = 0; i < xfer_size; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%ld) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+Test(deferred_work, rma_write)
+{
+	deferred_rma_test(FI_OP_WRITE, 12345, 54321, 0xbeef, true);
+}
+
+Test(deferred_work, rma_write_no_event)
+{
+	deferred_rma_test(FI_OP_WRITE, 12345, 54321, 0xbeef, false);
+}
+
+Test(deferred_work, rma_read)
+{
+	deferred_rma_test(FI_OP_READ, 12345, 54321, 0xbeef, true);
+}
+
+Test(deferred_work, rma_read_no_event)
+{
+	deferred_rma_test(FI_OP_READ, 12345, 54321, 0xbeef, false);
+}
+
+static void deferred_amo_test(bool comp_event, bool fetch, bool comp)
+{
+	int ret;
+	struct mem_region mem_window;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_ioc iov = {};
+	struct fi_ioc fetch_iov = {};
+	struct fi_ioc comp_iov = {};
+	struct fi_rma_ioc rma_iov = {};
+	struct fi_op_atomic amo = {};
+	struct fi_op_fetch_atomic fetch_amo = {};
+	struct fi_op_compare_atomic comp_amo = {};
+	struct fi_msg_atomic *amo_msg;
+	struct fi_deferred_work work = {};
+	struct fid_cntr *trig_cntr = cxit_write_cntr;
+	struct fid_cntr *comp_cntr = cxit_read_cntr;
+	uint64_t expected_flags;
+	uint64_t source_buf = 1;
+	uint64_t *target_buf;
+	uint64_t result;
+	uint64_t key = 0xbbb;
+	uint64_t trig_thresh = 12345;
+	uint64_t init_target_value = 0x7FFFFFFFFFFFFFFF;
+	uint64_t fetch_result = 0;
+	uint64_t compare_value = init_target_value;
+
+	ret = mr_create(sizeof(*target_buf), FI_REMOTE_WRITE | FI_REMOTE_READ,
+			0, &key, &mem_window);
+	assert(ret == FI_SUCCESS);
+
+	target_buf = (uint64_t *)mem_window.mem;
+	*target_buf = init_target_value;
+
+	result = source_buf + *target_buf;
+
+	iov.addr = &source_buf;
+	iov.count = 1;
+
+	rma_iov.key = key;
+	rma_iov.count = 1;
+
+	if (fetch) {
+		amo_msg = &fetch_amo.msg;
+		fetch_amo.ep = cxit_ep;
+		fetch_amo.flags = comp_event ? FI_COMPLETION : 0;
+		work.op_type = FI_OP_FETCH_ATOMIC;
+		work.op.fetch_atomic = &fetch_amo;
+		expected_flags = FI_ATOMIC | FI_READ;
+
+		fetch_iov.addr = &fetch_result;
+		fetch_iov.count = 1;
+
+		fetch_amo.fetch.msg_iov = &fetch_iov;
+		fetch_amo.fetch.iov_count = 1;
+	} else if (comp) {
+		amo_msg = &comp_amo.msg;
+		comp_amo.ep = cxit_ep;
+		comp_amo.flags = comp_event ? FI_COMPLETION : 0;
+		work.op_type = FI_OP_COMPARE_ATOMIC;
+		work.op.compare_atomic = &comp_amo;
+		expected_flags = FI_ATOMIC | FI_READ;
+
+		fetch_iov.addr = &fetch_result;
+		fetch_iov.count = 1;
+
+		comp_iov.addr = &compare_value;
+		comp_iov.count = 1;
+
+		comp_amo.fetch.msg_iov = &fetch_iov;
+		comp_amo.fetch.iov_count = 1;
+		comp_amo.compare.msg_iov = &comp_iov;
+		comp_amo.compare.iov_count = 1;
+	} else {
+		amo_msg = &amo.msg;
+		amo.ep = cxit_ep;
+		amo.flags = comp_event ? FI_COMPLETION : 0;
+		work.op_type = FI_OP_ATOMIC;
+		work.op.atomic = &amo;
+		expected_flags = FI_ATOMIC | FI_WRITE;
+	}
+
+	amo_msg->msg_iov = &iov;
+	amo_msg->iov_count = 1;
+	amo_msg->addr = cxit_ep_fi_addr;
+	amo_msg->rma_iov = &rma_iov;
+	amo_msg->rma_iov_count = 1;
+	amo_msg->datatype = FI_UINT64;
+	amo_msg->op = comp ? FI_CSWAP : FI_SUM;
+
+	work.threshold = trig_thresh;
+	work.triggering_cntr = trig_cntr;
+	work.completion_cntr = comp_cntr;
+
+	ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work);
+	cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret);
+
+	/* Verify no target event has occurred. */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret);
+
+	ret = fi_cntr_add(trig_cntr, work.threshold);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+
+	if (comp_event) {
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_tx_event(&cqe, expected_flags, NULL);
+	} else {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d",
+			     ret);
+	}
+
+	poll_counter_assert(trig_cntr, work.threshold, 5);
+	poll_counter_assert(comp_cntr, 1, 5);
+
+	/* Validate AMO data */
+	if (comp)
+		cr_assert_eq(*target_buf, source_buf, "Invalid target result");
+	else
+		cr_assert_eq(*target_buf, result, "Invalid target result");
+
+	if (fetch || comp)
+		cr_assert_eq(fetch_result, init_target_value,
+			     "Invalid fetch result expected=%lu got=%lu",
+			     init_target_value, fetch_result);
+
+	mr_destroy(&mem_window);
+}
+
+Test(deferred_work, amo_no_event)
+{
+	deferred_amo_test(false, false, false);
+}
+
+Test(deferred_work, amo_event)
+{
+	deferred_amo_test(true, false, false);
+}
+
+Test(deferred_work, fetch_amo_no_event)
+{
+	deferred_amo_test(false, true, false);
+}
+
+Test(deferred_work, fetch_amo_event)
+{
+	deferred_amo_test(true, true, false);
+}
+
+Test(deferred_work, compare_amo_no_event)
+{
+	deferred_amo_test(false, false, true);
+}
+
+Test(deferred_work, compare_amo_event)
+{
+	deferred_amo_test(true, false, true);
+}
+
+static void deferred_cntr(bool is_inc)
+{
+	struct fi_cntr_attr attr = {};
+	struct fid_cntr *cntr;
+	struct fid_cntr *trig_cntr = cxit_write_cntr;
+	int ret;
+	uint64_t value = 123456;
+	uint64_t thresh = 1234;
+	struct fi_op_cntr op_cntr = {};
+	struct fi_deferred_work work = {};
+
+	ret = fi_cntr_open(cxit_domain, &attr, &cntr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_open failed %d", ret);
+
+	/* Ensure success value is non-zero to ensure success and increment
+	 * work.
+	 */
+	ret = fi_cntr_add(cntr, 1);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+
+	op_cntr.cntr = cntr;
+	op_cntr.value = value;
+
+	work.op_type = is_inc ? FI_OP_CNTR_ADD : FI_OP_CNTR_SET;
+	work.triggering_cntr = trig_cntr;
+	work.threshold = thresh;
+	work.op.cntr = &op_cntr;
+
+	ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work);
+	cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret);
+
+	/* Trigger the operation. */
+	ret = fi_cntr_add(trig_cntr, work.threshold);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret);
+
+	poll_counter_assert(trig_cntr, work.threshold, 5);
+	poll_counter_assert(cntr, is_inc ? 1 + value : value, 5);
+
+	ret = fi_close(&cntr->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close failed %d", ret);
+}
+
+Test(deferred_work, cntr_add)
+{
+	deferred_cntr(true);
+}
+
+Test(deferred_work, cntr_set)
+{
+	deferred_cntr(false);
+}
+
+static void deferred_recv_op_test(bool comp_event, size_t xfer_size,
+				  uint64_t trig_thresh, bool is_tagged,
+				  uint64_t tag)
+{
+	int i;
+	int ret;
+	uint8_t *recv_buf;
+	uint8_t *send_buf;
+	struct fi_cq_tagged_entry tx_cqe;
+	struct fi_cq_tagged_entry rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct iovec iov = {};
+	struct fi_op_msg msg = {};
+	struct fi_op_tagged tagged = {};
+	struct fi_deferred_work work = {};
+	uint64_t expected_rx_flags =
+		is_tagged ? FI_TAGGED | FI_RECV : FI_MSG | FI_RECV;
+	uint64_t expected_rx_tag = is_tagged ? tag : 0;
+	uint64_t expected_tx_flags =
+		is_tagged ? FI_TAGGED | FI_SEND : FI_MSG | FI_SEND;
+	struct fi_cntr_attr attr = {};
+	struct fid_cntr *recv_cntr;
+
+	ret = fi_cntr_open(cxit_domain, &attr, &recv_cntr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_open failed %d", ret);
+
+	recv_buf = calloc(1, xfer_size);
+	cr_assert(recv_buf);
+
+	send_buf = calloc(1, xfer_size);
+	cr_assert(send_buf);
+
+	for (i = 0; i < xfer_size; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Recv deferred op */
+	iov.iov_base = recv_buf;
+	iov.iov_len = xfer_size;
+
+	work.threshold = trig_thresh;
+	work.triggering_cntr = recv_cntr;
+	work.completion_cntr = recv_cntr;
+
+	if (is_tagged) {
+		tagged.ep = cxit_ep;
+		tagged.msg.msg_iov = &iov;
+		tagged.msg.iov_count = 1;
+		tagged.msg.tag = tag;
+		tagged.msg.addr = cxit_ep_fi_addr;
+		tagged.flags = comp_event ? FI_COMPLETION : 0;
+
+		work.op_type = FI_OP_TRECV;
+		work.op.tagged = &tagged;
+	} else {
+		msg.ep = cxit_ep;
+		msg.msg.msg_iov = &iov;
+		msg.msg.iov_count = 1;
+		msg.msg.addr = cxit_ep_fi_addr;
+		msg.flags = comp_event ? FI_COMPLETION : 0;
+
+		work.op_type = FI_OP_RECV;
+		work.op.msg = &msg;
+	}
+
+	ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work);
+	cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret);
+
+	if (is_tagged)
+		ret = fi_tsend(cxit_ep, send_buf, xfer_size, NULL,
+			       cxit_ep_fi_addr, tag, NULL);
+	else
+		ret = fi_send(cxit_ep, send_buf, xfer_size, NULL,
+			      cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	/* Wait for async send event. In software endpoint mode, RX CQ needs to
+	 * be progress to progress TX CQ.
+	 */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		if (ret == -FI_EAGAIN)
+			fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+	} while (ret == -FI_EAGAIN);
+
+	validate_tx_event(&tx_cqe, expected_tx_flags, NULL);
+
+	/* Verify optional receive event. */
+	if (comp_event) {
+		/* Wait for async event indicating data has been sent */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_rx_event(&rx_cqe, NULL, xfer_size, expected_rx_flags,
+				  NULL, 0, expected_rx_tag);
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+	} else {
+		ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d",
+			     ret);
+	}
+
+	/* Validate sent data */
+	for (i = 0; i < xfer_size; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Need to progress recv the transaction to increment the counter. */
+	ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+	cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret);
+
+	poll_counter_assert(recv_cntr, 1, 5);
+
+	free(send_buf);
+	free(recv_buf);
+	fi_close(&recv_cntr->fid);
+}
+
+Test(deferred_work, recv_eager_message_comp_event)
+{
+	deferred_recv_op_test(true, 1024, 0, false, 0);
+}
+
+Test(deferred_work, recv_rendezvous_message_comp_event)
+{
+	deferred_recv_op_test(true, 1024 * 1024, 0, false, 0);
+}
+
+Test(deferred_work, recv_eager_message_no_comp_event)
+{
+	deferred_recv_op_test(false, 1024, 0, false, 0);
+}
+
+Test(deferred_work, recv_rendezvous_message_no_comp_event, .timeout=60)
+{
+	deferred_recv_op_test(false, 1024 * 1024, 0, false, 0);
+}
+
+Test(deferred_work, recv_tagged_eager_message_comp_event)
+{
+	deferred_recv_op_test(true, 1024, 0, true, 987654321);
+}
+
+Test(deferred_work, recv_tagged_rendezvous_message_comp_event)
+{
+	deferred_recv_op_test(true, 1024 * 1024, 0, true, 987654321);
+}
+
+Test(deferred_work, recv_tagged_eager_message_no_comp_event)
+{
+	deferred_recv_op_test(false, 1024, 0, true, 987654321);
+}
+
+Test(deferred_work, recv_tagged_rendezvous_message_no_comp_event, .timeout=60)
+{
+	deferred_recv_op_test(false, 1024 * 1024, 0, true, 987654321);
+}
+
+static void deferred_recv_non_zero_thresh(bool is_tagged)
+{
+	int ret;
+	uint8_t *recv_buf;
+	struct iovec iov = {};
+	struct fi_op_msg msg = {};
+	struct fi_op_tagged tagged = {};
+	struct fi_deferred_work work = {};
+	struct fi_cntr_attr attr = {};
+	struct fid_cntr *recv_cntr;
+
+	ret = fi_cntr_open(cxit_domain, &attr, &recv_cntr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_open failed %d", ret);
+
+	recv_buf = calloc(1, 5);
+	cr_assert(recv_buf);
+
+	/* Recv deferred op to self */
+	iov.iov_base = recv_buf;
+	iov.iov_len = 5;
+
+	work.threshold = 5;
+	work.triggering_cntr = recv_cntr;
+	work.completion_cntr = recv_cntr;
+
+	if (is_tagged) {
+		tagged.ep = cxit_ep;
+		tagged.msg.msg_iov = &iov;
+		tagged.msg.iov_count = 1;
+		tagged.msg.tag = 456;
+		tagged.msg.addr = cxit_ep_fi_addr;
+		tagged.flags = FI_COMPLETION;
+
+		work.op_type = FI_OP_TRECV;
+		work.op.tagged = &tagged;
+	} else {
+		msg.ep = cxit_ep;
+		msg.msg.msg_iov = &iov;
+		msg.msg.iov_count = 1;
+		msg.msg.addr = cxit_ep_fi_addr;
+		msg.flags = FI_COMPLETION;
+
+		work.op_type = FI_OP_RECV;
+		work.op.msg = &msg;
+	}
+
+	ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work);
+	cr_assert_neq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret);
+
+	free(recv_buf);
+	fi_close(&recv_cntr->fid);
+}
+
+Test(deferred_work, recv_non_zero_thresh)
+{
+	deferred_recv_non_zero_thresh(false);
+}
+
+Test(deferred_work, recv_tagged_non_zero_thresh)
+{
+	deferred_recv_non_zero_thresh(true);
+}
+
+/* FI_INJECT with deferred work queue processing is not supported. */
+void deferred_msg_inject_test(bool is_tagged)
+{
+	int ret;
+	uint8_t *send_buf;
+	struct iovec iov = {};
+	struct fi_op_msg msg = {};
+	struct fi_op_tagged tagged = {};
+	struct fi_deferred_work work = {};
+
+	send_buf = calloc(1, 20);
+	cr_assert(send_buf);
+
+	/* Send deferred op to self */
+	iov.iov_base = send_buf;
+	iov.iov_len = 20;
+
+	work.threshold = 5;
+	work.triggering_cntr = cxit_send_cntr;
+	work.completion_cntr = cxit_send_cntr;
+
+	if (is_tagged) {
+		tagged.ep = cxit_ep;
+		tagged.msg.msg_iov = &iov;
+		tagged.msg.iov_count = 1;
+		tagged.msg.addr = cxit_ep_fi_addr;
+		tagged.msg.tag = 0x0123;
+		tagged.flags = FI_INJECT | FI_COMPLETION;
+
+		work.op_type = FI_OP_TSEND;
+		work.op.tagged = &tagged;
+	} else {
+		msg.ep = cxit_ep;
+		msg.msg.msg_iov = &iov;
+		msg.msg.iov_count = 1;
+		msg.msg.addr = cxit_ep_fi_addr;
+		msg.flags = FI_INJECT | FI_COMPLETION;
+
+		work.op_type = FI_OP_SEND;
+		work.op.msg = &msg;
+	}
+
+	ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work);
+	cr_assert_eq(ret, -FI_EINVAL, "FI_INJECT did not fail %d", ret);
+
+	free(send_buf);
+}
+
+Test(deferred_work, tsend_inject)
+{
+	deferred_msg_inject_test(true);
+}
+
+Test(deferred_work, send_inject)
+{
+	deferred_msg_inject_test(false);
+}
+
+#define TLE_RESERVED 8U
+
+static int alloc_service(struct cxil_dev *dev, unsigned int tle_count)
+{
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {
+		.enable = 1,
+		.limits = {
+			.type[CXI_RSRC_TYPE_PTE] = {
+				.max = 100,
+				.res = 100,
+			},
+			.type[CXI_RSRC_TYPE_TXQ] = {
+				.max = 100,
+				.res = 100,
+			},
+			.type[CXI_RSRC_TYPE_TGQ] = {
+				.max = 100,
+				.res = 100,
+			},
+			.type[CXI_RSRC_TYPE_EQ] = {
+				.max = 100,
+				.res = 100,
+			},
+			.type[CXI_RSRC_TYPE_CT] = {
+				.max = 100,
+				.res = 100,
+			},
+			.type[CXI_RSRC_TYPE_LE] = {
+				.max = 100,
+				.res = 100,
+			},
+			.type[CXI_RSRC_TYPE_TLE] = {
+				.max = tle_count + TLE_RESERVED,
+				.res = tle_count + TLE_RESERVED,
+			},
+			.type[CXI_RSRC_TYPE_AC] = {
+				.max = 8,
+				.res = 8,
+			},
+		},
+	};
+	int ret;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0,
+		     "cxil_alloc_svc(): Failed. Expected Success! rc:%d", ret);
+
+	return ret;
+}
+
+struct deferred_work_resources {
+	struct fi_info *hints;
+	struct fi_info *info;
+	struct fid_fabric *fab;
+	struct fid_domain *dom;
+	struct fid_cq *cq;
+	struct fid_cntr *cntr;
+	struct fid_av *av;
+	struct fid_ep *ep;
+	fi_addr_t loopback;
+	struct cxil_dev *dev;
+	int service_id;
+};
+
+#define test_assert(test, fmt, ...)					\
+	do {								\
+		if (!(test)) {						\
+			fprintf(stderr, "%s:%d: " fmt "\n",		\
+				__func__, __LINE__, ##__VA_ARGS__);	\
+			abort();					\
+		}							\
+	} while (0)
+
+static void
+deferred_work_resources_teardown(struct deferred_work_resources *res)
+{
+	test_assert((fi_close(&res->ep->fid) == FI_SUCCESS), "fi_close failed");
+	test_assert((fi_close(&res->cntr->fid) == FI_SUCCESS), "fi_close failed");
+	test_assert((fi_close(&res->cq->fid) == FI_SUCCESS), "fi_close failed");
+	test_assert((fi_close(&res->av->fid) == FI_SUCCESS), "fi_close failed");
+	test_assert((fi_close(&res->dom->fid) == FI_SUCCESS), "fi_close failed");
+	test_assert((fi_close(&res->fab->fid) == FI_SUCCESS), "fi_close failed");
+	fi_freeinfo(res->info);
+	fi_freeinfo(res->hints);
+}
+
+static bool triggered_ops_limited()
+{
+	static bool first = true;
+	static bool limited = false;
+
+	if (!first)
+		return limited;
+
+	char *s = getenv("FI_CXI_ENABLE_TRIG_OP_LIMIT");
+
+	if (!s)           /* variable not set/found */
+		goto not_limited;
+
+	char *endptr;
+	int i = strtol(s, &endptr, 10);
+
+	if (endptr == s)  /* no parsable integers */
+		goto not_limited;
+	if (!i)           /* set to 0 */
+		goto not_limited;
+
+	/* Some non-zero integer was parsed.
+	 * It still could be 10zebras, but we will count it.
+	 */
+
+	limited = true;
+
+ not_limited:
+
+	first = false;
+
+	return limited;
+}
+
+static void deferred_work_resources_init(struct deferred_work_resources *res,
+					 int service_id)
+{
+	int ret;
+	struct cxi_auth_key auth_key = {
+		.vni = 1,
+	};
+	struct fi_av_attr av_attr = {};
+
+	auth_key.svc_id = service_id;
+
+	res->hints = fi_allocinfo();
+	test_assert(res->hints, "fi_allocinfo failed");
+
+	res->hints->fabric_attr->prov_name = strdup("cxi");
+	test_assert(res->hints->fabric_attr->prov_name, "strdup failed");
+
+	res->hints->domain_attr->mr_mode =
+		FI_MR_ENDPOINT | FI_MR_ALLOCATED | FI_MR_PROV_KEY;
+	res->hints->tx_attr->op_flags = FI_TRANSMIT_COMPLETE;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 "cxi0", NULL, FI_SOURCE, res->hints,
+			 &res->info);
+	test_assert(ret == FI_SUCCESS, "fi_getinfo failed: %d\n", ret);
+
+	ret = fi_fabric(res->info->fabric_attr, &res->fab, NULL);
+	test_assert(ret == FI_SUCCESS, "fi_fabric failed: %d\n", ret);
+
+	res->info->domain_attr->auth_key = (void *)&auth_key;
+	res->info->domain_attr->auth_key_size = sizeof(auth_key);
+
+	ret = fi_domain(res->fab, res->info, &res->dom, NULL);
+	test_assert(ret == FI_SUCCESS, "fi_domain failed: %d\n", ret);
+
+	res->info->domain_attr->auth_key = NULL;
+	res->info->domain_attr->auth_key_size = 0;
+
+	ret = fi_av_open(res->dom, &av_attr, &res->av, NULL);
+	test_assert(ret == FI_SUCCESS, "fi_av_open failed: %d\n", ret);
+
+	ret = fi_cq_open(res->dom, NULL, &res->cq, NULL);
+	test_assert(ret == FI_SUCCESS, "fi_cq_open failed: %d\n", ret);
+
+	ret = fi_cntr_open(res->dom, NULL, &res->cntr, NULL);
+	test_assert(ret == FI_SUCCESS, "fi_cntr_open failed: %d\n", ret);
+
+	ret = fi_endpoint(res->dom, res->info, &res->ep, NULL);
+	test_assert(ret == FI_SUCCESS, "fi_endpoint failed: %d\n", ret);
+
+	ret = fi_ep_bind(res->ep, &res->cq->fid,
+			 FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
+	test_assert(ret == FI_SUCCESS, "fi_ep_bind failed: %d\n", ret);
+
+	ret = fi_ep_bind(res->ep, &res->cntr->fid,
+			 FI_SEND | FI_RECV | FI_READ | FI_WRITE);
+	test_assert(ret == FI_SUCCESS, "fi_ep_bind failed: %d\n", ret);
+
+	ret = fi_ep_bind(res->ep, &res->av->fid, 0);
+	test_assert(ret == FI_SUCCESS, "fi_ep_bind failed: %d\n", ret);
+
+	ret = fi_enable(res->ep);
+	test_assert(ret == FI_SUCCESS, "fi_enable failed: %d\n", ret);
+
+	ret = fi_av_insert(res->av, res->info->src_addr, 1, &res->loopback, 0,
+			   NULL);
+	test_assert(ret == 1, "fi_av_insert failed: %d\n", ret);
+}
+
+TestSuite(deferred_work_trig_op_limit, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(deferred_work_trig_op_limit, enforce_limit_single_thread)
+{
+	struct deferred_work_resources res = {};
+	unsigned int trig_op_count = 64;
+	unsigned int threshold = 1000;
+	char send_buf[256];
+	char recv_buf[256];
+	int ret;
+	int i;
+	struct fi_deferred_work work = {};
+	struct iovec iov = {};
+	struct fi_op_msg msg = {};
+	bool limited = triggered_ops_limited();
+
+	ret = cxil_open_device(0, &res.dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d\n", ret);
+
+	res.service_id = alloc_service(res.dev, trig_op_count);
+	cr_assert_gt(res.service_id, 0, "alloc_service() failed: %d\n",
+		     res.service_id);
+
+	deferred_work_resources_init(&res, res.service_id);
+
+	for (i = 0; i < trig_op_count; i++) {
+		ret = fi_recv(res.ep, recv_buf, sizeof(recv_buf), NULL,
+			      res.loopback, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d\n", ret);
+	}
+
+	iov.iov_base = send_buf;
+	iov.iov_len = sizeof(send_buf);
+
+	work.threshold = threshold;
+	work.triggering_cntr = res.cntr;
+	work.completion_cntr = res.cntr;
+
+	msg.ep = res.ep;
+	msg.msg.msg_iov = &iov;
+	msg.msg.iov_count = 1;
+	msg.msg.addr = res.loopback;
+	msg.flags = FI_TRANSMIT_COMPLETE;
+
+	work.op_type = FI_OP_SEND;
+	work.op.msg = &msg;
+
+	for (i = 0; i < trig_op_count; i++) {
+		ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work);
+		cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK iter %d failed %d", i, ret);
+	}
+
+	ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work);
+	if (limited)
+		cr_assert_eq(ret, -FI_ENOSPC, "FI_QUEUE_WORK failed %d", ret);
+	else
+		cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret);
+
+	cr_assert((fi_control(&res.dom->fid, FI_FLUSH_WORK, NULL) == FI_SUCCESS));
+
+	for (i = 0; i < trig_op_count; i++) {
+		ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work);
+		cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK iter %d failed %d", i, ret);
+	}
+
+	cr_assert((fi_control(&res.dom->fid, FI_FLUSH_WORK, NULL) == FI_SUCCESS));
+
+	deferred_work_resources_teardown(&res);
+
+	cr_assert((cxil_destroy_svc(res.dev, res.service_id) == 0));
+	cxil_close_device(res.dev);
+}
+
+static void run_multi_process_dwq_test(int service_id)
+{
+	struct deferred_work_resources res = {};
+	int count = 4;
+	unsigned int threshold = 1000;
+	char send_buf[256];
+	int ret;
+	int i;
+	struct fi_deferred_work work = {};
+	struct iovec iov = {};
+	struct fi_op_msg msg = {};
+	bool limited = triggered_ops_limited();
+
+	deferred_work_resources_init(&res, service_id);
+
+	iov.iov_base = send_buf;
+	iov.iov_len = sizeof(send_buf);
+
+	work.threshold = threshold;
+	work.triggering_cntr = res.cntr;
+	work.completion_cntr = res.cntr;
+
+	msg.ep = res.ep;
+	msg.msg.msg_iov = &iov;
+	msg.msg.iov_count = 1;
+	msg.msg.addr = res.loopback;
+	msg.flags = FI_TRANSMIT_COMPLETE;
+
+	work.op_type = FI_OP_SEND;
+	work.op.msg = &msg;
+
+	/* Continue trying to queue multiple TLEs and free them. */
+	for (i = 0; i < count; i++) {
+		while (true) {
+			ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work);
+			test_assert(((ret == FI_SUCCESS) && limited) || (ret  == -FI_ENOSPC),
+				    "FI_QUEUE_WORK failed %d", ret);
+
+			if (ret == -FI_ENOSPC)
+				break;
+		}
+
+		test_assert((fi_control(&res.dom->fid, FI_FLUSH_WORK, NULL) == FI_SUCCESS),
+			    "FI_FLUSH_WORK failed");
+	}
+
+	deferred_work_resources_teardown(&res);
+
+	exit(EXIT_SUCCESS);
+}
+
+#define TLE_POOLS 4U
+
+Test(deferred_work_trig_op_limit, enforce_limit_multi_process)
+{
+	struct deferred_work_resources res = {};
+	int trig_op_count = 100;
+	int ret;
+	union c_cq_sts_max_tle_in_use max_in_use = {};
+	pid_t pid = -1;
+	int status;
+	int i;
+	bool found_max_in_use = false;
+	int num_forks = 5;
+	bool limited = triggered_ops_limited();
+
+	ret = cxil_open_device(0, &res.dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d\n", ret);
+
+	ret = cxil_map_csr(res.dev);
+	cr_assert_eq(ret, 0, "cxil_map_csr failed: %d\n", ret);
+
+	res.service_id = alloc_service(res.dev, trig_op_count);
+	cr_assert_gt(res.service_id, 0, "alloc_service() failed: %d\n",
+		     res.service_id);
+
+	for (i = 0; i < TLE_POOLS; i++) {
+		ret = cxil_write_csr(res.dev, C_CQ_STS_MAX_TLE_IN_USE(i),
+				     &max_in_use, sizeof(max_in_use));
+		cr_assert_eq(ret, 0, "cxil_write_csr failed: %d\n", ret);
+	}
+
+	for (i = 0; i < num_forks; i++) {
+		pid = fork();
+		if (pid == 0)
+			run_multi_process_dwq_test(res.service_id);
+	}
+
+	wait(&status);
+
+	for (i = 0; i < TLE_POOLS; i++) {
+		ret = cxil_read_csr(res.dev, C_CQ_STS_MAX_TLE_IN_USE(i),
+				    &max_in_use, sizeof(max_in_use));
+		cr_assert_eq(ret, 0, "cxil_read_csr failed: %d\n", ret);
+
+		fprintf(stderr, "%d max_in_use.max = %d\n", i, max_in_use.max);
+
+		if (max_in_use.max >= trig_op_count && max_in_use.max < (trig_op_count + 8)) {
+			found_max_in_use = true;
+			break;
+		}
+	}
+	if (limited)
+		cr_assert_eq(found_max_in_use, true, "Triggered op limit exceeded\n");
+
+	while ((ret = cxil_destroy_svc(res.dev, res.service_id)) == -EBUSY) {}
+	cr_assert(ret == 0, "cxil_destroy_svc failed: %d\n", ret);
+
+	cxil_close_device(res.dev);
+}
diff --git a/prov/cxi/test/domain.c b/prov/cxi/test/domain.c
new file mode 100644
index 00000000000..f0dee9f0b8d
--- /dev/null
+++ b/prov/cxi/test/domain.c
@@ -0,0 +1,421 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(domain, .init = cxit_setup_domain, .fini = cxit_teardown_domain,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test basic domain creation */
+Test(domain, simple)
+{
+	cxit_create_domain();
+	cr_assert(cxit_domain != NULL);
+
+	cxit_destroy_domain();
+}
+
+/* Test use of topology ops */
+Test(domain, topology)
+{
+	unsigned int group_num, switch_num, port_num;
+	int ret;
+
+	cxit_create_domain();
+	cr_assert(cxit_domain != NULL);
+	ret = dom_ops->topology(&cxit_domain->fid, &group_num, &switch_num,
+				&port_num);
+	cr_assert_eq(ret, FI_SUCCESS, "topology failed: %d\n", ret);
+
+	ret = dom_ops->topology(&cxit_domain->fid, NULL, &switch_num,
+				&port_num);
+	cr_assert_eq(ret, FI_SUCCESS, "null group topology failed: %d\n", ret);
+
+	ret = dom_ops->topology(&cxit_domain->fid, &group_num, NULL,
+				&port_num);
+	cr_assert_eq(ret, FI_SUCCESS, "null switch topology failed: %d\n", ret);
+
+	ret = dom_ops->topology(&cxit_domain->fid, &group_num, &switch_num,
+				NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "null port topology failed: %d\n", ret);
+
+	cxit_destroy_domain();
+}
+
+Test(domain, enable_hybrid_mr_desc)
+{
+	int ret;
+
+	cxit_create_domain();
+	cr_assert(cxit_domain != NULL);
+
+	ret = dom_ops->enable_hybrid_mr_desc(&cxit_domain->fid, true);
+	cr_assert_eq(ret, FI_SUCCESS, "enable_hybrid_mr_desc failed: %d\n",
+		     ret);
+
+	cxit_destroy_domain();
+}
+
+Test(domain, ep_get_unexp_msgs)
+{
+	size_t num_ux_ret;
+	size_t num_ux;
+	size_t addrlen = sizeof(cxit_ep_addr);
+	int ret;
+
+	cxit_create_domain();
+	cr_assert(cxit_domain != NULL);
+
+	/* Set up RMA objects */
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_cntrs();
+	cxit_bind_cntrs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret);
+
+	/* Find assigned Endpoint address. Address is assigned during enable. */
+	ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen);
+	cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret);
+	cr_assert(addrlen == sizeof(cxit_ep_addr));
+
+	num_ux_ret = dom_ops->ep_get_unexp_msgs(cxit_ep, NULL, 0,
+						NULL, &num_ux);
+	cr_assert_eq(num_ux_ret, 0, "ep_get_unexp_msgs bad return\n");
+	cr_assert_eq(num_ux, 0, "ep_get_unexp_msgs ux_count not 0\n");
+
+	/* Tear down RMA objects */
+	cxit_destroy_ep();
+	cxit_destroy_av();
+	cxit_destroy_cntrs();
+	cxit_destroy_cqs();
+	cxit_destroy_domain();
+}
+
+Test(domain, get_dwq_depth)
+{
+	int ret;
+	size_t depth;
+
+	cxit_create_domain();
+	cr_assert(cxit_domain != NULL);
+
+	ret = dom_ops->get_dwq_depth(&cxit_domain->fid, &depth);
+	cr_assert_eq(ret, FI_SUCCESS, "get_dwq_depth failed: %d\n",
+		     ret);
+
+	cr_assert(depth > 0);
+
+	cxit_destroy_domain();
+}
+
+Test(domain, enable_mr_match_events)
+{
+	int ret;
+	struct cxip_domain *cxip_dom;
+	struct cxip_mr *cxip_mr;
+	uint64_t key = 50;
+	struct mem_region region;
+	bool enable;
+
+	cxit_create_domain();
+	cr_assert(cxit_domain != NULL);
+
+	cxip_dom = container_of(cxit_domain, struct cxip_domain,
+				util_domain.domain_fid);
+	cr_assert_eq(cxip_env.mr_match_events,
+		     cxip_dom->mr_match_events, "Global setting failed");
+
+	if (!cxip_env.mr_match_events) {
+		enable = true;
+		ret = fi_control(&cxit_domain->fid,
+				 FI_OPT_CXI_SET_MR_MATCH_EVENTS, &enable);
+		cr_assert_eq(ret, FI_SUCCESS,
+			     "enable_mr_match_events failed: %d", ret);
+
+		cr_assert_eq(cxip_dom->mr_match_events, true,
+			     "domain mr_match_events not set");
+	}
+
+	/* MR type established, setup RMA objects */
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_cntrs();
+	cxit_bind_cntrs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert_eq(ret, FI_SUCCESS, "EP enable failed %d", ret);
+
+	ret = mr_create(8, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key, &region);
+	cr_assert_eq(ret, FI_SUCCESS, "MR create failed %d", ret);
+
+	cxip_mr = container_of(region.mr, struct cxip_mr, mr_fid);
+	cr_assert_eq(cxip_mr->count_events, true,
+		     "MR match events not set");
+
+	mr_destroy(&region);
+
+	/* Tear down RMA objects */
+	cxit_destroy_ep();
+	cxit_destroy_av();
+	cxit_destroy_cntrs();
+	cxit_destroy_cqs();
+	cxit_destroy_domain();
+}
+
+Test(domain, enable_optimized_mrs)
+{
+	int ret;
+	struct cxip_domain *cxip_dom;
+	bool optimized;
+
+	cxit_create_domain();
+	cr_assert(cxit_domain != NULL);
+
+	cxip_dom = container_of(cxit_domain, struct cxip_domain,
+				util_domain.domain_fid);
+	cr_assert_eq(cxip_env.optimized_mrs,
+		     cxip_dom->optimized_mrs, "Global setting failed");
+
+	/* Disable optimized MRs for the domain */
+	ret = dom_ops->enable_optimized_mrs(&cxit_domain->fid, false);
+	optimized = false;
+	ret = fi_control(&cxit_domain->fid,
+			 FI_OPT_CXI_SET_OPTIMIZED_MRS, &optimized);
+	if (cxip_dom->is_prov_key) {
+		cr_assert_eq(ret, FI_SUCCESS, "Unexpected call failure");
+		cr_assert_eq(cxip_dom->optimized_mrs, false, "Disable failed");
+	} else {
+		cr_assert_eq(ret, -FI_EINVAL, "Client key check failed");
+		cr_assert_eq(cxip_dom->optimized_mrs, cxip_env.optimized_mrs,
+			     "Client key altered domain specific setting");
+	}
+
+	/* Enable optimized MRs for the domain */
+	optimized = true;
+	ret = fi_control(&cxit_domain->fid,
+			 FI_OPT_CXI_SET_OPTIMIZED_MRS, &optimized);
+	if (cxip_dom->is_prov_key) {
+		cr_assert_eq(ret, FI_SUCCESS, "Unexpected call failure");
+		cr_assert_eq(cxip_dom->optimized_mrs, true, "Enable failed");
+	} else {
+		cr_assert_eq(ret, -FI_EINVAL, "Client key check failed");
+		cr_assert_eq(cxip_dom->optimized_mrs, cxip_env.optimized_mrs,
+			     "Client key altered domain specific setting");
+	}
+
+	cxit_destroy_domain();
+}
+
+Test(domain, disable_prov_key_cache)
+{
+	int ret;
+	struct cxip_domain *cxip_dom;
+	bool enable = false;
+
+	cxit_create_domain();
+	cr_assert(cxit_domain != NULL);
+
+	cxip_dom = container_of(cxit_domain, struct cxip_domain,
+				util_domain.domain_fid);
+	cr_assert_eq(cxip_env.prov_key_cache,
+		     cxip_dom->prov_key_cache, "Global setting failed");
+
+	ret = fi_control(&cxit_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE,
+			 &enable);
+
+	if (cxip_dom->is_prov_key) {
+		cr_assert_eq(ret, FI_SUCCESS, "Unexpected failure %d", ret);
+		cr_assert_eq(cxip_dom->prov_key_cache, false, "Update failed");
+	} else {
+		cr_assert_eq(ret, -FI_EINVAL, "Unexpected success");
+		cr_assert_eq(cxip_env.prov_key_cache,
+			     cxip_dom->prov_key_cache, "Unexpected update");
+	}
+
+	cxit_destroy_domain();
+}
+
+static const char *_fi_coll_to_text(enum fi_collective_op coll)
+{
+	switch (coll) {
+	case FI_BARRIER:	return "FI_BARRIER";
+	case FI_BROADCAST:	return "FI_BROADCAST";
+	case FI_ALLTOALL:	return "FI_ALLTOALL";
+	case FI_ALLREDUCE:	return "FI_ALLREDUCE";
+	case FI_ALLGATHER:	return "FI_ALLGATHER";
+	case FI_REDUCE_SCATTER:	return "FI_REDUCE_SCATTER";
+	case FI_REDUCE:		return "FI_REDUCE";
+	case FI_SCATTER:	return "FI_SCATTER";
+	case FI_GATHER:		return "FI_GATHER";
+	default:		return "NOCOLL";
+	}
+}
+
+static const char *_fi_op_to_text(enum fi_op op)
+{
+	switch ((int)op) {
+	case FI_MIN:		return "FI_MIN";
+	case FI_MAX:		return "FI_MAX";
+	case FI_SUM:		return "FI_SUM";
+	case FI_PROD:		return "FI_PROD";
+	case FI_LOR:		return "FI_LOR";
+	case FI_LAND:		return "FI_LAND";
+	case FI_BOR:		return "FI_BOR";
+	case FI_BAND:		return "FI_BAND";
+	case FI_LXOR:		return "FI_LXOR";
+	case FI_BXOR:		return "FI_BXOR";
+	case FI_ATOMIC_READ:	return "FI_ATOMIC_READ";
+	case FI_ATOMIC_WRITE:	return "FI_ATOMIC_WRITE";
+	case FI_CSWAP:		return "FI_CSWAP";
+	case FI_CSWAP_NE:	return "FI_CSWAP_NE";
+	case FI_CSWAP_LE:	return "FI_CSWAP_LE";
+	case FI_CSWAP_LT:	return "FI_CSWAP_LT";
+	case FI_CSWAP_GE:	return "FI_CSWAP_GE";
+	case FI_CSWAP_GT:	return "FI_CSWAP_GT";
+	case FI_MSWAP:		return "FI_MSWAP";
+	case FI_NOOP:		return "FI_NOOP";
+	default:		return "NOOP";
+	}
+}
+
+static const char *_fi_datatype_to_text(enum fi_datatype datatype)
+{
+	switch ((int)datatype) {
+	case FI_INT8:			return "FI_INT8";
+	case FI_UINT8:			return "FI_UINT8";
+	case FI_INT16:			return "FI_INT16";
+	case FI_UINT16:			return "FI_UINT16";
+	case FI_INT32:			return "FI_INT32";
+	case FI_UINT32:			return "FI_UINT32";
+	case FI_INT64:			return "FI_INT64";
+	case FI_UINT64:			return "FI_UINT64";
+	case FI_FLOAT:			return "FI_FLOAT";
+	case FI_DOUBLE:			return "FI_DOUBLE";
+	case FI_FLOAT_COMPLEX:		return "FI_FLOAT_COMPLEX";
+	case FI_DOUBLE_COMPLEX:		return "FI_DOUBLE_COMPLEX";
+	case FI_LONG_DOUBLE:		return "FI_LONG_DOUBLE";
+	case FI_LONG_DOUBLE_COMPLEX:	return "FI_LONG_DOUBLE_COMPLEX";
+	case FI_VOID:			return "FI_VOID";
+	default:			return "NOTYPE";
+	}
+}
+
+static void _test_coll_info(enum fi_collective_op coll,
+			    enum fi_op op,
+			    enum fi_datatype dtyp,
+			    size_t count, size_t size, int exp)
+{
+	struct fi_collective_attr attr, *attrp;
+	const char *collname = _fi_coll_to_text(coll);
+	const char *opname = _fi_op_to_text(op);
+	const char *dtypname = _fi_datatype_to_text(dtyp);
+	int ret;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.op = op;
+	attr.datatype = dtyp;
+	attrp = (op == -1) ? NULL : &attr;
+	ret = fi_query_collective(cxit_domain, coll, attrp, 0L);
+	cr_assert_eq(ret, exp,
+		     "query(%s attr.op=%s %s)=%s expect=%s\n",
+		     collname, opname, dtypname,
+		     fi_strerror(ret), fi_strerror(exp));
+	if (!attrp || ret)
+		return;
+
+	cr_assert_eq(attr.datatype_attr.count, count,
+		     "query(%s attr.op=%s %s)count=%ld expect=%ld\n",
+		     collname, opname, dtypname,
+		     attr.datatype_attr.count, count);
+	cr_assert_eq(attr.datatype_attr.size, size,
+		     "query(%s attr.op=%s %s)size=%ld expect=%ld\n",
+		     collname, opname, dtypname,
+		     attr.datatype_attr.size, size);
+}
+
+Test(domain, coll_info)
+{
+	cxit_create_domain();
+	cr_assert(cxit_domain != NULL);
+
+	_test_coll_info(FI_BARRIER, -1, -1, 0, 0, FI_SUCCESS);
+	_test_coll_info(FI_BARRIER, FI_NOOP, FI_VOID, 0, 0, FI_SUCCESS);
+
+	_test_coll_info(FI_BROADCAST, -1, FI_VOID, 0, 0, -FI_EINVAL);
+	_test_coll_info(FI_BROADCAST, FI_SUM, FI_VOID, 0, 0, -FI_EOPNOTSUPP);
+	_test_coll_info(FI_BROADCAST, FI_ATOMIC_WRITE, FI_UINT8, 32, 1,
+			FI_SUCCESS);
+
+	_test_coll_info(FI_REDUCE, FI_ATOMIC_WRITE, -1, 0, 0, -FI_EOPNOTSUPP);
+	_test_coll_info(FI_REDUCE, FI_BOR, FI_INT8, 32, 1, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BOR, FI_INT16, 16, 2, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BOR, FI_INT32, 8, 4, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BOR, FI_INT64, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BOR, FI_UINT8, 32, 1, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BOR, FI_UINT16, 16, 2, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BOR, FI_UINT32, 8, 4, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BOR, FI_UINT64, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BAND, FI_INT8, 32, 1, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BAND, FI_INT16, 16, 2, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BAND, FI_INT32, 8, 4, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BAND, FI_INT64, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BAND, FI_UINT8, 32, 1, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BAND, FI_UINT16, 16, 2, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BAND, FI_UINT32, 8, 4, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BAND, FI_UINT64, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BXOR, FI_INT8, 32, 1, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BXOR, FI_INT16, 16, 2, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BXOR, FI_INT32, 8, 4, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BXOR, FI_INT64, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BXOR, FI_UINT8, 32, 1, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BXOR, FI_UINT16, 16, 2, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BXOR, FI_UINT32, 8, 4, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_BXOR, FI_UINT64, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_MIN, FI_UINT64, 0, 0, -FI_EOPNOTSUPP);
+	_test_coll_info(FI_REDUCE, FI_MIN, FI_INT64, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_MIN, FI_DOUBLE, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_MAX, FI_UINT64, 0, 0, -FI_EOPNOTSUPP);
+	_test_coll_info(FI_REDUCE, FI_MAX, FI_INT64, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_MAX, FI_DOUBLE, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_SUM, FI_UINT64, 0, 0, -FI_EOPNOTSUPP);
+	_test_coll_info(FI_REDUCE, FI_SUM, FI_INT64, 4, 8, FI_SUCCESS);
+	_test_coll_info(FI_REDUCE, FI_SUM, FI_DOUBLE, 4, 8, FI_SUCCESS);
+
+	cxit_destroy_domain();
+}
+
+TestSuite(domain_cntrs, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test basic counter read */
+Test(domain_cntrs, cntr_read)
+{
+	int ret;
+	uint64_t value;
+	struct timespec ts;
+
+	ret = dom_ops->cntr_read(&cxit_domain->fid, C_CNTR_LPE_SUCCESS_CNTR,
+				 &value, &ts);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	printf("LPE_SUCCESS_CNTR: %lu\n", value);
+}
diff --git a/prov/cxi/test/ep.c b/prov/cxi/test/ep.c
new file mode 100644
index 00000000000..dab6ed9a37b
--- /dev/null
+++ b/prov/cxi/test/ep.c
@@ -0,0 +1,1914 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(ep, .init = cxit_setup_ep, .fini = cxit_teardown_ep,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test basic EP creation */
+Test(ep, simple)
+{
+	cxit_create_ep();
+
+	cxit_destroy_ep();
+}
+
+/* Test NULL parameter passed with EP creation */
+Test(ep, ep_null_info)
+{
+	int ret;
+
+	ret = fi_endpoint(cxit_domain, NULL, &cxit_ep, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "Failure with NULL info. %d", ret);
+}
+
+/* Test NULL parameter passed with EP creation */
+Test(ep, ep_null_ep)
+{
+	int ret;
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, NULL, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "Failure with NULL ep. %d", ret);
+}
+
+struct ep_test_params {
+	void *context;
+	enum fi_ep_type type;
+	int retval;
+};
+
+static struct ep_test_params ep_ep_params[] = {
+	{.type = FI_EP_RDM,
+		.retval = FI_SUCCESS},
+	{.type = FI_EP_UNSPEC,
+		.retval = FI_SUCCESS},
+	{.type = FI_EP_MSG,
+		.retval = -FI_EINVAL},
+	{.type = FI_EP_DGRAM,
+		.retval = -FI_EINVAL},
+	{.type = FI_EP_SOCK_STREAM,
+		.retval = -FI_EINVAL},
+	{.type = FI_EP_SOCK_DGRAM,
+		.retval = -FI_EINVAL},
+	{.type = FI_EP_RDM,
+		.context = (void *)0xabcdef,
+		.retval = FI_SUCCESS},
+};
+
+ParameterizedTestParameters(ep, fi_ep_types)
+{
+	size_t param_sz;
+
+	param_sz = ARRAY_SIZE(ep_ep_params);
+	return cr_make_param_array(struct ep_test_params, ep_ep_params,
+				   param_sz);
+}
+
+ParameterizedTest(struct ep_test_params *param, ep, fi_ep_types)
+{
+	int ret;
+	struct cxip_ep *cep;
+
+	cxit_fi->ep_attr->type = param->type;
+	cxit_ep = NULL;
+	ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, param->context);
+	cr_assert_eq(ret, param->retval,
+		     "fi_endpoint() error for type %d. %d != %d",
+		     param->type, ret, param->retval);
+
+	if (ret != FI_SUCCESS)
+		return;
+
+	cr_assert_not_null(cxit_ep);
+	cr_expect_eq(cxit_ep->fid.fclass, FI_CLASS_EP);
+	cr_expect_eq(cxit_ep->fid.context, param->context);
+	cep = container_of(cxit_ep, struct cxip_ep, ep);
+	cr_expect_not_null(cep->ep_obj);
+
+	cxit_destroy_ep();
+}
+
+/* Test Passive EP creation is not supported */
+Test(ep, passive_ep)
+{
+	int ret;
+	struct fid_pep *pep = NULL;
+
+	ret = fi_passive_ep(cxit_fabric, cxit_fi, &pep, NULL);
+	cr_assert_eq(ret, -FI_ENOSYS, "Failure with fi_passive_ep. %d", ret);
+	cr_assert_null(pep);
+}
+
+Test(ep, ep_bind_null_bind_obj)
+{
+	int ret;
+
+	cxit_create_ep();
+
+	ret = fi_ep_bind(cxit_ep, NULL, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, ep_bind_invalid_fclass)
+{
+	int ret;
+
+	cxit_create_ep();
+	cxit_create_av();
+
+	/* try to bind an unsupported class type */
+	cxit_ep->fid.fclass = FI_CLASS_PEP;
+	ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0);
+	cr_assert_eq(ret, -FI_EINVAL);
+	cxit_ep->fid.fclass = FI_CLASS_EP;
+
+	cxit_destroy_av();
+	cxit_destroy_ep();
+}
+
+Test(ep, ep_bind_av)
+{
+	struct cxip_ep *ep;
+	struct cxip_av *av;
+
+	cxit_create_ep();
+	cxit_create_av();
+
+	cxit_bind_av();
+
+	av = container_of(cxit_av, struct cxip_av, av_fid.fid);
+	ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+
+	cr_assert_not_null(ep->ep_obj);
+	cr_assert_eq(ep->ep_obj->av, av);
+
+	cxit_destroy_ep();
+	cxit_destroy_av();
+}
+
+Test(ep, ep_bind_eq)
+{
+	int ret;
+
+	/* order is not important */
+	cxit_create_eq();
+	cxit_create_ep();
+
+	ret = fi_ep_bind(cxit_ep, &cxit_eq->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_eq_bind EQ failed %d", ret);
+
+	/* order is important */
+	cxit_destroy_ep();
+	cxit_destroy_eq();
+}
+
+Test(ep, ep_bind_mr)
+{
+	int ret;
+
+	/*
+	 * At the time of implementing this test MRs were not supported by the
+	 * CXI provider. Fake attempting to register a MR with a EP using an AV
+	 */
+	cxit_create_ep();
+	cxit_create_av();
+
+	cxit_av->fid.fclass = FI_CLASS_MR;
+	ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0);
+	cr_assert_eq(ret, -FI_EINVAL, "Bind (fake) MR to EP. %d", ret);
+	cxit_av->fid.fclass = FI_CLASS_AV;
+
+	cxit_destroy_ep();
+	cxit_destroy_av();
+}
+
+Test(ep, ep_bind_cq)
+{
+	struct cxip_ep *ep;
+	struct cxip_cq *rx_cq, *tx_cq;
+
+	cxit_create_ep();
+	cxit_create_cqs();
+	cr_assert_not_null(cxit_tx_cq);
+	cr_assert_not_null(cxit_rx_cq);
+
+	cxit_bind_cqs();
+
+	rx_cq = container_of(cxit_rx_cq, struct cxip_cq, util_cq.cq_fid.fid);
+	tx_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid.fid);
+	ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+
+	cr_assert_not_null(ep->ep_obj);
+	cr_assert_eq(ep->ep.fid.fclass, FI_CLASS_EP);
+	cr_assert_eq(ep->ep_obj->txc.send_cq, tx_cq);
+	cr_assert_eq(ep->ep_obj->rxc.recv_cq, rx_cq);
+
+	cxit_destroy_ep();
+	cxit_destroy_cqs();
+}
+
+Test(ep, ep_bind_cq_eps)
+{
+	struct fid_ep *fid_ep2;
+	struct cxip_ep *ep;
+	struct cxip_ep *ep2;
+	int ret;
+
+	cxit_create_ep();
+	cxit_create_cqs();
+	cr_assert_not_null(cxit_tx_cq);
+	cr_assert_not_null(cxit_rx_cq);
+
+	cxit_bind_cqs();
+
+	/* Create second EP */
+	ret = fi_endpoint(cxit_domain, cxit_fi, &fid_ep2, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_endpoint");
+	cr_assert_not_null(fid_ep2);
+
+	/* Bind same CQs to second EP */
+	ret = fi_ep_bind(fid_ep2, &cxit_tx_cq->fid, cxit_tx_cq_bind_flags);
+	cr_assert(!ret, "fi_ep_bind TX CQ to 2nd EP");
+
+	ret = fi_ep_bind(fid_ep2, &cxit_rx_cq->fid, cxit_rx_cq_bind_flags);
+	cr_assert(!ret, "fi_ep_bind RX CQ to 2nd EP");
+
+	ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	cr_assert_not_null(ep->ep_obj);
+	ep2 = container_of(fid_ep2, struct cxip_ep, ep.fid);
+	cr_assert_not_null(ep2->ep_obj);
+
+	cr_assert_eq(ep->ep_obj->txc.send_cq, ep2->ep_obj->txc.send_cq,
+		     "Send CQ mismatch");
+	cr_assert_eq(ep->ep_obj->rxc.recv_cq, ep2->ep_obj->rxc.recv_cq,
+		     "Receive CQ mismatch");
+
+	ret = fi_close(&fid_ep2->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close endpoint");
+
+	cxit_destroy_ep();
+	cxit_destroy_cqs();
+}
+
+Test(ep, ep_bind_cntr)
+{
+	int ret;
+
+	cxit_create_ep();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	cxit_create_cntrs();
+	cxit_bind_cntrs();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS);
+
+	cxit_destroy_ep();
+	cxit_destroy_cntrs();
+	cxit_destroy_av();
+	cxit_destroy_cqs();
+}
+
+Test(ep, ep_bind_stx_ctx)
+{
+	int ret;
+	struct fi_tx_attr *attr = NULL;
+	void *context = NULL;
+
+	ret = fi_stx_context(cxit_domain, attr, NULL, context);
+	cr_assert_eq(ret, -FI_ENOSYS,
+		     "TODO Add test for STX CTXs binding to the endpoint when implemented");
+}
+
+Test(ep, ep_bind_srx_ctx)
+{
+	int ret;
+	struct fi_rx_attr *attr = NULL;
+	void *context = NULL;
+
+	ret = fi_srx_context(cxit_domain, attr, NULL, context);
+	cr_assert_eq(ret, -FI_ENOSYS,
+		     "TODO Add test for SRX CTXs binding to the endpoint when implemented");
+}
+
+Test(ep, ep_bind_unhandled)
+{
+	int ret;
+
+	cxit_create_ep();
+	cxit_create_av();
+
+	/* Emulate a different type of object type */
+	cxit_av->fid.fclass = -1;
+	ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_ep_bind unhandled object. %d", ret);
+	cxit_av->fid.fclass = FI_CLASS_AV;
+
+	cxit_destroy_ep();
+	cxit_destroy_av();
+}
+
+Test(ep, cancel_ep)
+{
+	int ret;
+
+	cxit_create_ep();
+
+	ret = fi_cancel(&cxit_ep->fid, NULL);
+	cr_assert_eq(ret, -FI_EOPBADSTATE);
+
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_cancel(&cxit_ep->fid, NULL);
+	cr_assert_eq(ret, -FI_ENOENT);
+
+	ret = fi_cancel(&cxit_ep->fid, (void *)1);
+	cr_assert_eq(ret, -FI_ENOENT);
+
+	cxit_destroy_ep();
+	cxit_destroy_av();
+	cxit_destroy_cqs();
+}
+
+Test(ep, cancel_unhandled)
+{
+	int ret;
+
+	cxit_create_ep();
+
+	/* Emulate a different type of object type */
+	cxit_ep->fid.fclass = FI_CLASS_PEP;
+	ret = fi_cancel(&cxit_ep->fid, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	cxit_ep->fid.fclass = FI_CLASS_EP;
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_unhandled_obj)
+{
+	int ret;
+
+	cxit_create_ep();
+
+	/* Emulate a different type of object type */
+	cxit_ep->fid.fclass = FI_CLASS_PEP;
+	ret = fi_control(&cxit_ep->fid, -1, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+	cxit_ep->fid.fclass = FI_CLASS_EP;
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_unhandled_cmd)
+{
+	int ret;
+
+	cxit_create_ep();
+
+	ret = fi_control(&cxit_ep->fid, -1, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_null_fid_alias)
+{
+	int ret;
+	struct fi_alias alias = {0};
+
+	cxit_create_ep();
+
+	/* A null alias.fid causes -FI_EINVAL */
+	ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_ALIAS. %d", ret);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_empty_alias)
+{
+	int ret;
+	struct fi_alias alias = {0};
+	struct fid *alias_fid;
+
+	cxit_create_ep();
+
+	/* Empty alias.flags causes -FI_EINVAL */
+	alias.fid = &alias_fid;
+	ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_ALIAS. %d", ret);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_bad_flags_alias)
+{
+	int ret;
+	struct fi_alias alias = {0};
+
+	cxit_create_ep();
+
+	/* Both Tx and Rx flags causes -FI_EINVAL */
+	alias.flags = FI_TRANSMIT | FI_RECV;
+	ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_ALIAS. %d", ret);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_tx_flags_alias)
+{
+	int ret;
+	struct fi_alias alias = {0};
+	struct fid *alias_fid = NULL;
+	struct cxip_ep *cxi_ep, *alias_ep;
+
+	cxit_create_ep();
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	alias.fid = &alias_fid;
+	alias.flags = FI_TRANSMIT;
+	ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_ALIAS. %d", ret);
+	cr_assert_not_null(alias_fid);
+
+	/* verify alias vs cxit_ep */
+	alias_ep = container_of(alias_fid, struct cxip_ep, ep.fid);
+	cr_assert_eq(alias_ep->ep_obj, cxi_ep->ep_obj, "EP Attr");
+	cr_assert_eq(alias_ep->is_alias, 1, "EP is_alias");
+	cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 1, "EP refs 1");
+
+	/* close alias */
+	ret = fi_close(alias_fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close endpoint");
+	alias_fid = NULL;
+	cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 0, "EP refs 0");
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_rx_flags_alias)
+{
+	int ret;
+	struct fi_alias alias = {0};
+	struct fid *alias_fid = NULL;
+	struct cxip_ep *cxi_ep, *alias_ep;
+
+	cxit_create_ep();
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	alias.fid = &alias_fid;
+	alias.flags = FI_RECV;
+	ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_ALIAS. %d", ret);
+	cr_assert_not_null(alias_fid);
+
+	alias_ep = container_of(alias_fid, struct cxip_ep, ep.fid);
+	cr_assert_eq(alias_ep->ep_obj, cxi_ep->ep_obj, "EP Attr");
+	cr_assert_eq(alias_ep->is_alias, 1, "EP is_alias");
+	cr_assert_not_null(cxi_ep->ep_obj, "EP attr NULL");
+	cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 1, "EP refs 1");
+
+	/* close alias */
+	ret = fi_close(alias_fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close endpoint");
+	alias_fid = NULL;
+	cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 0, "EP refs 0");
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_getopsflag_both_tx_rx)
+{
+	int ret;
+	uint64_t flags = FI_TRANSMIT | FI_RECV;
+
+	cxit_create_ep();
+
+	ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_GETOPSFLAG TX/RX. %d",
+		     ret);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_getopsflag_no_flags)
+{
+	int ret;
+	uint64_t flags = FI_TRANSMIT | FI_RECV;
+
+	cxit_create_ep();
+
+	ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_GETOPSFLAG 0. %d", ret);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_getopsflag_tx)
+{
+	int ret;
+	uint64_t flags = FI_TRANSMIT;
+	struct cxip_ep *cxi_ep;
+
+	cxit_create_ep();
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_GETOPSFLAG TX. %d", ret);
+	cr_assert_eq(cxi_ep->tx_attr.op_flags, flags,
+		     "fi_control FI_GETOPSFLAG Flag mismatch. %" PRIx64 " != %"
+		     PRIx64 " ", cxi_ep->tx_attr.op_flags, flags);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_getopsflag_rx)
+{
+	int ret;
+	uint64_t flags = FI_RECV;
+	struct cxip_ep *cxi_ep;
+
+	cxit_create_ep();
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_GETOPSFLAG RX. %d", ret);
+	cr_assert_eq(cxi_ep->rx_attr.op_flags, flags,
+		     "fi_control FI_GETOPSFLAG Flag mismatch. %" PRIx64 " != %"
+		     PRIx64 " ", cxi_ep->rx_attr.op_flags, flags);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_setopsflag_both_tx_rx)
+{
+	int ret;
+	uint64_t flags = FI_TRANSMIT | FI_RECV;
+
+	cxit_create_ep();
+
+	ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_SETOPSFLAG TX/RX. %d",
+		     ret);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_setopsflag_no_flags)
+{
+	int ret;
+	uint64_t flags = FI_TRANSMIT | FI_RECV;
+
+	cxit_create_ep();
+
+	ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_SETOPSFLAG 0. %d", ret);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_setopsflag_tx)
+{
+	int ret;
+	uint64_t flags = (FI_TRANSMIT | FI_MSG | FI_TRIGGER |
+			  FI_DELIVERY_COMPLETE);
+	uint64_t tx_flags;
+	struct cxip_ep *cxi_ep;
+
+	cxit_create_ep();
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_SETOPSFLAG TX. %d", ret);
+	flags &= ~FI_TRANSMIT;
+	tx_flags = cxi_ep->tx_attr.op_flags;
+	cr_assert_eq(tx_flags, flags,
+		     "fi_control FI_SETOPSFLAG TX Flag mismatch. %" PRIx64
+		     " != %" PRIx64, tx_flags, flags);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_setopsflag_tx_complete)
+{
+	int ret;
+	uint64_t flags = FI_TRANSMIT | FI_MSG | FI_TRIGGER | FI_AFFINITY;
+	uint64_t tx_flags;
+	struct cxip_ep *cxi_ep;
+
+	cxit_create_ep();
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_SETOPSFLAG TX. %d", ret);
+	flags &= ~FI_TRANSMIT;
+	flags |= FI_TRANSMIT_COMPLETE;
+	tx_flags = cxi_ep->tx_attr.op_flags;
+	cr_assert_eq(tx_flags, flags,
+		     "fi_control FI_SETOPSFLAG TXcomp Flag mismatch. %" PRIx64
+		     " != %" PRIx64, tx_flags, flags);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_setopsflag_rx)
+{
+	int ret;
+	uint64_t flags = FI_RECV | FI_TAGGED | FI_NUMERICHOST | FI_EVENT;
+	uint64_t rx_flags;
+	struct cxip_ep *cxi_ep;
+
+	cxit_create_ep();
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_SETOPSFLAG RX. %d", ret);
+	flags &= ~FI_RECV;
+	rx_flags = cxi_ep->rx_attr.op_flags;
+	cr_assert_eq(rx_flags, flags,
+		     "fi_control FI_SETOPSFLAG RX Flag mismatch. %" PRIx64
+		     " != %" PRIx64, rx_flags, flags);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, control_enable_nocq)
+{
+	int ret;
+
+	cxit_create_ep();
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert_eq(ret, -FI_ENOCQ, "fi_enable. %d", ret);
+
+	cxit_destroy_ep();
+	cxit_destroy_av();
+}
+
+Test(ep, control_enable_noav)
+{
+	int ret;
+
+	cxit_create_ep();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert_eq(ret, -FI_ENOAV, "fi_enable. %d", ret);
+
+	cxit_destroy_ep();
+	cxit_destroy_cqs();
+}
+
+Test(ep, control_enable)
+{
+	int ret;
+
+	cxit_create_ep();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_enable. %d", ret);
+
+	cxit_destroy_ep();
+	cxit_destroy_av();
+	cxit_destroy_cqs();
+}
+
+struct ep_ctrl_null_params {
+	int command;
+	int retval;
+};
+
+ParameterizedTestParameters(ep, ctrl_null_arg)
+{
+	size_t param_sz;
+
+	static struct ep_ctrl_null_params ep_null_params[] = {
+		{.command = -1,
+		 .retval = -FI_EINVAL},
+		{.command = FI_SETOPSFLAG,
+		 .retval = -FI_EINVAL},
+		{.command = FI_ENABLE,
+		 .retval = -FI_ENOAV},
+	};
+
+	param_sz = ARRAY_SIZE(ep_null_params);
+	return cr_make_param_array(struct ep_ctrl_null_params, ep_null_params,
+				   param_sz);
+}
+
+ParameterizedTest(struct ep_ctrl_null_params *param, ep, ctrl_null_arg)
+{
+	int ret;
+
+	cxit_create_ep();
+
+	ret = fi_control(&cxit_ep->fid, param->command, NULL);
+	cr_assert_eq(ret, param->retval, "fi_control type %d. %d != %d",
+		     param->command, ret, param->retval);
+
+	cxit_destroy_ep();
+}
+
+struct ep_getopt_args {
+	int level;
+	int optname;
+	size_t *optval;
+	size_t *optlen;
+	int retval;
+};
+
+static size_t optvalue;
+static size_t optlength = sizeof(size_t);
+static struct ep_getopt_args ep_null_params[] = {
+	{.level = -1,
+	 .retval = -FI_ENOPROTOOPT},
+	{.level = FI_OPT_ENDPOINT,
+	 .optname = FI_OPT_CM_DATA_SIZE,
+	 .retval = -FI_ENOPROTOOPT},
+	{.level = FI_OPT_ENDPOINT,
+	 .optname = -1,
+	 .retval = -FI_ENOPROTOOPT},
+	{.level = FI_OPT_ENDPOINT,
+	 .optname = FI_OPT_MIN_MULTI_RECV,
+	 .optval = NULL,
+	 .optlen = NULL,
+	 .retval = -FI_EINVAL},
+	{.level = FI_OPT_ENDPOINT,
+	 .optname = FI_OPT_MIN_MULTI_RECV,
+	 .optval = &optvalue,
+	 .optlen = NULL,
+	 .retval = -FI_EINVAL},
+	{.level = FI_OPT_ENDPOINT,
+	 .optname = FI_OPT_MIN_MULTI_RECV,
+	 .optval = &optvalue,
+	 .optlen = &optlength,
+	 .retval = FI_SUCCESS},
+};
+
+ParameterizedTestParameters(ep, getopt_args)
+{
+	size_t param_sz;
+
+	param_sz = ARRAY_SIZE(ep_null_params);
+	return cr_make_param_array(struct ep_getopt_args, ep_null_params,
+				   param_sz);
+}
+
+ParameterizedTest(struct ep_getopt_args *param, ep, getopt_args)
+{
+	int ret;
+	struct cxip_ep *cxi_ep;
+
+	cxit_create_ep();
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	ret = fi_getopt(&cxit_ep->fid, param->level, param->optname,
+			(void *)param->optval, param->optlen);
+	cr_assert_eq(ret, param->retval,
+		     "fi_getopt lvl %d name %d val %p len %p. %d != %d",
+		     param->level, param->optname, param->optval,
+		     param->optlen, ret, param->retval);
+
+	if (ret == FI_SUCCESS) {
+		cr_assert_not_null(cxi_ep->ep_obj);
+		cr_assert_eq(*param->optval, cxi_ep->ep_obj->rxc.min_multi_recv,
+			     "fi_getopt val mismatch. %zd != %zd",
+			     *param->optval,
+			     cxi_ep->ep_obj->rxc.min_multi_recv);
+		cr_assert_eq(*param->optlen, sizeof(size_t),
+			     "fi_getopt len mismatch. %zd != %zd",
+			     *param->optlen, sizeof(size_t));
+	}
+
+	cxit_destroy_ep();
+}
+
+struct ep_setopt_args {
+	int level;
+	int optname;
+	size_t optval;
+	size_t optlen;
+	int retval;
+};
+
+ParameterizedTestParameters(ep, setopt_args)
+{
+	size_t param_sz;
+
+	static struct ep_setopt_args ep_null_params[] = {
+		{.level = -1,
+		.retval = -FI_ENOPROTOOPT},
+		{.level = FI_OPT_ENDPOINT,
+		.optname = FI_OPT_CM_DATA_SIZE,
+		.retval = -FI_ENOPROTOOPT},
+		{.level = FI_OPT_ENDPOINT,
+		.optname = -1,
+		.retval = -FI_ENOPROTOOPT},
+		{.level = FI_OPT_ENDPOINT,
+		.optname = FI_OPT_MIN_MULTI_RECV,
+		.optval = 0,
+		.retval = -FI_EINVAL},
+		{.level = FI_OPT_ENDPOINT,
+		.optname = FI_OPT_MIN_MULTI_RECV,
+		.optval = 26,
+		.retval = FI_SUCCESS},
+		{.level = FI_OPT_ENDPOINT,
+		.optname = FI_OPT_MIN_MULTI_RECV,
+		.optval = 90001,
+		.retval = FI_SUCCESS},
+		{.level = FI_OPT_ENDPOINT,
+		.optname = FI_OPT_MIN_MULTI_RECV,
+		.optval = 1<<24,
+		.retval = -FI_EINVAL},
+	};
+
+	param_sz = ARRAY_SIZE(ep_null_params);
+	return cr_make_param_array(struct ep_setopt_args, ep_null_params,
+				   param_sz);
+}
+
+ParameterizedTest(struct ep_setopt_args *param, ep, setopt_args)
+{
+	int ret;
+	struct cxip_ep *cxi_ep;
+	void *val = NULL;
+
+	if (param->optval != 0)
+		val = &param->optval;
+
+	cxit_create_ep();
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	ret = fi_setopt(&cxit_ep->fid, param->level, param->optname,
+			val, param->optlen);
+	cr_assert_eq(ret, param->retval,
+		     "fi_setopt lvl %d name %d val %zd. %d != %d",
+		     param->level, param->optname, param->optval,
+		     ret, param->retval);
+
+	if (ret == FI_SUCCESS) {
+		cr_assert_not_null(cxi_ep->ep_obj);
+		cr_assert_eq(param->optval, cxi_ep->ep_obj->rxc.min_multi_recv,
+			     "fi_setopt val mismatch. %zd != %zd",
+			     param->optval, cxi_ep->ep_obj->rxc.min_multi_recv);
+	}
+
+	cxit_destroy_ep();
+}
+
+Test(ep, rx_ctx_ep)
+{
+	int ret;
+
+	cxit_create_ep();
+
+	/* RX context doesn't work with anything but scalable eps */
+	ret = fi_rx_context(cxit_ep, 0, NULL, NULL, NULL);
+	cr_assert_eq(ret, -FI_ENOSYS, "fi_rx_context bad ep. %d", ret);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, tx_ctx_ep)
+{
+	int ret;
+
+	cxit_create_ep();
+
+	/* RX context doesn't work with anything but scalable eps */
+	ret = fi_tx_context(cxit_ep, 0, NULL, NULL, NULL);
+	cr_assert_eq(ret, -FI_ENOSYS, "fi_tx_context bad ep. %d", ret);
+
+	cxit_destroy_ep();
+}
+
+Test(ep, stx_ctx_null_stx)
+{
+	int ret;
+	struct fi_tx_attr *attr = NULL;
+	void *context = NULL;
+
+	ret = fi_stx_context(cxit_domain, attr, NULL, context);
+	/* TODO Fix when fi_stx_context is implemented, should be -FI_EINVAL */
+	cr_assert_eq(ret, -FI_ENOSYS, "fi_stx_context null stx. %d", ret);
+}
+
+Test(ep, stx_ctx)
+{
+	int ret;
+	struct fi_tx_attr *attr = NULL;
+	struct fid_stx *stx;
+	struct cxip_ep *ep;
+	void *context = &ret;
+	struct cxip_domain *dom;
+	struct cxip_txc *txc;
+	int refs;
+
+	dom = container_of(cxit_domain, struct cxip_domain,
+			   util_domain.domain_fid);
+	refs = ofi_atomic_get32(&dom->ref);
+
+	ret = fi_stx_context(cxit_domain, attr, &stx, context);
+
+	/* TODO Fix when fi_stx_context is implemented, should be FI_SUCCESS */
+	cr_assert_eq(ret, -FI_ENOSYS, "fi_stx_context failed. %d", ret);
+	if (ret == -FI_ENOSYS)
+		return;
+
+	ep = container_of(stx, struct cxip_ep, ep);
+	txc = &ep->ep_obj->txc;
+
+	/* Validate stx */
+	cr_assert_eq(txc->domain, dom);
+	cr_assert_eq(ofi_atomic_inc32(&dom->ref), refs + 1);
+	cr_assert_eq(ep->ep.fid.fclass, FI_CLASS_TX_CTX);
+	cr_assert_eq(ep->ep.fid.context, context);
+
+	ret = fi_close(&stx->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close stx_ep. %d", ret);
+}
+
+Test(ep, srx_ctx_null_srx)
+{
+	int ret;
+	struct fi_rx_attr *attr = NULL;
+	void *context = NULL;
+
+	ret = fi_srx_context(cxit_domain, attr, NULL, context);
+	/* TODO Fix when fi_srx_context is implemented, should be -FI_EINVAL */
+	cr_assert_eq(ret, -FI_ENOSYS, "fi_srx_context null srx. %d", ret);
+}
+
+Test(ep, srx_ctx)
+{
+	int ret;
+	struct fi_rx_attr *attr = NULL;
+	struct fid_ep *srx;
+	struct cxip_ep *srx_ep;
+	void *context = &ret;
+	struct cxip_domain *dom;
+	struct cxip_rxc *rxc;
+	int refs;
+
+	dom = container_of(cxit_domain, struct cxip_domain,
+			   util_domain.domain_fid);
+	refs = ofi_atomic_get32(&dom->ref);
+
+	ret = fi_srx_context(cxit_domain, attr, &srx, context);
+	/* TODO Fix when fi_srx_context is implemented, should be FI_SUCCESS */
+	cr_assert_eq(ret, -FI_ENOSYS, "fi_stx_context failed. %d", ret);
+	if (ret == -FI_ENOSYS)
+		return;
+
+	srx_ep = container_of(srx, struct cxip_ep, ep);
+	rxc = &srx_ep->ep_obj->rxc;
+
+	/* Validate stx */
+	cr_assert_eq(rxc->domain, dom);
+	cr_assert_eq(ofi_atomic_inc32(&dom->ref), refs + 1);
+	cr_assert_eq(srx_ep->ep.fid.fclass, FI_CLASS_RX_CTX);
+	cr_assert_eq(srx_ep->ep.fid.context, context);
+	cr_assert_eq(rxc->state, RXC_ENABLED);
+	cr_assert_eq(rxc->min_multi_recv, CXIP_EP_MIN_MULTI_RECV);
+
+	ret = fi_close(&srx->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close srx_ep. %d", ret);
+}
+
+TestSuite(ep_init, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(ep_init, auth_key)
+{
+	int ret;
+	struct cxi_auth_key auth_key = {
+		.svc_id = CXI_DEFAULT_SVC_ID,
+		.vni = 1,
+	};
+
+	/* Create fabric */
+	cxit_setup_domain();
+
+	/* Try invalid auth key */
+	cxit_fi->domain_attr->auth_key_size = 12345;
+	ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+
+	/* Set custom auth key in Domain */
+	cxit_fi->domain_attr->auth_key = mem_dup(&auth_key, sizeof(auth_key));
+	cxit_fi->domain_attr->auth_key_size = sizeof(auth_key);
+
+	/* Create enabled Domain/EP */
+	cxit_setup_rma();
+
+	cxit_teardown_rma();
+
+	/*---*/
+
+	cxit_setup_domain();
+	cxit_create_domain();
+
+	/* Try invalid auth key */
+	cxit_fi->ep_attr->auth_key_size = 12345;
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL);
+	cr_assert_eq(ret, -FI_EINVAL); /* inconsistent error */
+
+	/* Set custom auth key in EP */
+	auth_key.vni = 200;
+
+	free(cxit_fi->ep_attr->auth_key);
+	cxit_fi->ep_attr->auth_key = mem_dup(&auth_key, sizeof(auth_key));
+	cxit_fi->ep_attr->auth_key_size = sizeof(auth_key);
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+
+	/* Try mis-matched svc_id */
+	auth_key.svc_id = 10;
+	auth_key.vni = 301;
+
+	free(cxit_fi->ep_attr->auth_key);
+	cxit_fi->ep_attr->auth_key = mem_dup(&auth_key, sizeof(auth_key));
+	cxit_fi->ep_attr->auth_key_size = sizeof(auth_key);
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL);
+	cr_assert_eq(ret, -FI_EINVAL);
+
+	cxit_destroy_domain();
+	cxit_teardown_domain();
+}
+
+Test(ep_init, tclass)
+{
+	int ret;
+
+	/* Create fabric */
+	cxit_setup_domain();
+
+	/* Try invalid auth key */
+	cxit_fi->domain_attr->tclass = FI_TC_DSCP;
+
+	ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "ret is: %d\n", ret);
+
+	/* Set custom TC in Domain */
+	cxit_fi->domain_attr->tclass = FI_TC_LOW_LATENCY;
+
+	/* Create enabled Domain/EP */
+	cxit_setup_rma();
+
+	cxit_teardown_rma();
+
+	/*---*/
+
+	cxit_setup_domain();
+	cxit_create_domain();
+
+	/* Try invalid auth key */
+	cxit_fi->tx_attr->tclass = FI_TC_DSCP;
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "ret is: %d\n", ret);
+
+	/* Set custom TC in EP */
+	cxit_fi->tx_attr->tclass = FI_TC_DEDICATED_ACCESS;
+
+	/* Create enabled Domain/EP */
+	cxit_setup_rma();
+
+	cxit_teardown_rma();
+}
+
+Test(ep, invalid_tx_attr_size)
+{
+	struct fid_ep *tmp_ep;
+	int ret;
+
+	/* Invalid TX attr size. */
+	cxit_fi->tx_attr->size = 1234567;
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, &tmp_ep, NULL);
+	cr_assert(ret != FI_SUCCESS, "fi_endpoint");
+}
+
+Test(ep, valid_tx_attr_size)
+{
+	struct fid_ep *tmp_ep;
+	int ret;
+
+	/* Invalid TX attr size. */
+	cxit_fi->tx_attr->size = 16384;
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, &tmp_ep, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_endpoint");
+
+	ret = fi_close(&tmp_ep->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close EP");
+}
+
+Test(ep, valid_tx_attr_size_hints)
+{
+	struct fi_info *hints;
+	struct fi_info *info;
+	int ret;
+	unsigned int tx_size = 1024;
+
+	hints = fi_allocinfo();
+	cr_assert(hints != NULL, "fi_allocinfo");
+
+	hints->tx_attr->size = tx_size;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+
+	assert(info->tx_attr->size == tx_size);
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+TestSuite(ep_tclass, .init = cxit_setup_tx_alias_rma,
+	  .fini = cxit_teardown_tx_alias_rma, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Add control test for setting of EP tclass.
+ *
+ * Test same for alias EP.
+ *
+ * Parameterized for all TCLASS values and bad values.
+ */
+struct ep_tclass_params {
+	int tclass;
+	int retval;
+};
+
+static struct ep_tclass_params tclass_params[] = {
+	{.tclass = 0,
+	 .retval = FI_SUCCESS},
+	{.tclass = FI_TC_UNSPEC,
+	 .retval = FI_SUCCESS},
+	{.tclass = FI_TC_DSCP,
+	 .retval = -FI_EINVAL},
+	{.tclass = FI_TC_LABEL,
+	 .retval = FI_SUCCESS},
+	{.tclass = FI_TC_BEST_EFFORT,
+	 .retval = FI_SUCCESS},
+	{.tclass = FI_TC_LOW_LATENCY,
+	 .retval = FI_SUCCESS},
+	{.tclass = FI_TC_DEDICATED_ACCESS,
+	 .retval = FI_SUCCESS},
+	{.tclass = FI_TC_BULK_DATA,
+	 .retval = FI_SUCCESS},
+	{.tclass = FI_TC_SCAVENGER,
+	 .retval = FI_SUCCESS},
+	{.tclass = FI_TC_NETWORK_CTRL,		/* Not supported */
+	 .retval = -FI_EINVAL},
+	{.tclass = FI_TC_NETWORK_CTRL + 1,	/* Illegal */
+	 .retval = -FI_EINVAL},
+};
+
+int set_ep_tclass(struct cxip_ep *ep, uint32_t tclass)
+{
+	int ret;
+
+	ret = fi_set_val(&ep->ep.fid, FI_OPT_CXI_SET_TCLASS,
+			 (void *)&tclass);
+	if (ret == FI_SUCCESS) {
+		if (tclass != FI_TC_UNSPEC)
+			cr_assert_eq(tclass, ep->tx_attr.tclass,
+				     "update tclass mismatch. %d != %d",
+				     tclass, ep->tx_attr.tclass);
+		else
+			cr_assert_neq(tclass, ep->tx_attr.tclass,
+				      "FI_TC_UNSPEC tclass not updated");
+	}
+
+	return ret;
+}
+
+ParameterizedTestParameters(ep_tclass, alias_set_tclass)
+{
+	size_t param_sz;
+
+	param_sz = ARRAY_SIZE(tclass_params);
+	return cr_make_param_array(struct ep_tclass_params,
+				   tclass_params, param_sz);
+}
+
+/* Modify EP alias traffic class */
+ParameterizedTest(struct ep_tclass_params *param, ep_tclass,
+		  alias_set_tclass)
+{
+	int ret;
+	struct cxip_ep *cxi_ep;
+	struct cxip_ep *alias_ep = NULL;
+	uint32_t orig_ep_tclass;
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+	orig_ep_tclass = cxi_ep->tx_attr.tclass;
+
+	alias_ep = container_of(&cxit_tx_alias_ep->fid, struct cxip_ep, ep.fid);
+	cr_assert_not_null(alias_ep->ep_obj);
+
+	ret = set_ep_tclass(alias_ep, param->tclass);
+	cr_assert_eq(ret, param->retval,
+		     "fi_set_val for TCLASS %d", param->tclass);
+
+	/* make sure only the alias EP tclass changed */
+	cr_assert_eq(orig_ep_tclass, cxi_ep->tx_attr.tclass,
+		     "Original EP tclass changed");
+}
+
+ParameterizedTestParameters(ep_tclass, set_tclass)
+{
+	size_t param_sz;
+
+	param_sz = ARRAY_SIZE(tclass_params);
+	return cr_make_param_array(struct ep_tclass_params,
+				   tclass_params, param_sz);
+}
+
+/* Modify standard EP traffic class parameters */
+ParameterizedTest(struct ep_tclass_params *param, ep_tclass, set_tclass)
+{
+	int ret;
+	struct cxip_ep *cxi_ep;
+
+	cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	ret = set_ep_tclass(cxi_ep, param->tclass);
+	cr_assert_eq(ret, param->retval,
+		     "fi_set_val for TCLASS %d", param->tclass);
+}
+
+TestSuite(ep_caps, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+void verify_ep_msg_cap(uint64_t flags)
+{
+	struct cxip_ep *ep;
+	int ret;
+
+	cxit_setup_ep();
+
+	/* Set info TX/RX attribute appropriately */
+	if (!(flags & FI_SEND))
+		cxit_fi->tx_attr->caps &= ~(FI_SEND | FI_SEND);
+	if (!(flags & FI_RECV))
+		cxit_fi->rx_attr->caps &= ~(FI_MSG | FI_RECV);
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_cntrs();
+	cxit_bind_cntrs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "EP enable");
+
+	ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	/* Requires knowledge of implementation */
+	if (flags & FI_SEND) {
+		cr_assert(ep->ep_obj->txc.enabled, "TX Enabled");
+		cr_assert(ep->ep_obj->txc.send_cq != NULL, "Send CQ");
+	}
+
+	if (flags & FI_RECV) {
+		cr_assert(ep->ep_obj->rxc.state == RXC_ENABLED ||
+			  ep->ep_obj->rxc.state == RXC_ENABLED_SOFTWARE,
+			  "RX Enabled");
+		cr_assert(ep->ep_obj->rxc.recv_cq != NULL, "Receive CQ");
+		cr_assert(ep->ep_obj->rxc.rx_evtq.eq != NULL, "RX H/W EQ");
+		cr_assert(ep->ep_obj->rxc.rx_cmdq != NULL, "RX TGT CMDQ");
+		cr_assert(ep->ep_obj->rxc.tx_cmdq != NULL, "RX TX CMDQ");
+	} else {
+		cr_assert(ep->ep_obj->rxc.state == RXC_ENABLED, "R/X enabled");
+		cr_assert(ep->ep_obj->rxc.rx_evtq.eq == NULL, "RX H/W EQ");
+		cr_assert(ep->ep_obj->rxc.rx_cmdq == NULL, "RX TGT CMDQ");
+		cr_assert(ep->ep_obj->rxc.tx_cmdq == NULL, "RX TX CMDQ");
+	}
+
+	cxit_teardown_rma();
+}
+
+static void verify_ep_msg_ops(uint64_t flags)
+{
+	bool recv;
+	bool send;
+	uint8_t *recv_buf;
+	uint8_t *send_buf;
+	int recv_len = 512;
+	int send_len = 512;
+	struct iovec riovec;
+	struct iovec siovec;
+	struct fi_msg rmsg = {};
+	struct fi_msg smsg = {};
+	int ret;
+
+	recv = !!(flags & FI_RECV);
+	send = !!(flags & FI_SEND);
+
+	cxit_setup_ep();
+
+	/* Set info TX/RX attribute appropriately */
+	if (!send)
+		cxit_fi->tx_attr->caps &= ~(FI_MSG | FI_SEND);
+	if (!recv)
+		cxit_fi->rx_attr->caps &= ~(FI_MSG | FI_RECV);
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_cntrs();
+	cxit_bind_cntrs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert_not_null(recv_buf);
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert_not_null(send_buf);
+
+	/* Verify can not call API functions */
+	ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_recv");
+
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	ret = fi_recvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_recvv");
+
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = NULL;
+	ret = fi_recvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_recvmsg");
+
+	ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_send");
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	ret = fi_sendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_sendv");
+
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.context = NULL;
+	ret = fi_sendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_sendmsg");
+
+	ret = fi_inject(cxit_ep, send_buf, 8, cxit_ep_fi_addr);
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_inject");
+
+	ret = fi_senddata(cxit_ep, send_buf, send_len, NULL, 0xa5a5,
+			  cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_senddata");
+
+	ret = fi_injectdata(cxit_ep, send_buf, 8, 0xa5a5, cxit_ep_fi_addr);
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_injectdata");
+
+	/* Enable EP */
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "EP enable");
+
+	ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, recv ? FI_SUCCESS : -FI_ENOSYS,
+		     "EP enabled fi_recv");
+
+	ret = fi_recvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, recv ? FI_SUCCESS : -FI_ENOSYS,
+		     "EP enabled fi_recvv");
+
+	ret = fi_recvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, recv ? FI_SUCCESS : -FI_ENOSYS,
+		     "EP enabled fi_recvmsg");
+
+	ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, send ? FI_SUCCESS : -FI_ENOSYS,
+		     "EP enabled fi_send");
+
+	ret = fi_sendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, send ? FI_SUCCESS : -FI_ENOSYS,
+		     "EP enabled fi_sendv");
+	cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_sendv");
+
+	cxit_teardown_rma();
+}
+
+Test(ep_caps, msg_tx_rx)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* No hints */
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, NULL, &info);
+	cr_assert(ret == FI_SUCCESS);
+	cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned");
+	cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned");
+	cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned");
+	cr_assert_eq(info->tx_attr->caps & FI_MSG, FI_MSG,
+		     "FI_MSG TX returned");
+	cr_assert_eq(info->tx_attr->caps & FI_SEND, FI_SEND,
+		     "FI_SEND TX returned");
+	cr_assert_eq(info->rx_attr->caps & FI_MSG, FI_MSG,
+		     "FI_MSG RX returned");
+	cr_assert_eq(info->rx_attr->caps & FI_RECV, FI_RECV,
+		     "FI_RECV RX returned");
+	verify_ep_msg_cap(FI_SEND | FI_RECV);
+	fi_freeinfo(info);
+
+	/* hints->caps set to 0 */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = 0;
+	cxit_fi_hints->tx_attr->caps = 0;
+	cxit_fi_hints->rx_attr->caps = 0;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned");
+	cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned");
+	cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned");
+	cr_assert_eq(info->tx_attr->caps & FI_MSG, FI_MSG,
+		     "FI_MSG TX returned");
+	cr_assert_eq(info->tx_attr->caps & FI_SEND, FI_SEND,
+		     "FI_SEND TX returned");
+	cr_assert_eq(info->rx_attr->caps & FI_MSG, FI_MSG,
+		     "FI_MSG RX returned");
+	cr_assert_eq(info->rx_attr->caps & FI_RECV, FI_RECV,
+		     "FI_RECV RX returned");
+	verify_ep_msg_cap(FI_SEND | FI_RECV);
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+
+	/* hints->caps set to FI_MSG | FI_SEND | FI_RECV */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_MSG | FI_SEND | FI_RECV;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned");
+	cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned");
+	cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned");
+	verify_ep_msg_cap(FI_SEND | FI_RECV);
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+
+	/* hints->caps set to FI_MSG implies FI_SEND and FI_RECV */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_MSG;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned");
+	cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned");
+	cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned");
+	verify_ep_msg_cap(FI_SEND | FI_RECV);
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(ep_caps, msg_tx)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* hints->caps set to FI_MSG | FI_SEND is TX message only EP */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_MSG | FI_SEND;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned");
+	cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned");
+	cr_assert_eq(info->caps & FI_RECV, 0, "FI_RECV not returned");
+	verify_ep_msg_cap(FI_SEND);
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(ep_caps, msg_rx)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* hints->caps set to FI_MSG | FI_RECV is RX message only EP */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_MSG | FI_RECV;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned");
+	cr_assert_eq(info->caps & FI_SEND, 0, "FI_SEND not returned");
+	cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned");
+	verify_ep_msg_cap(FI_RECV);
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(ep_caps, msg_rx_only_ops)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* hints->caps set to FI_MSG | FI_RECV is RX message only EP */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_MSG | FI_RECV;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+
+	verify_ep_msg_ops(FI_RECV);
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+/* Verify FI_RMA API ops set */
+extern struct fi_ops_rma cxip_ep_rma_ops;
+extern struct fi_ops_rma cxip_ep_rma_no_ops;
+
+static void verify_ep_rma_ops(uint64_t caps)
+{
+	int ret;
+
+	cxit_setup_ep();
+
+	cxit_fi->caps = caps;
+	cxit_fi->tx_attr->caps = caps;
+
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_cntrs();
+	cxit_bind_cntrs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	/* Enable EP */
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "EP enable");
+
+	/* Verify correct function table is set */
+	if (caps & FI_RMA && ofi_rma_initiate_allowed(caps))
+		cr_assert_eq(cxit_ep->rma, &cxip_ep_rma_ops,
+			     "FI_RMA ops not set");
+	else
+		cr_assert_eq(cxit_ep->rma, &cxip_ep_rma_no_ops,
+			     "FI_RMA ops set");
+
+	cxit_teardown_rma();
+}
+
+/* Verify FI_ATOMIC API ops enable/disable */
+extern struct fi_ops_atomic cxip_ep_atomic_ops;
+extern struct fi_ops_atomic cxip_ep_atomic_no_ops;
+
+static void verify_ep_amo_ops(uint64_t caps)
+{
+	int ret;
+
+	cxit_setup_ep();
+
+	cxit_fi->caps = caps;
+	cxit_fi->tx_attr->caps = caps;
+
+	cxit_create_ep();
+	cxit_create_eq();
+	cxit_create_cqs();
+	cxit_bind_cqs();
+	cxit_create_cntrs();
+	cxit_bind_cntrs();
+	cxit_create_av();
+	cxit_bind_av();
+
+	/* Enable EP */
+	ret = fi_enable(cxit_ep);
+	cr_assert(ret == FI_SUCCESS, "EP enable");
+
+	/* Verify correct function table is set */
+	if (caps & FI_ATOMIC && ofi_rma_initiate_allowed(caps))
+		cr_assert_eq(cxit_ep->atomic, &cxip_ep_atomic_ops,
+			     "FI_ATOMIC ops not set");
+	else
+		cr_assert_eq(cxit_ep->atomic, &cxip_ep_atomic_no_ops,
+			     "FI_ATOMIC ops set");
+
+	cxit_teardown_rma();
+}
+
+/* test_cap is the caps that should be set */
+static void verify_caps_only(struct fi_info *info,
+			     uint64_t test_cap)
+{
+	if (!(test_cap & FI_TAGGED))
+		cr_assert_eq(info->caps & FI_TAGGED, 0, "FI_TAGGED set");
+	if (!(test_cap & FI_ATOMIC))
+		cr_assert_eq(info->caps & FI_ATOMIC, 0, "FI_ATOMIC set");
+	if (!(test_cap & FI_RMA))
+		cr_assert_eq(info->caps & FI_RMA, 0, "FI_RMA set");
+	if (!(test_cap & FI_COLLECTIVE))
+		cr_assert_eq(info->caps & FI_COLLECTIVE, 0,
+			     "FI_COLLECTIVE set");
+}
+
+Test(ep_caps, msg_only)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* hints->caps set to for only FI_MSG, don't enable other API */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_MSG;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	verify_caps_only(info, FI_MSG);
+
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(ep_caps, tagged_only)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* hints->caps set to for only FI_TAGGED, don't enable other API */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_TAGGED;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	verify_caps_only(info, FI_TAGGED);
+
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(ep_caps, rma_only)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* hints->caps set to for only FI_RMA, don't enable other API */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_RMA;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	verify_caps_only(info, FI_RMA);
+
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(ep_caps, atomic_only)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* hints->caps set to for only FI_ATOMIC, don't enable other API */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_ATOMIC;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	verify_caps_only(info, FI_ATOMIC);
+
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(ep_caps, coll_only)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* hints->caps set to for only FI_COLLECTIVE enables only FI_MSG */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_COLLECTIVE;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	verify_caps_only(info, FI_COLLECTIVE | FI_MSG);
+
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(ep_caps, rma_initiator)
+{
+	verify_ep_rma_ops(FI_RMA | FI_READ | FI_WRITE);
+}
+
+Test(ep_caps, rma_target_only)
+{
+	verify_ep_rma_ops(FI_RMA | FI_REMOTE_READ | FI_REMOTE_WRITE);
+}
+
+Test(ep_caps, rma_amo_only)
+{
+	verify_ep_rma_ops(FI_ATOMIC | FI_READ | FI_WRITE);
+}
+
+Test(ep_caps, rma_none)
+{
+	verify_ep_rma_ops(FI_MSG | FI_TAGGED);
+}
+
+Test(ep_caps, amo_initiator)
+{
+	verify_ep_amo_ops(FI_ATOMIC | FI_READ | FI_WRITE);
+}
+
+Test(ep_caps, amo_target_only)
+{
+	verify_ep_amo_ops(FI_ATOMIC | FI_REMOTE_READ | FI_REMOTE_WRITE);
+}
+
+Test(ep_caps, amo_rma_only)
+{
+	verify_ep_amo_ops(FI_RMA | FI_READ | FI_WRITE);
+}
+
+Test(ep_caps, amo_none)
+{
+	verify_ep_amo_ops(FI_MSG | FI_TAGGED);
+}
+
+TestSuite(ep_locking, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(ep_locking, domain)
+{
+	struct cxip_domain *dom;
+	struct cxip_ep *ep;
+	struct cxip_cq *cq;
+
+	cxit_setup_getinfo();
+
+	cxit_fi_hints->domain_attr->threading = FI_THREAD_DOMAIN;
+	cxit_setup_rma();
+
+	cr_assert_eq(cxit_fi->domain_attr->threading, FI_THREAD_DOMAIN,
+		     "Threading");
+
+	dom = container_of(cxit_domain, struct cxip_domain,
+			   util_domain.domain_fid);
+	cr_assert_eq(dom->trig_cmdq_lock.lock_type, OFI_LOCK_NONE,
+		     "Domain trigger command lock");
+
+	ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	cr_assert_eq(ep->ep_obj->lock.lock_type, OFI_LOCK_NONE,
+		     "EP object lock");
+
+	cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_NONE,
+		     "TX CQ EP list lock");
+	cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_NOOP,
+		     "TX CQ entry lock");
+
+	cq = container_of(cxit_rx_cq, struct cxip_cq, util_cq.cq_fid);
+	cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_NONE,
+		     "RX CQ EP list lock");
+	cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_NOOP,
+		     "RX CQ entry lock");
+
+	cxit_teardown_rma();
+}
+
+Test(ep_locking, completion)
+{
+	struct cxip_domain *dom;
+	struct cxip_ep *ep;
+	struct cxip_cq *cq;
+
+	cxit_setup_getinfo();
+
+	cxit_fi_hints->domain_attr->threading = FI_THREAD_COMPLETION;
+	cxit_setup_rma();
+
+	cr_assert_eq(cxit_fi->domain_attr->threading, FI_THREAD_COMPLETION,
+		     "Threading");
+
+	dom = container_of(cxit_domain, struct cxip_domain,
+			   util_domain.domain_fid);
+	cr_assert_eq(dom->trig_cmdq_lock.lock_type, OFI_LOCK_SPINLOCK,
+		     "Domain trigger command lock");
+
+	ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	cr_assert_eq(ep->ep_obj->lock.lock_type, OFI_LOCK_NONE,
+		     "EP object lock");
+
+	cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_NONE,
+		     "TX CQ EP list lock");
+	cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_NOOP,
+		     "TX CQ entry lock");
+
+	cq = container_of(cxit_rx_cq, struct cxip_cq, util_cq.cq_fid);
+	cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_NONE,
+		     "RX CQ EP list lock");
+	cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_NOOP,
+		     "RX CQ entry lock");
+
+	cxit_teardown_rma();
+}
+
+Test(ep_locking, safe)
+{
+	struct cxip_domain *dom;
+	struct cxip_ep *ep;
+	struct cxip_cq *cq;
+
+	cxit_setup_getinfo();
+
+	cxit_fi_hints->domain_attr->threading = FI_THREAD_SAFE;
+	cxit_setup_rma();
+
+	cr_assert_eq(cxit_fi->domain_attr->threading, FI_THREAD_SAFE,
+		     "Threading");
+
+	dom = container_of(cxit_domain, struct cxip_domain,
+			   util_domain.domain_fid);
+	cr_assert_eq(dom->trig_cmdq_lock.lock_type, OFI_LOCK_SPINLOCK,
+		     "Domain trigger command lock");
+
+	ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	cr_assert_eq(ep->ep_obj->lock.lock_type, OFI_LOCK_SPINLOCK,
+		     "EP object lock");
+
+	cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid);
+	cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_SPINLOCK,
+		     "TX CQ EP list lock");
+	cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_SPINLOCK,
+		     "TX CQ entry lock");
+
+	cq = container_of(cxit_rx_cq, struct cxip_cq, util_cq.cq_fid);
+	cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_SPINLOCK,
+		     "RX CQ EP list lock");
+	cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_SPINLOCK,
+		     "RX CQ entry lock");
+
+	cxit_teardown_rma();
+}
diff --git a/prov/cxi/test/eq.c b/prov/cxi/test/eq.c
new file mode 100644
index 00000000000..00730982b22
--- /dev/null
+++ b/prov/cxi/test/eq.c
@@ -0,0 +1,37 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2020 Hewlett Packard Enterprise Development LP
+ */
+
+/* Notes:
+ *
+ * This test is perfunctory at present. A fuller set of tests is available:
+ *
+ * virtualize.sh fabtests/unit/fi_eq_test
+ *
+ * TODO: current implementation does not support wait states.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+
+#include <ofi.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(eq, .init = cxit_setup_eq, .fini = cxit_teardown_eq,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test basic CQ creation */
+Test(eq, simple)
+{
+	cxit_create_eq();
+	cr_assert(cxit_eq != NULL);
+	cxit_destroy_eq();
+}
+
diff --git a/prov/cxi/test/fabric.c b/prov/cxi/test/fabric.c
new file mode 100644
index 00000000000..4eb04e4cf0b
--- /dev/null
+++ b/prov/cxi/test/fabric.c
@@ -0,0 +1,647 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2015-2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+static const char cxip_dom_fmt[] = "cxi%d";
+
+static char *get_dom_name(int if_idx)
+{
+	char *dom;
+	int ret;
+
+	ret = asprintf(&dom, cxip_dom_fmt, if_idx);
+	cr_assert(ret > 0);
+
+	return dom;
+}
+
+TestSuite(getinfo_env_vars, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(getinfo_env_vars, default_tx_size)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+	struct fi_info *iter;
+
+	ret = setenv("FI_CXI_DEFAULT_TX_SIZE", "17", 1);
+	cr_assert(ret == 0);
+
+	hints = fi_allocinfo();
+	cr_assert(hints);
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert(hints->fabric_attr->prov_name);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 NULL, NULL, cxit_flags, hints, &info);
+	cr_assert(ret == FI_SUCCESS);
+
+	iter = info;
+	while (iter) {
+		cr_assert(info->tx_attr->size == 17);
+		iter = iter->next;
+	}
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+Test(getinfo_env_vars, default_rx_size)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+	struct fi_info *iter;
+
+	ret = setenv("FI_CXI_DEFAULT_RX_SIZE", "17", 1);
+	cr_assert(ret == 0);
+
+	hints = fi_allocinfo();
+	cr_assert(hints);
+
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert(hints->fabric_attr->prov_name);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 NULL, NULL, cxit_flags, hints, &info);
+	cr_assert(ret == FI_SUCCESS);
+
+	iter = info;
+	while (iter) {
+		cr_assert(info->rx_attr->size == 17);
+		iter = iter->next;
+	}
+
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+}
+
+TestSuite(getinfo, .init = cxit_setup_getinfo,
+	  .fini = cxit_teardown_getinfo, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test fabric selection with provider name */
+Test(getinfo, prov_name)
+{
+	int infos = 0;
+
+	cxit_fi_hints->fabric_attr->prov_name = strdup(cxip_prov_name);
+
+	cxit_create_fabric_info();
+	cr_assert(cxit_fi != NULL);
+
+	/* Make sure we have at least 1 FI for each IF */
+	do {
+		cr_assert(!strcmp(cxit_fi->fabric_attr->prov_name,
+				  cxip_prov_name));
+		infos++;
+	} while ((cxit_fi = cxit_fi->next));
+	cr_assert(infos >= cxit_n_ifs);
+}
+
+/* Test fabric selection with domain name */
+Test(getinfo, dom_name)
+{
+	int infos = 0;
+	struct cxip_if *if_entry;
+	struct slist_entry *entry, *prev __attribute__ ((unused));
+
+	slist_foreach(&cxip_if_list, entry, prev) {
+		if_entry = container_of(entry, struct cxip_if, if_entry);
+		infos = 0;
+
+		cxit_node = get_dom_name(if_entry->info->dev_id);
+		cxit_flags = FI_SOURCE;
+		printf("searching %s\n", cxit_node);
+
+		cxit_create_fabric_info();
+		cr_assert(cxit_fi != NULL);
+
+		/* Make sure we have at least 1 FI for each IF */
+		do {
+			cr_expect(!strcmp(cxit_fi->domain_attr->name,
+					  cxit_node),
+					  "%s != %s\n",
+					  cxit_fi->domain_attr->name,
+					  cxit_fi_hints->domain_attr->name);
+
+			cr_assert(!strcmp(cxit_fi->fabric_attr->prov_name,
+					  cxip_prov_name));
+
+			cr_assert(!strcmp(cxit_fi->fabric_attr->name,
+				  cxip_prov_name));
+
+			infos++;
+		} while ((cxit_fi = cxit_fi->next));
+		cr_assert(infos >= 1);
+
+		cxit_destroy_fabric_info();
+	}
+	cr_assert(infos >= 1);
+}
+
+/* Test fabric selection with fabric name */
+Test(getinfo, fab_name)
+{
+	int infos = 0;
+	struct slist_entry *entry, *prev __attribute__ ((unused));
+	struct fi_info *fi;
+
+	slist_foreach(&cxip_if_list, entry, prev) {
+		infos = 0;
+
+		cxit_fi_hints->fabric_attr->name = strdup(cxip_prov_name);
+
+		cxit_create_fabric_info();
+		cr_assert(cxit_fi != NULL);
+
+		fi = cxit_fi;
+		do {
+			/* Not all providers can be trusted to filter by fabric
+			 * name */
+			if (strcmp(fi->fabric_attr->prov_name,
+				   cxip_prov_name))
+				continue;
+
+			cr_assert(!strcmp(fi->fabric_attr->name,
+					  fi->fabric_attr->name));
+
+			infos++;
+		} while ((fi = fi->next));
+
+		cxit_destroy_fabric_info();
+	}
+	cr_assert(infos);
+}
+
+Test(getinfo, prov_version)
+{
+	cxit_fi_hints->fabric_attr->prov_name = strdup(cxip_prov_name);
+
+	cxit_create_fabric_info();
+	cr_assert(cxit_fi != NULL);
+	cr_assert(cxit_fi->fabric_attr != NULL);
+
+	cr_assert(FI_MAJOR(cxit_fi->fabric_attr->prov_version) ==
+		  CXIP_MAJOR_VERSION,
+		  "Major version wwrong, expected %d, version returned %d",
+		  CXIP_MAJOR_VERSION,
+		  FI_MAJOR(cxit_fi->fabric_attr->prov_version));
+	cr_assert(FI_MINOR(cxit_fi->fabric_attr->prov_version) ==
+		  CXIP_MINOR_VERSION,
+		  "Minor version wwrong, expected %d, version returned %d",
+		  CXIP_MINOR_VERSION,
+		  FI_MINOR(cxit_fi->fabric_attr->prov_version));
+}
+
+Test(getinfo, valid_av_auth_key)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->caps = FI_MSG | FI_TAGGED | FI_REMOTE_COMM;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+	fi_freeinfo(info);
+}
+
+Test(getinfo, invalid_av_auth_key_not_null_domain_auth_key)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->caps = FI_MSG | FI_TAGGED | FI_REMOTE_COMM;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->domain_attr->auth_key = (void *)hints;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	hints->domain_attr->auth_key = NULL;
+
+	fi_freeinfo(hints);
+	fi_freeinfo(info);
+}
+
+Test(getinfo, invalid_av_auth_key_not_null_ep_auth_key)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->caps = FI_MSG | FI_TAGGED | FI_REMOTE_COMM;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->ep_attr->auth_key = (void *)hints;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	hints->ep_attr->auth_key = NULL;
+
+	fi_freeinfo(hints);
+	fi_freeinfo(info);
+}
+
+Test(getinfo, invalid_av_auth_key_not_zero_ep_auth_key_size)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->caps = FI_MSG | FI_TAGGED | FI_REMOTE_COMM;
+	hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->ep_attr->auth_key_size = 1;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+	fi_freeinfo(info);
+}
+
+Test(getinfo, valid_multiple_auth_keys_per_ep)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->domain_attr->max_ep_auth_key = 2;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->caps = FI_MSG;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	cr_assert_eq(info->domain_attr->max_ep_auth_key,
+		     hints->domain_attr->max_ep_auth_key);
+
+	fi_freeinfo(hints);
+	fi_freeinfo(info);
+}
+
+Test(getinfo, invalid_multiple_auth_keys_per_ep)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->domain_attr->max_ep_auth_key = (1 << 16);
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->caps = FI_MSG;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+	fi_freeinfo(info);
+}
+
+Test(getinfo, invalid_fi_directed_recv_with_multiple_auth_keys_per_ep)
+{
+	int ret;
+	struct fi_info *hints;
+	struct fi_info *info;
+
+	hints = fi_allocinfo();
+	cr_assert_not_null(hints, "fi_allocinfo failed");
+
+	hints->domain_attr->max_ep_auth_key = 2;
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	hints->caps = FI_MSG | FI_DIRECTED_RECV;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	cr_assert_not_null(hints, "strdup failed");
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, hints, &info);
+	cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret);
+
+	fi_freeinfo(hints);
+	fi_freeinfo(info);
+}
+
+TestSuite(getinfo_infos, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+#define MAX_INFOS	16
+#define FI_ADDR_CXI_COMPAT FI_ADDR_OPX
+
+struct info_check {
+	int mr_mode;
+	uint32_t format;
+	size_t max_ep_auth_key;
+};
+
+Test(getinfo_infos, nohints)
+{
+	int num_info;
+	int i;
+	int info_per_if = 0;
+	struct fi_info *fi_ptr;
+	char *dom_name;
+	char *odp;
+	char *compat;
+	struct info_check infos[MAX_INFOS];
+	size_t max_ep_auth_key;
+
+	cxit_init();
+	cr_assert(!cxit_fi_hints, "hints not NULL");
+
+	cxit_create_fabric_info();
+	cr_assert(cxit_fi != NULL);
+
+	for (i = 0; i < MAX_INFOS; i++) {
+		infos[i].format = 0;
+		infos[i].mr_mode = -1;
+	}
+
+	/* By default when no hints are specified, each interface
+	 * should have 4 fi_info.
+	 */
+	for (i = 0; i < 2; i++) {
+		if (i < 1)
+			max_ep_auth_key = 1;
+		else
+			max_ep_auth_key = 4;
+
+		infos[info_per_if].mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED |
+					FI_MR_PROV_KEY;
+		infos[info_per_if].format = FI_ADDR_CXI;
+		infos[info_per_if].max_ep_auth_key = max_ep_auth_key;
+		info_per_if++;
+
+		infos[info_per_if].mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+		infos[info_per_if].format = FI_ADDR_CXI;
+		infos[info_per_if].max_ep_auth_key = max_ep_auth_key;
+		info_per_if++;
+
+		/* Add ODP versions if enabled */
+		odp = getenv("FI_CXI_ODP");
+		if (odp && strtol(odp, NULL, 10)) {
+			infos[info_per_if].format = FI_ADDR_CXI;
+			infos[info_per_if].mr_mode = FI_MR_ENDPOINT | FI_MR_PROV_KEY;
+			infos[info_per_if].max_ep_auth_key = max_ep_auth_key;
+			info_per_if++;
+
+			infos[info_per_if].format = FI_ADDR_CXI;
+			infos[info_per_if].mr_mode = FI_MR_ENDPOINT;
+			infos[info_per_if].max_ep_auth_key = max_ep_auth_key;
+			info_per_if++;
+		}
+	}
+
+	/* If we are supporting compatibility with old constants,
+	 * then fi_info are repeated with compatibility constants.
+	 */
+	compat = getenv("FI_CXI_COMPAT");
+	if (!compat || strtol(compat, NULL, 10) == 1) {
+		for (i = 0; i < info_per_if; i++) {
+			infos[info_per_if + i].mr_mode =
+				infos[i].mr_mode;
+			infos[info_per_if + i].format =
+				FI_ADDR_CXI_COMPAT;
+			infos[info_per_if + i].max_ep_auth_key =
+				infos[i].max_ep_auth_key;
+		}
+		info_per_if += i;
+	}
+	cr_assert(info_per_if <= MAX_INFOS, "Too many infos");
+
+	fi_ptr = cxit_fi;
+
+	while (fi_ptr) {
+		/* Only concerned with CXI */
+		if (strcmp(fi_ptr->fabric_attr->prov_name, cxip_prov_name)) {
+			fi_ptr = fi_ptr->next;
+			continue;
+		}
+
+		dom_name = fi_ptr->domain_attr->name;
+		num_info = 0;
+
+		/* Each info for the same NIC as the same domain name */
+		while (fi_ptr) {
+			/* Different interface detected */
+			if (strcmp(dom_name, fi_ptr->domain_attr->name))
+				break;
+
+			num_info++;
+			cr_assert(num_info <= MAX_INFOS,
+				  "too many fi_info %d", num_info);
+
+			cr_assert(infos[num_info - 1].mr_mode ==
+				  fi_ptr->domain_attr->mr_mode,
+				  "expected MR mode %x got %x",
+				  infos[num_info - 1].mr_mode,
+				  fi_ptr->domain_attr->mr_mode);
+
+			cr_assert(infos[num_info - 1].format ==
+				  fi_ptr->addr_format,
+				  "expected addr_fomrat %u got %u",
+				  infos[num_info - 1].format,
+				  fi_ptr->addr_format);
+
+			fi_ptr = fi_ptr->next;
+		}
+
+		cr_assert(num_info == info_per_if,
+			  "Wrong number of fi_info %d got %d",
+			  num_info, info_per_if);
+	}
+	cxit_destroy_fabric_info();
+}
+
+Test(getinfo_infos, hints)
+{
+	int num_info;
+	int i;
+	int info_per_if = 0;
+	struct fi_info *fi_ptr;
+	char *dom_name;
+	char *compat;
+	struct info_check infos[2];
+
+	cxit_setup_fabric();
+	cr_assert(cxit_fi != NULL);
+	cr_assert(cxit_fi_hints != NULL);
+
+	for (i = 0; i < 2; i++) {
+		infos[i].format = 0;
+		infos[i].mr_mode = -1;
+	}
+
+	infos[0].format = FI_ADDR_CXI;
+	infos[0].mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED;
+	if (cxit_prov_key)
+		infos[0].mr_mode |= FI_MR_PROV_KEY;
+	info_per_if++;
+
+	compat = getenv("FI_CXI_COMPAT");
+	if (!compat || strtol(compat, NULL, 10) == 1) {
+		infos[1].format = FI_ADDR_CXI_COMPAT;
+		infos[1].mr_mode = infos[0].mr_mode;
+		info_per_if++;
+	}
+
+	fi_ptr = cxit_fi;
+
+	while (fi_ptr) {
+		/* Should only be CXI provider */
+		cr_assert(!strcmp(fi_ptr->fabric_attr->prov_name,
+				  cxip_prov_name), "non-cxi provider");
+
+		dom_name = fi_ptr->domain_attr->name;
+		num_info = 0;
+
+		/* Each info for the same NIC as the same domain name */
+		while (fi_ptr) {
+			/* Different interface detected */
+			if (strcmp(dom_name, fi_ptr->domain_attr->name))
+				break;
+
+			num_info++;
+			cr_assert(num_info <= 2, "too many fi_info %d",
+				  num_info);
+
+			cr_assert(infos[num_info - 1].mr_mode ==
+				  fi_ptr->domain_attr->mr_mode,
+				  "expected MR mode %x got %x",
+				  infos[num_info - 1].mr_mode,
+				  fi_ptr->domain_attr->mr_mode);
+
+			cr_assert(infos[num_info - 1].format ==
+				  fi_ptr->addr_format,
+				  "expected addr_fomrat %u got %u",
+				  infos[num_info - 1].format,
+				  fi_ptr->addr_format);
+
+			fi_ptr = fi_ptr->next;
+		}
+
+		cr_assert(num_info == info_per_if,
+			  "Wrong number of fi_info %d got %d",
+			  num_info, info_per_if);
+	}
+	cxit_teardown_fabric();
+}
+
+Test(getinfo_infos, hints_no_rma)
+{
+	int ret;
+
+	cxit_setup_getinfo();
+	cr_assert(cxit_fi == NULL);
+	cr_assert(cxit_fi_hints != NULL);
+
+	/* Request info with hints capabilities that do not
+	 * include RMA and make sure fi_info is returned
+	 * even if FI_MR_ENDPOINT is not specified.
+	 */
+	cxit_fi_hints->domain_attr->mr_mode = 0;
+	cxit_fi_hints->caps = FI_MSG | FI_TAGGED | FI_SEND | FI_RECV;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &cxit_fi);
+	cr_assert(ret == FI_SUCCESS, "fi_getinfo()");
+	cr_assert(cxit_fi != NULL, "no fi_info");
+
+	cr_assert(cxit_fi->domain_attr->mr_mode == 0, "MR mode not 0");
+	cr_assert(cxit_fi->caps & (FI_MSG | FI_TAGGED | FI_SEND | FI_RECV),
+		  "caps cleared");
+
+	fi_freeinfo(cxit_fi);
+	cxit_fi = NULL;
+
+	/* Request info with hints capabilities that do not
+	 * include RMA and but do include mr_mode bits. Make
+	 * sure the mr_mode bits are cleared.
+	 * TODO: When common code is patched to remove FI_MR_ENDPOINT,
+	 * when RMA/ATOMIC is not required, add that mode to the hints.
+	 */
+	cxit_fi_hints->domain_attr->mr_mode = FI_MR_ALLOCATED | FI_MR_PROV_KEY;
+	cxit_fi_hints->caps = FI_MSG | FI_TAGGED | FI_SEND | FI_RECV;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &cxit_fi);
+	cr_assert(ret == FI_SUCCESS, "fi_getinfo()");
+	cr_assert(cxit_fi != NULL, "no fi_info");
+
+	cr_assert(cxit_fi->domain_attr->mr_mode == 0, "MR mode not cleared");
+	cr_assert(cxit_fi->caps & (FI_MSG | FI_TAGGED | FI_SEND | FI_RECV),
+		  "caps cleared");
+
+	fi_freeinfo(cxit_fi);
+	cxit_fi = NULL;
+
+	cxit_teardown_getinfo();
+}
+
+TestSuite(fabric, .init = cxit_setup_fabric, .fini = cxit_teardown_fabric,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test basic fabric creation */
+Test(fabric, simple)
+{
+	cxit_create_fabric();
+	cr_assert(cxit_fabric != NULL);
+
+	cxit_destroy_fabric();
+}
diff --git a/prov/cxi/test/fi_info_test.sh b/prov/cxi/test/fi_info_test.sh
new file mode 100644
index 00000000000..b0fba97d698
--- /dev/null
+++ b/prov/cxi/test/fi_info_test.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# set -x
+
+SCRIPT=$(basename ${BASH_SOURCE[0]:-$0})
+FI_INFO="../../../util/fi_info"
+ENODATA=61
+DEVICE="cxi1"
+
+usage() {
+cat <<EOF1
+Usage: ${SCRIPT} -t tapfile
+       ${SCRIPT} --tap=tapfile
+
+One of -t or --tap is required.
+
+Runs fi_info on non-existant device to determine if a proper error is reported.
+EOF1
+exit 0
+}
+
+report() {
+	local status=$1
+	local tapfile=$2
+
+	local result="FAIL"
+	local ok="not ok"
+	local passing=0
+	local failing=1
+
+	if [ $status -eq 0 ]; then
+		result="PASS"
+		ok="ok"
+		passing=1
+		failing=0
+	fi
+
+	# Mimic criterion output to log
+	echo "[====] Running 1 test from $SCRIPT:"
+	echo "[$result] $SCRIPT"
+	echo "[====] Synthesis: Tested: 1 | Passing: $passing | Failing: $failing | Crashing: 0"
+
+	# Put similar data in tapfile
+
+cat<<EOF2 > $tapfile
+TAP version 13
+1..1
+$ok - fi_info::test for interface not found
+EOF2
+}
+
+# ################################################################
+
+tapfile=""
+
+while getopts t:-: OPT; do    # allow -t and -- with arg
+	# support long options: https://stackoverflow.com/a/28466267/519360
+	if [ "$OPT" = "-" ]; then   # long option: reformulate OPT and OPTARG
+		OPT="${OPTARG%%=*}"       # extract long option name
+		OPTARG="${OPTARG#$OPT}"   # extract long option argument (may be empty)
+		OPTARG="${OPTARG#=}"      # if long option argument, remove assigning `=`
+	fi
+	case "$OPT" in
+		t | tap)
+			tapfile="$OPTARG"
+			;;
+		h)
+			usage
+			;;
+		\?)
+			exit 1    # bad short option (error reported by getopts)
+			;;
+		*)
+			echo "Illegal option --$OPT"  # bad long option
+			exit 1
+			;;
+	esac
+done
+
+if [ -z "$tapfile" ]; then
+	usage
+fi
+
+test="FI_CXI_DEVICE_NAME=\"${DEVICE}\" ${FI_INFO} -p cxi"
+
+echo "Running test: $test"
+eval "$test"
+ret=$?
+
+status=1    # bashism: 0 means it passed
+if [ $ret -eq $ENODATA ] || [ $ret -eq -$ENODATA ]; then
+	status=0
+fi
+
+report $status $tapfile
+
+exit $status
diff --git a/prov/cxi/test/flask_testsrv.py b/prov/cxi/test/flask_testsrv.py
new file mode 100644
index 00000000000..f2071efa453
--- /dev/null
+++ b/prov/cxi/test/flask_testsrv.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+# Copyright (c) 2021 Hewlett Packard Enterprise Development LP
+help = f'''
+Standalone REST server for local testing
+
+TARGET /test
+    Provides basic targets for GET, PUT, POST, PATCH, and DELETE.
+    "Content-Type: application/json" header should be specified.
+    Result is JSON data identifying the operation, and the supplied data.
+    If the supplied data contains a JSON tag named 'return_code',
+        the corresponding value will be used as the return code of the
+        response.
+    Exercise using ./curltest --auto
+
+If --host is omitted, host is http://127.0.0.1 local address (if available)
+If --host is 0.0.0.0, host is the current IP address of the node
+'''
+
+import argparse
+import textwrap
+import sys
+import json
+
+from argparse import ArgumentParser, HelpFormatter
+from flask import Flask, request
+from flask_restful import Api, Resource
+
+class RawFormatter(HelpFormatter):
+    def _fill_text(self, text, width, indent):
+        return "\n".join([textwrap.fill(line, width) for line in textwrap.indent(textwrap.dedent(text), indent).splitlines()])
+
+# Test code for CURL regression test
+class selftestResource(Resource):
+    def return_code(self, json):
+        if json is not None and "return_code" in json:
+            return json["return_code"]
+        return 200
+
+    def get(self):
+        info = {
+            'operation': 'GET',
+            'data': ''
+        }
+        return info, self.return_code(None)
+
+    def put(self):
+        info = {
+            'operation': 'PUT',
+            'data': request.json
+        }
+        return info, self.return_code(request.json)
+
+    def post(self):
+        info = {
+            'operation': 'POST',
+            'data': request.json
+        }
+        return info, self.return_code(request.json)
+
+    def patch(self):
+        info = {
+            'operation': 'PATCH',
+            'data': request.json
+        }
+        return info, self.return_code(request.json)
+
+    def delete(self):
+        info = {
+            'operation': 'DELETE',
+            'data': request.json
+        }
+        return info, self.return_code(request.json)
+
+def main(argv):
+    parser = argparse.ArgumentParser(
+        description=help, formatter_class=RawFormatter)
+    parser.add_argument('--host', default=None)
+    parser.add_argument('--port', default=None)
+    args = parser.parse_args()
+
+    app = Flask(__name__)
+    api = Api(app);
+    api.add_resource(selftestResource, '/test')
+    app.run(debug=True, host=args.host, port=args.port)
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/prov/cxi/test/fork.c b/prov/cxi/test/fork.c
new file mode 100644
index 00000000000..dc106889b15
--- /dev/null
+++ b/prov/cxi/test/fork.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <ctype.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+
+#include "libcxi/libcxi.h"
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+#define SECRET 0xFFU
+#define XFER_SIZE 257U
+#define INIT_BUF_VALUE 0xAAU
+#define INIT_BUF_OFFSET 127U
+#define TGT_BUF_VALUE 0xFFU
+#define TGT_BUF_OFFSET 3215U
+#define RKEY 0x1U
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+
+/* Needs to be marked volatile to prevent hangs due to compiler optimization. */
+static volatile bool child_process_block = true;
+
+static void signal_handler(int sig)
+{
+	child_process_block = false;
+}
+
+static void fork_test_runner(bool odp, bool huge_page, bool fork_safe)
+{
+	long page_size;
+	uint8_t *buf;
+	uint8_t *init_buf;
+	uint8_t *tgt_buf;
+	int ret;
+	struct fid_mr *mr;
+	int status;
+	struct fi_cq_tagged_entry cqe;
+	pid_t pid;
+	int i = 0;
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+	uint64_t rkey;
+	bool again;
+
+	if (odp) {
+		ret = setenv("FI_CXI_FORCE_ODP", "1", 1);
+		cr_assert_eq(ret, 0, "Failed to set FI_CXI_FORCE_ODP %d",
+			     -errno);
+	}
+
+	if (fork_safe) {
+		ret = setenv("CXI_FORK_SAFE", "1", 1);
+		cr_assert_eq(ret, 0, "Failed to set CXI_FORK_SAFE %d", -errno);
+
+		if (huge_page) {
+			ret = setenv("CXI_FORK_SAFE_HP", "1", 1);
+			cr_assert_eq(ret, 0, "Failed to set CXI_FORK_SAFE %d",
+				     -errno);
+		}
+	}
+
+	cxit_setup_msg();
+
+	signal(SIGUSR1, signal_handler);
+
+	/* Single map is used for page aliasing with child process and RDMA. */
+	if (huge_page) {
+		page_size = 2 * 1024 * 1024;
+		flags |= MAP_HUGETLB | MAP_HUGE_2MB;
+	} else {
+		page_size = sysconf(_SC_PAGESIZE);
+	}
+
+	buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+	cr_assert(buf != MAP_FAILED, "mmap failed");
+
+	memset(buf, 0, page_size);
+
+	/* This secret is passed to the child process. Child process will verify
+	 * it receives this secret.
+	 */
+	buf[0] = SECRET;
+	init_buf = buf + INIT_BUF_OFFSET;
+	tgt_buf = buf + TGT_BUF_OFFSET;
+
+	/* Register the buffer. The behavior of the child buffer depends upon
+	 * the following
+	 * - If CXI_FORK_SAFE is set and copy-on-fork kernel support does not
+	 *   exist, madvise(MADV_DONTFORK) will be issued against the page.
+	 *   This will cause the child to segfault.
+	 * - If CXI_FORK_SAFE is set and copy-on-fork kernel support exists,
+	 *   madvise(MADV_DONTFORK) will NOT be issued against the page. The
+	 *   child process will get its data and the parent process will
+	 *   not have data corruption.
+	 * - If ODP is not used and kernel copy-on-fork is not supported, the
+	 *   child process will get its data, and the parent process will have
+	 *   data corruption.
+	 * - If ODP is not used and the kernel supports copy-on-fork, the child
+	 *   process will get its data, and the parent process will not have
+	 *   data corruption.
+	 * - If ODP is used, the child process will get its data, and the parent
+	 *   process will not have data corruption.
+	 */
+	ret = fi_mr_reg(cxit_domain, tgt_buf, XFER_SIZE, FI_REMOTE_WRITE, 0,
+			RKEY, 0, &mr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed %d", ret);
+
+	ret = fi_mr_bind(mr, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed %d", ret);
+
+	ret = fi_mr_enable(mr);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed %d", ret);
+
+	rkey = fi_mr_key(mr);
+
+	again = true;
+	do {
+		pid = fork();
+		if (pid >= 0) {
+			again = false;
+			break;
+		}
+
+		cr_assert_eq(errno, EAGAIN, "fork() failed: %d", errno);
+	} while (again);
+
+	if (pid == 0) {
+		while (child_process_block)
+			sched_yield();
+
+		/* If CXI_FORK_SAFE is set (i.e. fork_safe is true) and
+		 * kernel copy-on-fork does not exist, this will segfault.
+		 */
+		if (buf[0] == SECRET)
+			_exit(EXIT_SUCCESS);
+
+		/* This should never happen. */
+		_exit(EXIT_FAILURE);
+	}
+
+	/* Writing these buffers will trigger COW if copy-on-fork
+	 * kernel support does not exist. If that is the case then unless
+	 * madvise(MADV_DONTFORK) was called, parent process will get a new
+	 * page.
+	 */
+	memset(init_buf, INIT_BUF_VALUE, XFER_SIZE);
+	memset(tgt_buf, TGT_BUF_VALUE, XFER_SIZE);
+
+	ofi_sfence();
+
+	/* Unblock the child process. */
+	kill(pid, SIGUSR1);
+
+	ret = fi_write(cxit_ep, init_buf, XFER_SIZE, NULL, cxit_ep_fi_addr, 0,
+		       rkey, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_write failed %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	if (cxil_is_copy_on_fork() || odp || fork_safe) {
+		for (i = 0; i < XFER_SIZE; i++)
+			cr_assert_eq(init_buf[i], tgt_buf[i], "data corruption with fork");
+	} else {
+		for (i = 0; i < XFER_SIZE; i++)
+			cr_assert_neq(init_buf[i], tgt_buf[i], "Missing data corruption with fork");
+	}
+
+	waitpid(pid, &status, 0);
+
+	if (!cxil_is_copy_on_fork() && fork_safe) {
+		cr_assert_eq(WIFSIGNALED(status), true, "Child was not terminated by signal: is_exit=%d exit=%d is_sig=%d sig=%d",
+			     WIFEXITED(status), WEXITSTATUS(status),
+			     WIFSIGNALED(status), WTERMSIG(status));
+		cr_assert_eq(WTERMSIG(status), SIGSEGV, "Child signal was not SIGSEGV");
+	} else {
+		cr_assert_eq(WIFEXITED(status), true, "Child was not terminated by exit: is_exit=%d exit=%d is_sig=%d sig=%d",
+			     WIFEXITED(status), WEXITSTATUS(status),
+			     WIFSIGNALED(status), WTERMSIG(status));
+		cr_assert_eq(WEXITSTATUS(status), EXIT_SUCCESS, "Child process had data corruption");
+	}
+
+	fi_close(&mr->fid);
+	munmap(buf, page_size);
+
+	cxit_teardown_msg();
+}
+
+TestSuite(fork, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* No ODP, no fork safe variables, and system page size. On kernels before 5.12,
+ * parent process should have data corruption. Child process should not have
+ * data corruption and should not segfault.
+ */
+Test(fork, page_aliasing_no_odp_no_fork_safe_system_page_size)
+{
+	fork_test_runner(false, false, false);
+}
+
+/* ODP, no fork safe variables, and system page size. Parent process should not
+ * have data corruption regardless of kernel version. Child process should not
+ * have data corruption and should not segfault.
+ */
+Test(fork, page_aliasing_odp_no_fork_safe_system_page_size)
+{
+	fork_test_runner(true, false, false);
+}
+
+/* No ODP, no fork safe variables, and system page size. Parent process should
+ * not have data corruption regardless of kernel version. Child process should
+ * segfault if copy-on-fork kernel support does not exist (The parent would
+ * have called madvise MADV_DONTFORK if that is the case).
+ */
+Test(fork, page_aliasing_no_odp_fork_safe_system_page_size)
+{
+	fork_test_runner(false, false, true);
+}
+
+/* No ODP, no fork safe variables, and 2MiB page size. On kernels before 5.12,
+ * parent process should have data corruption. Child process should not have
+ * data corruption and should not segfault.
+ */
+Test(fork, page_aliasing_no_odp_no_fork_safe_huge_page)
+{
+	fork_test_runner(false, true, false);
+}
+
+/* ODP, no fork safe variables, and 2MiB page size. Parent process should not
+ * have data corruption regardless of kernel version. Child process should not
+ * have data corruption and should not segfault.
+ */
+Test(fork, page_aliasing_odp_no_fork_safe_huge_page)
+{
+	fork_test_runner(true, true, false);
+}
+
+/* No ODP, with fork safe variables, and 2MiB page size. Parent process should
+ * not have data corruption regardless of kernel version. Child process should
+ * segfault if the kernel does not support copy-on-fork (since the parent
+ * would have called MADV_DONTFORK on virtual address range).
+ */
+Test(fork, page_aliasing_no_odp_fork_safe_huge_page)
+{
+	fork_test_runner(false, true, true);
+}
+
+static volatile bool block_threads = true;
+
+static void *child_memory_free_thread_runner(void *context)
+{
+	bool huge_page = (bool)context;
+	long page_size;
+	uint8_t *buf;
+	int ret;
+	struct fid_mr *mr;
+	int status;
+	pid_t pid;
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+	bool again;
+
+	while (block_threads)
+		sched_yield();
+
+	/* Single map is used for page aliasing with child process and RDMA. */
+	if (huge_page) {
+		page_size = 2 * 1024 * 1024;
+		flags |= MAP_HUGETLB | MAP_HUGE_2MB;
+	} else {
+		page_size = sysconf(_SC_PAGESIZE);
+	}
+
+	buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+	cr_assert(buf != MAP_FAILED, "mmap failed");
+
+	memset(buf, 0, page_size);
+
+	ret = fi_mr_reg(cxit_domain, buf, XFER_SIZE, FI_REMOTE_WRITE, 0,
+			gettid(), 0, &mr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed %d", ret);
+
+	/* MR reg will result in cxil_map() being called. On kernels < 5.12,
+	 * libcxi will call MADV_DONTFORK on the range. For the purposes of this
+	 * test, we want the child to munmap this buffer to see if it deadlocks
+	 * in the MR cache. Thus, we need to undo the MADV_DONTFORK.
+	 */
+	if (!cxil_is_copy_on_fork()) {
+		ret = madvise(buf, page_size, MADV_DOFORK);
+		cr_assert_eq(ret, 0, "madvise failed %d", ret);
+	}
+
+	ret = fi_mr_bind(mr, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed %d", ret);
+
+	ret = fi_mr_enable(mr);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed %d", ret);
+
+	again = true;
+	do {
+		pid = fork();
+		if (pid >= 0) {
+			again = false;
+			break;
+		}
+
+		cr_assert_eq(errno, EAGAIN, "fork() failed: %d", errno);
+	} while (again);
+
+	if (pid == 0) {
+		munmap(buf, page_size);
+		_exit(EXIT_SUCCESS);
+	}
+
+	waitpid(pid, &status, 0);
+
+	cr_assert_eq(WIFEXITED(status), true, "Child was not terminated by exit: is_exit=%d exit=%d is_sig=%d sig=%d",
+		     WIFEXITED(status), WEXITSTATUS(status),
+		     WIFSIGNALED(status), WTERMSIG(status));
+	cr_assert_eq(WEXITSTATUS(status), EXIT_SUCCESS, "Child process had data corruption");
+
+	fi_close(&mr->fid);
+	munmap(buf, page_size);
+
+	return NULL;
+}
+
+#define THREAD_MAX 256U
+
+static void child_memory_free_runner(bool huge_page, int thread_count)
+{
+	pthread_t threads[THREAD_MAX];
+	int i;
+	int ret;
+
+	cr_assert(thread_count <= THREAD_MAX);
+
+	/* For kernels < 5.12, CXI_FORK_SAFE needs to be set. If not set, the
+	 * control event queue buffers would be subjected to copy-on-write. This
+	 * may result in the parent threads deadlocking.
+	 */
+	ret = setenv("CXI_FORK_SAFE", "1", 1);
+	cr_assert_eq(ret, 0, "Failed to set CXI_FORK_SAFE %d", -errno);
+
+	if (huge_page) {
+		ret = setenv("CXI_FORK_SAFE_HP", "1", 1);
+		cr_assert_eq(ret, 0, "Failed to set CXI_FORK_SAFE %d", -errno);
+	}
+
+	cxit_setup_msg();
+
+	for (i = 0; i < thread_count; i++) {
+		ret = pthread_create(&threads[i], NULL,
+				     child_memory_free_thread_runner,
+				     (void *)huge_page);
+		cr_assert(ret == 0);
+	}
+
+	block_threads = false;
+
+	for (i = 0; i < thread_count; i++)
+		pthread_join(threads[i], NULL);
+
+	cxit_teardown_msg();
+}
+
+/* The objective of this test is to see if child processes can deadlock on the
+ * MR cache lock if threads are forking while other threads are doing memory
+ * registration.
+ */
+Test(fork, child_memory_free_system_page_size)
+{
+	child_memory_free_runner(false, 16);
+}
+
+Test(fork, child_memory_free_huge_page_size)
+{
+	child_memory_free_runner(true, 16);
+}
diff --git a/prov/cxi/test/hip/hip_cntr_test.cpp b/prov/cxi/test/hip/hip_cntr_test.cpp
new file mode 100644
index 00000000000..486af232e77
--- /dev/null
+++ b/prov/cxi/test/hip/hip_cntr_test.cpp
@@ -0,0 +1,95 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
+ */
+
+#include "hip/hip_runtime.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <rdma/fabric.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_cxi_ext.h>
+
+/* Example compile instructions. */
+// hipcc --amdgpu-target=gfx908 -I<path_to>/libfabric/install/include -L/opt/rocm/lib64/ -L/opt/rocm/lib -L<path_to>/libfabric/install/lib -lfabric -g -c hip_cntr_test.cpp
+// hipcc --amdgpu-target=gfx908 -I<path_to>/libfabric/install/include -L/opt/rocm/lib64/ -L/opt/rocm/lib -L<path_to>/libfabric/install/lib -lfabric -g hip_cntr_test.o -o hip_cntr_test
+
+#define GPU_WB_SIZE 8U
+
+static struct fi_info *hints;
+static struct fi_info *info;
+static struct fid_fabric *fabric;
+static struct fid_domain *domain;
+static struct fid_cntr *cntr;
+static struct fi_cxi_cntr_ops *cntr_ops;
+static void *gpu_wb;
+
+void resource_init(void)
+{
+	int ret;
+
+	ret = hipMalloc(&gpu_wb, GPU_WB_SIZE);
+	assert(ret == hipSuccess);
+
+	hints  = fi_allocinfo();
+	assert(hints != NULL);
+
+	/* Always select CXI provider */
+	hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+	hints->fabric_attr->prov_name = strdup("cxi");
+	assert(hints->fabric_attr->prov_name != NULL);
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), NULL,
+			 NULL, 0, hints, &info);
+	assert(ret == FI_SUCCESS);
+
+	ret = fi_fabric(info->fabric_attr, &fabric, NULL);
+	assert(ret == FI_SUCCESS);
+
+	ret = fi_domain(fabric, info, &domain, NULL);
+	assert(ret == FI_SUCCESS);
+
+	ret = fi_cntr_open(domain, NULL, &cntr, NULL);
+	assert(ret == FI_SUCCESS);
+
+	ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0, (void **)&cntr_ops,
+			  NULL);
+	assert(ret == FI_SUCCESS);
+
+	ret = cntr_ops->set_wb_buffer(&cntr->fid, gpu_wb, GPU_WB_SIZE);
+	assert(ret == FI_SUCCESS);
+}
+
+void resource_free(void)
+{
+	fi_close(&cntr->fid);
+	fi_close(&domain->fid);
+	fi_close(&fabric->fid);
+	fi_freeinfo(info);
+	fi_freeinfo(hints);
+	hipFree(gpu_wb);
+}
+
+int main(int argc, char *argv[])
+{
+	int ret;
+
+	resource_init();
+
+	ret = fi_cntr_adderr(cntr, 5);
+	assert(ret == FI_SUCCESS);
+
+	while (fi_cntr_readerr(cntr) != 5);
+
+	ret = fi_cntr_add(cntr, 123);
+	assert(ret == FI_SUCCESS);
+
+	while (fi_cntr_read(cntr) != 123);
+	while (fi_cntr_readerr(cntr) != 5);
+
+	resource_free();
+
+	return 0;
+}
diff --git a/prov/cxi/test/lat.c b/prov/cxi/test/lat.c
new file mode 100644
index 00000000000..23d324f72d1
--- /dev/null
+++ b/prov/cxi/test/lat.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2018-2021 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+void *buf;
+
+void do_tsend(size_t len)
+{
+	int ret;
+
+	ret = fi_tsend(cxit_ep, buf, len, NULL, cxit_ep_fi_addr, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "ret is %d\n", ret);
+}
+void do_tsend_0() { do_tsend(0); }
+void do_tsend_8() { do_tsend(8); }
+void do_tsend_256() { do_tsend(256); }
+
+void do_trecv(size_t len)
+{
+	int ret;
+
+	ret = fi_trecv(cxit_ep, buf, len, NULL, FI_ADDR_UNSPEC, 0, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "ret is %d\n", ret);
+}
+void do_trecv_0() { do_trecv(0); }
+void do_trecv_8() { do_trecv(8); }
+void do_trecv_256() { do_trecv(256); }
+
+void do_tsend_more(size_t len)
+{
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = len,
+	};
+	struct fi_msg_tagged msg = {
+		.msg_iov = &iov,
+		.iov_count = 1,
+		.addr = cxit_ep_fi_addr,
+	};
+	int ret;
+
+	ret = fi_tsendmsg(cxit_ep, &msg, FI_MORE);
+	cr_assert_eq(ret, FI_SUCCESS, "ret is %d\n", ret);
+}
+void do_tsend_more_8() { do_tsend_more(8); }
+void do_tsend_more_256() { do_tsend_more(256); }
+
+void do_trecv_more(size_t len)
+{
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = len,
+	};
+	struct fi_msg_tagged msg = {
+		.msg_iov = &iov,
+		.iov_count = 1,
+		.addr = FI_ADDR_UNSPEC,
+	};
+	int ret;
+
+	ret = fi_trecvmsg(cxit_ep, &msg, FI_MORE);
+	cr_assert_eq(ret, FI_SUCCESS, "ret is %d\n", ret);
+}
+void do_trecv_more_8() { do_trecv_more(8); }
+void do_trecv_more_256() { do_trecv_more(256); }
+
+TestSuite(latency, .init = cxit_setup_tagged, .fini = cxit_teardown_tagged,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+struct latency_params {
+	char *api;
+	void (*func)();
+	bool flush_send;
+};
+
+ParameterizedTestParameters(latency, basic)
+{
+	size_t param_sz;
+
+	static struct latency_params params[] = {
+		{
+			.api = "tsend (0-byte)",
+			.func = do_tsend_0,
+			.flush_send = false,
+		},
+		{
+			.api = "trecv (0-byte)",
+			.func = do_trecv_0,
+			.flush_send = false,
+		},
+		{
+			.api = "tsend (8-byte)",
+			.func = do_tsend_8,
+			.flush_send = false,
+		},
+		{
+			.api = "trecv (8-byte)",
+			.func = do_trecv_8,
+			.flush_send = false,
+		},
+		{
+			.api = "tsend (256-byte)",
+			.func = do_tsend_256,
+			.flush_send = false,
+		},
+		{
+			.api = "trecv (256-byte)",
+			.func = do_trecv_256,
+			.flush_send = false,
+		},
+		{
+			.api = "tsend_more (8b, no doorbell)",
+			.func = do_tsend_more_8,
+			.flush_send = true,
+		},
+		{
+			.api = "trecv_more (8b, no doorbell)",
+			.func = do_trecv_more_8,
+			.flush_send = false,
+		},
+		{
+			.api = "tsend_more (256b, no doorbell)",
+			.func = do_tsend_more_256,
+			.flush_send = true,
+		},
+		{
+			.api = "trecv_more (256b, no doorbell)",
+			.func = do_trecv_more_256,
+			.flush_send = false,
+		},
+	};
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct latency_params, params, param_sz);
+}
+
+/* Test API latency */
+ParameterizedTest(struct latency_params *params, latency, basic)
+{
+	int warmup = 10;
+	uint64_t loops = 200;
+	int i;
+	uint64_t start;
+	uint64_t end;
+
+	buf = malloc(0x1000);
+	cr_assert(buf);
+
+	for (i = 0; i < warmup; i++)
+		params->func();
+
+	start = ofi_gettime_ns();
+
+	for (i = 0; i < loops; i++)
+		params->func();
+
+	end = ofi_gettime_ns();
+
+	printf("%s latency: %lu ns\n", params->api, (end - start) / loops);
+
+	/* Cleanup all outstanding more sends. */
+	if (params->flush_send) {
+		do_tsend_0();
+		sleep(1);
+		fi_cq_read(cxit_tx_cq, NULL, 0);
+	}
+
+	free(buf);
+}
diff --git a/prov/cxi/test/mem_reg.c b/prov/cxi/test/mem_reg.c
new file mode 100644
index 00000000000..8f85d3651d4
--- /dev/null
+++ b/prov/cxi/test/mem_reg.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <ctype.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+
+#include "libcxi/libcxi.h"
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(memReg, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+static void hmem_dev_reg_test_runner(bool dev_reg, bool cache_enable)
+{
+	int ret;
+	void *buf;
+	size_t buf_size = 1234;
+	struct fid_mr *mr;
+	struct cxip_mr *cxi_mr;
+
+	if (dev_reg)
+		ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1);
+	else
+		ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1);
+	cr_assert_eq(ret, 0,
+		     "Failed to set FI_CXI_DISABLE_HMEM_DEV_REGISTER %d",
+		     -errno);
+
+	if (cache_enable)
+		ret = setenv("FI_MR_CACHE_MONITOR", "memhooks", 1);
+	else
+		ret = setenv("FI_MR_CACHE_MONITOR", "disabled", 1);
+	cr_assert_eq(ret, 0,
+		     "Failed to set FI_MR_CACHE_MONITOR %d",
+		     -errno);
+
+	buf = malloc(buf_size);
+	cr_assert_neq(buf, NULL, "Failed to alloc mem");
+
+	cxit_setup_msg();
+
+	ret = fi_mr_reg(cxit_domain, buf, buf_size, FI_READ | FI_WRITE, 0, 0, 0,
+			&mr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret);
+
+	ret = fi_mr_bind(mr, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed: %d", ret);
+
+	ret = fi_mr_enable(mr);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed: %d", ret);
+
+	/* Have to examine the struct to determine if correct behavior is
+	 * happening.
+	 */
+	cxi_mr = container_of(mr, struct cxip_mr, mr_fid);
+	if (dev_reg)
+		cr_assert_eq(cxi_mr->md->handle_valid, true,
+			      "Bad cxip_md handle_valid");
+	else
+		cr_assert_eq(cxi_mr->md->handle_valid, false,
+			     "Bad cxip_md host_addr");
+	cr_assert_eq(cxi_mr->md->cached, cache_enable, "Bad cxip_md cached");
+
+	ret = fi_close(&mr->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close failed: %d", ret);
+
+	cxit_teardown_msg();
+	free(buf);
+}
+
+Test(memReg, disableHmemDevRegisterEnabled_mrCacheEnabled)
+{
+	hmem_dev_reg_test_runner(true, true);
+}
+
+Test(memReg, disableHmemDevRegisterEnabled_mrCacheDisabled)
+{
+	hmem_dev_reg_test_runner(true, false);
+}
+
+Test(memReg, disableHmemDevRegisterDisabled_mrCacheEnabled)
+{
+	hmem_dev_reg_test_runner(false, true);
+}
+
+Test(memReg, disableHmemDevRegisterDisabled_mrCacheDisabled)
+{
+	hmem_dev_reg_test_runner(false, false);
+}
+
+static void system_mem_dev_reg_test_runner(bool system_mem_cache_enabled,
+					   bool hmem_dev_reg_enabled)
+{
+	char *send_buf;
+	char *recv_buf;
+	size_t buf_size = 1234;
+	int ret;
+	struct fi_cq_tagged_entry cqe;
+	int i;
+
+	if (system_mem_cache_enabled)
+		ret = setenv("FI_MR_CACHE_MONITOR", "memhooks", 1);
+	else
+		ret = setenv("FI_MR_CACHE_MONITOR", "disabled", 1);
+	cr_assert_eq(ret, 0,
+		     "Failed to set FI_MR_CACHE_MONITOR %d",
+		     -errno);
+
+	if (hmem_dev_reg_enabled)
+		ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1);
+	else
+		ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1);
+	cr_assert_eq(ret, 0,
+		     "Failed to set FI_CXI_DISABLE_HMEM_DEV_REGISTER %d",
+		     -errno);
+
+	send_buf = calloc(1, buf_size);
+	cr_assert_neq(send_buf, NULL, "Failed to alloc mem");
+
+	recv_buf = calloc(1, buf_size);
+	cr_assert_neq(recv_buf, NULL, "Failed to alloc mem");
+
+	ret = open("/dev/urandom", O_RDONLY);
+	cr_assert_neq(ret, -1, "open failed: %d", -errno);
+	read(ret, send_buf + 1, buf_size - 1);
+	close(ret);
+
+	cxit_setup_msg();
+
+	ret = fi_recv(cxit_ep, recv_buf + 1, buf_size - 1, NULL,
+		      cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+	ret = fi_send(cxit_ep, send_buf + 1, buf_size - 1, NULL,
+		      cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	for (i = 0; i < buf_size; i++)
+		cr_assert_eq(send_buf[i], recv_buf[i],
+			     "Data corruption at byte %d", i);
+
+	cxit_teardown_msg();
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+Test(memReg, systemMemNoCache_enableHmemDevRegister)
+{
+	system_mem_dev_reg_test_runner(false, true);
+}
+
+Test(memReg, systemMemCache_enableHmemDevRegister)
+{
+	system_mem_dev_reg_test_runner(true, true);
+}
+
+Test(memReg, systemMemNoCache_disableHmemDevRegister)
+{
+	system_mem_dev_reg_test_runner(false, false);
+}
+
+Test(memReg, systemMemCache_disableHmemDevRegister)
+{
+	system_mem_dev_reg_test_runner(true, false);
+}
diff --git a/prov/cxi/test/mr.c b/prov/cxi/test/mr.c
new file mode 100644
index 00000000000..fab3cbab7d7
--- /dev/null
+++ b/prov/cxi/test/mr.c
@@ -0,0 +1,974 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2020 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(mr, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(mr, opt_mrs, .timeout = 60)
+{
+	int opt_mr_cnt = 200;
+	struct mem_region opt_mrs[opt_mr_cnt];
+	int i;
+	uint64_t key;
+
+	for (i = 0; i < opt_mr_cnt; i++) {
+		key = i;
+		mr_create(0x1000, FI_REMOTE_WRITE, 0, &key, &opt_mrs[i]);
+	}
+
+
+	for (i = 0; i < opt_mr_cnt; i++)
+		mr_destroy(&opt_mrs[i]);
+}
+
+Test(mr, invalid_fi_directed_recv_flag)
+{
+	int ret;
+	struct fi_mr_attr attr = {};
+	struct iovec iov = {};
+	struct fid_mr *mr;
+
+	iov.iov_len = sizeof(ret);
+	iov.iov_base = (void *)&ret;
+
+	attr.mr_iov = &iov;
+	attr.iov_count = 1;
+	attr.access = FI_REMOTE_READ | FI_REMOTE_WRITE;
+	attr.requested_key = 0x123;
+
+	ret = fi_mr_regattr(cxit_domain, &attr, FI_DIRECTED_RECV, &mr);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_mr_regattr failed: %d", ret);
+}
+
+Test(mr, std_mrs, .timeout = 600, .disabled = true)
+{
+	int std_mr_cnt = 16*1024;
+	int mrs = 0;
+	struct mem_region std_mrs[std_mr_cnt];
+	int i;
+	int ret;
+	uint64_t key;
+
+	for (i = 0; i < std_mr_cnt; i++) {
+		mrs++;
+		key = i + 200;
+		ret = mr_create(8, FI_REMOTE_WRITE, 0, &key, &std_mrs[i]);
+		if (ret) {
+			printf("Standard MR limit: %d\n", mrs);
+			break;
+		}
+	}
+
+	/* It's difficult to predict available resources. An idle system
+	 * currently supports at least 13955 total standard MRs. This is
+	 * roughly:
+	 * 16k total LEs -
+	 * 1000 (reserved for services) -
+	 * 1400 (reserved for other pools) =
+	 * 13984
+	 *
+	 * An EP requires a few other LEs to implement messaging and other
+	 * APIs.
+	 */
+	cr_assert(mrs >= 13955);
+
+	/* Note: MR close is very slow in emulation due to
+	 * cxil_invalidate_pte_le().
+	 */
+	for (i = 0; i < mrs; i++)
+		mr_destroy(&std_mrs[i]);
+}
+
+Test(mr, opt_mr_recycle, .timeout = 600, .disabled = false)
+{
+	int mr_cnt = 2*1024+1; // more than the total number of  PTEs
+	struct mem_region mr;
+	int i;
+	int ret;
+	uint64_t key;
+
+	for (i = 0; i < mr_cnt; i++) {
+		key = 0;
+		ret = mr_create(8, FI_REMOTE_WRITE, 0, &key, &mr);
+		cr_assert_eq(ret, FI_SUCCESS, "Failed to allocate MR %d\n", i);
+
+		mr_destroy(&mr);
+	}
+}
+
+/* Perform zero-byte Puts to zero-byte standard and optimized MRs. Validate
+ * remote counting events.
+ */
+Test(mr, mr_zero_len)
+{
+	struct mem_region mr;
+	struct fi_cq_tagged_entry cqe;
+	int ret;
+	uint64_t key;
+
+	/* Optimized MR */
+	key = 0;
+
+	ret = mr_create(0, FI_REMOTE_WRITE, 0, &key, &mr);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_write(cxit_ep, NULL, 0, NULL,
+		       cxit_ep_fi_addr, 0, key, NULL);
+	cr_assert(ret == FI_SUCCESS, "write failure %d", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	while (fi_cntr_read(cxit_rem_cntr) != 1)
+		;
+
+	mr_destroy(&mr);
+
+	/* Standard MR */
+	/* TODO: For FI_MR_PROV_KEY we will need to fully
+	 * allocate optimized
+	 */
+	key = 200;
+	ret = mr_create(0, FI_REMOTE_WRITE, 0, &key, &mr);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_write(cxit_ep, NULL, 0, NULL,
+		       cxit_ep_fi_addr, 0, key, NULL);
+	cr_assert(ret == FI_SUCCESS, "ret: %d\n", ret);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	while (fi_cntr_read(cxit_rem_cntr) != 2)
+		;
+
+	mr_destroy(&mr);
+}
+
+/* Validate that unique keys are enforced. */
+Test(mr, mr_unique_key)
+{
+	char buf[256];
+	struct fid_mr *mr1;
+	struct fid_mr *mr2;
+	int ret;
+
+	/* MR keys are enforced by the domain. */
+	if (cxit_prov_key) {
+		assert(1);
+		return;
+	}
+
+	ret = fi_mr_reg(cxit_domain, buf, 256, FI_REMOTE_WRITE, 0, 0, 0, &mr1,
+			NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_reg(cxit_domain, buf, 256, FI_REMOTE_WRITE, 0, 0, 0, &mr2,
+			NULL);
+	cr_assert(ret == -FI_ENOKEY);
+
+	ret = fi_close(&mr1->fid);
+	cr_assert(ret == FI_SUCCESS);
+}
+
+/* Validate not recycling non-cached FI_MR_PROV_KEY */
+Test(mr, mr_recycle)
+{
+	char buf[256];
+	struct fid_mr *mr1;
+	struct fid_mr *mr2;
+	struct fid_mr *mr3;
+	uint64_t rkey1 = 0;
+	uint64_t rkey2 = 0;
+	uint64_t rkey3 = 0;
+	int ret;
+
+	/* Must be non-cached FI_MR_PROV_KEY; we rely on the fact
+	 * rma EP are setup with a remote counter and bind it
+	 * to the EP which forces non-cached for the MR.
+	 */
+	if (!cxit_prov_key) {
+		assert(1);
+		return;
+	}
+
+	ret = fi_mr_reg(cxit_domain, buf, 256,
+			FI_REMOTE_READ | FI_REMOTE_WRITE, 0, rkey1, 0,
+			&mr1, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_bind(mr1, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind MR1 failed %d", ret);
+
+	ret = fi_mr_bind(mr1, &cxit_rem_cntr->fid, FI_REMOTE_WRITE);
+	cr_assert_eq(ret, FI_SUCCESS,
+		     "fi_mr_bind MR1 counter failed %d", ret);
+
+	ret = fi_mr_enable(mr1);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable MR1 failed %d", ret);
+
+	rkey1 = fi_mr_key(mr1);
+	cr_assert_neq(rkey1, FI_KEY_NOTAVAIL, "MR1 KEY invalid %lx", rkey1);
+
+	ret = fi_mr_reg(cxit_domain, buf, 256,
+			FI_REMOTE_READ | FI_REMOTE_WRITE, 0, rkey2, 0,
+			&mr2, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_bind(mr2, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind MR2 failed %d", ret);
+
+	ret = fi_mr_bind(mr2, &cxit_rem_cntr->fid, FI_REMOTE_WRITE);
+	cr_assert_eq(ret, FI_SUCCESS,
+		     "fi_mr_bind MR2 counter failed %d", ret);
+
+	ret = fi_mr_enable(mr2);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable MR2 failed %d", ret);
+
+	rkey2 = fi_mr_key(mr2);
+	cr_assert_neq(rkey2, FI_KEY_NOTAVAIL, "MR2 KEY invalid %lx", rkey2);
+	cr_assert_neq(rkey2, rkey1, "MR Keys not unique");
+
+	ret = fi_close(&mr2->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "close of MR2 %d", ret);
+
+	ret = fi_mr_reg(cxit_domain, buf, 256,
+			FI_REMOTE_READ | FI_REMOTE_WRITE, 0, rkey3, 0,
+			&mr3, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_bind(mr3, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind MR3 failed %d", ret);
+
+	ret = fi_mr_bind(mr3, &cxit_rem_cntr->fid, FI_REMOTE_WRITE);
+	cr_assert_eq(ret, FI_SUCCESS,
+		     "fi_mr_bind MR3 counter failed %d", ret);
+
+	ret = fi_mr_enable(mr3);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable MR3 failed %d", ret);
+
+	rkey3 = fi_mr_key(mr3);
+	cr_assert_neq(rkey3, FI_KEY_NOTAVAIL, "MR3 KEY invalid %lx", rkey3);
+
+	cr_assert_neq(rkey3, rkey1, "MR3 Key not unique");
+	cr_assert_neq(rkey3, rkey2, "MR2 Key recycled");
+
+	ret = fi_close(&mr1->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "close of MR1 %d", ret);
+	ret = fi_close(&mr3->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "close of MR3 %d", ret);
+}
+
+/* Validate that RKEY are not required for local MR */
+Test(mr, mr_no_local_rkey)
+{
+	char buf[256];
+	struct fid_mr *mr1;
+	struct fid_mr *mr2;
+	uint64_t rkey = 0;
+	uint64_t no_rkey;
+	int ret;
+
+	ret = fi_mr_reg(cxit_domain, buf, 256, FI_READ | FI_WRITE, 0, rkey, 0,
+			&mr1, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_bind(mr1, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind mr1 failed %d", ret);
+
+	ret = fi_mr_enable(mr1);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable mr1 failed %d", ret);
+
+	no_rkey = fi_mr_key(mr1);
+	cr_assert_eq(no_rkey, FI_KEY_NOTAVAIL, "No RKEY check %ld", no_rkey);
+
+	/* Verify second local MR with same client key value passed works */
+	ret = fi_mr_reg(cxit_domain, buf, 256, FI_READ | FI_WRITE, 0, rkey, 0,
+			&mr2, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_bind(mr2, &cxit_ep->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind mr2 failed %d", ret);
+
+	ret = fi_mr_enable(mr2);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable mr2 failed %d", ret);
+
+	no_rkey = fi_mr_key(mr2);
+	cr_assert_eq(no_rkey, FI_KEY_NOTAVAIL, "No RKEY check %ld", no_rkey);
+
+	ret = fi_close(&mr2->fid);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_close(&mr1->fid);
+	cr_assert(ret == FI_SUCCESS);
+}
+
+
+/* Test creating and destroying an MR that is never bound to an EP. */
+Test(mr, no_bind)
+{
+	int ret;
+	size_t buf_len = 0x1000;
+	void *buf;
+	struct fid_mr *mr;
+
+	buf = malloc(buf_len);
+	cr_assert(buf);
+
+	/* Optimized MR */
+
+	ret = fi_mr_reg(cxit_domain, buf, buf_len, FI_REMOTE_WRITE,
+			0, 0, 0, &mr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS);
+
+	fi_close(&mr->fid);
+
+	/* Standard MR */
+
+	ret = fi_mr_reg(cxit_domain, buf, buf_len, FI_REMOTE_WRITE,
+			0, 200, 0, &mr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS);
+
+	fi_close(&mr->fid);
+
+	free(buf);
+}
+
+TestSuite(mr_event, .init = cxit_setup_rma_mr_events,
+	  .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(mr_event, counts)
+{
+	int ret;
+	struct fi_cq_tagged_entry cqe;
+	struct fid_mr *mr;
+	struct cxip_mr *cxip_mr;
+	uint8_t *src_buf;
+	uint8_t *tgt_buf;
+	int src_len = 8;
+	int tgt_len = 4096;
+	uint64_t key_val = 200;
+	uint64_t orig_cnt;
+	int matches;
+	int accesses;
+	uint64_t operand1;
+	uint64_t result1;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_ioc result_ioc;
+	struct fi_rma_ioc rma_ioc;
+
+	/* Need remote counters */
+	cxit_create_rem_cntrs();
+
+	src_buf = malloc(src_len);
+	cr_assert_not_null(src_buf, "src_buf alloc failed");
+
+	tgt_buf = calloc(1, tgt_len);
+	cr_assert_not_null(tgt_buf, "tgt_buf alloc failed");
+
+	/* Create MR */
+	ret = fi_mr_reg(cxit_domain, tgt_buf, tgt_len,
+			FI_REMOTE_WRITE | FI_REMOTE_READ, 0,
+			key_val, 0, &mr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	cxip_mr = container_of(mr, struct cxip_mr, mr_fid);
+
+	ret = fi_mr_bind(mr, &cxit_ep->fid, 0);
+	cr_assert(ret == FI_SUCCESS);
+
+	cr_assert(cxit_rem_cntr != NULL);
+	ret = fi_mr_bind(mr, &cxit_rem_cntr->fid, FI_REMOTE_WRITE);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_enable(mr);
+	cr_assert(ret == FI_SUCCESS);
+
+	if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY)
+		key_val = fi_mr_key(mr);
+
+	/* Match counts do not apply to optimized MR */
+	if (cxip_generic_is_mr_key_opt(key_val))
+		goto done;
+
+	orig_cnt = fi_cntr_read(cxit_rem_cntr);
+
+	matches = ofi_atomic_get32(&cxip_mr->match_events);
+	accesses = ofi_atomic_get32(&cxip_mr->access_events);
+
+	ret = fi_write(cxit_ep, src_buf, src_len, NULL,
+		       cxit_ep_fi_addr, 0, key_val, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate remote counter was incremented correctly */
+	while (orig_cnt + 1 != fi_cntr_read(cxit_rem_cntr))
+		;
+
+	/* Validate match and access counts incremented */
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) >= matches + 1,
+		  "Match count not updated for RMA\n");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) >= accesses + 1,
+		  "RMA access count not updated\n");
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) ==
+		  ofi_atomic_get32(&cxip_mr->access_events),
+		  "RMA matches do not equal accesses");
+
+	matches = ofi_atomic_get32(&cxip_mr->match_events);
+	accesses = ofi_atomic_get32(&cxip_mr->access_events);
+
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0,
+			key_val, FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL);
+
+	/* Validate remote counter was incremented correctly */
+	while (orig_cnt + 2 != fi_cntr_read(cxit_rem_cntr))
+		;
+
+	/* Validate match and access counts incremented */
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) >= matches + 1,
+		  "Match count not updated for atomic");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) >= accesses + 1,
+		  "Atomic access count not updated");
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) ==
+		  ofi_atomic_get32(&cxip_mr->access_events),
+		  "Atomic matches do not equal accesses");
+
+	matches = ofi_atomic_get32(&cxip_mr->match_events);
+	accesses = ofi_atomic_get32(&cxip_mr->access_events);
+
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, NULL, &result1, NULL,
+			      cxit_ep_fi_addr, 0, key_val, FI_UINT64,
+			      FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+
+	/* Validate remote counter was incremented correctly */
+	while (orig_cnt + 3 != fi_cntr_read(cxit_rem_cntr))
+		;
+
+	/* Validate match and access counts incremented */
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) >= matches + 1,
+		  "Fetch atomic match count not updated for atomic");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) >= accesses + 1,
+		  "Fetch atomic access count not updated");
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) ==
+		  ofi_atomic_get32(&cxip_mr->access_events),
+		  "Fetch atomic matches do not equal accesses");
+
+	matches = ofi_atomic_get32(&cxip_mr->match_events);
+	accesses = ofi_atomic_get32(&cxip_mr->access_events);
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+	result_ioc.addr = &result1;
+	result_ioc.count = 1;
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key_val;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	/* Do a fetch with a flush */
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_DELIVERY_COMPLETE);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL);
+
+	/* Validate remote counter was incremented correctly,
+	 * once for atomic and once for flush.
+	 */
+	while (orig_cnt + 5 != fi_cntr_read(cxit_rem_cntr))
+		;
+
+	/* Validate match and access counts incremented */
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) >= matches + 1,
+		  "Fetch atomic/flush match count not updated for atomic");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) >= accesses + 1,
+		  "Fetch atomic/flush access count not updated");
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) ==
+		  ofi_atomic_get32(&cxip_mr->access_events),
+		  "Fetch atomic flush matches do not equal accesses");
+
+done:
+	fi_close(&mr->fid);
+
+	free(tgt_buf);
+	free(src_buf);
+}
+
+Test(mr_event, not_found_counts)
+{
+	int ret;
+	struct fi_cq_err_entry err;
+	struct fi_cq_tagged_entry cqe;
+	struct fid_mr *mr;
+	struct cxip_mr *cxip_mr;
+	uint8_t *src_buf;
+	uint8_t *tgt_buf;
+	int src_len = 8;
+	int tgt_len = 4096;
+	uint64_t key_val = 200;
+	int matches;
+	int accesses;
+	uint64_t operand1;
+	uint64_t result1;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_ioc result_ioc;
+	struct fi_rma_ioc rma_ioc;
+
+	src_buf = malloc(src_len);
+	cr_assert_not_null(src_buf, "src_buf alloc failed");
+
+	tgt_buf = calloc(1, tgt_len);
+	cr_assert_not_null(tgt_buf, "tgt_buf alloc failed");
+
+	/* Create MR */
+	ret = fi_mr_reg(cxit_domain, tgt_buf, tgt_len,
+			FI_REMOTE_WRITE | FI_REMOTE_READ, 0,
+			key_val, 0, &mr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	cxip_mr = container_of(mr, struct cxip_mr, mr_fid);
+
+	ret = fi_mr_bind(mr, &cxit_ep->fid, 0);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_enable(mr);
+	cr_assert(ret == FI_SUCCESS);
+
+	if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY)
+		key_val = fi_mr_key(mr);
+
+	/* Match counts do not apply to optimized MR */
+	if (cxip_generic_is_mr_key_opt(key_val))
+		goto done;
+
+	/* Use invalid key so that remote MR is not found */
+	key_val++;
+
+	matches = ofi_atomic_get32(&cxip_mr->match_events);
+	accesses = ofi_atomic_get32(&cxip_mr->access_events);
+
+	ret = fi_write(cxit_ep, src_buf, src_len, NULL,
+		       cxit_ep_fi_addr, 0, key_val, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret);
+	cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err);
+
+	/* Validate match and access counts did not increment */
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches,
+		  "Match count updated for RMA\n");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses,
+		  "Access count updated for RMA\n");
+
+	ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0,
+			key_val, FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic success %d", ret);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret);
+	cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err);
+
+	/* Validate match and access counts did not increment */
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches,
+		  "Match count updated for atomic\n");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses,
+		  "Access count updated for atomic\n");
+
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, NULL, &result1, NULL,
+			      cxit_ep_fi_addr, 0, key_val, FI_UINT64,
+			      FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic fetch success %d",
+		     ret);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret);
+	cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err);
+
+	/* Validate match and access counts did not increment */
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches,
+		  "Match count updated for atomic fetch\n");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses,
+		  "Access count updated for atomic fetch\n");
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+	result_ioc.addr = &result1;
+	result_ioc.count = 1;
+	rma_ioc.addr = 0;
+	rma_ioc.count = 1;
+	rma_ioc.key = key_val;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	/* Do a fetch with a flush */
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_DELIVERY_COMPLETE);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic flush success %d",
+		     ret);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret);
+	cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err);
+
+	/* Validate match and access counts did not increment */
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches,
+		  "Match count updated for atomic flush\n");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses,
+		  "Access count updated for atomic flush\n");
+
+done:
+	fi_close(&mr->fid);
+
+	free(tgt_buf);
+	free(src_buf);
+}
+
+Test(mr_event, bounds_err_counts)
+{
+	int ret;
+	struct fi_cq_err_entry err;
+	struct fi_cq_tagged_entry cqe;
+	struct fid_mr *mr;
+	struct cxip_mr *cxip_mr;
+	uint8_t *src_buf;
+	uint8_t *tgt_buf;
+	int src_len = 16;
+	int tgt_len = 8;
+	uint64_t key_val = 200;  /* Force client key to be standard MR */
+	int matches;
+	int accesses;
+	uint64_t operand1;
+	uint64_t result1;
+	struct fi_msg_atomic msg = {};
+	struct fi_ioc ioc;
+	struct fi_ioc result_ioc;
+	struct fi_rma_ioc rma_ioc;
+	struct cxip_ep *cxi_ep;
+
+	src_buf = malloc(src_len);
+	cr_assert_not_null(src_buf, "src_buf alloc failed");
+
+	tgt_buf = calloc(1, tgt_len);
+	cr_assert_not_null(tgt_buf, "tgt_buf alloc failed");
+
+	/* Create MR */
+	ret = fi_mr_reg(cxit_domain, tgt_buf, tgt_len,
+			FI_REMOTE_WRITE | FI_REMOTE_READ, 0,
+			key_val, 0, &mr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	cxip_mr = container_of(mr, struct cxip_mr, mr_fid);
+
+	ret = fi_mr_bind(mr, &cxit_ep->fid, 0);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_enable(mr);
+	cr_assert(ret == FI_SUCCESS);
+
+	if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY)
+		key_val = fi_mr_key(mr);
+
+	/* Match counts do not apply to optimized MR */
+	if (cxip_generic_is_mr_key_opt(key_val))
+		goto done;
+
+	/* Netsim does not generate EVENT_MATCH for bounds,
+	 * while hardware does. TODO: Fix this in netsim.
+	 */
+	cxi_ep = container_of(cxit_ep, struct cxip_ep, ep);
+
+	matches = ofi_atomic_get32(&cxip_mr->match_events);
+	accesses = ofi_atomic_get32(&cxip_mr->access_events);
+
+	/* src len is greater than remote MR len */
+	ret = fi_write(cxit_ep, src_buf, src_len, NULL,
+		       cxit_ep_fi_addr, 0, key_val, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret);
+	cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err);
+
+	/* Validate match and access counts increment */
+	if (!is_netsim(cxi_ep->ep_obj)) {
+		matches++;
+		accesses++;
+	}
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches,
+		  "Match count mismatch for RMA\n");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses,
+		  "Access count mismatch for RMA\n");
+
+	/* Remote offset of 8 is greater than remote MR bounds */
+	ret = fi_atomic(cxit_ep, &operand1, 1, NULL, cxit_ep_fi_addr, 8,
+			key_val, FI_UINT64, FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic success %d", ret);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret);
+	cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err);
+
+	/* Validate match and access counts increment */
+	if (!is_netsim(cxi_ep->ep_obj)) {
+		matches++;
+		accesses++;
+	}
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches,
+		  "Match count mismatch for atomic\n");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses,
+		  "Access count mismatch for atomic\n");
+
+	/* Remote offset of 8 is greater than remote MR bounds */
+	ret = fi_fetch_atomic(cxit_ep, &operand1, 1, NULL, &result1, NULL,
+			      cxit_ep_fi_addr, 8, key_val, FI_UINT64,
+			      FI_SUM, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic fetch success %d",
+		     ret);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret);
+	cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err);
+
+	/* Validate match and access counts increment */
+	if (!is_netsim(cxi_ep->ep_obj)) {
+		matches++;
+		accesses++;
+	}
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches,
+		  "Match count mismatch atomic fetch\n");
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses,
+		  "Access count mismatch for atomic fetch\n");
+
+	ioc.addr = &operand1;
+	ioc.count = 1;
+	result_ioc.addr = &result1;
+	result_ioc.count = 1;
+
+	/* Remote offset of 8 is greater than remote MR bounds */
+	rma_ioc.addr = 8;
+	rma_ioc.count = 1;
+	rma_ioc.key = key_val;
+
+	msg.msg_iov = &ioc;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma_ioc;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.datatype = FI_UINT64;
+	msg.op = FI_SUM;
+
+	/* Do a fetch with a flush */
+	ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1,
+				 FI_DELIVERY_COMPLETE);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic flush success %d",
+		     ret);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret);
+	cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err);
+
+	/* For an atomic flush with FI_DELIVERY_COMPLETE using an
+	 * out-of-bounds offset we expect both the atomic and zero
+	 * by flush to generate events.
+	 */
+	if (!is_netsim(cxi_ep->ep_obj)) {
+		matches++;
+		accesses++;
+	}
+	cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches + 1,
+		  "Match count != %d for flush with atomic error",
+		  matches + 1);
+	cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses + 1,
+		  "Access count != %d for flush with atomic error",
+		  accesses + 1);
+
+done:
+	fi_close(&mr->fid);
+
+	free(tgt_buf);
+	free(src_buf);
+}
+
+/*
+ * With FI_MR_PROV_KEY, test if all PID IDX mapping resources required by
+ * optimized MR are consumed, that falling back to standard MR is done.
+ * This test should run with and without MR cache disabled.
+ */
+TestSuite(mr_resources, .init = cxit_setup_domain, .fini = cxit_teardown_domain,
+	  .timeout = 120);
+
+#define NUM_MR_TEST_EP	15
+#define NUM_MR_PER_EP	86
+
+Test(mr_resources, opt_fallback)
+{
+	struct fid_domain *dom[NUM_MR_TEST_EP];
+	struct fid_ep *ep[NUM_MR_TEST_EP];
+	struct fid_av *av[NUM_MR_TEST_EP];
+	struct fid_cq *cq[NUM_MR_TEST_EP];
+	struct fid_mr **mr;
+	char buf[256];
+	int ret;
+	int num_dom;
+	int num_mr;
+	int tot_mr;
+
+	if (!cxit_prov_key)
+		return;
+
+	mr = calloc(NUM_MR_TEST_EP * NUM_MR_PER_EP,
+		    sizeof(struct fid_mr *));
+	cr_assert(mr != NULL, "calloc");
+
+	for (num_dom = 0, tot_mr = 0; num_dom < NUM_MR_TEST_EP; num_dom++) {
+
+		ret = fi_domain(cxit_fabric, cxit_fi, &dom[num_dom], NULL);
+		cr_assert(ret == FI_SUCCESS, "fi_domain");
+
+		ret = fi_endpoint(dom[num_dom], cxit_fi, &ep[num_dom], NULL);
+		cr_assert(ret == FI_SUCCESS, "fi_endpoint");
+
+		ret = fi_av_open(dom[num_dom], &cxit_av_attr,
+				 &av[num_dom], NULL);
+		cr_assert(ret == FI_SUCCESS, "fi_av_open");
+
+		ret = fi_ep_bind(ep[num_dom], &av[num_dom]->fid, 0);
+		cr_assert(ret == FI_SUCCESS, "fi_ep_bind AV");
+
+		ret = fi_cq_open(dom[num_dom], &cxit_tx_cq_attr,
+				 &cq[num_dom], NULL);
+		cr_assert(ret == FI_SUCCESS, "fi_cq_open");
+
+		ret = fi_ep_bind(ep[num_dom], &cq[num_dom]->fid,
+				 FI_TRANSMIT);
+		cr_assert(ret == FI_SUCCESS, "fi_ep_bind TX CQ");
+
+		ret = fi_ep_bind(ep[num_dom], &cq[num_dom]->fid,
+				 FI_RECV);
+		cr_assert(ret == FI_SUCCESS, "fi_ep_bind RX CQ");
+
+		ret = fi_enable(ep[num_dom]);
+		cr_assert(ret == FI_SUCCESS, "fi_enable");
+
+		/* Create only optimized MR for this EP */
+		for (num_mr = 0; num_mr < NUM_MR_PER_EP; num_mr++, tot_mr++) {
+
+			ret = fi_mr_reg(dom[num_dom], buf, 256,
+					FI_REMOTE_WRITE | FI_REMOTE_READ,
+					0, 0, 0, &mr[tot_mr], NULL);
+			cr_assert(ret == FI_SUCCESS, "fi_mr_reg");
+
+			ret = fi_mr_bind(mr[tot_mr], &ep[num_dom]->fid, 0);
+			cr_assert(ret == FI_SUCCESS, "fi_mr_bind");
+
+			ret = fi_mr_enable(mr[tot_mr]);
+			cr_assert(ret == FI_SUCCESS, "fi_mr_enable");
+		}
+	}
+
+	/*
+	 * Validate that sufficient MR were created to exhaust the PID IDX
+	 * mappings of 2560. There are two mappings required for each MR
+	 * and 4 PID IDX mappings required by each endpoint created.
+	 */
+	cr_assert(4 * num_dom + tot_mr * 2 >= 2560, "Number of MR created");
+
+	for (num_mr = 0; num_mr < tot_mr; num_mr++) {
+		ret = fi_close(&mr[num_mr]->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close MR");
+	}
+
+	for (num_dom = 0; num_dom < NUM_MR_TEST_EP; num_dom++) {
+		ret = fi_close(&ep[num_dom]->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close EP");
+
+		ret = fi_close(&cq[num_dom]->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close CQ");
+
+		ret = fi_close(&av[num_dom]->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close AV");
+
+		ret = fi_close(&dom[num_dom]->fid);
+		cr_assert(ret == FI_SUCCESS, "fi_close Domain");
+	}
+
+	free(mr);
+}
diff --git a/prov/cxi/test/msg.c b/prov/cxi/test/msg.c
new file mode 100644
index 00000000000..058761b2745
--- /dev/null
+++ b/prov/cxi/test/msg.c
@@ -0,0 +1,2169 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(msg, .init = cxit_setup_msg, .fini = cxit_teardown_msg,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test basic send/recv */
+Test(msg, ping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate RX event fields */
+	cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+	cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV),
+		  "RX CQE flags mismatch");
+	cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length");
+	cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+	cr_assert(rx_cqe.data == 0, "Invalid RX CQE data");
+	cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate TX event fields */
+	cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+	cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+		  "TX CQE flags mismatch");
+	cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+	cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+	cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+	cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic send/recv with data */
+Test(msg, pingdata)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	uint64_t data = 0xabcdabcdabcdabcd;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_senddata(cxit_ep, send_buf, send_len, NULL, data,
+			  cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate RX event fields */
+	cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+	cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV | FI_REMOTE_CQ_DATA),
+		  "RX CQE flags mismatch");
+	cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length");
+	cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+	cr_assert(rx_cqe.data == data, "Invalid RX CQE data");
+	cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate TX event fields */
+	cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+	cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+		  "TX CQE flags mismatch");
+	cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+	cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+	cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+	cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic inject send */
+Test(msg, inject_ping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_inject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_inject failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_MSG | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Make sure a TX event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic injectdata */
+Test(msg, injectdata_ping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	uint64_t data = 0xabcdabcdabcdabcd;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_injectdata(cxit_ep, send_buf, send_len, data,
+			    cxit_ep_fi_addr);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_inject failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len,
+			  FI_MSG | FI_RECV | FI_REMOTE_CQ_DATA, NULL, data, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Make sure a TX event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic sendv/recvv */
+Test(msg, vping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct iovec riovec;
+	struct iovec siovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	ret = fi_recvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	ret = fi_sendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate RX event fields */
+	cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+	cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV),
+		  "RX CQE flags mismatch");
+	cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length");
+	cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+	cr_assert(rx_cqe.data == 0, "Invalid RX CQE data");
+	cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate TX event fields */
+	cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+	cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+		  "TX CQE flags mismatch");
+	cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+	cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+	cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+	cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic sendmsg/recvmsg */
+Test(msg, msgping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg rmsg = {};
+	struct fi_msg smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = NULL;
+
+	ret = fi_recvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.context = NULL;
+
+	ret = fi_sendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate RX event fields */
+	cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+	cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV),
+		  "RX CQE flags mismatch");
+	cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length");
+	cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+	cr_assert(rx_cqe.data == 0, "Invalid RX CQE data");
+	cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate TX event fields */
+	cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+	cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+		  "TX CQE flags mismatch");
+	cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+	cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+	cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+	cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic sendmsg/recvmsg with two EP bound to same CQ */
+Test(msg, msgping_cq_share)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*recv_buf2,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg rmsg = {};
+	struct fi_msg smsg = {};
+	struct iovec riovec;
+	struct iovec riovec2;
+	struct iovec siovec;
+	struct fid_ep *fid_ep2;
+	struct cxip_addr ep2_addr;
+	fi_addr_t ep2_fi_addr;
+	size_t addrlen = sizeof(cxit_ep_addr);
+	int num_recv_comps = 0;
+
+	/* Create a second EP bound to the same CQs as original */
+	ret = fi_endpoint(cxit_domain, cxit_fi, &fid_ep2, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_endpoint");
+	cr_assert_not_null(fid_ep2);
+
+	ret = fi_ep_bind(fid_ep2, &cxit_tx_cq->fid, cxit_tx_cq_bind_flags);
+	cr_assert(!ret, "fe_ep_bind TX CQ to 2nd EP");
+	ret = fi_ep_bind(fid_ep2, &cxit_rx_cq->fid, cxit_rx_cq_bind_flags);
+	cr_assert(!ret, "fe_ep_bind RX CQ to 2nd EP");
+
+	ret = fi_ep_bind(fid_ep2, &cxit_av->fid, 0);
+	cr_assert(!ret, "fi_ep_bind AV to 2nd EP");
+
+	ret = fi_enable(fid_ep2);
+	cr_assert(ret == FI_SUCCESS, "fi_enable of 2nd EP");
+
+	ret = fi_getname(&fid_ep2->fid, &ep2_addr, &addrlen);
+	cr_assert(ret == FI_SUCCESS, "fi_getname for 2nd EP");
+	cr_assert(addrlen == sizeof(ep2_addr), "addr length");
+
+	ret = fi_av_insert(cxit_av, (void *)&ep2_addr, 1,
+			   &ep2_fi_addr, 0, NULL);
+	cr_assert(ret == 1);
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	recv_buf2 = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf2);
+	memset(recv_buf2, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer for first EP */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = NULL;
+
+	ret = fi_recvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Post RX buffer for second EP */
+	riovec2.iov_base = recv_buf2;
+	riovec2.iov_len = recv_len;
+	rmsg.msg_iov = &riovec2;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = NULL;
+
+	ret = fi_recvmsg(fid_ep2, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.context = NULL;
+
+	ret = fi_sendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	/* Send 64 byte message to 2nd EP */
+	smsg.addr = ep2_fi_addr;
+	ret = fi_sendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send to EP2 failed %d", ret);
+
+	/* Wait for async events from single CQ bound to multiple EP
+	 * to verify receive notification for each EP occurs.
+	 */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		if (ret == 1) {
+			/* Validate RX event fields */
+			cr_assert(rx_cqe.op_context == NULL,
+				  "RX CQE Context mismatch");
+			cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV),
+				  "RX CQE flags mismatch");
+			cr_assert(rx_cqe.len == send_len,
+				  "Invalid RX CQE length");
+			cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+			cr_assert(rx_cqe.data == 0, "Invalid RX CQE data");
+			cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+			cr_assert(from == cxit_ep_fi_addr,
+				  "Invalid source address");
+			num_recv_comps++;
+		}
+	} while (num_recv_comps < 2);
+	cr_assert_eq(num_recv_comps, 2, "Not all completions received");
+
+	/* Wait for async events indicating data has been sent */
+	for (i = 0; i < 2; i++) {
+		ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		/* Validate TX event fields */
+		cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+		cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+			  "TX CQE flags mismatch");
+		cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+		cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+		cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+		cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+	}
+
+	/* Validate sent data to each receive buffer */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+		cr_expect_eq(recv_buf2[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf2[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	ret = fi_close(&fid_ep2->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close endpoint2");
+
+	free(send_buf);
+	free(recv_buf);
+	free(recv_buf2);
+}
+
+/* Test basic sendmsg/recvmsg with data */
+Test(msg, msgping_wdata)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg rmsg = {};
+	struct fi_msg smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+	uint64_t data = 0xabcdabcdabcdabcd;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = NULL;
+
+	ret = fi_recvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.context = NULL;
+	smsg.data = data;
+
+	ret = fi_sendmsg(cxit_ep, &smsg, FI_REMOTE_CQ_DATA);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate RX event fields */
+	cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+	cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV | FI_REMOTE_CQ_DATA),
+		  "RX CQE flags mismatch");
+	cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length");
+	cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+	cr_assert(rx_cqe.data == data, "Invalid RX CQE data");
+	cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate TX event fields */
+	cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+	cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+		  "TX CQE flags mismatch");
+	cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+	cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+	cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+	cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic injectmsg */
+Test(msg, inject_msgping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg rmsg = {};
+	struct fi_msg smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = NULL;
+
+	ret = fi_recvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.context = NULL;
+
+	ret = fi_sendmsg(cxit_ep, &smsg, FI_INJECT);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_sendmsg failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_MSG | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_MSG | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test send/recv sizes small to large */
+Test(msg, sizes)
+{
+	int i, j, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64*1024; /* 128k fails */
+	int send_len = 64*1024;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	bool sent;
+	bool recved;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	for (i = 0; i <= recv_len; i = (i ? i << 1 : 1)) {
+		recved = sent = false;
+
+		/* Post RX buffer */
+		ret = fi_recv(cxit_ep, i ? recv_buf : NULL, i, NULL,
+			      FI_ADDR_UNSPEC, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+		/* Send to self */
+		ret = fi_send(cxit_ep, i ? send_buf : NULL, i, NULL,
+			      cxit_ep_fi_addr, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+		/* Gather both events, ensure progress on both sides. */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+			if (ret == 1) {
+				cr_assert_eq(recved, false);
+				recved = true;
+			} else {
+				cr_assert_eq(ret, -FI_EAGAIN,
+					     "fi_cq_read unexpected value %d",
+					     ret);
+			}
+
+			ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+			if (ret == 1) {
+				cr_assert_eq(sent, false);
+				sent = true;
+			} else {
+				cr_assert_eq(ret, -FI_EAGAIN,
+					     "fi_cq_read unexpected value %d",
+					     ret);
+			}
+		} while (!(sent && recved));
+
+		/* Validate RX event fields */
+		cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+		cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV),
+			  "RX CQE flags mismatch");
+		cr_assert(rx_cqe.len == i, "Invalid RX CQE length");
+		cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+		cr_assert(rx_cqe.data == 0, "Invalid RX CQE data");
+		cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+		/* Validate TX event fields */
+		cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+		cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+			  "TX CQE flags mismatch");
+		cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+		cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+		cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+		cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+
+		/* Validate sent data */
+		for (j = 0; j < i; j++) {
+			cr_expect_eq(recv_buf[j], send_buf[j],
+				     "data mismatch, element[%d], exp=%d saw=%d, size:%d err=%d\n",
+				     j, send_buf[j], recv_buf[j], i, err++);
+		}
+	}
+
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test send/recv sizes large to small (this exercises MR caching) */
+Test(msg, sizes_desc)
+{
+	int i, j, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64*1024; /* 128k fails */
+	int send_len = 64*1024;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	bool sent;
+	bool recved;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	for (i = recv_len; i >= 1; i >>= 1) {
+		recved = sent = false;
+
+		/* Post RX buffer */
+		ret = fi_recv(cxit_ep, recv_buf, i, NULL, FI_ADDR_UNSPEC,
+			      NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+		/* Send 64 bytes to self */
+		ret = fi_send(cxit_ep, send_buf, i, NULL, cxit_ep_fi_addr,
+			      NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+		/* Gather both events, ensure progress on both sides. */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+			if (ret == 1) {
+				cr_assert_eq(recved, false);
+				recved = true;
+			} else {
+				cr_assert_eq(ret, -FI_EAGAIN,
+					     "fi_cq_read unexpected value %d",
+					     ret);
+			}
+
+			ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+			if (ret == 1) {
+				cr_assert_eq(sent, false);
+				sent = true;
+			} else {
+				cr_assert_eq(ret, -FI_EAGAIN,
+					     "fi_cq_read unexpected value %d",
+					     ret);
+			}
+		} while (!(sent && recved));
+
+		/* Validate RX event fields */
+		cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+		cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV),
+			  "RX CQE flags mismatch");
+		cr_assert(rx_cqe.len == i, "Invalid RX CQE length");
+		cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+		cr_assert(rx_cqe.data == 0, "Invalid RX CQE data");
+		cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+		/* Validate TX event fields */
+		cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+		cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+			  "TX CQE flags mismatch");
+		cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+		cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+		cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+		cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+
+		/* Validate sent data */
+		for (j = 0; j < i; j++) {
+			cr_expect_eq(recv_buf[j], send_buf[j],
+				     "data mismatch, element[%d], exp=%d saw=%d, size:%d err=%d\n",
+				     j, send_buf[j], recv_buf[j], i, err++);
+		}
+	}
+
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test software posted receives greater than hardware limits */
+Test(msg, sw_max_recv, .timeout = CXIT_DEFAULT_TIMEOUT)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	fi_addr_t from;
+	char *rx_mode;
+
+	/* Test is only valid in software only matching */
+	rx_mode = getenv("FI_CXI_RX_MATCH_MODE");
+	if (!rx_mode || strcmp(rx_mode, "software")) {
+		cr_assert(1);
+		return;
+	}
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	/* Only 64K buffer IDs are available */
+	for (i = 0; i < 68000; i++) {
+		ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL,
+			      FI_ADDR_UNSPEC, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+	}
+
+	/* Send 64 bytes to self */
+	for (i = 0; i < 68000; i++) {
+		ret = fi_send(cxit_ep, send_buf, send_len, NULL,
+			      cxit_ep_fi_addr, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+		/* Wait for async event indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		/* Validate RX event fields */
+		cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+		cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV),
+			  "RX CQE flags mismatch");
+		cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length");
+		cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+		cr_assert(rx_cqe.data == 0, "Invalid RX CQE data");
+		cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		/* Validate TX event fields */
+		cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+		cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+			  "TX CQE flags mismatch");
+		cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+		cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+		cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+		cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+	}
+}
+
+/* Test send/recv interoperability with tagged messaging */
+Test(msg, tagged_interop)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	uint8_t *trecv_buf,
+		*tsend_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	trecv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(trecv_buf);
+	memset(trecv_buf, 0, recv_len);
+
+	tsend_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(tsend_buf);
+
+	for (i = 0; i < send_len; i++)
+		tsend_buf[i] = i + 0xc1;
+
+	/* Post tagged RX buffer */
+	ret = fi_trecv(cxit_ep, trecv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	/* Post RX buffer */
+	ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	/* Send 64 byte tagged message to self */
+	ret = fi_tsend(cxit_ep, tsend_buf, send_len, NULL, cxit_ep_fi_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate RX event fields */
+	cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+	cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV),
+		  "RX CQE flags mismatch");
+	cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length");
+	cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+	cr_assert(rx_cqe.data == 0, "Invalid RX CQE data");
+	cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate TX event fields */
+	cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+	cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND),
+		  "TX CQE flags mismatch");
+	cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+	cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+	cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+	cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate RX event fields */
+	cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch");
+	cr_assert(rx_cqe.flags == (FI_TAGGED | FI_RECV),
+		  "RX CQE flags mismatch");
+	cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length");
+	cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address");
+	cr_assert(rx_cqe.data == 0, "Invalid RX CQE data");
+	cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag");
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	/* Validate TX event fields */
+	cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch");
+	cr_assert(tx_cqe.flags == (FI_TAGGED | FI_SEND),
+		  "TX CQE flags mismatch");
+	cr_assert(tx_cqe.len == 0, "Invalid TX CQE length");
+	cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address");
+	cr_assert(tx_cqe.data == 0, "Invalid TX CQE data");
+	cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(trecv_buf[i], tsend_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, tsend_buf[i], trecv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(tsend_buf);
+	free(trecv_buf);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+#define RECV_CTX ((void *)0xabc0000000000000)
+#define SEND_CTX ((void *)0xdef0000000000000)
+
+void do_multi_recv(uint8_t *send_buf, size_t send_len,
+		   uint8_t *recv_buf, size_t recv_len,
+		   bool send_first, size_t sends, size_t olen)
+{
+	int i, j, ret;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg rmsg = {};
+	struct fi_msg smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+	uint64_t rxe_flags;
+	uint64_t txe_flags;
+	size_t sent = 0;
+	size_t recved = 0;
+	size_t err_recved = 0;
+	struct fi_cq_tagged_entry tx_cqe;
+	struct fi_cq_tagged_entry rx_cqe;
+	struct fi_cq_err_entry err_cqe = {};
+	size_t recved_len = 0;
+	bool dequeued = false;
+
+	if (!sends)
+		sends = recv_len / send_len;
+
+	memset(recv_buf, 0, recv_len);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = RECV_CTX;
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.context = SEND_CTX;
+
+	if (send_first) {
+		for (i = 0; i < sends; i++) {
+			ret = fi_sendmsg(cxit_ep, &smsg, 0);
+			cr_assert_eq(ret, FI_SUCCESS,
+				     "fi_sendmsg failed %d", ret);
+		}
+
+		/* Progress send to ensure it arrives unexpected */
+		i = 0;
+		do {
+			ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+			if (ret == 1) {
+				sent = true;
+				break;
+			}
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "send failed %d", ret);
+		} while (i++ < 100000);
+	}
+
+	ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recvmsg failed %d", ret);
+
+	if (!send_first) {
+		sleep(1);
+		for (i = 0; i < sends; i++) {
+			ret = fi_sendmsg(cxit_ep, &smsg, 0);
+			cr_assert_eq(ret, FI_SUCCESS,
+				     "fi_sendmsg failed %d", ret);
+		}
+	}
+
+	/* Gather both events, ensure progress on both sides. */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		if (ret == 1) {
+			rxe_flags = FI_MSG | FI_RECV;
+
+			validate_multi_recv_rx_event(&rx_cqe, RECV_CTX,
+						     send_len, rxe_flags,
+						     0, 0);
+			cr_assert(from == cxit_ep_fi_addr,
+				  "Invalid source address");
+
+			if (rx_cqe.flags & FI_MULTI_RECV) {
+				cr_assert(!dequeued);
+				dequeued = true;
+			}
+
+			recved_len = rx_cqe.len;
+
+			/* Validate sent data */
+			uint8_t *rbuf = rx_cqe.buf;
+
+			for (j = 0; j < recved_len; j++) {
+				cr_expect_eq(rbuf[j], send_buf[j],
+					     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+					     j, send_buf[j], rbuf[j],
+					     err++);
+				cr_assert(err < 10);
+			}
+			cr_assert_eq(err, 0, "Data errors seen\n");
+
+			recved++;
+		} else if (ret == -FI_EAVAIL) {
+			ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0);
+			cr_assert_eq(ret, 1);
+
+			recved_len = err_cqe.len;
+			uint8_t *rbuf = recv_buf + ((sends-1) * send_len);
+
+			/* The truncated transfer is always the last, which
+			 * dequeued the multi-recv buffer.
+			 */
+			rxe_flags = FI_MSG | FI_RECV;
+
+			cr_assert(err_cqe.op_context == RECV_CTX,
+				  "Error RX CQE Context mismatch");
+			cr_assert((err_cqe.flags & ~FI_MULTI_RECV) == rxe_flags,
+				  "Error RX CQE flags mismatch");
+			cr_assert(err_cqe.len == send_len - olen,
+				  "Invalid Error RX CQE length, got: %ld exp: %ld",
+				  err_cqe.len, recv_len);
+			cr_assert(err_cqe.buf == rbuf,
+				  "Invalid Error RX CQE address (%p %p)",
+				  err_cqe.buf, rbuf);
+			cr_assert(err_cqe.data == 0,
+				  "Invalid Error RX CQE data");
+			cr_assert(err_cqe.tag == 0,
+				  "Invalid Error RX CQE tag");
+			cr_assert(err_cqe.olen == olen,
+				  "Invalid Error RX CQE olen, got: %ld exp: %ld",
+				  err_cqe.olen, olen);
+			cr_assert(err_cqe.err == FI_ETRUNC,
+				  "Invalid Error RX CQE code\n");
+			cr_assert(err_cqe.prov_errno == C_RC_OK,
+				  "Invalid Error RX CQE errno");
+			cr_assert(err_cqe.err_data == NULL);
+			cr_assert(err_cqe.err_data_size == 0);
+
+			if (err_cqe.flags & FI_MULTI_RECV) {
+				cr_assert(!dequeued);
+				dequeued = true;
+			}
+
+			/* Validate sent data */
+			for (j = 0; j < recved_len; j++) {
+				cr_expect_eq(rbuf[j], send_buf[j],
+					     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+					     j, send_buf[j], rbuf[j],
+					     err++);
+				cr_assert(err < 10);
+			}
+			cr_assert_eq(err, 0, "Data errors seen\n");
+
+			err_recved++;
+		} else {
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d",
+				     ret);
+		}
+
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		if (ret == 1) {
+			txe_flags = FI_MSG | FI_SEND;
+			sent++;
+			validate_tx_event(&tx_cqe, txe_flags, SEND_CTX);
+		} else {
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d",
+				     ret);
+		}
+	} while (sent < sends || (recved + err_recved) < sends);
+}
+
+struct msg_multi_recv_params {
+	size_t send_len;
+	size_t recv_len;
+	bool ux;
+	size_t sends;
+	size_t olen;
+};
+
+#define SHORT_SEND_LEN 128
+#define SHORT_SENDS 200
+#define LONG_SEND_LEN 4096
+#define LONG_SENDS 20
+#define SHORT_OLEN (3*1024)
+#define LONG_OLEN 1024
+
+static struct msg_multi_recv_params params[] = {
+#if 1
+	/* expected/unexp eager */
+	{.send_len = SHORT_SEND_LEN,
+	 .recv_len = SHORT_SENDS * SHORT_SEND_LEN,
+	 .ux = false},
+	{.send_len = SHORT_SEND_LEN,
+	 .recv_len = SHORT_SENDS * SHORT_SEND_LEN,
+	 .ux = true},
+
+	/* exp/unexp long */
+	{.send_len = LONG_SEND_LEN,
+	 .recv_len = LONG_SENDS*LONG_SEND_LEN,
+	 .ux = false},
+	{.send_len = LONG_SEND_LEN,
+	 .recv_len = LONG_SENDS*LONG_SEND_LEN,
+	 .ux = true},
+#endif
+
+#if 1
+	/* exp/unexp overflow */
+	{.send_len = LONG_SEND_LEN,
+	 .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - LONG_OLEN),
+	 .ux = false,
+	 .sends = LONG_SENDS+1,
+	 .olen = LONG_OLEN},
+	{.send_len = LONG_SEND_LEN,
+	 .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - LONG_OLEN),
+	 .ux = true,
+	 .sends = LONG_SENDS+1,
+	 .olen = LONG_OLEN},
+#endif
+
+#if 1
+	/* exp/unexp overflow */
+	{.send_len = LONG_SEND_LEN,
+	 .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - SHORT_OLEN),
+	 .ux = false,
+	 .sends = LONG_SENDS+1,
+	 .olen = SHORT_OLEN},
+	{.send_len = LONG_SEND_LEN,
+	 .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - SHORT_OLEN),
+	 .ux = true,
+	 .sends = LONG_SENDS+1,
+	 .olen = SHORT_OLEN},
+#endif
+};
+
+ParameterizedTestParameters(msg, multi_recv)
+{
+	size_t param_sz;
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct msg_multi_recv_params, params,
+				   param_sz);
+}
+
+/* Test multi-recv messaging */
+ParameterizedTest(struct msg_multi_recv_params *param, msg, multi_recv)
+{
+	void *recv_buf;
+	void *send_buf;
+
+	recv_buf = aligned_alloc(s_page_size, param->recv_len);
+	cr_assert(recv_buf);
+
+	send_buf = aligned_alloc(s_page_size, param->send_len);
+	cr_assert(send_buf);
+
+	do_multi_recv(send_buf, param->send_len, recv_buf,
+		      param->recv_len, param->ux, param->sends,
+		      param->olen);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test multi-recv cancel */
+Test(msg, multi_recv_cancel)
+{
+	int i, ret;
+	uint8_t *recv_buf;
+	int recv_len = 0x1000;
+	int recvs = 5;
+	struct fi_cq_tagged_entry rx_cqe;
+	struct fi_cq_err_entry err_cqe;
+	struct fi_msg rmsg = {};
+	struct iovec riovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = NULL;
+
+	for (i = 0; i < recvs; i++) {
+		ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+	}
+
+	for (i = 0; i < recvs; i++) {
+		ret = fi_cancel(&cxit_ep->fid, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_cancel failed %d", ret);
+	}
+
+	for (i = 0; i < recvs; i++) {
+		do {
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			if (ret == -FI_EAVAIL)
+				break;
+
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "unexpected event %d", ret);
+		} while (1);
+
+		ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0);
+		cr_assert_eq(ret, 1);
+
+		cr_assert(err_cqe.op_context == NULL,
+			  "Error RX CQE Context mismatch");
+		cr_assert(err_cqe.flags == (FI_MSG | FI_RECV | FI_MULTI_RECV),
+			  "Error RX CQE flags mismatch");
+		cr_assert(err_cqe.err == FI_ECANCELED,
+			  "Invalid Error RX CQE code\n");
+		cr_assert(err_cqe.prov_errno == 0,
+			  "Invalid Error RX CQE errno");
+	}
+}
+
+/* Test out-of-order multi-receive transaction completion */
+Test(msg, multi_recv_ooo)
+{
+	int i, j, ret;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg rmsg = {};
+	struct fi_msg smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+	uint64_t rxe_flags;
+	int bytes_sent = 0;
+	uint8_t *recv_buf;
+	uint8_t *send_buf;
+	size_t send_len = 8*1024;
+	int sends = 10;
+	size_t recv_len = send_len * 5 + 64 * 5;
+	int sent = 0;
+	int recved = 0;
+	struct fi_cq_tagged_entry tx_cqe[sends];
+	struct fi_cq_tagged_entry rx_cqe[sends];
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = NULL;
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.context = NULL;
+
+	ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	sleep(1);
+	for (i = 0; i < sends; i++) {
+		/* Interleave long and short sends. They will complete in a
+		 * different order than they were sent or received.
+		 */
+		if (i % 2)
+			siovec.iov_len = 64;
+		else
+			siovec.iov_len = 8*1024;
+
+		ret = fi_sendmsg(cxit_ep, &smsg, 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d",
+			     ret);
+	}
+
+	for (i = 0; i < sends; i++) {
+		/* Gather both events, ensure progress on both sides. */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe[recved], 1,
+					     &from);
+			if (ret == 1) {
+				recved++;
+			} else {
+				cr_assert_eq(ret, -FI_EAGAIN,
+					     "fi_cq_read unexpected value %d",
+					     ret);
+			}
+
+			ret = fi_cq_read(cxit_tx_cq, &tx_cqe[sent], 1);
+			if (ret == 1) {
+				sent++;
+			} else {
+				cr_assert_eq(ret, -FI_EAGAIN,
+					     "fi_cq_read unexpected value %d",
+					     ret);
+			}
+		} while (!(sent == sends && recved == sends));
+	}
+
+	for (i = 0; i < sends; i++) {
+		bytes_sent += rx_cqe[i].len;
+		rxe_flags = FI_MSG | FI_RECV;
+		if (bytes_sent > (recv_len - CXIP_EP_MIN_MULTI_RECV))
+			rxe_flags |= FI_MULTI_RECV;
+
+		cr_assert(rx_cqe[i].flags == rxe_flags, "CQE flags mismatch");
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+		validate_tx_event(&tx_cqe[i], FI_MSG | FI_SEND, NULL);
+
+		/* Validate sent data */
+		uint8_t *rbuf = rx_cqe[i].buf;
+
+		for (j = 0; j < rx_cqe[i].len; j++) {
+			cr_expect_eq(rbuf[j], send_buf[j],
+				  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+				  j, send_buf[j], recv_buf[j], err++);
+		}
+		cr_assert_eq(err, 0, "Data errors seen\n");
+	}
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+Test(msg, fc_multi_recv, .timeout = 30)
+{
+	int i, j, k, ret, tx_ret;
+	uint8_t *send_bufs;
+	uint8_t *send_buf;
+	int send_len = 64;
+	uint8_t *recv_buf;
+	int recv_len = 64;
+	int mrecv_msgs = 10;
+	struct fi_msg rmsg = {};
+	struct iovec riovec;
+	struct fi_cq_tagged_entry tx_cqe;
+	struct fi_cq_tagged_entry rx_cqe;
+	int nsends_concurrent = 3; /* must be less than the LE pool min. */
+	int nsends = 20;
+	int sends = 0;
+	fi_addr_t from;
+
+	cr_assert(!(nsends % mrecv_msgs));
+
+	send_bufs = aligned_alloc(s_page_size, send_len * nsends_concurrent);
+	cr_assert(send_bufs);
+
+	recv_buf = aligned_alloc(s_page_size, recv_len * mrecv_msgs);
+	cr_assert(recv_buf);
+
+	for (i = 0; i < nsends_concurrent - 1; i++) {
+		send_buf = send_bufs + (i % nsends_concurrent) * send_len;
+		memset(send_buf, i, send_len);
+
+		tx_ret = fi_send(cxit_ep, send_buf, send_len, NULL,
+				 cxit_ep_fi_addr, NULL);
+	}
+
+	for (i = nsends_concurrent - 1; i < nsends; i++) {
+		send_buf = send_bufs + (i % nsends_concurrent) * send_len;
+		memset(send_buf, i, send_len);
+
+		do {
+			tx_ret = fi_send(cxit_ep, send_buf, send_len, NULL,
+					 cxit_ep_fi_addr, NULL);
+
+			/* Progress RX to avoid EQ drops */
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d", ret);
+
+			/* Just progress */
+			fi_cq_read(cxit_tx_cq, NULL, 0);
+		} while (tx_ret == -FI_EAGAIN);
+
+		cr_assert_eq(tx_ret, FI_SUCCESS, "fi_tsend failed %d", tx_ret);
+
+		do {
+			tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+
+			/* Progress RX to avoid EQ drops */
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d", ret);
+		} while (tx_ret == -FI_EAGAIN);
+
+		cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d",
+			     tx_ret);
+
+		validate_tx_event(&tx_cqe, FI_MSG | FI_SEND, NULL);
+
+		if (!(++sends % 1000))
+			printf("%u Sends complete.\n", sends);
+	}
+
+	for (i = 0; i < nsends_concurrent - 1; i++) {
+		do {
+			tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+
+			/* Progress RX to avoid EQ drops */
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d", ret);
+		} while (tx_ret == -FI_EAGAIN);
+
+		cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d",
+			     tx_ret);
+
+		validate_tx_event(&tx_cqe, FI_MSG | FI_SEND, NULL);
+
+		if (!(++sends % 1000))
+			printf("%u Sends complete.\n", sends);
+	}
+
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len * mrecv_msgs;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.context = NULL;
+
+	for (i = 0; i < nsends / mrecv_msgs; i++) {
+		memset(recv_buf, 0, recv_len * mrecv_msgs);
+		do {
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+			assert(ret == FI_SUCCESS || ret == -FI_EAGAIN);
+
+			ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d",
+				     ret);
+		} while (ret == -FI_EAGAIN);
+
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+		for (k = 0; k < mrecv_msgs; k++) {
+			do {
+				ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1,
+						     &from);
+			} while (ret == -FI_EAGAIN);
+
+			cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d",
+				     ret);
+
+			validate_multi_recv_rx_event(&rx_cqe, NULL, recv_len,
+						     FI_MSG | FI_RECV, 0, 0);
+			cr_assert(from == cxit_ep_fi_addr,
+				  "Invalid source address");
+			bool last_msg = (k == (mrecv_msgs - 1));
+			bool dequeued = rx_cqe.flags & FI_MULTI_RECV;
+
+			cr_assert(!(last_msg ^ dequeued));
+
+			for (j = 0; j < recv_len; j++) {
+				cr_assert_eq(recv_buf[k * recv_len + j],
+					     (uint8_t)i * mrecv_msgs + k,
+					     "data mismatch, recv: %d,%d element[%d], exp=%d saw=%d\n",
+					     i, k, j,
+					     (uint8_t)i * mrecv_msgs + k,
+					     recv_buf[k * recv_len + j]);
+			}
+		}
+	}
+
+	free(send_bufs);
+	free(recv_buf);
+}
+
+static void test_fc_multi_recv(size_t xfer_len, bool progress_before_post)
+{
+	int ret;
+	char *recv_buf;
+	char *send_buf;
+	int i;
+	struct fi_msg rmsg = {};
+	struct iovec riovec;
+	unsigned int send_events = 0;
+	unsigned int recv_events = 0;
+	struct fi_cq_tagged_entry cqe;
+	size_t min_mrecv = 0;
+	size_t opt_len = sizeof(size_t);
+	bool unlinked = false;
+
+	/* Needs to exceed available LEs. */
+	unsigned int num_xfers = 100;
+
+	ret = fi_setopt(&cxit_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV,
+			&min_mrecv, opt_len);
+	cr_assert(ret == FI_SUCCESS);
+
+	recv_buf = calloc(num_xfers, xfer_len);
+	cr_assert(recv_buf);
+
+	send_buf = calloc(num_xfers, xfer_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < (num_xfers * xfer_len); i++)
+		send_buf[i] = (char)(rand() % 256);
+
+	/* Fire off all the unexpected sends expect 1. Last send will be sent
+	 * expectedly to verify that hardware has updates the manage local LE
+	 * start and length fields accordingly.
+	 */
+	for (i = 0; i < num_xfers - 1; i++) {
+		do {
+			ret = fi_send(cxit_ep, &send_buf[i * xfer_len],
+				      xfer_len, NULL, cxit_ep_fi_addr, NULL);
+			if (ret == -FI_EAGAIN) {
+				fi_cq_read(cxit_rx_cq, &cqe, 0);
+				fi_cq_read(cxit_tx_cq, &cqe, 0);
+			}
+		} while (ret == -FI_EAGAIN);
+		cr_assert(ret == FI_SUCCESS);
+	}
+
+	/* Progress before post will cause all ULEs to be onloaded before the
+	 * append occurs.
+	 */
+	if (progress_before_post)
+		fi_cq_read(cxit_rx_cq, &cqe, 0);
+
+	/* Append late multi-recv buffer. */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = num_xfers * xfer_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = cxit_ep_fi_addr;
+	rmsg.context = NULL;
+
+	do {
+		ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV);
+		if (ret == -FI_EAGAIN) {
+			fi_cq_read(cxit_tx_cq, NULL, 0);
+			fi_cq_read(cxit_rx_cq, NULL, 0);
+		}
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for all send events. Since this test can be run with or without
+	 * flow control, progressing the RX CQ may be required.
+	 */
+	while (send_events != (num_xfers - 1)) {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert(ret == -FI_EAGAIN || ret == 1);
+		if (ret == 1)
+			send_events++;
+
+		/* Progress RXC. */
+		fi_cq_read(cxit_rx_cq, &cqe, 0);
+	}
+
+	/* Wait for all receive events. */
+	while (recv_events != (num_xfers - 1)) {
+		ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+		cr_assert(ret == -FI_EAGAIN || ret == 1);
+		if (ret == 1 && cqe.flags & FI_RECV)
+			recv_events++;
+	}
+
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Make last send expected. This ensures that hardware and/or software
+	 * has correctly updated the LE start and length fields correctly.
+	 */
+	do {
+		ret = fi_send(cxit_ep, &send_buf[(num_xfers - 1) * xfer_len],
+			      xfer_len, NULL, cxit_ep_fi_addr, NULL);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for all send events. Since this test can be run with or without
+	 * flow control, progressing the RX CQ may be required.
+	 */
+	while (send_events != num_xfers) {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert(ret == -FI_EAGAIN || ret == 1);
+		if (ret == 1)
+			send_events++;
+
+		/* Progress RXC. */
+		fi_cq_read(cxit_rx_cq, &cqe, 0);
+	}
+
+	/* Process the last receive event and the multi-receive event signaling
+	 * the provider is no longer using the buffer.
+	 */
+	while (recv_events != num_xfers && !unlinked) {
+		ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+		cr_assert(ret == -FI_EAGAIN || ret == 1);
+		if (ret == 1) {
+			if (cqe.flags & FI_RECV)
+				recv_events++;
+			if (cqe.flags & FI_MULTI_RECV)
+				unlinked = true;
+		}
+	}
+
+	/* Data integrity check. If hardware/software mismanaged the multi-recv
+	 * start and/or length fields on the expected send, data will be
+	 * corrupted.
+	 */
+	for (i = 0; i < (num_xfers * xfer_len); i++)
+		cr_assert_eq(send_buf[i], recv_buf[i],
+			     "Data miscompare: byte=%u", i);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+Test(msg, fc_multi_recv_rdzv, .timeout = 10)
+{
+	/* Transfer size needs to be large enough to trigger rendezvous. */
+	test_fc_multi_recv(16384, false);
+}
+
+Test(msg, fc_multi_recv_rdzv_onload_ules, .timeout = 10)
+{
+	/* Transfer size needs to be large enough to trigger rendezvous. */
+	test_fc_multi_recv(16384, true);
+}
+
+Test(msg, fc_no_eq_space_expected_multi_recv, .timeout = 10)
+{
+	test_fc_multi_recv(1, false);
+}
+
+Test(msg, fc_no_eq_space_expected_multi_recv_onload_ules, .timeout = 10)
+{
+	test_fc_multi_recv(1, false);
+}
+
+Test(msg, zero_byte_send_recv_iov)
+{
+	int ret;
+	struct fi_cq_tagged_entry cqe;
+
+	ret = fi_recvv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recvv failed: %d", ret);
+
+	ret = fi_sendv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_sendv failed: %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+}
+
+Test(msg, zero_byte_send_recv_msg)
+{
+	int ret;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg rmsg = {};
+	struct fi_msg smsg = {};
+
+	rmsg.addr = cxit_ep_fi_addr;
+
+	ret = fi_recvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recvmsg failed: %d", ret);
+
+	smsg.addr = cxit_ep_fi_addr;
+
+	ret = fi_sendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_sendmsg failed: %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+}
+
+/* Verify that FI_AV_USER_ID is returned from fi_cq_readfrom(). */
+Test(msg, av_user_id)
+{
+	int ret;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	fi_addr_t from;
+	fi_addr_t user_id = 0xdeadbeef;
+
+	/* Need to remove loopback address from AV and reinsert with
+	 * FI_AV_USER_ID.
+	 */
+	ret = fi_av_remove(cxit_av, &cxit_ep_fi_addr, 1, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_remove failed: %d", ret);
+
+	cxit_ep_fi_addr = user_id;
+	ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr,
+			   FI_AV_USER_ID, NULL);
+	cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret);
+
+	ret = fi_recv(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	ret = fi_send(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+	cr_assert_eq(from, user_id, "Invalid user id: expected=%#lx got=%#lx",
+		     user_id, from);
+
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+}
+
+/* Verify that FI_AV_USER_ID is returned from fi_cq_readfrom(). */
+Test(msg, av_user_id_domain_cap)
+{
+	int ret;
+	struct fid_cq *cq;
+	struct fid_av *av;
+	struct fid_ep *ep;
+	struct fi_cq_attr cxit_tx_cq_attr = {
+		.format = FI_CQ_FORMAT_TAGGED,
+	};
+	struct fi_cq_tagged_entry cqe;
+	fi_addr_t from;
+	fi_addr_t dest_ep;
+	fi_addr_t user_id = 0xdeadbeef;
+	char addr[256];
+	size_t addr_size = sizeof(addr);
+	struct fi_av_attr av_attr = {
+		.flags = FI_AV_USER_ID,
+	};
+
+	ret = fi_cq_open(cxit_domain, &cxit_tx_cq_attr, &cq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open failed: %d", ret);
+
+	ret = fi_av_open(cxit_domain, &av_attr, &av, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret);
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, &ep, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_endpoint failed: %d", ret);
+
+	ret = fi_ep_bind(ep, &cq->fid, FI_TRANSMIT | FI_RECV);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind failed: %d", ret);
+
+	ret = fi_ep_bind(ep, &av->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind failed: %d", ret);
+
+	ret = fi_enable(ep);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_enable failed: %d", ret);
+
+	ret = fi_getname(&ep->fid, addr, &addr_size);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getname failed: %d", ret);
+
+	ret = fi_av_insert(av, addr, 1, &dest_ep, 0, NULL);
+	cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret);
+
+	ret = fi_av_set_user_id(av, dest_ep, user_id, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_av_set_user_id failed: %d", ret);
+
+	ret = fi_recv(ep, NULL, 0, NULL, dest_ep, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+
+	ret = fi_send(ep, NULL, 0, NULL, dest_ep, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret);
+
+	do {
+		ret = fi_cq_readfrom(cq, &cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	if (cqe.flags & FI_SEND) {
+		do {
+			ret = fi_cq_readfrom(cq, &cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+	}
+
+	cr_assert_eq(from, user_id, "Invalid user id: expected=%#lx got=%#lx",
+		     user_id, from);
+
+	ret = fi_close(&ep->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close failed %d", ret);
+
+	ret = fi_close(&av->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close failed %d", ret);
+
+	ret = fi_close(&cq->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close failed %d", ret);
+}
+
+TestSuite(hybrid_preemptive, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+#define RX_SIZE 2U
+
+Test(hybrid_preemptive, posted_recv_preemptive)
+{
+	int ret;
+	int i;
+
+	ret = setenv("FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE", "1", 1);
+	cr_assert(ret == 0);
+
+	ret = setenv("FI_CXI_RX_MATCH_MODE", "hybrid", 1);
+	cr_assert(ret == 0);
+
+	cxit_fi_hints = cxit_allocinfo();
+	cr_assert(cxit_fi_hints);
+
+	cxit_fi_hints->rx_attr->size = RX_SIZE;
+
+	cxit_setup_msg();
+
+	/* Posting more receives than RX_SIZE should cause transition to
+	 * SW EP.
+	 */
+	for (i = 0; i < RX_SIZE + 1; i++) {
+		ret = fi_recv(cxit_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL);
+
+		if (i < RX_SIZE)
+			cr_assert(ret == FI_SUCCESS);
+		else
+			cr_assert(ret == -FI_EAGAIN);
+	}
+
+	while (ret == -FI_EAGAIN) {
+		fi_cq_read(cxit_rx_cq, NULL, 0);
+		ret = fi_recv(cxit_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL);
+	}
+
+	cr_assert(ret == FI_SUCCESS);
+
+	cxit_teardown_msg();
+}
+
+Test(hybrid_preemptive, unexpected_msg_preemptive)
+{
+	int ret;
+	int i;
+	struct cxip_ep *cxip_ep;
+
+	ret = setenv("FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE", "1", 1);
+	cr_assert(ret == 0);
+
+	ret = setenv("FI_CXI_RX_MATCH_MODE", "hybrid", 1);
+	cr_assert(ret == 0);
+
+	cxit_fi_hints = cxit_allocinfo();
+	cr_assert(cxit_fi_hints);
+
+	cxit_fi_hints->rx_attr->size = RX_SIZE;
+
+	cxit_setup_msg();
+
+	cxip_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid);
+
+	/* Posting more unexpected messages than RX_SIZE should cause
+	 * transition to SW EP.
+	 */
+	for (i = 0; i < RX_SIZE + 1; i++) {
+		ret = fi_send(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, NULL);
+		cr_assert(ret == FI_SUCCESS);
+	}
+
+	while (cxip_ep->ep_obj->rxc.state != RXC_ENABLED_SOFTWARE)
+		fi_cq_read(cxit_rx_cq, NULL, 0);
+
+	cr_assert(ret == FI_SUCCESS);
+
+	cxit_teardown_msg();
+}
diff --git a/prov/cxi/test/multinode/README.md b/prov/cxi/test/multinode/README.md
new file mode 100644
index 00000000000..c4b6cc7dae2
--- /dev/null
+++ b/prov/cxi/test/multinode/README.md
@@ -0,0 +1,126 @@
+*SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP*
+
+# Multinode Framework
+
+The multinode_frmwk provides a framework for writing multinode test applications
+under a Workload Manager (WLM).
+
+The framework itself is controlled by a number of environment variables provided
+by the WLM, or the user environment:
+
+- **PMI_SIZE** is supplied by the WLM, and indicates the total number of nodes in the
+job.
+
+- **PMI_RANK** is supplied by the WLM, and indicates the rank of this instance of the
+application.
+
+- **PMI_SHARED_SECRET** is supplied by the WLM, and is a "magic number" (a nsec
+timestamp) that is guaranteed to be common to all instances of the application,
+and unique to each job.
+
+- **PMI_NUM_HSNS** is supplied by the user environment, and defaults to 1 if not
+specified. It can have a value from 1 to 4, and indicates the number of NICs
+(per node) to bring into play.
+
+- **PMI_HOME** is supplied by the user environment, and defaults to $HOME if not
+specified. This indicates the file system directory used for the file system
+Allgather operation, and must be readable and writable.
+
+# APP: test_frmwk
+
+The **test_frmwk** application is a basic sanity test for the framework itself.
+
+$ srun -Nn ./test_frmwk [args...]
+
+# APP: test_zbcoll
+
+The **test_zbcoll** application is a full regression suite for the zbcoll
+implementation, which provides a high-performance zero-buffer implementation of
+Barrier, Broadcast, and IOR Reduce used in the process of bootstrapping the
+collective join operation.
+
+$ srun -Nn ./test_zbcoll [args...]
+
+# APP: test_coll
+
+The **test_coll** application is a full regression suite for the accelerated
+collectives. It requires a multicast configuration service, which presents
+itself as a REST API.
+
+$ srun -Nn ./test_coll [args...]
+
+## Simulated Multicast ##
+
+A *simulated* multicast configuration service is provided in the multinode
+subdirectory. It uses FLASK (Python), and returns a small number of specifically
+invalid multicast addresses that are interpreted as a request for a UNICAST
+implementation of collectives. This implementation is not performant and should
+not be used in production -- it implements the broadcast phase of the
+accelerated collective as a series of point-to-point sends from the HWRoot to
+each leaf node, and as there is no multicast in-tree reduction, the HWRoot
+becomes a target of an incast from all the leaf transmissions. This can be used,
+however, to fully test the software paths and behaviors on small collective
+groups, without any involvement from the fabric manager software.
+
+The FLASK simulation is typically started in a window on the WLM job-launch node
+as follows:
+
+$ ./flask_fmgrsrv.py --host *ipaddress* --port *port*
+
+The *ipaddress* can be obtained on the host where it is run using:
+
+$ hostname -I | awk '{print $1}'
+
+The *port* can be any valid, unused port. A value of 5000 typically works.
+
+A number of environment variables control the libfabric collective behavior:
+
+- **FI_CXI_COLL_JOB_ID** is an identifier unique to each job.
+
+- **FI_CXI_COLL_JOB_STEP_ID** is an identifier unique to each job-step.
+
+- **FI_CXI_COLL_MCAST_TOKEN** is a security token used to authenticate the
+application to the fabric manager when using the REST API.
+
+- **FI_CXI_HWCOLL_ADDRS_PER_JOB** is the maximum number of multicast addresses
+  available to this job.
+
+- **FI_CXI_HWCOLL_MIN_NODES** is the minimum number of endpoints required to support accelerated collectives.
+
+- **FI_CXI_COLL_FABRIC_MGR_URL** is the URL for the fabric manager REST API.
+
+- **FI_CXI_COLL_RETRY_USEC** is the time spent waiting for reduction
+  completion before performing a retry.
+
+- **FI_CXI_COLL_TIMEOUT_USEC** is the length of time hardware reduction engines
+  will be reserved before timing out and delivering a partial result.
+
+- **FI_CXI_COLL_USE_DMA_PUT** (experimental) uses Cassini DMA to initiate sends
+for reduction packets.
+
+The framework will set all of the above environment variables to usable
+defaults, if they are not already specified in the user environment, with the
+exception of **FI_CXI_COLL_FABRIC_MGR_URL**, which must be explicitly defined in
+the user environment.
+
+$ export FI_CXI_COLL_FABRIC_MGR_URL='http://*ipaddress*:*port*'
+
+The simulated FLASK service can be tested using:
+
+$ curl $FI_CXI_COLL_FABRIC_MGR_URL
+
+This should return a JSON object containing help text strings.
+
+**NOTE**: The simulated service uses http, not https.
+
+## Production Multicast ##
+
+Full-scale (performant) test_coll runs can be performed by specifying the real
+fabric manager REST API URL.
+
+This will require that the WLM export a valid **FI_CXI_COLL_MCAST_TOKEN** in the
+job environment after acquiring the token for the job from the fabric manager.
+This is an opaque session token that persists for the duration of the job.
+
+**NOTE**: The real service uses https, not http, and is a trusted service.
diff --git a/prov/cxi/test/multinode/flask_fmgrsrv.py b/prov/cxi/test/multinode/flask_fmgrsrv.py
new file mode 100644
index 00000000000..1916e3dbe84
--- /dev/null
+++ b/prov/cxi/test/multinode/flask_fmgrsrv.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+# Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP
+
+help = f'''
+Multicast REST server simulation for distributed testing
+
+http://host:port/
+- GET produces this help as a JSON list
+
+http://host:port/fabric/collectives/multicast
+- POST generates a single multicast address and hwroot node
+- GET lists all multicast addresses
+- DELETE deletes all multicast addresses
+
+http://host:port/fabric/collectives/mcastid/<id>
+- DELETE deletes specified multicast address
+
+Multicast addresses are invalid (>=8192), causing UNICAST behavior
+Only addresses 8192-8199 are supported, to test exhaustion
+'''
+import argparse
+import textwrap
+import sys
+import json
+
+from argparse import ArgumentParser, HelpFormatter
+from flask import Flask, request
+from flask_restful import Api, Resource
+
+# Global storage for addresses/roots
+mcastroots = []
+mcastaddrs = []
+
+class RawFormatter(HelpFormatter):
+    def _fill_text(self, text, width, indent):
+        return "\n".join([textwrap.fill(line, width) for line in textwrap.indent(textwrap.dedent(text), indent).splitlines()])
+
+class fabtestInfo(Resource):
+    def get(self):
+        return help.splitlines(), 200
+
+def delEntry(value):
+    global mcastroots
+    global mcastaddrs
+
+    try:
+        idx = mcastaddrs.index(value)
+        del mcastroots[idx]
+        del mcastaddrs[idx]
+        print("DELETE ", value)
+    except:
+        print("multicast", value, "not in use")
+        pass
+
+class delete8192(Resource):
+    def delete(self):
+        delEntry(8192)
+
+class delete8193(Resource):
+    def delete(self):
+        delEntry(8193)
+
+class delete8194(Resource):
+    def delete(self):
+        delEntry(8194)
+
+class delete8195(Resource):
+    def delete(self):
+        delEntry(8195)
+
+class delete8196(Resource):
+    def delete(self):
+        delEntry(8196)
+
+class delete8197(Resource):
+    def delete(self):
+        delEntry(8197)
+
+class delete8198(Resource):
+    def delete(self):
+        delEntry(8198)
+
+class delete8199(Resource):
+    def delete(self):
+        delEntry(8199)
+
+class fabtestServer(Resource):
+    def get(self):
+        # Lists the existing multicast addresses
+        global mcastroots
+        global mcastaddrs
+
+        addrs = []
+        for k,v in enumerate(mcastroots):
+            addrs.append({'root':v, 'mcast':mcastaddrs[k]})
+        info = {
+            'ADDRLIST': addrs,
+        }
+        return info, 200
+
+    def delete(self):
+        # Deletes all multicast addresses
+        global mcastroots
+        global mcastaddrs
+
+        mcastroots = []
+        mcastaddrs = []
+        return None, 200
+
+    def post(self):
+        # Creates a new multicast address
+        global mcastroots
+        global mcastaddrs
+
+        print(request.json)
+        required = {
+            'jobID', 'macs', 'timeout',
+        }
+        optional = {
+            'jobStepID'
+        }
+        info = {}
+        error = []
+        dupmac = []
+
+        # Test for required fields, append error messages if missing
+        for key in required:
+            if key not in request.json:
+                error.append("no " + key)
+            else:
+                info[key] = request.json[key]
+        # Test macs for empty or duplicate addresses
+        if not error and not request.json['macs']:
+            error.append('empty macs')
+        for mac in request.json['macs']:
+            if mac not in dupmac:
+                dupmac.append(mac)
+            else:
+                error.append('duplicate mac=' + str(mac))
+
+        # Test for optional fields, provide defaults if missing
+        for key in optional:
+            if key not in request.json:
+                info[key] = None
+            else:
+                info[key] = request.json[key]
+
+        # Find a globally-unused mac address as hwRoot
+        info['hwRoot'] = None
+        for mac in request.json['macs']:
+            if mac not in mcastroots:
+                info['hwRoot'] = mac
+                break
+        if not info['hwRoot']:
+            error.append('no hwRoot usable')
+
+        # Find a globally unused mcast address
+        info['mcastID'] = None
+        for adr in range(8192, 8199):
+            if adr not in mcastaddrs:
+                info['mcastID'] = adr
+                break
+        if not info['mcastID']:
+            error.append('no mcast available')
+
+        # Report any accumulated errors
+        if error:
+            info = {
+                'error' : ', '.join(error)
+            }
+            return info, 400
+
+        # Otherwise, record and return complete record
+        mcastroots.append(mac)
+        mcastaddrs.append(adr)
+
+        info['jobID'] = request.json['jobID']
+        info['jobStepID'] = request.json['jobStepID']
+        info['macs'] = request.json['macs']
+        info['timeout'] = request.json['timeout']
+        info['documentSelfLink'] = 'fabric/collectives/mcastID/' + adr
+
+        return info, 200
+
+def main(argv):
+    parser = argparse.ArgumentParser(
+        description=help, formatter_class=RawFormatter)
+    parser.add_argument('--host', default=None)
+    parser.add_argument('--port', default=None)
+    args = parser.parse_args()
+
+    app = Flask(__name__)
+    api = Api(app);
+    api.add_resource(fabtestInfo, '/')
+    api.add_resource(fabtestServer, '/fabric/collectives/multicast')
+    api.add_resource(delete8192, '/fabric/collectives/mcastid/8192')
+    api.add_resource(delete8193, '/fabric/collectives/mcastid/8193')
+    api.add_resource(delete8194, '/fabric/collectives/mcastid/8194')
+    api.add_resource(delete8195, '/fabric/collectives/mcastid/8195')
+    api.add_resource(delete8196, '/fabric/collectives/mcastid/8196')
+    api.add_resource(delete8197, '/fabric/collectives/mcastid/8197')
+    api.add_resource(delete8198, '/fabric/collectives/mcastid/8198')
+    api.add_resource(delete8199, '/fabric/collectives/mcastid/8199')
+    app.run(debug=True, host=args.host, port=args.port)
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/prov/cxi/test/multinode/multinode_frmwk.c b/prov/cxi/test/multinode/multinode_frmwk.c
new file mode 100644
index 00000000000..94a847d3ba4
--- /dev/null
+++ b/prov/cxi/test/multinode/multinode_frmwk.c
@@ -0,0 +1,890 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP
+ */
+
+/*
+ * libfabric C test framework for multinode testing.
+ *
+ * This must be compiled with:
+ *
+ * - PLATFORM_CASSINI_HW=1 (or other hardware flag)
+ *
+ * Tests are run using srun: $ srun -Nn ./test_frmwk 'n' is the number of nodes
+ * to use. Some tests may place requirements on 'n'.
+ *
+ * frmwk_init_libfabric() sets up
+ * - generic fabric info for CXI driver
+ * - one domain (fabric address)
+ * - one endpoint
+ * - one of each of the following
+ *   - eq
+ *   - tx cq
+ *   - rx cq
+ *   - send cntr
+ *   - recv cntr
+ *   - read cntr
+ *   - write cntr
+ *   - remote cntr
+ *
+ * frmwk_populate_av() uses a sockets-based Allgather operation to collect local
+ * HSN addresses and distribute them over the entire set of nodes, and then
+ * creates and binds the fi_av object for the endpoint. This 'populate' function
+ * has been separated out from initialization, to allow the framework to use
+ * other means of population (e.g. MPI). The following environment variables are
+ * significant:
+ * - PMI_SIZE		(WLM)  number of ranks in job (from WLM)
+ * - PMI_RANK		(WLM)  rank of this process   (from WLM)
+ * - PMI_SHARED_SECRET	(WLM)  unique job identifier  (from WLM)
+ * - PMI_NUM_HSNS	(USER) optional, defaults to 1
+ * - PMI_HOME		(USER) optional, preferred file system directory to use
+ * - HOME		(USER) default file system directory to use
+ *
+ * frmwk_enable_libfabric() can be used after the fi_av object has been
+ * initialized.
+ *
+ * frmwk_free_libfabric() terminates the libfabric instance and cleans up.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+#include <malloc.h>
+#include <time.h>
+
+#include <netdb.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+
+#include <ofi.h>
+#include <cxip.h>
+
+#include "multinode_frmwk.h"
+
+/* If not compiled with DEBUG=1, this is a no-op */
+#define	TRACE(fmt, ...)	CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__)
+
+#define RETURN_ERROR(ret, txt) \
+	if (ret != FI_SUCCESS) { \
+		fprintf(stderr, "FAILED %s = %s\n", txt, fi_strerror(-ret)); \
+		return ret; \
+	}
+
+#define	CLOSE_OBJ(obj)	do {if (obj) fi_close(&obj->fid); } while (0)
+
+/* Taken from SLURM environment variables */
+int frmwk_numranks;		/* PMI_SIZE */
+int frmwk_rank;			/* PMI_RANK */
+int frmwk_nics_per_rank;	/* PMI_NUM_HSNS (defaults to 1) */
+int frmwk_numnics;
+const char *frmwk_unique;	/* PMI_SHARED_SECRET */
+const char *frmwk_nodename;	/* SLURMD_NODENAME */
+const char frmwk_node0[32];	/* SLURMD_NODELIST (first name) */
+union nicaddr *frmwk_nics;	/* array of NIC addresses plus rank and hsn */
+
+int _frmwk_init;
+
+char *cxit_node;
+char *cxit_service;
+uint64_t cxit_flags;
+struct fi_info *cxit_fi_hints;
+struct fi_info *cxit_fi;
+
+struct fid_fabric *cxit_fabric;
+struct fid_domain *cxit_domain;
+struct fi_cxi_dom_ops *cxit_dom_ops;
+
+struct mem_region {
+	uint8_t *mem;
+	struct fid_mr *mr;
+};
+
+struct fid_ep *cxit_ep;
+struct fi_eq_attr cxit_eq_attr = {
+	.size = 32,
+	.flags = FI_WRITE,
+	.wait_obj = FI_WAIT_NONE
+};
+uint64_t cxit_eq_bind_flags = 0;
+
+struct fid_eq *cxit_eq;
+
+struct fi_cq_attr cxit_rx_cq_attr = {
+	.format = FI_CQ_FORMAT_TAGGED
+
+};
+uint64_t cxit_rx_cq_bind_flags = FI_RECV;
+struct fid_cq *cxit_rx_cq;
+
+struct fi_cq_attr cxit_tx_cq_attr = {
+	.format = FI_CQ_FORMAT_TAGGED,
+	.size = 16384
+};
+uint64_t cxit_tx_cq_bind_flags = FI_TRANSMIT;
+struct fid_cq *cxit_tx_cq;
+
+fi_addr_t cxit_ep_fi_addr;
+
+struct fi_cntr_attr cxit_cntr_attr = {};
+struct fid_cntr *cxit_send_cntr;
+struct fid_cntr *cxit_recv_cntr;
+struct fid_cntr *cxit_read_cntr;
+struct fid_cntr *cxit_write_cntr;
+struct fid_cntr *cxit_rem_cntr;
+
+struct fi_av_attr cxit_av_attr = {
+	.type = FI_AV_TABLE,
+	.rx_ctx_bits = 0
+};
+struct fid_av *cxit_av;
+
+int cxit_n_ifs;
+struct fid_av_set *cxit_av_set;
+struct fid_mc *cxit_mc;
+fi_addr_t cxit_mc_addr;
+
+/* HMEM memory functional overlays */
+int mr_create_ext(size_t len, uint64_t access, uint8_t seed, uint64_t key,
+		  struct fid_cntr *cntr, struct mem_region *mr)
+{
+	int ret;
+
+	if (len) {
+		mr->mem = calloc(1, len);
+		ret = (mr->mem != NULL) ? FI_SUCCESS : FI_ENOMEM;
+		RETURN_ERROR(ret, __func__);
+	} else {
+		mr->mem = 0;
+	}
+
+	for (size_t i = 0; i < len; i++)
+		mr->mem[i] = i + seed;
+
+	ret = fi_mr_reg(cxit_domain, mr->mem, len, access, 0, key, 0,
+			&mr->mr, NULL);
+	RETURN_ERROR(ret, "fi_mr_reg");
+
+	ret = fi_mr_bind(mr->mr, &cxit_ep->fid, 0);
+	RETURN_ERROR(ret, "fi_mr_bind ep");
+
+	if (cntr) {
+		ret = fi_mr_bind(mr->mr, &cntr->fid, FI_REMOTE_WRITE);
+		RETURN_ERROR(ret, "fi_mr_bind cntr");
+	}
+
+	ret = fi_mr_enable(mr->mr);
+	RETURN_ERROR(ret, "fi_mr_enable");
+
+	return 0;
+}
+
+static ssize_t copy_from_hmem_iov(void *dest, size_t size,
+				  enum fi_hmem_iface iface, uint64_t device,
+				  const struct iovec *hmem_iov,
+				  size_t hmem_iov_count,
+				  uint64_t hmem_iov_offset)
+{
+	size_t cpy_size = MIN(size, hmem_iov->iov_len);
+
+	assert(iface == FI_HMEM_SYSTEM);
+	assert(hmem_iov_count == 1);
+	assert(hmem_iov_offset == 0);
+
+	memcpy(dest, hmem_iov->iov_base, cpy_size);
+
+	return cpy_size;
+}
+
+static ssize_t copy_to_hmem_iov(enum fi_hmem_iface iface, uint64_t device,
+				const struct iovec *hmem_iov,
+				size_t hmem_iov_count,
+				uint64_t hmem_iov_offset, const void *src,
+				size_t size)
+{
+	size_t cpy_size = MIN(size, hmem_iov->iov_len);
+
+	assert(iface == FI_HMEM_SYSTEM);
+	assert(hmem_iov_count == 1);
+	assert(hmem_iov_offset == 0);
+
+	memcpy(hmem_iov->iov_base, src, cpy_size);
+
+	return cpy_size;
+}
+
+struct fi_hmem_override_ops cxit_hmem_ops = {
+	.copy_from_hmem_iov = copy_from_hmem_iov,
+	.copy_to_hmem_iov = copy_to_hmem_iov,
+};
+
+/* A minimal generic context for use with asynchronous operations */
+struct mycontext {
+	int rx_err;
+	int rx_prov_err;
+	int tx_err;
+	int tx_prov_err;
+};
+
+/* display message on stdout from rank 0 */
+int frmwk_log0(const char *fmt, ...)
+{
+	va_list args;
+	int len = 0;
+
+	if (_frmwk_init && frmwk_rank != 0)
+		return 0;
+
+	va_start(args, fmt);
+	len = vfprintf(stdout, fmt, args);
+	va_end(args);
+	fflush(stdout);
+	return len;
+}
+
+/* display message with rank designation */
+int frmwk_log(const char *fmt, ...)
+{
+	va_list args;
+	int len = 0;
+
+	if (_frmwk_init)
+		len += fprintf(stdout, "[%2d] ", frmwk_rank);
+	va_start(args, fmt);
+	len += vfprintf(stdout, fmt, args);
+	va_end(args);
+	fflush(stdout);
+	return len;
+}
+
+/* Implement a simple sockets-based allgather for testing.
+ *
+ * This selects one node across all of the nodes to serve as a local root, and
+ * then uses sockets to transfer information.
+ */
+#define	FAIL(cond, msg, label) \
+	if (cond) { \
+		printf("FAIL socket %s=%d\n", msg, cond); \
+		goto label; \
+	}
+
+/* Sockets can chop up large reads */
+static ssize_t _fullread(int fd, char *ptr, ssize_t size)
+{
+	ssize_t rem = size;
+	ssize_t len;
+
+	while (rem > 0) {
+		len = read(fd, ptr, rem);
+		if (len < 0)
+			return len;
+		ptr += len;
+		rem -= len;
+	}
+	return size;
+}
+
+/* Sockets can chop up large writes */
+static ssize_t _fullwrite(int fd, char *ptr, ssize_t size)
+{
+	ssize_t rem = size;
+	ssize_t len;
+
+	while (rem > 0) {
+		len = write(fd, ptr, rem);
+		if (len < 0)
+			return len;
+		ptr += len;
+		rem -= len;
+	}
+	return size;
+}
+
+/* frmwk_node0 (first in list) serves as root */
+int _accept(int portno, size_t size, void *data, void *rslt)
+{
+	int listenfd = 0;
+	int *connfd, conncnt, connidx;
+	struct sockaddr_in serv_addr = { 0 };
+	char *rsltp;
+	size_t siz;
+	ssize_t len;
+	int error, ret;
+
+	// any early exit reports failure
+	error = -1;
+
+	// create the socket
+	listenfd = socket(AF_INET, SOCK_STREAM, 0);
+	FAIL(listenfd < 0, "socket", lablisten);
+
+	// release the socket immediately after termination
+	ret = setsockopt(listenfd, SOL_SOCKET, SO_REUSEADDR,
+			 &(int){1}, sizeof(int));
+	FAIL(ret < 0, "reuseaddr", lablisten);
+
+	// bind the socket to accept any incoming connections
+	serv_addr.sin_family = AF_INET;
+	serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+	serv_addr.sin_port = htons(portno);
+	ret = bind(listenfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr));
+	FAIL(ret < 0, "bind", lablisten);
+
+	// limit the number of connections
+	conncnt = frmwk_numranks - 1;
+	ret = listen(listenfd, conncnt);
+	FAIL(ret < 0, "listen", lablisten);
+
+	// create the connection array
+	connfd = calloc(conncnt, sizeof(*connfd));
+	FAIL(!connfd, "connfd", lablisten);
+
+	// initialize to invalid file descriptors
+	for (connidx = 0; connidx < conncnt; connidx++)
+		connfd[connidx] = -1;
+
+	// add our contribution to the result
+	rsltp = rslt;
+	memcpy(rsltp, data, size);
+	rsltp += size;
+
+	// accept connections and start the root protocol
+	for (connidx = 0; connidx < conncnt; connidx++) {
+		int fd;
+
+		fd = accept(listenfd, (struct sockaddr *)NULL, NULL);
+		FAIL(fd < 0, "accept", labclose);
+
+		// record this for later send
+		connfd[connidx] = fd;
+
+		// read from the connection
+		siz = size;
+		len = _fullread(fd, rsltp, siz);
+		FAIL(len < siz, "read", labclose);
+
+		// advance the result pointer
+		rsltp += siz;
+	}
+
+	// all contributions complete, send the result
+	for (connidx = 0; connidx < conncnt; connidx++)
+	{
+		int fd;
+
+		fd = connfd[connidx];
+		siz = frmwk_numranks * size;
+		len = _fullwrite(fd, rslt, siz);
+		FAIL(len < siz, "write", labclose);
+	}
+
+	// report success
+	error = 0;
+
+labclose:
+	for (connidx = 0; connidx < conncnt; connidx++)
+		close(connfd[connidx]);
+	free(connfd);
+lablisten:
+	close(listenfd);
+	return error;
+}
+
+/* nodes other than frmwk_node0 serve as leaves */
+int _connect(int portno, size_t size, void *data, void *rslt)
+{
+	int connfd = 0;
+	struct sockaddr_in serv_addr = { 0 };
+	struct hostent *he;
+	struct in_addr **addr_list;
+	size_t siz;
+	ssize_t len;
+	int error, ret;
+
+	// any early exit returns error
+	error = -1;
+
+	// create the socket
+	connfd = socket(AF_INET, SOCK_STREAM, 0);
+	FAIL(connfd < 0, "socket", labclose);
+
+	// release the socket immediately after termination
+	ret = setsockopt(connfd, SOL_SOCKET, SO_REUSEADDR,
+			 &(int){1}, sizeof(int));
+	FAIL(ret < 0, "reuseaddr", labclose);
+
+	// get network address of frmwk_node0 and connect socket
+	he = gethostbyname(frmwk_node0);
+	FAIL(!he, "gethostbyname", labclose);
+
+	addr_list = (struct in_addr **)he->h_addr_list;
+	FAIL(!addr_list, "gethostbyname empty", labclose);
+
+	serv_addr.sin_family = AF_INET;
+	serv_addr.sin_port = htons(portno);
+	serv_addr.sin_addr = *addr_list[0];
+	do {
+		usleep(1000);
+		ret = connect(connfd, (struct sockaddr *)&serv_addr,
+			      sizeof(serv_addr));
+	} while (ret < 0);
+
+	// write our data
+	siz = size;
+	len = _fullwrite(connfd, data, siz);
+	FAIL(len < siz, "write", labclose);
+
+	// wait for full data response
+	siz = frmwk_numranks * size;
+	len = _fullread(connfd, rslt, siz);
+	FAIL(len < siz, "read", labclose);
+
+	// report success
+	error = 0;
+
+labclose:
+	close(connfd);
+	return error;
+}
+
+int frmwk_allgather(size_t size, void *data, void *rslt)
+{
+	int portno = 5000;
+
+	return (!strcmp(frmwk_node0, frmwk_nodename)) ?
+		_accept(portno, size, data, rslt) :
+		_connect(portno, size, data, rslt);
+}
+
+int frmwk_barrier(void)
+{
+	ssize_t size = sizeof(char);
+	char data = 0;
+	char *rslt;
+	int ret;
+
+	rslt = calloc(frmwk_numranks, sizeof(char));
+	ret = frmwk_allgather(size, &data, rslt);
+	free(rslt);
+
+	return ret;
+}
+
+/**
+ * @brief Check for minimum number of ranks
+ *
+ * @param minranks required minimum number of ranks
+ * @return int error code, 0 on success
+ */
+int frmwk_check_env(int minranks)
+{
+	if (!_frmwk_init) {
+		fprintf(stderr, "Framework not initialized\n");
+		return -1;
+	}
+	if (frmwk_numranks < minranks) {
+		/* only one rank makes noise */
+		if (!frmwk_rank)
+			fprintf(stderr, "Requires >= %d ranks\n", minranks);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * @brief Shut down the libfabric test framework.
+ */
+void frmwk_free_libfabric(void)
+{
+	/* must close EP before closing anything bound to it */
+	CLOSE_OBJ(cxit_ep);
+	CLOSE_OBJ(cxit_av);
+	CLOSE_OBJ(cxit_rem_cntr);
+	CLOSE_OBJ(cxit_write_cntr);
+	CLOSE_OBJ(cxit_read_cntr);
+	CLOSE_OBJ(cxit_recv_cntr);
+	CLOSE_OBJ(cxit_send_cntr);
+	CLOSE_OBJ(cxit_eq);
+	CLOSE_OBJ(cxit_tx_cq);
+	CLOSE_OBJ(cxit_rx_cq);
+	CLOSE_OBJ(cxit_domain);
+	CLOSE_OBJ(cxit_fabric);
+	fi_freeinfo(cxit_fi);
+	fi_freeinfo(cxit_fi_hints);
+}
+
+/**
+ * @brief Initialize the libfabric test framework.
+ *
+ * The ep_obj->src_addr has a PID value of 511 (PID_ANY) until the EP is
+ * enabled, at which point the actual PID is assigned. Nothing works if the PIDs
+ * are mismatched between ranks.
+ *
+ * @return int error code, 0 on success
+ */
+int frmwk_init_libfabric(void)
+{
+	int ret;
+
+	if (!_frmwk_init) {
+		fprintf(stderr, "Framework not initialized\n");
+		return -1;
+	}
+
+	cxit_fi_hints = fi_allocinfo();
+	ret = (cxit_fi_hints != NULL) ? FI_SUCCESS : FI_ENOMEM;
+
+	cxit_fi_hints->fabric_attr->prov_name = strdup("cxi");
+	cxit_fi_hints->domain_attr->mr_mode = FI_MR_ENDPOINT;
+	cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &cxit_fi);
+	RETURN_ERROR(ret, "fi_getinfo");
+
+	ret = fi_fabric(cxit_fi->fabric_attr, &cxit_fabric, NULL);
+	RETURN_ERROR(ret, "fi_fabric");
+
+	ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL);
+	RETURN_ERROR(ret, "fi_domain");
+
+	ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_1, 0,
+			  (void **)&cxit_dom_ops, NULL);
+	RETURN_ERROR(ret, "fi_open_ops 1");
+
+	ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_2, 0,
+			  (void **)&cxit_dom_ops, NULL);
+	RETURN_ERROR(ret, "fi_open_ops 2");
+
+	ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0,
+			  (void **)&cxit_dom_ops, NULL);
+	RETURN_ERROR(ret, "fi_open_ops 3");
+
+	ret = fi_set_ops(&cxit_domain->fid, FI_SET_OPS_HMEM_OVERRIDE, 0,
+			 &cxit_hmem_ops, NULL);
+	RETURN_ERROR(ret, "fi_set_ops");
+
+	ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL);
+	RETURN_ERROR(ret, "fi_endpoint");
+
+	ret = fi_cq_open(cxit_domain, &cxit_rx_cq_attr, &cxit_rx_cq, NULL);
+	RETURN_ERROR(ret, "fi_cq_open RX");
+
+	ret = fi_ep_bind(cxit_ep, &cxit_rx_cq->fid, cxit_rx_cq_bind_flags);
+	RETURN_ERROR(ret, "fi_ep_bind RX_CQ");
+
+	ret = fi_cq_open(cxit_domain, &cxit_tx_cq_attr, &cxit_tx_cq, NULL);
+	RETURN_ERROR(ret, "fi_cq_open TX");
+	ret = fi_ep_bind(cxit_ep, &cxit_tx_cq->fid, cxit_tx_cq_bind_flags);
+	RETURN_ERROR(ret, "fi_ep_bind TX_CQ");
+
+	ret = fi_eq_open(cxit_fabric, &cxit_eq_attr, &cxit_eq, NULL);
+	RETURN_ERROR(ret, "fi_eq_open");
+	ret = fi_ep_bind(cxit_ep, &cxit_eq->fid, cxit_eq_bind_flags);
+	RETURN_ERROR(ret, "fi_ep_bind EQ");
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_send_cntr, NULL);
+	RETURN_ERROR(ret, "fi_cntr_open SEND");
+	ret = fi_ep_bind(cxit_ep, &cxit_send_cntr->fid, FI_SEND);
+	RETURN_ERROR(ret, "fi_ep_bind SEND CNTR");
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_recv_cntr, NULL);
+	RETURN_ERROR(ret, "fi_cntr_open RECV");
+	ret = fi_ep_bind(cxit_ep, &cxit_recv_cntr->fid, FI_RECV);
+	RETURN_ERROR(ret, "fi_ep_bind RECV CNTR");
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_read_cntr, NULL);
+	RETURN_ERROR(ret, "fi_cntr_open READ");
+	ret = fi_ep_bind(cxit_ep, &cxit_read_cntr->fid, FI_READ);
+	RETURN_ERROR(ret, "fi_ep_bind READ CNTR");
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_write_cntr, NULL);
+	RETURN_ERROR(ret, "fi_cntr_open WRITE");
+	ret = fi_ep_bind(cxit_ep, &cxit_write_cntr->fid, FI_WRITE);
+	RETURN_ERROR(ret, "fi_ep_bind WRITE CNTR");
+
+	ret = fi_cntr_open(cxit_domain, NULL, &cxit_rem_cntr, NULL);
+	RETURN_ERROR(ret, "fi_cntr_open REMOTE");
+
+	cxit_av_attr.count = 1024;
+	ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL);
+	RETURN_ERROR(ret, "fi_av_open");
+
+	ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0);
+	RETURN_ERROR(ret, "fi_ep_bind AV");
+
+	ret = fi_enable(cxit_ep);
+	RETURN_ERROR(ret, "fi_enable");
+
+	return 0;
+}
+
+/**
+ * @brief One way of populating the address vector.
+ *
+ * This uses frmwk_allgather() to perform the allgather of addresses across all
+ * nodes in the job. To work properly, the libfabric endpoint must already be
+ * enabled.
+ *
+ * This also serves as a barrier that ensures that all ranks have reached this
+ * call, i.e. all ranks have enabled their respective endpoint. If an endpoint
+ * is not enabled when another endpoint sends a packet, the sender will receive
+ * an ACK, but the target will drop the packet.
+ *
+ * This routine can be replaced with anything that provides an accurate AV
+ * across all nodes in the job, e.g. MPI, symmetric AVs distributed by some
+ * other out-of-band means to all nodes, or logical (rank) addressing of the
+ * Cassini chips.
+ *
+ * @param fiaddr : returns array of fi_addr_t in rank order
+ * @param size   : returns size of fiaddr array
+ * @return int error code, 0 on success.
+ */
+int frmwk_populate_av(fi_addr_t **fiaddrp, size_t *sizep)
+{
+	struct cxip_addr *alladdrs = NULL;
+	fi_addr_t *fiaddrs = NULL;
+	int i, ret;
+
+	if (!fiaddrp || !sizep)
+		return -FI_EINVAL;
+
+	ret = -FI_EFAULT;
+	ret = frmwk_gather_nics();
+	if (ret < 0)
+		goto fail;
+
+	ret = -FI_ENOMEM;
+	alladdrs = calloc(frmwk_numnics, sizeof(*alladdrs));
+	fiaddrs = calloc(frmwk_numnics, sizeof(*fiaddrs));
+	if (!fiaddrs || !alladdrs)
+		goto fail;
+
+	for (i = 0; i < frmwk_numnics; i++)
+		alladdrs[i].nic = frmwk_nics[i].nic;
+	ret = fi_av_insert(cxit_av, alladdrs, frmwk_numnics,
+			   fiaddrs, 0, NULL);
+	if (ret != frmwk_numnics)
+		goto fail;
+
+	*sizep = frmwk_numnics;
+	*fiaddrp = fiaddrs;
+	return FI_SUCCESS;
+
+fail:
+	free(fiaddrs);
+	free(alladdrs);
+	return ret;
+}
+
+/**
+ * @brief Display an error message to stderr and return error code.
+ *
+ * This prints to stderr only if ret != 0. It includes rank of the failing
+ * process and the size of the job. These values are meaningful only after
+ * frmwk_populate_av() has successfully completed.
+ *
+ * @param ret : error code
+ * @param fmt : printf format
+ * @param ... : printf parameters
+ * @return int value of ret
+ */
+int frmwk_errmsg(int ret, const char *fmt, ...)
+{
+	va_list args;
+	char host[256];
+	char *str;
+	int len;
+
+	if (!ret)
+		return 0;
+
+	if (gethostname(host, sizeof(host)))
+		strcpy(host, "unknown");
+
+	va_start(args, fmt);
+	len = vasprintf(&str, fmt, args);
+	va_end(args);
+	if (len < 0)
+		str = "(no errmsg)";
+	fprintf(stderr, "%s rank %2d of %2d FAILED %d: %s",
+		host, frmwk_rank, frmwk_numranks, ret, str);
+	if (len >= 0)
+		free(str);
+
+	return ret;
+}
+
+/* Read /sys files to get the HSN nic addresses */
+static void get_local_nic(int hsn, union nicaddr *nic)
+{
+	char fname[256];
+	char text[256];
+	char *ptr;
+	FILE *fid;
+	int i, n;
+
+	/* default */
+	strcpy(text, "FF:FF:FF:FF:FF:FF\n");
+	/* read from file, if possible */
+	snprintf(fname, sizeof(fname), "/sys/class/net/hsn%d/address", hsn);
+	if ((fid = fopen(fname, "r"))) {
+		n = fread(text, 1, sizeof(text), fid);
+		fclose(fid);
+		text[n] = 0;
+	}
+	TRACE("HSN address: %s", text);
+
+	/* parse "XX:XX:XX:XX:XX:XX\n" into 48-bit integer value */
+	nic->value = 0L;
+	ptr = text;
+	for (i = 0; i < 6; i++) {
+		nic->value <<= 8;
+		nic->value |= strtol(ptr, &ptr, 16);
+		ptr++;
+	}
+	nic->hsn = hsn;
+	nic->rank = frmwk_rank;
+	TRACE("rank=%2d hsn=%d nic=%05x\n", nic->rank, nic->hsn, nic->nic);
+}
+
+/* Sort comparator */
+static int _compare(const void *v1, const void *v2)
+{
+	uint64_t *a1 = (uint64_t *)v1;
+	uint64_t *a2 = (uint64_t *)v2;
+
+	if (*a1 < *a2)
+		return -1;
+	if (*a1 > *a2)
+		return 1;
+	return 0;
+}
+
+/* Allgather on NIC addresses across collective */
+int frmwk_gather_nics(void)
+{
+	union nicaddr *mynics = NULL;
+	int i, ret, localsize;
+
+	if (frmwk_nics)
+		return 0;
+
+	localsize = frmwk_nics_per_rank * NICSIZE;
+	mynics = calloc(1, localsize);
+	frmwk_nics = calloc(frmwk_numranks, localsize);
+	if (!mynics || !frmwk_nics)
+		goto fail;
+
+	for (i = 0; i < frmwk_nics_per_rank; i++)
+		get_local_nic(i, &mynics[i]);
+
+	ret = frmwk_allgather(localsize, mynics, frmwk_nics);
+	if (ret)
+		goto fail;
+
+	frmwk_numnics = frmwk_numranks * frmwk_nics_per_rank;
+	qsort(frmwk_nics, frmwk_numnics, NICSIZE, _compare);
+	TRACE("---\n");
+	for (i = 0; i < frmwk_numnics; i++)
+		TRACE("rank=%2d hsn=%d nic=%05x\n",
+		      frmwk_nics[i].rank,
+		      frmwk_nics[i].hsn,
+		      frmwk_nics[i].nic);
+	return 0;
+
+fail:
+	frmwk_numnics = 0;
+	free(frmwk_nics);
+	free(mynics);
+	return -1;
+}
+
+/* User call for the address of rank, hsn */
+int frmwk_nic_addr(int rank, int hsn)
+{
+	if (!frmwk_nics ||
+	    rank < 0 || rank >= frmwk_numranks ||
+	    hsn < 0 || hsn >= frmwk_nics_per_rank)
+		return -1;
+	return (long)frmwk_nics[rank * frmwk_nics_per_rank + hsn].nic;
+}
+
+/* Get environment variable as string representation of int */
+static int getenv_int(const char *name)
+{
+	char *env;
+	int value;
+
+	value = -1;
+	env = getenv(name);
+	if (env)
+		sscanf(env, "%d", &value);
+	return value;
+}
+
+/* Initialize the framework */
+void frmwk_init(bool quiet)
+{
+	char *s, *d;
+	int ret = -1;
+
+	/* Values are provided by the WLM */
+	s = getenv("SLURM_NODELIST");
+	d = (char *)frmwk_node0;
+	while (s && *s && *s != '-' && *s != ',') {
+		if (*s == '[')
+			s++;
+		else
+			*d++ = *s++;
+	}
+	*d = 0;
+	frmwk_nodename = getenv("SLURMD_NODENAME");
+	frmwk_numranks = getenv_int("PMI_SIZE");
+	frmwk_rank = getenv_int("PMI_RANK");
+	frmwk_unique = getenv("PMI_SHARED_SECRET");
+	if (frmwk_numranks < 1 || frmwk_rank < 0 || !frmwk_unique) {
+		if (quiet)
+			goto fail;
+		fprintf(stderr, "invalid PMI_SIZE=%d\n", frmwk_numranks);
+		fprintf(stderr, "invalid PMI_RANK=%d\n", frmwk_rank);
+		fprintf(stderr, "invalid PMI_SHARED_SECRET=%s\n", frmwk_unique);
+		fprintf(stderr, "Must be run under compatible WLM\n");
+		goto fail;
+	}
+
+	/* Optional for multiple HSNs, defaults to hsn0 */
+	frmwk_nics_per_rank = getenv_int("PMI_NUM_HSNS");
+	if (frmwk_nics_per_rank < 1)
+		frmwk_nics_per_rank = 1;
+
+	/* Re-export these as libfabric equivalents */
+	setenv("FI_CXI_COLL_JOB_ID", frmwk_unique, 1);
+	setenv("FI_CXI_COLL_JOB_STEP_ID", "0", 1);
+	setenv("FI_CXI_COLL_MCAST_TOKEN", "aaaaaa", 1);
+	setenv("FI_CXI_HWCOLL_MIN_NODES", "4", 1);
+	setenv("FI_CXI_HWCOLL_ADDRS_PER_JOB", "4", 1);
+	setenv("FI_CXI_COLL_FABRIC_MGR_URL", "what?", 1);
+
+	ret = 0;
+fail:
+	_frmwk_init = (!ret);
+}
+
+void frmwk_term(void)
+{
+	free(frmwk_nics);
+	frmwk_nics = NULL;
+	frmwk_unique = NULL;
+	frmwk_nics_per_rank = 0;
+	frmwk_numranks = 0;
+	frmwk_rank = 0;
+	_frmwk_init = 0;
+}
diff --git a/prov/cxi/test/multinode/multinode_frmwk.h b/prov/cxi/test/multinode/multinode_frmwk.h
new file mode 100644
index 00000000000..f20c89dd9dd
--- /dev/null
+++ b/prov/cxi/test/multinode/multinode_frmwk.h
@@ -0,0 +1,88 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * (c) Copyright 2021-2023 Hewlett Packard Enterprise Development LP
+ */
+
+#ifndef FRMWK_HEADER
+#define FRMWK_HEADER
+
+union nicaddr {
+	uint64_t value;
+	struct {
+		uint64_t nic:20;	// 20-bit CXI NIC address
+		uint64_t net:28;	// 28-bit network route
+		uint64_t hsn:2;		// up to 4 CXI chips per node
+		uint64_t rank:14;	// up to 16k ranks
+	} __attribute__((__packed__));
+};
+#define	NICSIZE	(sizeof(union nicaddr))
+
+/* These are initialized by frmwk_init() */
+extern int frmwk_nics_per_rank;
+extern int frmwk_numranks;
+extern int frmwk_numnics;
+extern int frmwk_rank;
+
+/* This is initialized by frmwk_populate_av() */
+extern union nicaddr *frmwk_nics;
+
+extern char *cxit_node;
+extern char *cxit_service;
+extern uint64_t cxit_flags;
+extern struct fi_info *cxit_fi_hints;
+extern struct fi_info *cxit_fi;
+
+extern struct fid_fabric *cxit_fabric;
+extern struct fid_domain *cxit_domain;
+extern struct fi_cxi_dom_ops *dom_ops;
+
+extern struct fid_ep *cxit_ep;
+extern struct fi_eq_attr cxit_eq_attr;
+extern uint64_t cxit_eq_bind_flags;
+extern struct fid_eq *cxit_eq;
+
+extern struct fi_cq_attr cxit_rx_cq_attr;
+extern uint64_t cxit_rx_cq_bind_flags;
+extern struct fid_cq *cxit_rx_cq;
+
+extern struct fi_cq_attr cxit_tx_cq_attr;
+extern uint64_t cxit_tx_cq_bind_flags;
+extern struct fid_cq *cxit_tx_cq;
+
+extern fi_addr_t cxit_ep_fi_addr;
+
+extern struct fi_cntr_attr cxit_cntr_attr;
+extern struct fid_cntr *cxit_send_cntr;
+extern struct fid_cntr *cxit_recv_cntr;
+extern struct fid_cntr *cxit_read_cntr;
+extern struct fid_cntr *cxit_write_cntr;
+extern struct fid_cntr *cxit_rem_cntr;
+
+extern struct fi_av_attr cxit_av_attr;
+extern struct fid_av *cxit_av;
+
+extern int cxit_n_ifs;
+extern struct fid_av_set *cxit_av_set;
+extern struct fid_mc *cxit_mc;
+extern fi_addr_t cxit_mc_addr;
+
+int frmwk_allgather(size_t size, void *data, void *rslt);
+int frmwk_barrier(void);
+int frmwk_gather_nics(void);
+int frmwk_nic_addr(int rank, int hsn);
+
+void frmwk_init(bool quiet);
+void frmwk_term(void);
+int frmwk_init_libfabric(void);
+void frmwk_free_libfabric(void);
+int frmwk_check_env(int minranks);
+int frmwk_populate_av(fi_addr_t **fiaddr, size_t *size);
+int frmwk_errmsg(int ret, const char *fmt, ...)
+	__attribute__((format(__printf__, 2, 3)));
+int frmwk_log0(const char *fmt, ...)
+	__attribute__((format(__printf__, 1, 2)));
+int frmwk_log(const char *fmt, ...)
+	__attribute__((format(__printf__, 1, 2)));
+
+#endif /* FRMWK_HEADER */
diff --git a/prov/cxi/test/multinode/perf_align.c b/prov/cxi/test/multinode/perf_align.c
new file mode 100644
index 00000000000..4c2093fa7bc
--- /dev/null
+++ b/prov/cxi/test/multinode/perf_align.c
@@ -0,0 +1,58 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP
+ *
+ * Generic ad-hoc CPU performance tests.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <time.h>
+#include <stdint.h>
+
+int main(int argc, char **argv)
+{
+	struct timespec ts1, ts2;
+	uint8_t arr[16];
+	uint64_t *a;
+	double *d;
+	uint64_t count;
+	int i;
+
+	/* Test alignment consequences on integer sum */
+	for (i = 0; i < 8; i++) {
+		count = 1000000000;
+		a = (uint64_t *)&arr[i];
+		clock_gettime(CLOCK_MONOTONIC, &ts1);
+		while (count--)
+			(*a) += 1;
+		clock_gettime(CLOCK_MONOTONIC, &ts2);
+		if (ts2.tv_nsec < ts1.tv_nsec) {
+			ts2.tv_nsec += 1000000000;
+			ts2.tv_sec -= 1;
+		}
+		ts2.tv_nsec -= ts1.tv_nsec;
+		ts2.tv_sec -= ts1.tv_sec;
+		printf("a[%d] = %3ld.%09ld\n", i, ts2.tv_sec, ts2.tv_nsec);
+	}
+
+	/* Test alignment consequences on double sum */
+	for (i = 0; i < 8; i++) {
+		count = 1000000000;
+		d = (double *)&arr[i];
+		clock_gettime(CLOCK_MONOTONIC, &ts1);
+		while (count--)
+			(*d) += 1.0;
+		clock_gettime(CLOCK_MONOTONIC, &ts2);
+		if (ts2.tv_nsec < ts1.tv_nsec) {
+			ts2.tv_nsec += 1000000000;
+			ts2.tv_sec -= 1;
+		}
+		ts2.tv_nsec -= ts1.tv_nsec;
+		ts2.tv_sec -= ts1.tv_sec;
+		printf("d[%d] = %3ld.%09ld\n", i, ts2.tv_sec, ts2.tv_nsec);
+	}
+
+	return 0;
+}
diff --git a/prov/cxi/test/multinode/perf_getip.c b/prov/cxi/test/multinode/perf_getip.c
new file mode 100644
index 00000000000..6acbe1c157f
--- /dev/null
+++ b/prov/cxi/test/multinode/perf_getip.c
@@ -0,0 +1,139 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP
+ */
+
+/* Compile: cc -o getip getip.c */
+#include <stdio.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <net/if.h>
+#include <arpa/inet.h>
+#include <time.h>
+
+int get_mac_ioctls(char **macs, int count)
+{
+	struct ifreq ifr, *it, *end;
+	struct ifconf ifc;
+	char buf[1024];
+	int success = 0;
+	int sock;
+	int i, idx, ret;
+	char *mptr;
+	unsigned char *sptr;
+
+	// acquire socket identifier
+	sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+	if (sock < 0)
+		return sock;
+	// prepare the ifc structure
+	ifc.ifc_len = sizeof(buf);
+	ifc.ifc_buf = buf;
+	// populate ifc structure from kernel
+	ret = ioctl(sock, SIOCGIFCONF, &ifc);
+	if (ret < 0)
+		return ret;
+	// walk through the interfaces
+	it = ifc.ifc_req;
+	end = it + (ifc.ifc_len / sizeof(struct ifreq));
+	idx = 0;
+	for (; it != end && idx < count; it++) {
+		// find only hsn* interfaces
+		strcpy(ifr.ifr_name, it->ifr_name);
+		if (strncmp(ifr.ifr_name, "hsn", 3))
+			continue;
+		// acquire flags
+		if (ioctl(sock, SIOCGIFFLAGS, &ifr))
+			continue;
+		// acquire hardware address
+		if (ioctl(sock, SIOCGIFHWADDR, &ifr))
+			continue;
+		// copy hardware address into return pointer
+		mptr = macs[idx++];
+		sptr = ifr.ifr_hwaddr.sa_data;
+		for (i = 0; i < 6; i++) {
+			if (i)
+				*mptr++ = ':';
+			mptr += sprintf(mptr, "%02x", *sptr++);
+		}
+		*mptr = 0;
+	}
+	close(sock);
+	return idx;
+}
+
+int get_mac_sysfile(char **macs, int count)
+{
+	DIR *dir;
+	FILE *fid;
+	struct dirent *dent;
+	char path[1024];
+	int n, idx;
+
+	// open the network sysfs directory
+	dir = opendir("/sys/class/net");
+	if (!dir)
+		return 1;
+	// read the directory contents
+	idx = 0;
+	while ((dent = readdir(dir)) && idx < count) {
+		// find only hsn* interfaces
+		if (strncmp("hsn", dent->d_name, 3))
+			continue;
+		// open and read the address file
+		sprintf(path, "/sys/class/net/%s/address", dent->d_name);
+		fid = fopen(path, "r");
+		n = fread(macs[idx++], 32, 1, fid);
+		fclose(fid);
+	}
+	closedir(dir);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct timespec t0, t1;
+	long int count;
+	char **macs;
+	int i, num;
+	int secs = 2;
+
+	macs = calloc(4, sizeof(char *));
+	for (i = 0; i < 4; i++)
+		macs[i] = malloc(32);
+
+	get_mac_ioctls(macs, 4);
+	clock_gettime(CLOCK_MONOTONIC, &t0);
+	t0.tv_sec += secs;
+	count = 0;
+	do {
+		get_mac_ioctls(macs, 4);
+		count++;
+		clock_gettime(CLOCK_MONOTONIC, &t1);
+	} while (t1.tv_sec < t0.tv_sec ||
+		 (t1.tv_sec == t0.tv_sec &&
+		  t1.tv_nsec < t0.tv_nsec));
+	printf("direct: %9ld\n", count);
+
+	get_mac_sysfile(macs, 4);
+	clock_gettime(CLOCK_MONOTONIC, &t0);
+	t0.tv_sec += secs;
+	count = 0;
+	do {
+		get_mac_sysfile(macs, 4);
+		count++;
+		clock_gettime(CLOCK_MONOTONIC, &t1);
+	} while (t1.tv_sec < t0.tv_sec ||
+		 (t1.tv_sec == t0.tv_sec &&
+		 t1.tv_nsec < t0.tv_nsec));
+	printf("sysfs : %9ld\n", count);
+
+	return 0;
+}
diff --git a/prov/cxi/test/multinode/test_barrier.c b/prov/cxi/test/multinode/test_barrier.c
new file mode 100644
index 00000000000..285e32e73ce
--- /dev/null
+++ b/prov/cxi/test/multinode/test_barrier.c
@@ -0,0 +1,454 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP
+ */
+
+/**
+ * Standalone BARRIER test to illustrate how to set up collectives.
+ *
+ * This uses the multinode_frmwk.c code to prepare a generic multinode
+ * environment for libfabric collectives testing, and provides some common
+ * tools for:
+ *
+ * - evaluating SLURM environment variables
+ * - configuring a vanilla libfabric, including HMEM overlays
+ * - distributing HSN addresses among the nodes
+ *
+ * The distribution of HSN addresses uses a linux socket-based method to
+ * share the HSN addresses among the nodes, and as such, presumes the
+ * existence of a standard Ethernet network linking the nodes (which is also
+ * presumed by SLURM).
+ *
+ * This code creates a single av_set consisting of the HSN0 addresses among
+ * the full set of nodes (i.e. MPI_COMM_WORLD), and then performs
+ * fi_join_collective() to obtain a "multicast" address to be used in the
+ * barrier operation. Note that the create_av_set() call checks for the
+ * environment variable FI_COLL_FABRIC_MGR_URL. If this is set, the join will
+ * attempt to use the specified fabric manager URL to set up a valid
+ * multicast address in the fabric. If this environment variable is not set,
+ * the join will use a "unicast" model in which all leaf nodes communicate
+ * with the root, and the root communicates with the leaves.
+ *
+ * The unicast model is not intended to be performant; it is intended to be
+ * simple, since the primary purpose of the unicast model is debugging and
+ * instruction.
+ */
+
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+#include <malloc.h>
+#include <time.h>
+#include <ofi.h>
+#include <cxip.h>
+#include "multinode_frmwk.h"
+
+/**
+ * Create av_set.
+ *
+ * fiaddrs must be in the same order across all nodes.
+ *
+ * rootidx is the index of the collective root for this group.
+ */
+int create_av_set(fi_addr_t *fiaddrs, size_t size, int rootidx,
+		  struct fid_av_set **avsetp)
+{
+	struct cxip_comm_key comm_key = {
+		.keytype = (cxip_env.coll_fabric_mgr_url) ?
+			    COMM_KEY_NONE : COMM_KEY_UNICAST,
+		.ucast.hwroot_idx = rootidx
+	};
+	struct fi_av_set_attr attr = {
+		.count = 0,
+		.start_addr = FI_ADDR_NOTAVAIL,
+		.end_addr = FI_ADDR_NOTAVAIL,
+		.stride = 1,
+		.comm_key_size = sizeof(comm_key),
+		.comm_key = (void *)&comm_key,
+		.flags = 0,
+	};
+	struct fid_av_set *avset;
+	int i, ret;
+
+	*avsetp = NULL;
+
+	if (rootidx < 0 || rootidx >= size) {
+		printf("%s invalid rootidx value=%d\n", __func__, rootidx);
+		return -1;
+	}
+
+	// create empty av_set (alloc and initialize to empty)
+	ret = fi_av_set(cxit_av, &attr, &avset, NULL);
+	if (ret) {
+		printf("%s fi_av_set failed %d\n", __func__, ret);
+		goto quit;
+	}
+	// append all addresses (in rank order) to av_set
+	for (i = 0; i < size; i++) {
+		ret = fi_av_set_insert(avset, fiaddrs[i]);
+		if (ret) {
+			printf("%s fi_av_set_insert failed %d\n", __func__, ret);
+			goto quit;
+		}
+	}
+        *avsetp = avset;
+	return 0;
+
+quit:
+	printf("%s FAILED %d\n", __func__, ret);
+	if (avset)
+		fi_close(&avset->fid);
+	return ret;
+}
+
+/**
+ * Poll the cqs once, and fill out a cqd structure.
+ *
+ * Note that the cqd is the largest supported cqd structure, so it can serve
+ * as both the success (smaller) or failure (larger) return structure.
+ *
+ * The rx_cq has the discard flag set for all operational modes other than
+ * the COMM_KEY_RANK simulation, so rx_cq events are not generated in this
+ * test code. Should this be expanded to use the COMM_KEY_RANK simulation
+ * (e.g. for automated single-node regression testing), the rx_cq should be
+ * read and the data discarded.
+ */
+static ssize_t _poll_cqs(struct fi_cq_err_entry *pcqd)
+{
+	ssize_t size;
+
+#if 1
+	/* read/discard rx_cq -- needed in COMM_KEY_RANK simulation only */
+	size = fi_cq_read(cxit_rx_cq, pcqd, 1);
+	if (size == -FI_EAVAIL)
+		size = fi_cq_readerr(cxit_rx_cq, pcqd, 1);
+#endif
+
+	/* tx_cq indicates barrier status */
+	size = fi_cq_read(cxit_tx_cq, pcqd, 1);
+	if (size == -FI_EAVAIL) {
+		size = fi_cq_readerr(cxit_tx_cq, pcqd, 1);
+		if (size >= 0)
+			size = -FI_EAVAIL;
+	}
+
+	return size;
+}
+
+static ssize_t _wait_cqs(struct fi_cq_err_entry *pcqd)
+{
+	ssize_t size;
+
+	do {
+		size = _poll_cqs(pcqd);
+	} while (size == -FI_EAGAIN);
+	return size;
+}
+
+/**
+ * Poll the endpoint EQ once, and fill out an eqd data structure.
+ */
+static ssize_t _poll_eq(uint32_t *pevent, struct fi_eq_err_entry *peqd) {
+	struct cxip_ep *ep;
+	struct fid_eq *eq;
+	ssize_t size;
+
+	ep = container_of(cxit_ep, struct cxip_ep, ep);
+	eq = &ep->ep_obj->coll.eq->util_eq.eq_fid;
+	size = fi_eq_read(eq, pevent, peqd, sizeof(*peqd), 0);
+	if (size == -FI_EAVAIL) {
+		size = fi_eq_readerr(eq, peqd, 0);
+		if (size >= 0)
+			size = -FI_EAVAIL;
+	}
+	return size;
+}
+
+static ssize_t _wait_eq(uint32_t *pevent, struct fi_eq_err_entry *peqd) {
+	ssize_t size;
+
+	do {
+		size = _poll_eq(pevent, peqd);
+	} while (size == -FI_EAGAIN);
+	return size;
+}
+
+/**
+ * Join the specified avset to create a multicast reference pointer.
+ *
+ * This is implemented as a blocking call, for simplicity. In practice,
+ * multiple join operations can be initiated, and then the wait can be called
+ * until all joins have completed. Note that completion can occur in any
+ * order, and the resulting eqd.context value must be checked to see which of
+ * the joins completed. We are simply using the avset pointer itself as the
+ * context, but in production, this could be any kind of unique pointer or
+ * index.
+ */
+int join_collective(struct fid_av_set *avset, struct fid_mc **mcp)
+{
+	struct fi_cq_err_entry cqd;
+	struct fi_eq_err_entry eqd;
+	uint32_t event;
+	ssize_t ret;
+
+	do {
+		_poll_cqs(&cqd);
+		ret = fi_join_collective(cxit_ep, FI_ADDR_NOTAVAIL,
+						avset, 0L, mcp, avset);
+	} while (ret == -FI_EAGAIN);
+	if (ret) {
+		frmwk_log("join initiation error = %ld\n", ret);
+		return ret;
+	}
+
+	ret = _wait_eq(&event, &eqd);
+	if (ret < 0) {
+		frmwk_log("join wait error = %ld\n", ret);
+		return ret;
+	}
+	if (event != FI_JOIN_COMPLETE) {
+		frmwk_log("join event = %d != %d\n", event, FI_JOIN_COMPLETE);
+		return -FI_EADDRNOTAVAIL;
+	}
+
+	return FI_SUCCESS;
+}
+
+/**
+ * Perform N barriers in sequence.
+ *
+ * delay is a limit for a random delay inserted before each barrier is
+ * initiated. The srand() function has initialized rand() to different seeds
+ * based on the rank, so this serves to ensure that the ranks initiate
+ * barriers out-of-sync in different orders, i.e. sometimes the root will go
+ * first, sometimes a leaf will go first.
+ */
+int barrier(struct fid_mc *mc, int N, int delay)
+{
+	struct timespec t0, t1;
+	uint64_t icontext;
+	uint64_t wcontext;
+	struct fi_cq_err_entry cqd;
+	int i, ret;
+
+	srand(100*frmwk_rank);
+	icontext = 0x1000;
+	clock_gettime(CLOCK_MONOTONIC, &t0);
+	for (i = 0; i < N; i++) {
+		do {
+			if (delay)
+				usleep(rand() % delay);
+			ret = fi_barrier(cxit_ep, (fi_addr_t)mc,
+						(void *)icontext);
+		} while (ret == -FI_EAGAIN);
+		frmwk_log("fi_barrier(%08lx) = %d\n", icontext, ret);
+		if (ret != FI_SUCCESS)
+			break;
+
+		ret = _wait_cqs(&cqd);
+		wcontext = (ret > 0) ? (uint64_t)cqd.op_context : -1L;
+		frmwk_log("wait_cqs(%08lx) = %d\n", wcontext, ret);
+		icontext++;
+	}
+	clock_gettime(CLOCK_MONOTONIC, &t1);
+	if (t1.tv_nsec < t0.tv_nsec) {
+		t1.tv_nsec += 1000000000;
+		t1.tv_sec--;
+	}
+	t1.tv_nsec -= t0.tv_nsec;
+	t1.tv_sec -= t0.tv_sec;
+	if (i < N) {
+		frmwk_log0("failed after %d barriers\n", i);
+		return -1;
+	}
+	frmwk_log0("%d barriers completed in %ld.%09ld sec\n", N,
+		   t1.tv_sec, t1.tv_nsec);
+	return FI_SUCCESS;
+}
+
+int barrier2(struct fid_mc *mc, int N)
+{
+	struct timespec t0, t1;
+	uint64_t icontext;
+	uint64_t wcontext;
+	struct fi_cq_err_entry cqd;
+	int started, pending, blocked;
+	int ret;
+
+	clock_gettime(CLOCK_MONOTONIC, &t0);
+	started = 0;
+	pending = 0;
+	blocked = 0;
+	icontext = 0x1000;
+	while (started < N) {
+		/* start barriers until blocked by -FI_EAGAIN */
+		ret = fi_barrier(cxit_ep, (fi_addr_t)mc, (void *)icontext);
+		if (ret == FI_SUCCESS) {
+			started++;
+			pending++;
+			blocked = 0;
+			frmwk_log("fi_barrier[%08lx] started=%d pending=%d\n",
+				  icontext, started, pending);
+			icontext++;
+			continue;
+		}
+		if (ret != -FI_EAGAIN) {
+			frmwk_log("fi_barrier[%08lx] = %d (failed)\n", icontext, ret);
+			break;
+		}
+		if (!blocked++)
+			frmwk_log("fi_barrier[%08lx] blocked\n", icontext);
+		/* poll for one barrier */
+		ret = _poll_cqs(&cqd);
+		wcontext = (ret > 0) ? (uint64_t)cqd.op_context : -1L;
+		if (ret > 0) {
+			if (ret > 1)
+				frmwk_log("poll returned %d unexpected\n", ret);
+			pending -= ret;
+			blocked = 0;
+			frmwk_log("_poll_cqs[%08lx], pending = %d\n",
+				  wcontext, pending);
+			continue;
+		}
+		if (ret != -FI_EAGAIN) {
+			frmwk_log("_poll_cqs = %d (failed)\n", ret);
+			break;
+		}
+	}
+	frmwk_log("started %d of %d, pending %d\n", started, N, pending);
+	if (started < N) {
+		frmwk_log("failed\n");
+		return -1;
+	}
+	while (pending > 0) {
+		ret = _poll_cqs(&cqd);
+		wcontext = (ret > 0) ? (uint64_t)cqd.op_context : -1L;
+		if (ret > 0) {
+			pending -= ret;
+			frmwk_log("wait_cqs[%08lx], pending = %d\n",
+				  wcontext, pending);
+			continue;
+		}
+		if (ret != -FI_EAGAIN) {
+			frmwk_log("_poll_cqs = %d\n", ret);
+			break;
+		}
+	}
+	frmwk_log("completed %d\n", started);
+	clock_gettime(CLOCK_MONOTONIC, &t1);
+	if (t1.tv_nsec < t0.tv_nsec) {
+		t1.tv_nsec += 1000000000;
+		t1.tv_sec--;
+	}
+	t1.tv_nsec -= t0.tv_nsec;
+	t1.tv_sec -= t0.tv_sec;
+	if (started < N) {
+		frmwk_log("_wait_cqs() = %d\n", ret);
+		frmwk_log("failed after %d barriers\n", started);
+		return -1;
+	}
+	frmwk_log("%d barriers completed in %ld.%09ld sec\n", N,
+		   t1.tv_sec, t1.tv_nsec);
+	return FI_SUCCESS;
+}
+
+const char *helpstr =
+	"\n"
+	"-N specifies the number of barriers to perform, default=1\n"
+	"-R specifies the rank to be used as the root, default=0\n"
+	"-D specifies a random max delay in usec, default=0\n"
+	"-p parallel barriers\n"
+	"\n";
+
+int main(int argc, char **argv)
+{
+	fi_addr_t *fiaddrs = NULL;
+	size_t size = 0;
+	int rootidx = 0;
+	struct fid_av_set *avset = NULL;
+	struct fid_mc *mc = NULL;
+	int N = 1;
+	bool parallel = false;
+	int delay = 0;
+	int help = 0;
+	int opt, ret;
+
+	while ((opt = getopt(argc, argv, "hpN:R:D:")) != -1) {
+		switch (opt) {
+		case 'p':
+			parallel = true;
+			break;
+		case 'N':
+			N = atoi(optarg);
+			break;
+		case 'R':
+			rootidx = atoi(optarg);
+			break;
+		case 'D':
+			delay = atoi(optarg);
+			break;
+		case 'h':
+		default:
+			help = 1;
+			ret = (opt == 'h') ? 0 : 1;
+			break;
+		}
+	}
+
+	/* Read environment variables and initialize frmwk memory */
+	frmwk_init(help);
+	if (help) {
+		frmwk_log0("Usage: %s [-h] [-N iterations]\n"
+			   "       [-R root_rank] [-D usec] [-p]\n",
+			   basename(argv[0]));
+		frmwk_log0("%s", helpstr);
+		return ret;
+	}
+
+	/* Test requires a minimum of two nodes */
+	if (frmwk_check_env(2))
+		return -1;
+
+	/* Must be done before populting AV */
+	ret = frmwk_init_libfabric();
+	if (ret)
+		goto quit;
+
+	/* Acquire HSN0 addresses and distribute across job */
+	ret = frmwk_populate_av(&fiaddrs, &size);
+	if (ret)
+		goto quit;
+
+	/* Create the MPI_COMM_WORLD group */
+	ret = create_av_set(fiaddrs, size, rootidx, &avset);
+	if (ret)
+		goto quit;
+
+	/* Create the collective multicast identifier */
+	ret = join_collective(avset, &mc);
+	if (ret)
+		goto quit;
+
+	/* Perform N barriers */
+	ret = (parallel) ? barrier2(mc, N) : barrier(mc, N, delay);
+	if (ret)
+		goto quit;
+
+quit:
+	if (mc)
+		fi_close(&mc->fid);
+	if (avset)
+		fi_close(&avset->fid);
+	free(fiaddrs);
+	frmwk_free_libfabric();
+	frmwk_term();
+	return ret;
+}
diff --git a/prov/cxi/test/multinode/test_coll.c b/prov/cxi/test/multinode/test_coll.c
new file mode 100644
index 00000000000..974f8c54bb6
--- /dev/null
+++ b/prov/cxi/test/multinode/test_coll.c
@@ -0,0 +1,1400 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP
+ */
+
+/*
+ * Validation test for the multinode coll implementation.
+ *
+ * Launch using: srun -N4 ./test_coll [args]
+ * Note that -N4 is the minimum. There is no maximum.
+ */
+
+/**
+ * Test the coll functions in a real environment.
+ */
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+#include <malloc.h>
+#include <time.h>
+#include <ofi.h>
+#include <cxip.h>
+#include "multinode_frmwk.h"
+
+/* If not compiled with DEBUG=1, this is a no-op */
+#define	TRACE(fmt, ...)	CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__)
+
+/* convert delays to nsecs */
+#define	nUSEC(n)	(n * 1000L)
+#define nMSEC(n)	(n * 1000000L)
+#define	nSEC(n)		(n * 1000000000L)
+
+int verbose = 0;
+
+/* Signaling NaN generation, for testing.
+ * Linux feature requires GNU_SOURCE.
+ * This generates a specific sNaN value.
+ */
+static inline double cxip_snan64(void)
+{
+	return _bits2dbl(0x7ff4000000000000);
+}
+
+/* initialize nsecs timer structure */
+static inline void _init_nsecs(struct timespec *tsp)
+{
+	clock_gettime(CLOCK_MONOTONIC, tsp);
+}
+
+/* return elapsed nsecs since initialized tsp */
+static inline long _measure_nsecs(struct timespec *tsp)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	ts.tv_nsec -= tsp->tv_nsec;
+	ts.tv_sec -= tsp->tv_sec;
+	if (ts.tv_nsec < 0) {
+		ts.tv_nsec += 1000000000L;
+		ts.tv_sec -= 1;
+	}
+	return 1000000000L*ts.tv_sec + ts.tv_nsec;
+}
+
+static inline void _nsecs_from_now(struct timespec *tsp, long nsecs)
+{
+	long secs = (nsecs/1000000000L);
+
+	nsecs %= 1000000000L;
+	clock_gettime(CLOCK_MONOTONIC, tsp);
+	tsp->tv_nsec += nsecs;
+	tsp->tv_sec += secs;
+	if (tsp->tv_nsec > 1000000000L) {
+		tsp->tv_nsec -= 1000000000L;
+		tsp->tv_sec += 1;
+	}
+}
+
+static inline bool _nsecs_expired(const struct timespec *tsp)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	if (ts.tv_sec < tsp->tv_sec)
+		return false;
+	if (ts.tv_sec == tsp->tv_sec &&
+	    ts.tv_nsec < tsp->tv_nsec)
+		return false;
+	return true;
+}
+
+/* poll rx and tx cqs once to drive I/O and return completion context */
+static void *_poll_cqs(void)
+{
+	struct fi_cq_err_entry cqd;
+	ssize_t size;
+
+	size = fi_cq_read(cxit_rx_cq, &cqd, 1);
+	if (size == -FI_EAVAIL)
+		size = fi_cq_readerr(cxit_rx_cq, &cqd, 1);
+	if (size > 0) {
+		TRACE("rx event seen\n");
+		TRACE("  size          %ld\n",size);
+		TRACE("  buf           %p\n",cqd.buf);
+		TRACE("  data          %016lx\n",cqd.data);
+		TRACE("  err           %d\n",cqd.err);
+		TRACE("  err_data      %p\n",cqd.err_data);
+		TRACE("  err_data_size %ld\n",cqd.err_data_size);
+		TRACE("  flags         %016lx\n",cqd.flags);
+		TRACE("  len           %ld\n",cqd.len);
+		TRACE("  olen          %ld\n",cqd.olen);
+		TRACE("  op_context    %p\n",cqd.op_context);
+		TRACE("  prov_errno    %d\n",cqd.prov_errno);
+		TRACE("  tag           %016lx\n",cqd.tag);
+	} else if (size != -FI_EAGAIN)
+		TRACE("rx ERROR seen = %ld\n", size);
+
+	size = fi_cq_read(cxit_tx_cq, &cqd, 1);
+	if (size == -FI_EAVAIL)
+		size = fi_cq_readerr(cxit_tx_cq, &cqd, 1);
+	if (size > 0) {
+		TRACE("tx event seen\n");
+		TRACE("  size          %ld\n",size);
+		TRACE("  buf           %p\n",cqd.buf);
+		TRACE("  data          %016lx\n",cqd.data);
+		TRACE("  err           %d\n",cqd.err);
+		TRACE("  err_data      %p\n",cqd.err_data);
+		TRACE("  err_data_size %ld\n",cqd.err_data_size);
+		TRACE("  flags         %016lx\n",cqd.flags);
+		TRACE("  len           %ld\n",cqd.len);
+		TRACE("  olen          %ld\n",cqd.olen);
+		TRACE("  op_context    %p\n",cqd.op_context);
+		TRACE("  prov_errno    %d\n",cqd.prov_errno);
+		TRACE("  tag           %016lx\n",cqd.tag);
+		return cqd.op_context;
+	}
+	if (size != -FI_EAGAIN)
+		TRACE("tx ERROR seen = %ld\n", size);
+	TRACE("%s return NULL\n", __func__);
+	return NULL;
+}
+
+/* blocking wait for single collective op completion */
+static void _wait_cqs(void *pcontext)
+{
+	TRACE("Wait for context %p\n", pcontext);
+	do {
+		if (pcontext == _poll_cqs())
+			break;
+	} while (true);
+}
+
+/**
+ * @brief Manage multiple av_sets.
+ *
+ * The avset_ary is an ordered list of different av_set objects, each of which
+ * represents a specific collective group.
+ *
+ * In this test framework, the multi join operation will create an
+ * independent join (and mc object) for EACH av_set in the avset_ary, to be
+ * initiated concurrently. Joins will be initiated in the list order, and
+ * completed in an arbitrary order.
+ *
+ * Hint: fid_av_set consists of just a constant self-size value, and a list
+ * of function pointers. It is contained inside cxip_av_set, which contains
+ * the addresses, address counts, etc. You need to take container_of() on the
+ * fid_av_set pointer to get the containing cxip_av_set. Real (non-test)
+ * users will not need this extra information.
+ */
+struct avset_ary {
+	struct fid_av_set **avset;
+	int avset_cnt;
+	int avset_siz;
+};
+
+void avset_ary_init(struct avset_ary *setary)
+{
+	setary->avset = NULL;
+	setary->avset_cnt = 0;
+	setary->avset_siz = 0;
+}
+
+void avset_ary_destroy(struct avset_ary *setary)
+{
+	int i;
+
+	if (setary->avset) {
+		for (i = 0; i < setary->avset_cnt; i++)
+			fi_close(&setary->avset[i]->fid);
+		free(setary->avset);
+	}
+	avset_ary_init(setary);
+}
+
+/* create a single avset using fiaddrs, size, and append it to the setary */
+int avset_ary_append(fi_addr_t *fiaddrs, size_t size,
+		     int mcast_addr, int root_idx,
+		     struct avset_ary *setary)
+{
+	struct cxip_comm_key comm_key = {
+		.keytype = (cxip_env.coll_fabric_mgr_url) ?
+			    COMM_KEY_NONE : COMM_KEY_UNICAST,
+		.ucast.mcast_addr = mcast_addr,
+		.ucast.hwroot_idx = root_idx
+	};
+	struct fi_av_set_attr attr = {
+		.count = 0,
+		.start_addr = FI_ADDR_NOTAVAIL,
+		.end_addr = FI_ADDR_NOTAVAIL,
+		.stride = 1,
+		.comm_key_size = sizeof(comm_key),
+		.comm_key = (void *)&comm_key,
+		.flags = 0,
+	};
+	struct fid_av_set *setp;
+	int i, ret;
+
+	// expand accumulator list as necessary
+	TRACE("%s cnt=%d siz=%d\n", __func__, setary->avset_cnt,
+	       setary->avset_siz);
+	if (setary->avset_siz <= setary->avset_cnt) {
+		void *ptr;
+		int siz;
+
+		TRACE("%s expand setary\n", __func__);
+		siz = setary->avset_siz + 4;
+		ptr = realloc(setary->avset, siz * sizeof(void *));
+		if (!ptr) {
+			TRACE("%s realloc failed\n", __func__);
+			ret = -FI_ENOMEM;
+			goto quit;
+		}
+		setary->avset_siz = siz;
+		setary->avset = ptr;
+	}
+	// create empty av_set (alloc and initialize to empty)
+	ret = fi_av_set(cxit_av, &attr, &setp, NULL);
+	if (ret) {
+		TRACE("%s fi_av_set failed %d\n", __func__, ret);
+		goto quit;
+	}
+	// append addresses to av_set
+	for (i = 0; i < size; i++) {
+		ret = fi_av_set_insert(setp, fiaddrs[i]);
+		if (ret) {
+			TRACE("%s fi_av_set_insert failed %d\n", __func__, ret);
+			goto quit;
+		}
+	}
+	// add to expanded list
+	setary->avset[setary->avset_cnt++] = setp;
+	return 0;
+
+quit:
+	TRACE("%s: FAILED %d\n", __func__, ret);
+	if (setp) {
+		fi_close(&setp->fid);
+		free(setp);
+	}
+	return ret;
+}
+
+/**
+ * @brief Perform concurrent joins over avset_ary objects.
+ *
+ * A single multi-join will initiate concurrent join operations over each of
+ * the av_set objects in the avset_ary.
+ *
+ * Each join is represented by a join_item, which contains a pointer to the
+ * generating av_set, and the resulting mc object. It also records a
+ * completion result and a provider error (if any). The join_items are linked
+ * to a dlist called the joinlist.
+ *
+ * A multi-join can be called multiple times for the same joinlist, and will
+ * continue add join_items to the joinlist.
+ *
+ * If the av_set objects are all disjoint, joins should proceed in parallel.
+ * If the av_set objects overlap, the first join will proceed, and subsequent
+ * joins will return -FI_EAGAIN until the blocking zbcoll getgroup operation
+ * completes, after which they will proceed in parallel. If the maximum
+ * zbcoll groupid value is acquired, all join operations will be blocked
+ * until at least one join operation completes, freeing a zbcoll groupid.
+ *
+ * Proper behavior is dependent on initiating all joins in the same relative
+ * order on every participating endpoint, which is a general MPI requirement
+ * for all collective operations.
+ *
+ * This returns when all joins specified in the setary have been initiated.
+ *
+ * Note that fi_join_collective() can be called from an endpoint that is not
+ * a valid endpoint in the collective group. These tests, in fact, will call
+ * fi_join_collective() for every endpoint in the WLM job, even if the av_set
+ * represents some subset of this. The call will return the value
+ * -FI_ECONNREFUSED for endpoints that do not belong to the collective
+ * group, and this causes the join structure to be discarded without adding
+ * it to the result joinlist. This means that when doing a multijoin,
+ * different endpoints may have different joinlist lengths.
+ *
+ * A join failure on an endpoint that is part of the collective group will
+ * result in an error propagated to all members of that group through zbcoll,
+ * so all endpoints will fail the join operation with the same error code.
+ */
+struct join_item {
+	struct dlist_entry entry;
+	struct fid_av_set *avset;
+	struct fid_mc *mc;
+	int prov_errno;
+	int retval;
+	int trace_no;
+};
+
+/* poll the collective eq once, count of completions (0 or 1) */
+static int _poll_eq(void)
+{
+	struct cxip_ep *ep;
+	struct fid_eq *eq;
+	struct fi_eq_err_entry eqd = {};
+	struct join_item *jctx;
+	uint32_t event;
+	int ret;
+
+	ep = container_of(cxit_ep, struct cxip_ep, ep);
+	eq = &ep->ep_obj->coll.eq->util_eq.eq_fid;
+
+	jctx = NULL;
+	ret = fi_eq_read(eq, &event, &eqd, sizeof(eqd), 0);
+	if (ret >= 0) {
+		TRACE("read EQ = %d\n", ret);
+		if (ret < sizeof(struct fi_eq_entry)) {
+			TRACE("fi_eq_read()=%d, exp=%ld\n",
+				ret, sizeof(struct fi_eq_entry));
+			return -FI_EINVAL;
+		}
+		TRACE("=== EQ SUCCESS\n");
+		TRACE("  size    = %d\n", ret);
+		TRACE("  event   = %d\n", event);
+		TRACE("  fid     = %p\n", eqd.fid);
+		TRACE("  context = %p\n", eqd.context);
+		TRACE("  data    = %lx\n", eqd.data);
+		if (eqd.context && event == FI_JOIN_COMPLETE) {
+			jctx = eqd.context;
+			jctx->retval = 0;
+			jctx->prov_errno = 0;
+			return 1;
+		}
+	}
+	if (ret == -FI_EAVAIL) {
+		TRACE("read EQ = %d\n", ret);
+		ret = fi_eq_readerr(eq, &eqd, 0);
+		if (ret < sizeof(struct fi_eq_err_entry)) {
+			TRACE("fi_eq_readerr()=%d, exp=%ld\n",
+			      ret, sizeof(struct fi_eq_err_entry));
+			return -FI_EINVAL;
+		}
+		TRACE("=== EQ error available\n");
+		TRACE("  size    = %d\n", ret);
+		TRACE("  event   = %d\n", event);
+		TRACE("  fid     = %p\n", eqd.fid);
+		TRACE("  context = %p\n", eqd.context);
+		TRACE("  data    = %lx\n", eqd.data);
+		TRACE("  err     = %s (%d)\n",
+			fi_strerror(-eqd.err), eqd.err);
+		TRACE("  prov_err= %d\n", eqd.prov_errno);
+		TRACE("  err_data= %p\n", eqd.err_data);
+		TRACE("  err_size= %ld\n", eqd.err_data_size);
+		if (eqd.context) {
+			jctx = eqd.context;
+			jctx->retval = eqd.err;
+			jctx->prov_errno = eqd.prov_errno;
+			return 1;
+		}
+	}
+	if (ret != -FI_EAGAIN) {
+		TRACE("read EQ = %d\n", ret);
+		TRACE("=== EQ other\n");
+		TRACE("  size    = %d\n", ret);
+		TRACE("  event   = %d\n", event);
+	}
+	return 0;
+}
+
+/* close a list of collectives */
+void coll_multi_release(struct dlist_entry *joinlist)
+{
+	struct join_item *jctx;
+
+	TRACE("coll_multi_release\n");
+	while (!dlist_empty(joinlist)) {
+		dlist_pop_front(joinlist, struct join_item, jctx, entry);
+		TRACE("close mc, empty = %d\n", dlist_empty(joinlist));
+		if (jctx->mc)
+			fi_close(&jctx->mc->fid);
+		TRACE("free jctx\n");
+		free(jctx);
+	}
+	TRACE("return\n");
+}
+
+/* initiate join on all sets in setary, and append to joinlist */
+int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist)
+{
+	struct join_item *jctx;
+	int i, ret, total, count;
+
+	TRACE("ENTRY %s\n", __func__);
+
+	// perform collective joins from setary
+	total = setary->avset_cnt;
+	count = 0;
+	for (i = 0; i < total; i++) {
+		jctx = calloc(1, sizeof(*jctx));
+		jctx->trace_no = i;
+		jctx->avset = setary->avset[i];
+		dlist_init(&jctx->entry);
+		TRACE("join %d of %d initiating\n", i, total);
+		ret = fi_join_collective(cxit_ep, FI_ADDR_NOTAVAIL,
+						setary->avset[i], 0L,
+						&jctx->mc, jctx);
+		/* node is not participating in this join */
+		if (ret == -FI_ECONNREFUSED) {
+			free(jctx);
+			continue;
+		}
+		TRACE("join %d continuing ret=%d\n", i, ret);
+		if (ret != FI_SUCCESS) {
+			TRACE("join %d FAILED\n", ret);
+			goto fail;
+		}
+		/* wait for join to complete */
+		do {
+			_poll_cqs();
+			ret = _poll_eq();
+		} while (ret == 0);
+		dlist_insert_tail(&jctx->entry, joinlist);
+		count++;
+	}
+	TRACE("DONE %s completed %d joins\n", __func__, count);
+	return FI_SUCCESS;
+
+fail:
+	TRACE("TEST failed\n");
+	coll_multi_release(joinlist);
+	return ret;
+}
+
+/* Perform cleanup on a multijoin */
+void coll_join_cleanup(struct avset_ary *setary, struct dlist_entry *joinlist)
+{
+	coll_multi_release(joinlist);
+	avset_ary_destroy(setary);
+}
+
+struct join_item *coll_join_item(struct dlist_entry *joinlist, int index)
+{
+	struct join_item *jctx;
+
+	dlist_foreach_container(joinlist, struct join_item, jctx, entry) {
+		if (!index--)
+			return jctx;
+	}
+	return NULL;
+}
+
+
+/* Utility function to create a single join with no errors */
+struct join_item *coll_single_join(fi_addr_t *fiaddrs, size_t size,
+				   int mcast_addr, int root_idx,
+				   int exp_retval, int exp_prov_errno,
+				   struct avset_ary *setary,
+				   struct dlist_entry *joinlist,
+				   const char *msg)
+{
+	struct join_item *jctx = NULL;
+	int ret;
+
+	avset_ary_init(setary);
+	ret = avset_ary_append(fiaddrs, size, mcast_addr, root_idx, setary);
+	if (ret) {
+		TRACE("%s JOIN avset_ary_append()=%d\n", msg, ret);
+		goto quit;
+	}
+
+	dlist_init(joinlist);
+	ret = coll_multi_join(setary, joinlist);
+	if (ret) {
+		TRACE("%s JOIN coll_multi_join()=%d\n", msg, ret);
+		goto quit;
+	}
+
+	jctx = dlist_first_entry_or_null(joinlist, struct join_item, entry);
+	if (!jctx) {
+		TRACE("%s JOIN produced NULL result\n", msg);
+		goto quit;
+	}
+
+	if (jctx->retval != exp_retval || jctx->prov_errno != exp_prov_errno) {
+		TRACE("%s JOIN ret=%d,exp=%d prov_errno=%d,exp=%d\n", msg,
+		      jctx->retval, exp_retval,
+		      jctx->prov_errno, exp_prov_errno);
+		goto quit;
+	}
+	TRACE("%s JOIN SUCCESS\n", msg);
+	return jctx;
+
+quit:
+	TRACE("%s JOIN FAILED\n", msg);
+	coll_join_cleanup(setary, joinlist);
+	return NULL;
+}
+
+#if 0
+int _test_multi_barrier(struct avset_ary *setary, struct dlist_entry *joinlist,
+			int N, long *nsec_delay, int total_secs)
+{
+	struct timespec *nsec_times, nsec_start;
+	int i, ret;
+
+	nsec_times = calloc(sizeof(struct timespec), N);
+	ret = coll_init_multi_join(setary, joinlist);
+	if (ret) {
+		TRACE("multicast_join init error = %d\n", ret);
+		goto quit;
+	}
+	ret = coll_wait_multi_join(joinlist);
+	if (ret) {
+		TRACE("multicast_join wait error = %d\n", ret);
+		goto quit;
+	}
+
+	_nsecs_from_now(&nsec_start, 0L);
+	nsec_start.tv_sec += total_secs;
+
+	for (i = 0; i < N; i++)
+		_nsecs_from_now(&nsec_times[i], nsec_delay[i]);
+	while (!_nsecs_expired(&nsec_start)) {
+		for (i = 0; i < N; i++) {
+			if (!_nsecs_expired(&nsec_times[i]))
+				continue;
+			for (j = 0; j < )
+		}
+
+	}
+quit:
+	free(nsec_times);
+	coll_multi_releasejoinlist);
+	avset_ary_destroy(setary);
+	return ret;
+}
+#endif
+
+/**
+ * @brief Simple test of join, returns a count of errors.
+ *
+ * This creates a single avset_ary from the supplied addresses, with hwroot
+ * of zero, and performs a single join, tests errors, and cleans up. Used to
+ * probe the basic error conditions.
+ */
+int _test_join(fi_addr_t *fiaddrs, size_t size, int exp_ret,
+	       int exp_prov_errno)
+{
+	struct avset_ary setary;
+	struct dlist_entry joinlist;
+	struct join_item *jctx;
+	int ret, errcnt;
+
+	errcnt = 0;
+	avset_ary_init(&setary);
+	ret = avset_ary_append(fiaddrs, size, 0, 1, &setary);
+	errcnt += !!ret;
+
+	dlist_init(&joinlist);
+	ret = coll_multi_join(&setary, &joinlist);
+	errcnt += !!ret;
+
+	dlist_foreach_container(&joinlist, struct join_item, jctx, entry) {
+		if (jctx->retval != exp_ret ||
+		    jctx->prov_errno != exp_prov_errno) {
+			TRACE("exp_ret=%d retval=%d\n",
+			      exp_ret, jctx->retval);
+			TRACE("exp_prov_errno=%d prov_errno=%d\n",
+			      exp_prov_errno, jctx->prov_errno);
+			errcnt++;
+		}
+	}
+
+	coll_multi_release(&joinlist);
+	avset_ary_destroy(&setary);
+
+	return errcnt;
+}
+
+/* Simple test of barrier, returns a count of errors. */
+int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count)
+{
+	struct avset_ary setary;
+	struct dlist_entry joinlist;
+	struct join_item *jctx;
+	uint64_t context;
+	int i, ret, total, errcnt;
+
+	errcnt = 0;
+	total = 0;
+	avset_ary_init(&setary);
+	ret = avset_ary_append(fiaddrs, size, 0, 1, &setary);
+	errcnt += !!ret;
+	if (ret) {
+		TRACE("BARRIER avset not created\n");
+		goto quit;
+	}
+
+	dlist_init(&joinlist);
+	ret = coll_multi_join(&setary, &joinlist);
+	errcnt += !!ret;
+	if (ret) {
+		TRACE("BARRIER JOIN not initiated\n");
+		goto quit;
+	}
+	TRACE("BARRIER JOIN COMPLETE\n");
+
+	jctx = dlist_first_entry_or_null(&joinlist, struct join_item, entry);
+	TRACE("Barrier join complete, jctx = %p\n", jctx);
+	for (i = 0; i < count; i++) {
+		do {
+			usleep(rand() % 100);
+			ret = fi_barrier(cxit_ep, (fi_addr_t )jctx->mc,
+					&context);
+			TRACE("barrier = %d\n", ret);
+		} while (ret == -FI_EAGAIN);
+		if (ret == FI_SUCCESS) {
+			TRACE("spin 1...\n");
+			_wait_cqs(&context);
+			TRACE("BARRIER COMPLETE #%d\n", i);
+			total++;
+		} else {
+			TRACE("BARRIER FAILED   #%d, ret=%d\n", i, ret);
+			errcnt++;
+		}
+	}
+
+quit:
+	frmwk_log0("Barrier errcnt=%d total=%d\n", errcnt, total);
+	coll_multi_release(&joinlist);
+	avset_ary_destroy(&setary);
+	return errcnt;
+}
+
+/* Simple test of broadcast, returns a count of errors. */
+int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int rootidx)
+{
+	struct avset_ary setary;
+	struct dlist_entry joinlist;
+	struct join_item *jctx;
+	uint64_t data[4], rslt[4];
+	uint64_t context;
+	int i, ret, errcnt;
+
+	errcnt = 0;
+	jctx = coll_single_join(fiaddrs, size, 0, rootidx, 0, 0,
+				&setary, &joinlist, "BROADCAST");
+	if (!jctx) {
+		TRACE("BROADCAST JOIN returned NULL\n");
+		goto quit;
+	}
+
+	data[0] = 0x12345678;
+	data[1] = 0x2468ace0;
+	data[2] = 0x13579bdf;
+	data[3] = 0x10101010;
+	memset(rslt, 0, sizeof(rslt));
+	if (frmwk_rank == rootidx)
+		memcpy(rslt, data, sizeof(rslt));
+	do {
+		_poll_cqs();
+		ret = fi_broadcast(cxit_ep, rslt, 4, NULL,
+				   (fi_addr_t )jctx->mc, fiaddrs[rootidx],
+				   FI_UINT64, 0L, &context);
+	} while (ret == -FI_EAGAIN);
+	errcnt += !!ret;
+	if (ret == FI_SUCCESS) {
+		TRACE("spin 1...\n");
+		_wait_cqs(&context);
+		TRACE("BROADCAST COMPLETE\n");
+		if (memcmp(rslt, data, sizeof(rslt))) {
+			for (i = 0; i < 4; i++)
+				TRACE("[%d] %016lx exp %016lx\n",
+					i, rslt[i], data[i]);
+			errcnt++;
+		}
+	} else {
+		TRACE("ret = %d\n", ret);
+		TRACE("BROADCAST FAILED\n");
+		errcnt++;
+	}
+
+quit:
+	coll_multi_release(&joinlist);
+	avset_ary_destroy(&setary);
+	return errcnt;
+}
+
+/* simple test of allreduce, returns a count of errors. */
+int _test_allreduce(fi_addr_t *fiaddrs, size_t size)
+{
+	struct avset_ary setary;
+	struct dlist_entry joinlist;
+	struct join_item *jctx;
+	int64_t *data, *rslt, *comp;
+	uint64_t context;
+	int i, j, ret, errcnt;
+
+	errcnt = 0;
+	avset_ary_init(&setary);
+	ret = avset_ary_append(fiaddrs, size, 0, 1, &setary);
+	errcnt += !!ret;
+	if (ret) {
+		TRACE("ALLREDUCE avset not created\n");
+		goto quit;
+	}
+
+	dlist_init(&joinlist);
+	ret = coll_multi_join(&setary, &joinlist);
+	errcnt += !!ret;
+	if (ret) {
+		TRACE("ALLREDUCE JOIN not initiated\n");
+		goto quit;
+	}
+
+	jctx = dlist_first_entry_or_null(&joinlist, struct join_item, entry);
+	TRACE("jctx = %p\n", jctx);
+	TRACE("mc   = %p\n", jctx->mc);
+
+	data = calloc(frmwk_numranks*4, sizeof(int64_t));
+	comp = calloc(4, sizeof(int64_t));
+	rslt = calloc(4, sizeof(int64_t));
+	for (i = 0; i < frmwk_numranks; i++) {
+		for (j = 0; j < 4; j++) {
+			data[4*i + j] = ((int64_t)(rand() - RAND_MAX/2) << 32);
+			data[4*i + j] |= rand();
+			comp[j] += data[4*i + j];
+		}
+	}
+	do {
+		_poll_cqs();
+		ret = fi_allreduce(cxit_ep, &data[frmwk_rank*4], 4, NULL,
+				   rslt, NULL, (fi_addr_t )jctx->mc, FI_INT64,
+				   FI_SUM, 0L, &context);
+	} while (ret == -FI_EAGAIN);
+	errcnt += !!ret;
+	if (ret == FI_SUCCESS) {
+		TRACE("spin 1...\n");
+		_wait_cqs(&context);
+		TRACE("ALLREDUCE COMPLETE\n");
+		if (memcmp(rslt, comp, 4*sizeof(int64_t))) {
+			for (i = 0; i < 4; i++)
+				TRACE("[%d] %016lx exp %016lx\n",
+					i, rslt[i], comp[i]);
+			errcnt++;
+		}
+	} else {
+		TRACE("ret = %d\n", ret);
+		TRACE("ALLREDUCE FAILED\n");
+		errcnt++;
+	}
+	free(rslt);
+	free(comp);
+	free(data);
+
+quit:
+	coll_multi_release(&joinlist);
+	avset_ary_destroy(&setary);
+	return errcnt;
+}
+
+/**
+ * Main application.
+ *
+ * ./test_coll -h or srun -Nx ./test_coll -h displays syntax and a list of
+ * tests.
+ *
+ * Each test is bracketed by do {...} while(0) and will be evaluated against
+ * the test mask created by the -t argument. If the test isn't in the -t
+ * selection, then the test is silently skipped. Tests can be easily
+ * rearranged or modified by adding new do {...} while(0) test cases. Each
+ * should begin with PREAMBLE(), which manages the argument handling.
+ *
+ * Each test case should end with frmwk_barrier(), which uses the framework()
+ * sockets-based barrier to ensure separation of the test cases.
+ */
+
+static uint64_t testmask = 0L;
+
+#define TAG(skip)	(skip ? "SKIP " : "----")
+#define	TEST(n)		(1 << n)
+#define	STDMSG(ret)	((ret > 0) ? "SKIP" : ((ret) ? "FAIL" : "good"))
+#define PREAMBLE(skip, num, nam) \
+	ret = 1; \
+	testname = nam; \
+	if (help) { \
+		frmwk_log0("%2d: %s\n", num, testname); break; \
+	}; \
+	if (!(testmask & TEST(num))) break; \
+	frmwk_log0("%4s %2d:%s\n", TAG(skip), num, testname); \
+	TRACE("%4s %2d:%s\n", TAG(skip), num, testname); \
+	if (skip) break; \
+	ret = 0
+
+int main(int argc, char **argv)
+{
+	bool trace_enabled = true;
+	fi_addr_t *fiaddrs = NULL;
+	fi_addr_t myaddr;
+	struct cxip_addr mycaddr;
+	size_t mycaddr_len;
+	size_t size = 0;
+	int errcnt = 0;
+	int tstcnt = 0;
+	int tstnum = 0;
+	int ret = 0;
+	int N = 0;
+	bool help = false;
+	struct join_item *jctx;
+	struct avset_ary setary;
+	struct dlist_entry joinlist;
+
+
+	const char *testname;
+	char opt;
+	int i, j;
+
+	/* by default, perform all tests */
+	testmask = -1L;
+	testname = NULL;
+
+	TRACE("enter main\n");
+	while ((opt = getopt(argc, argv, "hvVt:N:")) != -1) {
+		char *str, *s, *p;
+
+		switch (opt) {
+		case 't':
+			/* perform only selected tests */
+			str = optarg;
+			i = j = 0;
+			testmask = 0L;
+			while (*str) {
+				while (*str == ' ')
+					str++;
+				s = str;
+				while (*str && *str != ',')
+					str++;
+				if (*str)
+					*str++ = 0;
+				p = s;
+				while (*p && *p != '-')
+					p++;
+				i = atoi(s);
+				j = (*p) ? atoi(++p) : i;
+				if (j > 63)
+					j = 63;
+				while (i <= j)
+					testmask |= (1L << i++);
+			}
+			break;
+		case 'N':
+			N = atoi(optarg);
+			break;
+		case 'V':
+			trace_enabled = true;
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		case 'h':
+			help = true;
+			break;
+		default:
+			help = true;
+			frmwk_log0("Syntax error\n");
+			break;
+		}
+	}
+
+	/* initialize framework, silently if running help */
+	frmwk_init(help);
+	srand(frmwk_rank);
+
+	/* Collect env variable information from WLM */
+	do {
+		if (help) {
+			frmwk_log0(
+				"Usage: test_coll [-hvV] -Ncount[-t testno[-testno][,...]]\n");
+			frmwk_log0("\nTests:\n");
+			break;
+		}
+
+		/* Test requires a minimum of four nodes */
+		if (frmwk_check_env(4))
+			return -1;
+
+		/* Initialize libfabric on this node */
+		ret = frmwk_init_libfabric();
+		errcnt += !!ret;
+		if (frmwk_errmsg(ret, "frmwk_init_libfabric()\n"))
+			goto done;
+
+		cxip_trace_enable(trace_enabled);
+		TRACE("==== tracing enabled offset %d\n", frmwk_rank);
+
+		/* always start with FI_UNIVERSE */
+		ret = frmwk_populate_av(&fiaddrs, &size);
+		errcnt += !!ret;
+		if (frmwk_errmsg(ret, "frmwk_populate_av()\n"))
+			goto done;
+
+		myaddr = fiaddrs[frmwk_rank];
+		ret = fi_av_lookup(cxit_av, myaddr, &mycaddr, &mycaddr_len);
+		errcnt += !!ret;
+		if (frmwk_errmsg(ret, "fi_av_lookup(%d)\n", frmwk_rank))
+			goto done;
+
+		TRACE("numranks=%2d rank=%2d fiaddr=%ld caddr=%05x\n",
+		      frmwk_numranks, frmwk_rank, myaddr, mycaddr.nic);
+	} while (0);
+	if (errcnt)
+		goto done;
+
+	/* TEST CASES*/
+
+	/* Sanity test of framework.
+	*/
+	do {
+		PREAMBLE(0, tstnum, "test framework");
+		ret = 0;
+		tstcnt += 1;
+		errcnt += !!ret;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	/* Sanity test of the avset_ary_append() utility function.
+	*/
+	do {
+		PREAMBLE(0, tstnum, "create av_set list 1");
+		// Test multijoins over one array list
+		TRACE("======= %s\n", testname);
+		avset_ary_init(&setary);
+		ret = avset_ary_append(fiaddrs, size, 0, 0, &setary);
+		errcnt += !!ret;
+
+		avset_ary_destroy(&setary);
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	/* Exercise the avset_ary_append() utility function.
+	*/
+	do {
+		PREAMBLE(0, tstnum, "create av_set list 10");
+		// Exercise creating av_set lists
+		avset_ary_init(&setary);
+		dlist_init(&joinlist);
+
+		ret = 0;
+		for (i = 0; !ret && i < 10; i++)
+			ret = avset_ary_append(fiaddrs, size, 0, 0, &setary);
+		TRACE("ret=%d cnt=%d siz=%d\n", ret,
+		      setary.avset_cnt, setary.avset_siz);
+		errcnt += !!ret;
+		errcnt += !!(setary.avset_cnt != 10);
+		errcnt += !!(setary.avset_siz < 10);
+
+		avset_ary_destroy(&setary);
+		errcnt += !!(setary.avset_cnt != 0);
+		errcnt += !!(setary.avset_siz != 0);
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	/* Sanity test for _test_join() utility function.
+	*/
+	do {
+		PREAMBLE(0, tstnum, "test join (simple)");
+		// Test single join over one array list
+		TRACE("======= %s\n", testname);
+		jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0,
+					&setary, &joinlist, "simple");
+		coll_join_cleanup(&setary, &joinlist);
+		errcnt += !!!jctx;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	/* Test join operation with a 1-second delay on the root.
+	 */
+	do {
+		PREAMBLE(0, tstnum, "test join (slow root)");
+		// cause slow root rank
+		if (frmwk_rank == 0)
+			usleep(1000000);
+		jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0,
+					&setary, &joinlist, "slow root");
+		coll_join_cleanup(&setary, &joinlist);
+		errcnt += !!!jctx;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	/* Test join operation with a 1-second delay on a leaf.
+	 */
+	do {
+		PREAMBLE(0, tstnum, "test join (slow leaf)");
+		// cause slow leaf rank
+		if (frmwk_rank == (frmwk_numranks - 1))
+			usleep(1000000);
+		jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0,
+					&setary, &joinlist, "slow leaf");
+		coll_join_cleanup(&setary, &joinlist);
+		errcnt += !!!jctx;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "force -FI_EAGAIN on root getgroup");
+		// cause zbcoll root (rank 0) to reject getgroup requests once
+		cxip_trap_set(0, CXIP_TRAP_GETGRP, -FI_EAGAIN);
+		// cause non-root ranks attempt zbcoll getgroup first
+		if (frmwk_rank == 0)
+			usleep(10000);
+		jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0,
+					&setary, &joinlist, "FI_EAGAIN root");
+		coll_join_cleanup(&setary, &joinlist);
+		errcnt += !!!jctx;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "force -FI_EAGAIN on root broadcast");
+		// cause zbcoll root (rank 0) to reject broadcast requests once
+		cxip_trap_set(0, CXIP_TRAP_BCAST, -FI_EAGAIN);
+		jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0,
+					&setary, &joinlist,
+					"FI_EAGAIN root bcast");
+		coll_join_cleanup(&setary, &joinlist);
+		errcnt += !!!jctx;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "force -FI_EAGAIN on root reduce");
+		// cause zbcoll root (rank 0) to reject join reduce once
+		cxip_trap_set(0, CXIP_TRAP_REDUCE, -FI_EAGAIN);
+		jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0,
+					&setary, &joinlist,
+					"FI_EAGAIN root reduce");
+		coll_join_cleanup(&setary, &joinlist);
+		errcnt += !!!jctx;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "force -FI_EFAULT on PTE alloc");
+		// cause zbcoll root (rank 0) to simulate PTE alloc failure
+		cxip_trap_set(0, CXIP_TRAP_INITPTE, -FI_EFAULT);
+		ret =  _test_join(fiaddrs, size, -FI_EAVAIL,
+				  CXIP_PROV_ERRNO_PTE);
+		tstcnt += 1;
+		errcnt += !!ret;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "perform barrier");
+		ret = _test_barrier(fiaddrs, size, 1);
+		errcnt += !!ret;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "perform broadcast");
+		for (i = 0; i < frmwk_numranks; i++) {
+			ret = _test_broadcast(fiaddrs, size, i);
+			errcnt += !!ret;
+		}
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "perform allreduce sum");
+		ret = _test_allreduce(fiaddrs, size);
+		TRACE("allreduce ret = %d\n", ret);
+		errcnt += !!ret;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "perform barrier x N");
+		ret = _test_barrier(fiaddrs, size, N);
+		errcnt += !!ret;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "test mcast dup");
+		avset_ary_init(&setary);
+		TRACE("avset initialized\n");
+		ret = avset_ary_append(fiaddrs, size, 0, 0, &setary);
+		TRACE("avset append 1 = %d\n", ret);
+		ret = avset_ary_append(fiaddrs, size, 0, 1, &setary);
+		TRACE("avset append 2 = %d\n", ret);
+
+		dlist_init(&joinlist);
+		ret = coll_multi_join(&setary, &joinlist);
+		TRACE("join = %d\n", ret);
+
+		jctx = coll_join_item(&joinlist, 0);
+		TRACE("item 0 mc=%p retval=%d prov_errno=%d\n",
+		      jctx->mc, jctx->retval, jctx->prov_errno);
+		if (jctx->retval || jctx->prov_errno) {
+			TRACE("unexpected result on coll 0\n");
+			errcnt++;
+		}
+		jctx = coll_join_item(&joinlist, 1);
+		TRACE("item 1 mc=%p retval=%d prov_errno=%d\n",
+		      jctx->mc, jctx->retval, jctx->prov_errno);
+		if (jctx->retval != -FI_EAVAIL ||
+		    jctx->prov_errno != CXIP_PROV_ERRNO_MCAST_INUSE) {
+			TRACE("unexpected result on coll 1\n");
+			errcnt++;
+		}
+		tstcnt += 1;
+
+		frmwk_log0("%4s\n", STDMSG(ret));
+		coll_multi_release(&joinlist);
+		avset_ary_destroy(&setary);
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "test hwroot dup");
+		avset_ary_init(&setary);
+		TRACE("avset initialized\n");
+		ret = avset_ary_append(fiaddrs, size, 0, 0, &setary);
+		TRACE("avset append 1 = %d\n", ret);
+		ret = avset_ary_append(fiaddrs, size, 1, 0, &setary);
+		TRACE("avset append 2 = %d\n", ret);
+
+		dlist_init(&joinlist);
+		ret = coll_multi_join(&setary, &joinlist);
+		TRACE("join = %d\n", ret);
+
+		jctx = coll_join_item(&joinlist, 0);
+		TRACE("item 0 mc=%p retval=%d prov_errno=%d\n",
+		      jctx->mc, jctx->retval, jctx->prov_errno);
+		if (jctx->retval || jctx->prov_errno) {
+			TRACE("unexpected result on coll 0\n");
+			errcnt++;
+		}
+		jctx = coll_join_item(&joinlist, 1);
+		TRACE("item 1 mc=%p retval=%d prov_errno=%d\n",
+		      jctx->mc, jctx->retval, jctx->prov_errno);
+		if (jctx->retval != -FI_EAVAIL ||
+		    jctx->prov_errno != CXIP_PROV_ERRNO_HWROOT_INUSE) {
+			TRACE("unexpected result on coll 1\n");
+			errcnt++;
+		}
+		tstcnt += 1;
+
+		frmwk_log0("%4s\n", STDMSG(ret));
+		coll_multi_release(&joinlist);
+		avset_ary_destroy(&setary);
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "test hwroot and mcast dup");
+		avset_ary_init(&setary);
+		TRACE("avset initialized\n");
+		ret = avset_ary_append(fiaddrs, size, 0, 0, &setary);
+		TRACE("avset append 1 = %d\n", ret);
+		ret = avset_ary_append(fiaddrs, size, 0, 0, &setary);
+		TRACE("avset append 2 = %d\n", ret);
+
+		dlist_init(&joinlist);
+		ret = coll_multi_join(&setary, &joinlist);
+		TRACE("join = %d\n", ret);
+
+		jctx = coll_join_item(&joinlist, 0);
+		TRACE("item 0 mc=%p retval=%d prov_errno=%d\n",
+		      jctx->mc, jctx->retval, jctx->prov_errno);
+		if (jctx->retval || jctx->prov_errno) {
+			TRACE("unexpected result on coll 0\n");
+			errcnt++;
+		}
+		jctx = coll_join_item(&joinlist, 1);
+		TRACE("item 1 mc=%p retval=%d prov_errno=%d\n",
+		      jctx->mc, jctx->retval, jctx->prov_errno);
+		if (jctx->retval != -FI_EAVAIL ||
+		    jctx->prov_errno != CXIP_PROV_ERRNO_HWROOT_INUSE) {
+			TRACE("unexpected result on coll 1\n");
+			errcnt++;
+		}
+		tstcnt += 1;
+
+		frmwk_log0("%4s\n", STDMSG(ret));
+		coll_multi_release(&joinlist);
+		avset_ary_destroy(&setary);
+	} while (0);
+	tstnum++;
+
+	do {
+		PREAMBLE(0, tstnum, "test multiple join");
+		avset_ary_init(&setary);
+		TRACE("avset initialized\n");
+
+		for (i = 0; i < N; i++) {
+			ret = avset_ary_append(fiaddrs, size, i, i, &setary);
+			TRACE("avset append %d = %d\n", i, ret);
+		}
+
+		dlist_init(&joinlist);
+		ret = coll_multi_join(&setary, &joinlist);
+		TRACE("multijoin = %d\n", ret);
+
+		for (i = 0; i < N; i++) {
+			int exp_ret = (i < size) ? 0 : -FI_EAVAIL;
+			int exp_errno = (i < size) ? 0 : CXIP_PROV_ERRNO_HWROOT_INUSE;
+			int good;
+
+			jctx = coll_join_item(&joinlist, i);
+			if (!jctx) {
+				TRACE("no join item\n");
+				continue;
+			}
+			good = (jctx->retval == exp_ret &&
+				jctx->prov_errno == exp_errno);
+			errcnt += !good;
+			TRACE("item %d mc=%p retval=%d prov_errno=%d %s\n",
+			      i, jctx->mc, jctx->retval, jctx->prov_errno,
+			      good ? "ok" : "bad");
+		}
+		tstcnt += 1;
+
+		frmwk_log0("%4s\n", STDMSG(ret));
+		coll_multi_release(&joinlist);
+		avset_ary_destroy(&setary);
+	} while (0);
+	tstnum++;
+
+
+	do {
+		PREAMBLE(0, tstnum, "test multiple broadcast");
+
+		uint64_t **datary, *ctxary, *ctxptr;
+		int in_progress, tree, root, i, j;
+
+		/* set up maximum number of trees possible */
+		avset_ary_init(&setary);
+		for (tree = 0; tree < size; tree++) {
+			ret = avset_ary_append(fiaddrs, size, tree, tree, &setary);
+			TRACE("avset append group %d = %d\n", tree, ret);
+		}
+		TRACE("avset initialized\n");
+
+		dlist_init(&joinlist);
+		ret = coll_multi_join(&setary, &joinlist);
+		TRACE("multijoin = %d\n", ret);
+
+		/* context and data for each collective tree */
+		ctxary = calloc(size, sizeof(uint64_t));
+		datary = calloc(size, sizeof(void *));
+		for (tree = 0; tree < size; tree++) {
+			datary[tree] = calloc(4, sizeof(uint64_t));
+			ctxary[tree] = tree;
+		}
+
+		/* repeat the collective N times as requested*/
+		for (i = 0; i < N; i++) {
+			in_progress = 0;
+
+			/* rotate root every time */
+			root = i%size;
+
+			/* start a broadcast on every tree */
+			for (tree = 0; tree < size; tree++) {
+				uint64_t id = (uint64_t)tree << 32;
+
+				/* prepare the data */
+				memset(datary[tree], 0, 4*sizeof(uint64_t));
+				if (frmwk_rank == root) {
+					for (j = 0; j < 4; j++)
+						datary[tree][j] = id|root;
+				}
+				TRACE("strt=%d tree=%d\n", i, tree);
+				for (j = 0; j < 4; j++)
+					TRACE("  %016lx\n", datary[tree][j]);
+
+			}
+			for (tree = 0; tree < size; tree++) {
+				int tree2 = (tree + frmwk_rank)%size;
+
+				usleep(rand() % 100);
+				jctx = coll_join_item(&joinlist, tree2);
+				ret = fi_broadcast(cxit_ep, datary[tree2], 4, NULL,
+						   (fi_addr_t )jctx->mc,
+						   fiaddrs[root], FI_UINT64,
+						   0L, &ctxary[tree2]);
+				in_progress++;
+				TRACE("in_progress=%d\n", in_progress);
+				if ((ctxptr = _poll_cqs())) {
+					in_progress--;
+					TRACE("ctxptr=%ld in_progress=%d\n",
+					      *ctxptr, in_progress);
+				}
+			}
+			while (in_progress > 0) {
+				if ((ctxptr = _poll_cqs())) {
+					in_progress--;
+					TRACE("ctxptr=%ld in_progress=%d\n",
+					      *ctxptr, in_progress);
+				}
+			}
+			for (tree = 0; tree < size; tree++) {
+				TRACE("rslt=%d tree=%d\n", i, tree);
+				for (j = 0; j < 4; j++)
+					TRACE("  %016lx\n", datary[tree][j]);
+
+			}
+		}
+		tstcnt += 1;
+
+		frmwk_log0("%4s\n", STDMSG(ret));
+		coll_multi_release(&joinlist);
+		avset_ary_destroy(&setary);
+	} while (0);
+	tstnum++;
+
+#if 0
+	do {
+		PREAMBLE(0, tstnum, "title of test");
+		ret = 0;	// some test
+		errcnt += !!ret;
+		tstcnt += 1;
+		frmwk_log0("%4s\n", STDMSG(ret));
+		frmwk_barrier();
+	} while (0);
+	tstnum++;
+#endif
+
+	if (help)
+		return (errcnt);
+
+done:
+	frmwk_log0("%2d tests run, %d failures\n", tstcnt, errcnt);
+	frmwk_log0(!!errcnt ? "ERRORS SEEN\n" : "SUCCESS\n");
+	free(fiaddrs);
+	frmwk_free_libfabric();
+	frmwk_term();
+	return !!errcnt;
+}
diff --git a/prov/cxi/test/multinode/test_frmwk.c b/prov/cxi/test/multinode/test_frmwk.c
new file mode 100644
index 00000000000..6f8fdd7a850
--- /dev/null
+++ b/prov/cxi/test/multinode/test_frmwk.c
@@ -0,0 +1,80 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP
+ */
+
+/*
+ * Validation test for the pmi_frmwk implementation.
+ *
+ * Launch using: srun -N4 ./test_frmwk
+ *
+ * This can be used as a prototype for test applications.
+ *
+ * This activates libfabric, populates the AV, and then frees the libfabric
+ * instance.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+#include <malloc.h>
+#include <time.h>
+#include <cxip.h>
+#include <ofi.h>
+#include "multinode_frmwk.h"
+
+#define	TRACE(fmt, ...)	CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__)
+
+int main(int argc, char **argv)
+{
+	fi_addr_t *fiaddr = NULL;
+	size_t size = 0;
+	int i, j, ret;
+
+	frmwk_init(false);
+	printf("[%d|%d] initialized\n", frmwk_rank, frmwk_numranks);
+
+	ret = frmwk_gather_nics();
+	for (i = 0; i < frmwk_numranks; i++) {
+		printf("[%d|%d] rank %d HSNS [", frmwk_rank, frmwk_numranks, i);
+		for (j = 0; j < frmwk_nics_per_rank; j++)
+			printf(" %05x", frmwk_nic_addr(i, j));
+		printf("]\n");
+	}
+
+	frmwk_barrier();
+
+	ret = frmwk_init_libfabric();
+	if (frmwk_errmsg(ret, "frmwk_init_libfabric()\n"))
+		return ret;
+
+	ret = frmwk_populate_av(&fiaddr, &size);
+	if (frmwk_errmsg(ret, "frmwk_populate_av()\n"))
+		return ret;
+
+	printf("[%d|%d] fiaddrs\n", frmwk_rank, frmwk_numranks);
+	for (i = 0; i < size; i++) {
+		printf("[%d|%d] %ld\n", frmwk_rank, frmwk_numranks,
+			fiaddr[i]);
+	}
+
+	cxip_trace_enable(true);
+	TRACE("Trace message test %d\n", 0);
+	TRACE("Trace message test %d\n", 1);
+	cxip_trace_enable(false);
+	TRACE("This message should not appear\n");
+	cxip_trace_enable(true);
+	TRACE("This message should appear\n");
+
+	frmwk_free_libfabric();
+	free(fiaddr);
+
+	frmwk_term();
+	return ret;
+}
diff --git a/prov/cxi/test/multinode/test_zbcoll.c b/prov/cxi/test/multinode/test_zbcoll.c
new file mode 100644
index 00000000000..d19ff3aabd8
--- /dev/null
+++ b/prov/cxi/test/multinode/test_zbcoll.c
@@ -0,0 +1,1414 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP
+ */
+
+/*
+ * Validation test for the multinode zbcoll implementation.
+ *
+ * Launch using: srun -N4 ./test_zbcoll [args]
+ */
+
+/**
+ * Test the zbcoll functions in a real environment.
+ */
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+#include <malloc.h>
+#include <time.h>
+#include <ofi.h>
+#include <cxip.h>
+#include "multinode_frmwk.h"
+
+#define	TRACE(fmt, ...)	CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__)
+
+/* convert delays to nsecs */
+#define	nUSEC(n)	(n * 1000L)
+#define nMSEC(n)	(n * 1000000L)
+#define	nSEC(n)		(n * 1000000000L)
+
+int verbose = false;
+
+/* initialize nsecs timer structure */
+static inline void _init_nsecs(struct timespec *tsp)
+{
+	clock_gettime(CLOCK_MONOTONIC, tsp);
+}
+
+/* return elapsed nsecs since initialized tsp */
+static inline long _measure_nsecs(struct timespec *tsp)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	ts.tv_nsec -= tsp->tv_nsec;
+	ts.tv_sec -= tsp->tv_sec;
+	if (ts.tv_nsec < 0) {
+		ts.tv_nsec += 1000000000L;
+		ts.tv_sec -= 1;
+	}
+	return 1000000000L*ts.tv_sec + ts.tv_nsec;
+}
+
+/* introduce random jitter delay into operations per rank */
+void _jitter(int usec)
+{
+	static unsigned int seed = 0;
+	if (!seed)
+		seed = rand() + frmwk_rank + 1;
+	if (usec) {
+		usec = rand_r(&seed) % usec;
+		TRACE("_jitter delay = %d usec\n", usec);
+		usleep(usec);
+	}
+}
+
+/* utility to poll and capture trailing errors/completions */
+static void _idle_wait(struct cxip_ep_obj *ep_obj, int msec)
+{
+	uint32_t dsc0, err0, ack0, rcv0;
+	uint32_t dsc, err, ack, rcv;
+	struct timespec ts;
+	long nsecs = 0L;
+
+	cxip_zbcoll_get_counters(ep_obj, &dsc0, &err0, &ack0, &rcv0);
+	_init_nsecs(&ts);
+	do {
+		cxip_ep_zbcoll_progress(ep_obj);
+		cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+		nsecs = _measure_nsecs(&ts);
+		if (dsc==dsc0 && err==err0 && ack==ack0 && rcv==rcv0)
+			continue;
+		TRACE("ns=%ld dsc=%d err=%d ack=%d rcv=%d\n",
+			nsecs, dsc, err, ack, rcv);
+		cxip_trace_flush();
+		dsc0 = dsc;
+		err0 = err;
+		ack0 = ack;
+		rcv0 = rcv;
+	} while (msec < 0 || nsecs < nMSEC(msec));
+}
+
+/* utility to do a primitive wait for send completion based on counters */
+static int _send_wait(struct cxip_zbcoll_obj *zb, int sndcnt, int rcvcnt)
+{
+	struct cxip_ep_obj *ep_obj = zb->ep_obj;
+	uint32_t dsc, err, ack, rcv;
+	struct timespec ts;
+	long nsecs = 0L;
+
+	_init_nsecs(&ts);
+	do {
+		cxip_ep_zbcoll_progress(ep_obj);
+		cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+		if (err || dsc)
+			break;
+		if (ack >= sndcnt && rcv >= rcvcnt)
+			break;
+		nsecs = _measure_nsecs(&ts);
+	} while (nsecs < nMSEC(100));
+	TRACE("ns=%ld dsc=%d err=%d ack=%d rcv=%d rc=%d\n",
+		   nsecs, dsc, err, ack, rcv, zb->error);
+	if (nsecs >= nMSEC(100)) {
+		TRACE("TIMEOUT\n");
+		return 1;
+	}
+	if (err || dsc || ack < sndcnt || rcv < rcvcnt) {
+		TRACE("TRANSPORT FAILURE\n");
+		return 1;
+	}
+	if (zb->error) {
+		TRACE("STATE FAILURE\n");
+		return 1;
+	}
+	cxip_trace_flush();
+	return 0;
+}
+
+/* send a single packet from node to node, and wait for completion */
+static struct cxip_addr bad_cxip_addr;
+static int bad_cxip_index = -1;
+
+int _test_send_to_dest(struct cxip_ep_obj *ep_obj,
+		       size_t size, fi_addr_t *fiaddrs,
+		       int src, int dst, uint64_t payload)
+{
+	struct cxip_zbcoll_obj *zb;
+	int grp_rank;
+	int sndcnt, rcvcnt;
+	int i, ret;
+
+	ret = cxip_zbcoll_alloc(ep_obj, size, fiaddrs, ZB_NOSIM, &zb);
+	if (frmwk_errmsg(ret, "%s: cxip_zbcoll_alloc()\n", __func__))
+		return ret;
+
+	/* replace an address with a different address */
+	if (bad_cxip_index >= 0 && bad_cxip_index < size) {
+		TRACE("bad id being used\n");
+		zb->caddrs[bad_cxip_index] = bad_cxip_addr;
+	}
+
+	grp_rank = zb->state[0].grp_rank;
+
+	ep_obj->zbcoll.disable = true;
+	zb->grpid = 0;
+	cxip_zbcoll_reset_counters(ep_obj);
+	if (src < 0 && dst < 0) {
+		/* every source to every destination */
+		sndcnt = size;
+		rcvcnt = size;
+		for (i = 0; i < size; i++)
+			cxip_zbcoll_send(zb, grp_rank, i, payload);
+	} else if (src < 0) {
+		/* every source sends to one destination */
+		sndcnt = 1;
+		rcvcnt = (dst == grp_rank) ? size : 0;
+		cxip_zbcoll_send(zb, grp_rank, dst, payload);
+	} else if (dst < 0 && src == grp_rank) {
+		/* this source sends to every destination */
+		sndcnt = size;
+		rcvcnt = 1;
+		for (i = 0; i < size; i++)
+			cxip_zbcoll_send(zb, grp_rank, i, payload);
+	} else if (dst < 0) {
+		/* some other src to every destination */
+		sndcnt = 0;
+		rcvcnt = 1;
+	} else if (grp_rank == src) {
+		/* this source to a destination */
+		sndcnt = 1;
+		rcvcnt = (grp_rank == dst) ? 1 : 0;
+		cxip_zbcoll_send(zb, grp_rank, dst, payload);
+	} else if (grp_rank == dst) {
+		/* some other source to this destination */
+		sndcnt = 0;
+		rcvcnt = 1;
+	} else {
+		/* not participating */
+		sndcnt = 0;
+		rcvcnt = 0;
+	}
+	ret = _send_wait(zb, sndcnt, rcvcnt);
+	ep_obj->zbcoll.disable = false;
+	cxip_zbcoll_free(zb);
+
+	return ret;
+}
+
+/* normal utility to wait for collective completion, returns coll error */
+static int _coll_wait(struct cxip_zbcoll_obj *zb, long nsec_wait)
+{
+	uint32_t dsc, err, ack, rcv;
+	struct timespec ts;
+	long nsecs = 0L;
+
+	if (!zb) {
+		TRACE("%s: NULL zb passed\n", __func__);
+		return -FI_EINVAL;
+	}
+	_init_nsecs(&ts);
+	do {
+		cxip_ep_zbcoll_progress(zb->ep_obj);
+		cxip_zbcoll_get_counters(zb->ep_obj, &dsc, &err, &ack, &rcv);
+		/* this waits for a software completion */
+		if (zb->error || !zb->busy)
+			break;
+		nsecs = _measure_nsecs(&ts);
+	} while (nsecs < nsec_wait);
+	TRACE("ns=%ld dsc=%d err=%d ack=%d rcv=%d\n",
+		   nsecs, dsc, err, ack, rcv);
+	if (nsecs >= nsec_wait) {
+		TRACE("TIMEOUT\n");
+		return -FI_ETIMEDOUT;
+	}
+	/* return the software error code -- may be -FI_EAGAIN */
+	TRACE("return code = %d\n", zb->error);
+	return zb->error;
+}
+
+/**
+ * @brief Internal workhorse to create zb object and get group id.
+ *
+ * If the endpoint is not in the group, this will return FI_SUCCESS, delete the
+ * zb object (if any), and do nothing.
+ *
+ * This creates a zb object as necessary.
+ *
+ * This destroys the zb object on any error.
+ *
+ * This call blocks for up to 100 msec waiting for completion.
+ *
+ * @param ep_obj : endpoint
+ * @param size   : number of NIDs in group
+ * @param fiaddrs: fiaddrs in group
+ * @param zbp    : return pointer to zb object (may be non-NULL)
+ * @return int   : libfabric error code
+ */
+int _getgroup(struct cxip_ep_obj *ep_obj,
+	      size_t size, fi_addr_t *fiaddrs,
+	      struct cxip_zbcoll_obj **zbp)
+{
+	int ret;
+
+	/* need a zbcoll object for this */
+	if (!zbp) {
+		TRACE("%s: NULL zbp passed\n", __func__);
+		return -FI_EINVAL;
+	}
+	if (!*zbp) {
+		ret = cxip_zbcoll_alloc(ep_obj, size, fiaddrs, ZB_NOSIM, zbp);
+		if (ret == -FI_ECONNREFUSED) {
+			TRACE("=== COMPLETED SKIP\n");
+			return FI_SUCCESS;
+		}
+		if (frmwk_errmsg(ret, "%s: cxip_zbcoll_alloc()\n", __func__))
+			goto out;
+	}
+
+	/* getgroup collective */
+	do {
+		TRACE("microsleep\n");
+		usleep(10);
+		ret = cxip_zbcoll_getgroup(*zbp);
+		if (ret == -FI_EAGAIN)
+			continue;
+		if (frmwk_errmsg(ret, "%s: cxip_zbcoll_getgroup()\n", __func__))
+			break;
+		/* Returns a collective completion error */
+		ret = _coll_wait(*zbp, nMSEC(100));
+		if (ret == -FI_EAGAIN)
+			continue;
+		break;
+	} while (true);
+
+	/* clean up after error */
+	if (ret)
+		goto out;
+
+	TRACE("=== COMPLETED GETGROUP grpid=%d ret=%s\n", (*zbp)->grpid,
+	    fi_strerror(-ret));
+	return FI_SUCCESS;
+
+out:
+	TRACE("%s: failed\n", __func__);
+	cxip_zbcoll_free(*zbp);
+	*zbp = NULL;
+	return ret;
+}
+
+/* detect overt getgroup errors */
+int _check_getgroup_errs(struct cxip_zbcoll_obj *zb, int exp_grpid)
+{
+	return (frmwk_errmsg(!zb, "zb == NULL") ||
+		frmwk_errmsg(zb->error, "zb->error == %d\n", zb->error) ||
+		frmwk_errmsg(zb->grpid != exp_grpid, "zb->grpid=%d exp=%d\n",
+			     zb->grpid, exp_grpid));
+}
+
+/* rotate array[size] by rot positions */
+void _rotate_array32(uint32_t *array, size_t size, int rot)
+{
+	uint32_t *copy;
+	uint32_t i, j;
+
+	copy = calloc(size, sizeof(uint32_t));
+	memcpy(copy, array, size*sizeof(uint32_t));
+	for (i = 0; i < size; i++) {
+		j = (i + rot) % size;
+		array[i] = copy[j];
+	}
+	free(copy);
+}
+
+/* shuffle array[size] randomly */
+void _shuffle_array32(uint32_t *array, size_t size)
+{
+	uint32_t i, j, t;
+
+	for (i = 0; i < size-1; i++) {
+		j = i + (rand() / ((RAND_MAX / (size - i)) + 1));
+		t = array[j];
+		array[j] = array[i];
+		array[i] = t;
+	}
+}
+
+/**
+ * @brief Perform multiple concurrent getgroup operations.
+ *
+ * Parametrized test to thoroughly exercise getgroup edge conditions.
+ *
+ * This sets up to acquire 'nruns' group IDs.
+ *
+ * On each run it will only use 'naddrs' of the 'size' endpoints. If the default
+ * value of -1 is used, each run will use a random number between 1 and 'size'.
+ *
+ * Prior to each run, the list of addresses is rotated. If 'rot' is -1, the list
+ * is randomly shuffled. The purpose of rotation is to guarantee disjoint sets
+ * of NIDs can be created. For instance, if you have 16 addresses (size=16), and
+ * you set nruns=naddrs=rot=4, then all of the groups will be disjoint.
+ *
+ * This imposes a random jitter of up to 'usec' microseconds on each node, to
+ * break up synchronous behavior among the nodes, and exaggerate race
+ * conditions.
+ *
+ * This presumes a shared file system across all of the nodes under srun, and
+ * writes results to files named using the rank number, overwriting old files
+ * from prior runs. The rank 0 node will complete the test by reading back all
+ * of the files and processing them to ensure correct behavior.
+ *
+ * @param ep_obj : endpoint object
+ * @param size   : total number of NID addresses
+ * @param fiaddrs: all NID addresses
+ * @param nruns  : nruns of concurrency
+ * @param naddrs : number of NIDs to use (-1 implies random)
+ * @param rot    : nid rotations per run (-1 implies shuffle)
+ * @param usec   : usec jitter to impose randomly
+ * @return int   : 0 on success, or error code
+ */
+int _multigroup(struct cxip_ep_obj *ep_obj, size_t size, fi_addr_t *fiaddrs,
+		int nruns, int naddrs, int rot, int usec)
+{
+	char fnam[256];
+	FILE *fd;
+	struct cxip_zbcoll_obj **zb;
+	fi_addr_t *addrs;
+	uint32_t *index;
+	uint32_t **rows;
+	uint32_t *length;
+	int *grps;
+	bool shuffle = false;
+	uint32_t dsc, err, ack, rcv;
+	int i, j, ret;
+
+	cxip_zbcoll_reset_counters(ep_obj);
+
+	ret = 0;
+	if (nruns < 0)
+		nruns = size;
+	if (nruns > cxip_zbcoll_max_grps(false))
+		nruns = cxip_zbcoll_max_grps(false);
+	if (naddrs > size)
+		naddrs = size;
+
+	addrs = calloc(size, sizeof(fi_addr_t));// indices converted to addrs
+	index = calloc(size, sizeof(uint32_t));	// nid indices (easier to read)
+	for (j = 0; j < size; j++)
+		index[j] = j;
+
+	/* rows   : getgroup requests, list of nids involved
+	 * length : number of addrs in each getgroup request, is <= size
+	 * grps   : resulting group ID for each getgroup request
+	 * zb     : zb_coll object for each getgroup request
+	 */
+	rows = calloc(nruns, sizeof(void *));
+	length = calloc(nruns, sizeof(uint32_t));
+	grps = calloc(nruns, sizeof(int));
+	zb = calloc(nruns, sizeof(void *));
+	for (i = 0; i < nruns; i++) {
+		/* -1 means random sizes */
+		if (naddrs < 0) {
+			length[i] = 1 + (rand() % (size - 1));
+		} else {
+			length[i] = naddrs;
+		}
+		/* -1 means shuffle targets */
+		if (rot < 0) {
+			rot = 1;
+			shuffle = true;
+		}
+		/* copy shuffled indices into row */
+		rows[i] = calloc(length[i], sizeof(uint32_t));
+		_rotate_array32(index, size, rot);
+		if (shuffle)
+			_shuffle_array32(index, size);
+		memcpy(rows[i], index, length[i]*sizeof(uint32_t));
+	}
+
+	/* create zb with grpid, in same group order across nodes */
+	for (i = 0; i < nruns; i++) {
+		for (j = 0; j < length[i]; j++)
+			addrs[j] = fiaddrs[rows[i][j]];
+		_jitter(usec);
+		ret = _getgroup(ep_obj, length[i], addrs, &zb[i]);
+		if (frmwk_errmsg(ret, "FAILURE getgroup %d\n", i)) {
+			TRACE("FAILURE getgroup %d\n", i);
+			goto done;
+		}
+		grps[i] = (zb[i]) ? zb[i]->grpid : -1;
+	}
+
+	/* need to compare each node result with other, write to file */
+	sprintf(fnam, "grpid%d", frmwk_rank);
+	fd = fopen(fnam, "w");
+
+	cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv);
+	fprintf(fd, "%d %d %d %d\n", dsc, err, ack, rcv);
+	for (i = 0; i < nruns; i++) {
+		fprintf(fd, " %2d", grps[i]);
+		for (j = 0; j < size; j++)
+			fprintf(fd, " %2d", (j < length[i]) ? rows[i][j] : -1);
+		fprintf(fd, "\n");
+	}
+	fclose(fd);
+
+
+	/* clean up */
+done:
+	for (i = 0; i < nruns; i++) {
+		cxip_zbcoll_free(zb[i]);
+		free(rows[i]);
+	}
+	free(grps);
+	free(length);
+	free(rows);
+	free(index);
+	free(addrs);
+	return ret;
+}
+
+/* display the accumulated data for the full test run */
+void _printrun(size_t size, int irun, int ***data)
+{
+	int irank, inid;
+
+	printf("Test run #%d\n", irun);
+	for (irank = 0; irank < frmwk_numranks; irank++) {
+		printf("rank %2d: ", irank);
+		if (data[irank][irun][0] < 0) {
+			printf("SKIP\n");
+			continue;
+		}
+		printf("GRP %2d:", data[irank][irun][0]);
+		for (inid = 1; inid < size+1; inid++)
+			printf(" %2d", data[irank][irun][inid]);
+		printf("\n");
+	}
+}
+
+/**
+ * @brief Check _multigroup results across all nodes.
+ *
+ * This is run only on the rank 0 process, and verifies the prior test run.
+ *
+ * @param size  : total number of NID addresses
+ * @param nruns : nruns of concurrency in test
+ * @return int  : 0 on success, non-zero on failure
+ */
+int _multicheck(size_t size, int nruns)
+{
+	char fnam[256];
+	FILE *fd;
+	uint32_t *dsc, *err, *ack, *rcv;
+	int ***data;
+	uint64_t bitv, *mask;
+	int grp, nid;
+	int irank, irank2, irun, inid, ret;
+
+	ret = 0;
+	/* data[irank][irun][inid], inid==0 is grpid */
+	data = calloc(frmwk_numranks, sizeof(void *));
+	for (irank = 0; irank < frmwk_numranks; irank++) {
+		data[irank] = calloc(nruns, sizeof(void *));
+		for (irun = 0; irun < nruns; irun++) {
+			data[irank][irun] = calloc(size + 1, sizeof(int));
+		}
+	}
+	/* one bit for each nid, max is 64 */
+	mask = calloc(size, sizeof(uint64_t));
+	dsc = calloc(frmwk_numranks, sizeof(uint32_t));
+	err = calloc(frmwk_numranks, sizeof(uint32_t));
+	ack = calloc(frmwk_numranks, sizeof(uint32_t));
+	rcv = calloc(frmwk_numranks, sizeof(uint32_t));
+
+	/* read in the per-rank file data from the last test run */
+	for (irank = 0; irank < frmwk_numranks; irank++) {
+		/* read file contents into data array */
+		sprintf(fnam, "grpid%d", irank);
+		fd = fopen(fnam, "r");
+		if (! fd) {
+			printf("Could not open %s\n", fnam);
+			ret = 1;
+			goto cleanup;
+		}
+		if (fscanf(fd, " %d %d %d %d",
+			   &dsc[irank],
+			   &err[irank],
+			   &ack[irank],
+			   &rcv[irank]) < 4) {
+			printf("bad read (errs)\n");
+			ret = 1;
+			goto cleanup;
+		}
+		for (irun = 0; irun < nruns; irun++) {
+			for (inid = 0; inid < size + 1; inid++) {
+				int *ptr = &data[irank][irun][inid];
+				if (fscanf(fd, " %d", ptr) < 1) {
+					printf("bad read[%d,%d]\n", irun, inid);
+					ret = 1;
+					goto cleanup;
+				}
+			}
+		}
+		fclose(fd);
+	}
+
+	/* All ranks in any test run must use the same grpid, ranks */
+	for (irun = 0; irun < nruns; irun++) {
+		irank2 = -1;
+		for (irank = 1; irank < frmwk_numranks; irank++) {
+			/* grpid < 0: rank not involved */
+			if (data[irank][irun][0] < 0)
+				continue;
+			/* remember first involved rank */
+			if (irank2 < 0)
+				irank2 = irank;
+			/* compare entire row with first involved */
+			for (inid = 0; inid < size+1; inid++)
+				if (data[irank][irun][inid] !=
+				    data[irank2][irun][inid])
+					break;
+			/* miscompare is a failure */
+			if (inid < size+1) {
+				printf("ERROR in run #%d @ %d\n", irun, inid);
+				printf("reductions do not match\n");
+				_printrun(size, irun, data);
+				ret = 1;
+				goto cleanup;
+			}
+		}
+	}
+	/* validated that all ranks in each run are identical */
+
+	/* No nid should reuse the same grpid, only check rank 0 */
+	irank = 0;
+	for (irun = 0; irun < nruns; irun++) {
+		/* grpid < 0: rank not involved */
+		if (data[irank][irun][0] < 0)
+			continue;
+		grp = data[irank][irun][0];
+		for (inid = 1; inid < size+1; inid++) {
+			/* ignore unused fiaddrs */
+			if (data[irank][irun][inid] < 0)
+				continue;
+			nid = data[irank][irun][inid];
+			bitv = 1L << grp;
+			/* failure if grpid already used */
+			if (mask[nid] & bitv) {
+				printf("ERROR in run #%d @ %d\n",
+					irun, inid);
+				printf("reuse of grpid %d by %d\n",
+					grp, nid);
+				_printrun(size, irun, data);
+				goto cleanup;
+			}
+			mask[nid] |= bitv;
+		}
+	}
+
+	/* We don't expect discard or ack errors */
+	for (irank = 0; irank < frmwk_numranks; irank++)
+		if (dsc[irank] || err[irank])
+			break;
+	if (irank < frmwk_numranks) {
+		printf("ERROR transmission errors\n");
+		for (irank = 0; irank < frmwk_numranks; irank++) {
+			printf("rank %2d: dsc=%d err=%d ack=%d rcv=%d\n",
+				irank, dsc[irank], err[irank],
+				ack[irank], rcv[irank]);
+		}
+		goto cleanup;
+	}
+
+cleanup:
+	if (verbose) {
+		printf("==================\n");
+		printf("Dump all test runs\n");
+		for (irun = 0; irun < nruns; irun++)
+			_printrun(size, irun, data);
+		printf("getgroup test %s\n", !ret ? "passed" : "FAILED");
+	}
+	fflush(stdout);
+
+	free(dsc);
+	free(err);
+	free(ack);
+	free(rcv);
+	free(mask);
+	for (irank = 0; irank < frmwk_numranks; irank++) {
+		for (irun = 0; irun < nruns; irun++)
+			free(data[irank][irun]);
+		free(data[irank]);
+	}
+	free(data);
+	return ret;
+}
+
+/* use up all group IDs, then free zb objects and add more */
+int _exhaustgroup(struct cxip_ep_obj *ep_obj, size_t size, fi_addr_t *fiaddrs,
+		 int nruns, int usec)
+{
+	struct cxip_zbcoll_obj **zb;
+	int maxgrps;
+	int i, n, ret = 0;
+
+	maxgrps = cxip_zbcoll_max_grps(false);
+	if (nruns < 0)
+		nruns = maxgrps + 10;
+	zb = calloc(nruns, sizeof(void *));
+	n = 1;
+	for (i = 0; i < nruns; i++) {
+		_jitter(usec);
+		ret = _getgroup(ep_obj, size, fiaddrs, &zb[i]);
+		if (ret == -FI_EBUSY) {
+			/* free an old zb, and try again */
+			cxip_zbcoll_free(zb[n]);
+			zb[n] = NULL;
+			ret = _getgroup(ep_obj, size, fiaddrs, &zb[i]);
+			if (frmwk_errmsg(ret, "FAILURE\n")) {
+				TRACE("FAILURE\n");
+				break;
+			}
+			if (zb[i]->grpid != n) {
+				TRACE("FAILURE\n");
+				break;
+			}
+			n = (n + 3) % maxgrps;
+		}
+	}
+	for (i = 0; i < nruns; i++)
+		cxip_zbcoll_free(zb[i]);
+
+	return 0;
+}
+
+/* Wait for completion, log errors, free zb object */
+int _test_wait_free(struct cxip_zbcoll_obj *zb,
+		    uint64_t *result, uint64_t expect)
+{
+	int ret;
+
+	/* wait for completion */
+	ret = _coll_wait(zb, nMSEC(100));
+	if (frmwk_errmsg(ret, "reduce wait failed\n"))
+		goto done;
+
+	if (!result)
+		goto done;
+
+	TRACE("expect=%08lx result=%08lx, ret=%s\n",
+	    expect, *result, fi_strerror(-ret));
+	if (*result != expect) {
+		ret = 1;
+		frmwk_errmsg(ret, "expect=%08lx result=%08lx\n",
+				expect, *result);
+	}
+done:
+	cxip_zbcoll_free(zb);
+	return ret;
+}
+
+/* barrier across all NIDs, return zb object */
+int _test_barr(struct cxip_ep_obj *ep_obj,
+	     size_t size, fi_addr_t *fiaddrs,
+	     struct cxip_zbcoll_obj **zbp)
+{
+	struct cxip_zbcoll_obj *zb = NULL;
+	int ret;
+
+	/* need a zbcoll context for this */
+	ret = _getgroup(ep_obj, size, fiaddrs, &zb);
+	if (ret)
+		goto out;
+
+	/* reset counters */
+	cxip_zbcoll_reset_counters(ep_obj);
+
+	/* if this fails, do not continue */
+	ret = cxip_zbcoll_barrier(zb);
+	if (frmwk_errmsg(ret, "barr0 return=%s, exp=%d\n", fi_strerror(-ret), 0))
+		goto out;
+
+	/* try this again, should fail with -FI_EAGAIN */
+	ret = cxip_zbcoll_barrier(zb);
+	if (frmwk_errmsg((ret != -FI_EAGAIN), "barr1 return=%d, exp=%d\n",
+		       ret, -FI_EAGAIN))
+		goto out;
+
+	*zbp = zb;
+	return 0;
+out:
+	cxip_zbcoll_free(zb);
+	return 1;
+}
+
+/* broadcast the payload from rank 0 to all other ranks, return zb object */
+int _test_bcast(struct cxip_ep_obj *ep_obj,
+	       size_t size, fi_addr_t *fiaddrs,
+	       uint64_t *result, struct cxip_zbcoll_obj *zb)
+{
+	int ret;
+
+	TRACE("%s: entry\n", __func__);
+	/* reset counters */
+	cxip_zbcoll_reset_counters(ep_obj);
+
+	/* if this fails, do not continue */
+	TRACE("%s: initiate broadcast\n", __func__);
+	ret = cxip_zbcoll_broadcast(zb, result);
+	TRACE("bcast payload=%08lx, ret=%s\n", *result, fi_strerror(-ret));
+	if (frmwk_errmsg(ret, "bcast0 return=%s, exp=%d\n", fi_strerror(-ret), 0))
+		goto out;
+
+	/* try this again, should fail with -FI_EAGAIN */
+	ret = cxip_zbcoll_broadcast(zb, result);
+	TRACE("bcast payload=%08lx, ret=%s\n", *result, fi_strerror(-ret));
+	if (frmwk_errmsg((ret != -FI_EAGAIN), "bcast1 return=%d, exp=%d\n",
+		       ret, -FI_EAGAIN))
+		goto out;
+	return 0;
+out:
+	TRACE("%s: failed\n", __func__);
+	return 1;
+}
+
+/* Generate a random number with some constant bits, limited to 53 bits.
+ * rand() sequence is deterministic.
+ */
+static inline uint64_t _reduce_val(void)
+{
+	uint64_t val = rand();
+	val = (val << 32) | rand();
+	return (val | 0x10010002) % (1L << 54);
+}
+
+int _test_reduce(struct cxip_ep_obj *ep_obj,
+	         size_t size, fi_addr_t *fiaddrs,
+		 uint64_t *payload, struct cxip_zbcoll_obj *zb)
+{
+	int ret;
+
+	/* reset counters */
+	cxip_zbcoll_reset_counters(ep_obj);
+
+	/* if this fails, do not continue */
+	ret = cxip_zbcoll_reduce(zb, payload);
+	TRACE("reduce payload=%08lx, ret=%s\n", *payload, fi_strerror(-ret));
+	if (frmwk_errmsg(ret, "reduce0 return=%s, exp=%d\n",
+		       fi_strerror(-ret), 0))
+		goto out;
+
+	/* try this again, should fail with -FI_EAGAIN */
+	ret = cxip_zbcoll_reduce(zb, payload);
+	TRACE("reduce payload=%08lx, ret=%s\n", *payload, fi_strerror(-ret));
+	if (frmwk_errmsg((ret != -FI_EAGAIN), "reduce1 return=%d, exp=%d\n",
+		       ret, -FI_EAGAIN))
+		goto out;
+
+	return 0;
+out:
+	TRACE("%s: failed\n", __func__);
+	return 1;
+}
+
+const char *testnames[] = {
+	"test  0: send one packet 0 -> 0",
+	"test  1: send one packet 0 -> 1",
+	"test  2: send one packet 1 -> 0",
+	"test  3: send one packet 0 -> N",
+	"test  4: send one packet N -> 0",
+	"test  5: send one packet N -> N",
+	"test  6: single getgroup",
+	"test  7: double getgroup full overlap",
+	"test  8: double getgroup partial overlap",
+	"test  9: getgroup regression [-NMRD]",
+	"test 10: getgroup exahustion [-ND]",
+	"test 11: barrier",
+	"test 12: broadcast (single)",
+	"test 13: broadcast (concurrent)",
+	"test 14: reduce (single)",
+	"test 15: reduce (concurrent)",
+	"test 16: getgroup perf",
+	"test 17: barrier perf",
+	"test 18: broadcast perf",
+	"test 19: reduce perf",
+	"test 20: send bad dest [-B required]",
+	"test 21: recv bad dest [-B required]",
+	NULL
+};
+const char *testname;
+
+int usage(int ret)
+{
+	int i;
+
+	frmwk_log0("Usage: test_zbcoll [-hvV] [-s seed]\n"
+		"                    [-N nruns] [-M sublen] [-R rotate]\n"
+		"                    [-D usec_delay] [-B bad_NIC]\n"
+		"                    [-t testno[,testno...]]\n"
+		"\n"
+		"  -h displays this help\n"
+		"  -v provides verbose output\n"
+		"  -V provides per-node tracing\n"
+		"  -s specifies a random seed for randomized tests\n"
+		"  -t specifies tests e.g. (1,2,3) or (1-3) or (1-3,11-12)"
+		"\n");
+	for (i = 0; testnames[i]; i++)
+		frmwk_log0("%s\n", testnames[i]);
+
+	return ret;
+}
+
+/* scan for integers in -t option */
+static inline char *scanint(char *ptr, int *val)
+{
+	char *p = ptr;
+	while (*ptr >= '0' && *ptr <= '9')
+		ptr++;
+	*val = atoi(p);
+	return ptr;
+}
+
+#define	TEST(n)	(1 << n)
+static inline bool _istest(uint64_t mask, int test)
+{
+	return (mask & (1 << test)) && (testname = testnames[test]);
+}
+
+int main(int argc, char **argv)
+{
+	bool trace_enabled = false;
+	char hostname[256];
+	fi_addr_t *fiaddrs = NULL;
+	struct cxip_ep *cxip_ep;
+	struct cxip_ep_obj *ep_obj;
+	struct cxip_zbcoll_obj *zb1 = NULL;
+	struct cxip_zbcoll_obj *zb2 = NULL;
+	size_t size = 0;
+	unsigned int seed;
+	uint64_t testmask;
+	uint64_t result1, result2;
+	uint64_t payload1, payload2;
+	uint64_t expect1, expect2;
+	int opt, nruns, naddrs, rot, usec, badnic, ret;
+
+	int errcnt = 0;
+	int i;
+
+	seed = 123;
+	usec = 0;	// as fast as possible
+	nruns = -1;	// run maximum number groups
+	naddrs = -1;	// random selection of fiaddrs
+	rot = -1;	// random shuffle of fiaddrs
+	testmask = -1;	// run all tests
+	badnic = -1;	// do not use an address override
+
+	while ((opt = getopt(argc, argv, "hvVt:s:N:M:R:D:B:")) != -1) {
+		char *str, *s, *p;
+		int i, j;
+
+		switch (opt) {
+		case 't':
+			testmask = 0;
+			str = optarg;
+			i = j = 0;
+			while (*str) {
+				s = str;
+				while (*str && *str != ',')
+					str++;
+				if (*str)
+					*str++ = 0;
+				p = s;
+				while (*p && *p != '-')
+					p++;
+				i = atoi(s);
+				j = (*p) ? atoi(++p) : i;
+				while (i <= j)
+					testmask |= 1 << i++;
+			}
+			break;
+		case 's':
+			seed = atoi(optarg);
+			break;
+		case 'N':
+			nruns = atoi(optarg);
+			break;
+		case 'M':
+			naddrs = atoi(optarg);
+			break;
+		case 'R':
+			rot = atoi(optarg);
+			break;
+		case 'D':
+			usec = atoi(optarg);
+			break;
+		case 'T':
+			frmwk_rank = atoi(optarg);
+			break;
+		case 'B':
+			badnic = strtol(optarg, NULL, 16);
+			break;
+		case 'V':
+			trace_enabled = true;
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		case 'h':
+			return usage(0);
+		default:
+			return usage(1);
+		}
+	}
+
+	frmwk_init(false);
+	if (frmwk_check_env(4))
+		return -1;
+
+	ret = frmwk_init_libfabric();
+	if (frmwk_errmsg(ret, "frmwk_init_libfabric()\n"))
+		return ret;
+
+	cxip_trace_rank = frmwk_rank;
+	cxip_trace_numranks = frmwk_numranks;
+	cxip_trace_enable(trace_enabled);
+	TRACE("==== tracing enabled offset %d\n", frmwk_rank);
+
+	srand(seed);
+	if (naddrs < 0)
+		naddrs = frmwk_numranks;
+	if (nruns < 0)
+		nruns = frmwk_numranks;
+	if (nruns > cxip_zbcoll_max_grps(false))
+		nruns = cxip_zbcoll_max_grps(false);
+
+	frmwk_log0("Using random seed = %d\n", seed);
+	if (verbose) {
+		frmwk_log0("verbose = true\n");
+		frmwk_log0("nruns    = %d\n", nruns);
+		frmwk_log0("naddrs    = %d\n", naddrs);
+		frmwk_log0("rotate   = %d\n", rot);
+		frmwk_log0("delay    = %d usec\n", usec);
+	}
+
+	cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+	ep_obj = cxip_ep->ep_obj;
+
+	/* always start with FI_UNIVERSE */
+	ret = frmwk_populate_av(&fiaddrs, &size);
+	if (frmwk_errmsg(ret, "frmwk_populate_av()\n"))
+		return 1;
+	frmwk_log0("libfabric populated\n");
+
+	gethostname(hostname, sizeof(hostname));
+	TRACE("%s NIC=%04x PID=%d\n", hostname, ep_obj->src_addr.nic,
+	    ep_obj->ptable->pid);
+
+	if (_istest(testmask, 0)) {
+		TRACE("======= %s\n", testname);
+		ret = _test_send_to_dest(ep_obj, size, fiaddrs, 0, 0, frmwk_rank);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret));
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 1)) {
+		TRACE("======= %s\n", testname);
+		ret = _test_send_to_dest(ep_obj, size, fiaddrs, 0, 1, frmwk_rank);
+		errcnt += !!ret;
+		TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret));
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 2)) {
+		TRACE("======= %s\n", testname);
+		ret = _test_send_to_dest(ep_obj, size, fiaddrs, 1, 0, frmwk_rank);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret));
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 3)) {
+		TRACE("======= %s\n", testname);
+		ret = _test_send_to_dest(ep_obj, size, fiaddrs, 0, -1, frmwk_rank);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret));
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 4)) {
+		TRACE("======= %s\n", testname);
+		ret = _test_send_to_dest(ep_obj, size, fiaddrs, -1, 0, frmwk_rank);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret));
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 5)) {
+		TRACE("======= %s\n", testname);
+		ret = _test_send_to_dest(ep_obj, size, fiaddrs, -1, -1, frmwk_rank);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret));
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 6)) {
+		TRACE("======= %s\n", testname);
+		zb1 = NULL;
+		ret = 0;
+		ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1);
+		ret += !!_check_getgroup_errs(zb1, 0);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		cxip_zbcoll_free(zb1);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 7)) {
+		TRACE("======= %s\n", testname);
+		zb1 = NULL;
+		ret = 0;
+		ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1);
+		ret += !!_getgroup(ep_obj, size, fiaddrs, &zb2);
+		ret += !!_check_getgroup_errs(zb1, 0);
+		ret += !!_check_getgroup_errs(zb2, 1);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		cxip_zbcoll_free(zb2);
+		cxip_zbcoll_free(zb1);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 8)) {
+		TRACE("======= %s\n", testname);
+		zb1 = zb2 = NULL;
+		ret = 0;
+		TRACE("test one\n");
+		if (frmwk_rank != frmwk_numranks-1) {
+			ret += !!_getgroup(ep_obj, size-1, &fiaddrs[0], &zb2);
+			ret += !!_check_getgroup_errs(zb2, 0);
+		} else {
+			TRACE("SKIP\n");
+		}
+		TRACE("test two\n");
+		if (frmwk_rank != 0) {
+			ret += !!_getgroup(ep_obj, size-1, &fiaddrs[1], &zb1);
+			ret += !!_check_getgroup_errs(zb1, 1);
+		} else {
+			TRACE("SKIP\n");
+		}
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		cxip_zbcoll_free(zb2);
+		cxip_zbcoll_free(zb1);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 9)) {
+		TRACE("======= %s\n", testname);
+		ret = 0;
+		ret += !!_multigroup(ep_obj, size, fiaddrs, nruns, naddrs,
+				     rot, usec);
+		frmwk_barrier();
+
+		if (!ret && frmwk_rank == 0)
+			ret += !!_multicheck(size, nruns);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 10)) {
+		TRACE("======= %s\n", testname);
+		ret = 0;
+		ret += !!_exhaustgroup(ep_obj, size, fiaddrs, nruns, usec);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		ret += !!_test_barr(ep_obj, size, fiaddrs, &zb1);
+		ret += !!_test_wait_free(zb1, NULL, 0);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 11)) {
+		TRACE("======= %s\n", testname);
+		zb1 = NULL;
+		ret = 0;
+		ret += !!_test_barr(ep_obj, size, fiaddrs, &zb1);
+		ret += !!_test_wait_free(zb1, NULL, 0);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 12)) {
+		TRACE("======= %s\n", testname);
+		zb1 = NULL;
+		ret = 0;
+		result1 = (frmwk_rank) ? frmwk_rank : 0x123;
+		ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1);
+		ret += !!_test_bcast(ep_obj, size, fiaddrs, &result1, zb1);
+		ret += !!_test_wait_free(zb1, &result1, 0x123);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 13)) {
+		TRACE("======= %s\n", testname);
+		zb1 = zb2 = NULL;
+		ret = 0;
+		result1 = (frmwk_rank) ? frmwk_rank : 0x123;
+		result2 = (frmwk_rank) ? frmwk_rank : 0x456;
+		ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1);
+		ret += !!_getgroup(ep_obj, size, fiaddrs, &zb2);
+		ret += !!_test_bcast(ep_obj, size, fiaddrs, &result1, zb1);
+		ret += !!_test_bcast(ep_obj, size, fiaddrs, &result2, zb2);
+		TRACE("waiting for bcast 1\n");
+		ret += !!_test_wait_free(zb1, &result1, 0x123);
+		TRACE("waiting for bcast 2\n");
+		ret += !!_test_wait_free(zb2, &result2, 0x456);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 14)) {
+
+		TRACE("======= %s\n", testname);
+		expect1 = -1L % (1L << 54);
+		for (i = 0; i < size; i++) {
+			uint64_t val = _reduce_val();
+			if (i == frmwk_rank)
+				payload1 = val;
+			expect1 &= val;
+		}
+		zb1 = NULL;
+		ret = 0;
+		ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1);
+		ret += !!_test_reduce(ep_obj, size, fiaddrs,
+				      &payload1, zb1);
+		ret += !!_test_wait_free(zb1, &payload1, expect1);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 15)) {
+		TRACE("======= %s\n", testname);
+		expect1 = -1L % (1L << 54);
+		expect2 = -1L % (1L << 54);
+		for (i = 0; i < size; i++) {
+			uint64_t val = _reduce_val();
+			if (i == frmwk_rank)
+				payload1 = val;
+			expect1 &= val;
+		}
+		for (i = 0; i < size; i++) {
+			uint64_t val = _reduce_val();
+			if (i == frmwk_rank)
+				payload2 = val;
+			expect2 &= val;
+		}
+		zb1 = zb2 = NULL;
+		ret = 0;
+		ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1);
+		ret += !!_getgroup(ep_obj, size, fiaddrs, &zb2);
+		ret += !!_test_reduce(ep_obj, size, fiaddrs,
+				      &payload1, zb1);
+		ret += !!_test_reduce(ep_obj, size, fiaddrs,
+				      &payload2, zb2);
+		ret += !!_test_wait_free(zb1, &payload1, expect1);
+		ret += !!_test_wait_free(zb2, &payload2, expect2);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 16)) {
+		struct timespec t0;
+		long count = 0;
+		double time;
+
+		TRACE("======= %s\n", testname);
+		trace_enabled = cxip_trace_enable(false);
+		zb1 = NULL;
+		ret = cxip_zbcoll_alloc(ep_obj, size, fiaddrs, ZB_NOSIM, &zb1);
+		clock_gettime(CLOCK_MONOTONIC, &t0);
+		while (!ret && count < 100000) {
+			int ret2;
+			do {
+				ret += !!cxip_zbcoll_getgroup(zb1);
+				ret2 = _coll_wait(zb1, nMSEC(100));
+			} while (!ret && ret2 == -FI_EAGAIN);
+			ret += !!ret2;
+			cxip_zbcoll_rlsgroup(zb1);
+			count++;
+		}
+		time = _measure_nsecs(&t0);
+		time /= 1.0*count;
+		time /= 1000.0;
+		cxip_trace_enable(trace_enabled);
+		cxip_zbcoll_free(zb1);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n",
+			 ret ? "FAIL" : "ok", testname, count, time);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 17)) {
+		struct timespec t0;
+		long count = 0;
+		double time;
+
+		TRACE("======= %s\n", testname);
+		trace_enabled = cxip_trace_enable(false);
+		zb1 = NULL;
+		ret = _getgroup(ep_obj, size, fiaddrs, &zb1);
+		clock_gettime(CLOCK_MONOTONIC, &t0);
+		while (!ret && count < 100000) {
+			ret += !!cxip_zbcoll_barrier(zb1);
+			ret += !!_coll_wait(zb1, nMSEC(100));
+			count++;
+		}
+		time = _measure_nsecs(&t0);
+		time /= 1.0*count;
+		time /= 1000.0;
+		cxip_trace_enable(trace_enabled);
+		cxip_zbcoll_free(zb1);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n",
+			 ret ? "FAIL" : "ok", testname, count, time);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 18)) {
+		struct timespec t0;
+		uint64_t result = 0x1234;
+		long count = 0;
+		double time;
+
+		TRACE("======= %s\n", testname);
+		trace_enabled = cxip_trace_enable(false);
+		zb1 = NULL;
+		ret = _getgroup(ep_obj, size, fiaddrs, &zb1);
+		clock_gettime(CLOCK_MONOTONIC, &t0);
+		while (!ret && count < 100000) {
+			ret += !!cxip_zbcoll_broadcast(zb1, &result);
+			ret += !!_coll_wait(zb1, nMSEC(100));
+			count++;
+		}
+		time = _measure_nsecs(&t0);
+		time /= 1.0*count;
+		time /= 1000.0;
+		cxip_trace_enable(trace_enabled);
+		cxip_zbcoll_free(zb1);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n",
+			 ret ? "FAIL" : "ok", testname, count, time);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 19)) {
+		struct timespec t0;
+		uint64_t result = 0x1234;
+		long count = 0;
+		double time;
+
+		TRACE("======= %s\n", testname);
+		trace_enabled = cxip_trace_enable(false);
+		zb1 = NULL;
+		ret = _getgroup(ep_obj, size, fiaddrs, &zb1);
+		clock_gettime(CLOCK_MONOTONIC, &t0);
+		while (!ret && count < 100000) {
+			ret += !!cxip_zbcoll_reduce(zb1, &result);
+			ret += !!_coll_wait(zb1, nMSEC(100));
+			count++;
+		}
+		time = _measure_nsecs(&t0);
+		time /= 1.0*count;
+		time /= 1000.0;
+		cxip_trace_enable(trace_enabled);
+		cxip_zbcoll_free(zb1);
+		errcnt += !!ret;
+		_idle_wait(ep_obj, 100);
+		frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n",
+			 ret ? "FAIL" : "ok", testname, count, time);
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 20)) {
+		if (badnic >= 0) {
+			TRACE("======= %s\n", testname);
+			bad_cxip_addr.nic = badnic;
+			bad_cxip_addr.pid = 0;
+			bad_cxip_index = 1;
+			ret = _test_send_to_dest(ep_obj, size, fiaddrs,
+						 0, 1, frmwk_rank);
+			bad_cxip_index = -1;
+			errcnt += !!ret;
+			_idle_wait(ep_obj, 100);
+			TRACE("rank %2d result = %d\n", frmwk_rank, ret);
+			frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		} else {
+			frmwk_log0("%4s %s\n", "SKIP", testname);
+		}
+		frmwk_barrier();
+	}
+
+	if (_istest(testmask, 21)) {
+		if (badnic >= 0) {
+			TRACE("======= %s\n", testname);
+			ret = _test_send_to_dest(ep_obj, size, fiaddrs,
+						 0, 1, frmwk_rank);
+			ret = _test_send_to_dest(ep_obj, size, fiaddrs,
+						 1, 0, frmwk_rank);
+			//ret = _getgroup(ep_obj, size, fiaddrs, &zb1);
+			TRACE("listening forever....\n");
+			cxip_trace_flush();
+			_idle_wait(ep_obj, -1);
+			frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname);
+		} else {
+			frmwk_log0("%4s %s\n", "SKIP", testname);
+		}
+		frmwk_barrier();
+	}
+
+	TRACE("Finished test run, cleaning up\n");
+	free(fiaddrs);
+	frmwk_free_libfabric();
+	frmwk_log0(!!errcnt ? "ERRORS SEEN\n" : "SUCCESS\n");
+	frmwk_term();
+	return !!errcnt;
+}
diff --git a/prov/cxi/test/nic.c b/prov/cxi/test/nic.c
new file mode 100644
index 00000000000..583d3950ea1
--- /dev/null
+++ b/prov/cxi/test/nic.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <ctype.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+
+#include "libcxi/libcxi.h"
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(fid_nic, .timeout = 5);
+
+Test(fid_nic, validate_nic_attr)
+{
+	int ret;
+	struct cxil_dev *dev;
+	struct cxi_svc_fail_info fail_info = {};
+	struct cxi_svc_desc svc_desc = {};
+	uint16_t valid_vni = 0x120;
+	struct fi_info *info;
+	struct cxip_nic_attr *nic_attr;
+
+	/* Need to allocate a service to be used by libfabric. */
+	ret = cxil_open_device(0, &dev);
+	cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret);
+
+	svc_desc.restricted_vnis = 1;
+	svc_desc.enable = 1;
+	svc_desc.num_vld_vnis = 1;
+	svc_desc.vnis[0] = valid_vni;
+
+	ret = cxil_alloc_svc(dev, &svc_desc, &fail_info);
+	cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret);
+	svc_desc.svc_id = ret;
+
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0",
+			 NULL, FI_SOURCE, NULL, &info);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret);
+
+	nic_attr = (struct cxip_nic_attr *)info->nic->prov_attr;
+	cr_assert_eq(nic_attr->version, 1);
+	cr_assert_eq(nic_attr->addr, dev->info.nid);
+	cr_assert_eq(nic_attr->default_rgroup_id, svc_desc.svc_id);
+	cr_assert_eq(nic_attr->default_vni, valid_vni);
+
+	fi_freeinfo(info);
+	ret = cxil_destroy_svc(dev, svc_desc.svc_id);
+	cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret);
+	cxil_close_device(dev);
+}
diff --git a/prov/cxi/test/repsum.c b/prov/cxi/test/repsum.c
new file mode 100644
index 00000000000..c544a97e9ab
--- /dev/null
+++ b/prov/cxi/test/repsum.c
@@ -0,0 +1,587 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP
+ */
+
+/* Notes:
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+bool verbose = false;
+
+typedef void (*GenFunc)(void);
+typedef double (*SumFunc)(size_t, double*);
+
+struct sum_dist {
+	const char *name;	// distribution name
+	GenFunc func;		// distribution generator
+};
+
+struct sum_test {
+	const char* name;	// test name
+	SumFunc func;		// test function
+	double min, max;	// cumulative results
+	bool contrib;		// contribute to global min/max
+};
+
+struct sum_test_suite {
+	double gmin, gmax;	// cumulative global bounds
+};
+
+/**
+ * @brief Data generation models.
+ *
+ * These functions generate arrays of doubles using different models that create
+ * different distributions of numbers.
+ */
+
+unsigned int seed = 3;
+size_t numvals = 0;
+double *values = NULL;
+
+/* Data generators for the dataset */
+void init_dataset(size_t size)
+{
+	free(values);
+	numvals = size;
+	values = calloc(size, sizeof(double));
+}
+
+void gen_const_data(void)
+{
+	/* constant data */
+	int i;
+
+	for (i = 0; i < numvals; i++)
+		values[i] = 1.0;
+}
+
+void gen_random_data(void)
+{
+	/* randomized data */
+	int i;
+
+	if (seed) {
+		srand(seed);
+		seed = 0;
+	}
+	for (i = 0; i < numvals; i++) {
+		int rnd, e;
+
+		rnd = random();
+		e = -32*(rnd & 0x7);
+		rnd >>= 3;
+		values[i] = scalbn(((rnd * 2.0)/RAND_MAX) - 1.0, e);
+	}
+}
+
+void gen_series_data(void)
+{
+	/* converging series */
+	double s = 1.0;
+	int i;
+
+	for (i = 0; i < numvals; i++) {
+		values[i] = s / (i+1);
+		s = -s;
+	}
+}
+
+void gen_sine_data(void)
+{
+	/* sine wave, particularly hard on reproducibility */
+	double s = 2.0*M_PI/numvals;
+	int i;
+
+	for (i = 0; i < numvals; i++) {
+		values[i] = sin(s*i);
+	}
+}
+
+void gen_range_data(void)
+{
+	int i, e, s, v;
+
+	/* oscillating between -inf and +inf */
+	v = 0;
+	s = 1;
+	for (i = 0; i < numvals; i++) {
+		if (!(i % 2048)) {
+			v += 1;
+			s = -s;
+		}
+		e = (i % 2048) - 1023;
+		values[i] = s*scalbn(1.0*v, e - 1023);
+	}
+}
+
+/**
+ * @brief Data ordering models.
+ *
+ * These functions reorder generated data to test associativity.
+ *
+ */
+
+void nosort_data(void)
+{
+}
+
+int _sortfunc(const void *p1, const void *p2)
+{
+	double *v1 = (double *)p1;
+	double *v2 = (double *)p2;
+
+	if (*v1 == *v2)
+		return 0;
+	return (*v1 < *v2) ? -1 : 1;
+}
+
+void sort_data(void)
+{
+	qsort(values, numvals, sizeof(double), _sortfunc);
+}
+
+void scramble_data(void)
+{
+	int i, j;
+	double t;
+
+	for (i = numvals-1; i > 0; i--) {
+		j = random() %(i+1);
+		t = values[i];
+		values[i] = values[j];
+		values[j] = t;
+	}
+}
+
+void reverse_data(void)
+{
+	int i, j, half;
+	double t;
+
+	half = numvals/2;
+	for (i = 0; i < half; i++) {
+		j = numvals-1-i;
+		t = values[i];
+		values[i] = values[j];
+		values[j] = t;
+	}
+}
+
+/**
+ * @brief Summation algoritihms.
+ *
+ * These function perform the double summation using different algorithms.
+ */
+
+double simple_sum(size_t n, double *v)
+{
+	double s = 0.0;
+	int i;
+
+	for (i = 0; i < n; i++)
+		s += v[i];
+
+	return s;
+}
+
+#define	RADIX 32
+double tree_sum(size_t n, double *v)
+{
+	double s = 0.0;
+	int i, k;
+
+	if (n > RADIX) {
+		k = n/RADIX;
+		for (i = 0; i < RADIX - 1; i++, n -= k)
+			s += tree_sum(k, &v[k*i]);
+		s += tree_sum(n, &v[k*i]);
+	} else {
+		for (i = 0; i < n; i++)
+			s += v[i];
+	}
+
+	return s;
+}
+
+double Kahans_sum(size_t n, double *v)
+{
+	double s = 0.0;
+	double c = 0.0;
+	int i;
+
+	for (i = 0; i < n; i++) {
+		double y = v[i] - c;
+		double t = s + y;
+
+		c = (t - s) - y;
+		s = t;
+	}
+
+	return s;
+}
+
+void print_repsum(struct cxip_repsum *x)
+{
+	printf("M=%3d T=[%016lx, %016lx, %016lx, %016lx] oflow=%d inexact=%d\n",
+		x->M, x->T[0], x->T[1], x->T[2], x->T[3],
+		x->overflow, x->inexact);
+}
+
+/**
+ * @brief Static structures to make the above models accessible to the test
+ * code.
+ *
+ */
+
+struct sum_dist test_dists[] = {
+	{.name="const",  .func=&gen_const_data},
+	{.name="random", .func=&gen_random_data},
+	{.name="series", .func=&gen_series_data},
+	{.name="sin",    .func=&gen_sine_data},
+	{.name="range",  .func=&gen_range_data}
+};
+#define	NUM_DISTS	(sizeof(test_dists)/sizeof(struct sum_dist))
+
+struct sum_dist test_perms[] = {
+	{.name="nosort",   .func=&nosort_data},
+	{.name="sort",     .func=&sort_data},
+	{.name="scramble", .func=&scramble_data},
+	{.name="reverse",  .func=&reverse_data},
+};
+#define	NUM_PERMS	(sizeof(test_perms)/sizeof(struct sum_dist))
+#define	PERM_NOSORT	0
+#define	PERM_SORT	1
+#define	PERM_SCRAMBLE	2
+#define	PERM_REVERSE	3
+
+struct sum_test test_cases[] = {
+	{.name="simple_sum", .func=&simple_sum,   .contrib=true},
+	{.name="tree_sum",   .func=&tree_sum,     .contrib=true},
+	{.name="Kahans_sum", .func=&Kahans_sum,   .contrib=true},
+	{.name="rep_sum",    .func=&cxip_rep_sum, .contrib=false},
+};
+#define	NUM_CASES	(sizeof(test_cases)/sizeof(struct sum_test))
+#define	TEST_SIMPLE	0
+#define	TEST_TREE	1
+#define	TEST_KAHAN	2
+#define	TEST_REPSUM	3
+
+struct sum_test_suite test_suite;
+
+/**
+ * @brief Main test code.
+ *
+ * The basic model is to take a particular distribution of doubles, then perform
+ * multiple summations of that distribution with different orderings of the
+ * values, retaining the result as a (min, max) pair.
+ *
+ * For a perfectly-reproducible summation method, the final result for each
+ * distribution will show min == max.
+ */
+
+void _show_results(void)
+{
+	struct sum_test *test;
+	double dif, mid, err;
+	int n;
+
+	for (n = 0; n < NUM_CASES; n++) {
+		test = &test_cases[n];
+		dif = (test->max - test->min);
+		mid = (test->max + test->min)/2.0;
+		err = fabs(mid ? dif/mid : dif);
+
+		if (verbose)
+			printf("%12s %29.20g %29.20g %g\n",
+				test->name, test->min, test->max, err);
+	}
+}
+
+void _reset_results(void)
+{
+	int n;
+
+	test_suite.gmax = -HUGE_VAL;
+	test_suite.gmin = HUGE_VAL;
+	for (n = 0; n < NUM_CASES; n++) {
+		test_cases[n].max = -HUGE_VAL;
+		test_cases[n].min = HUGE_VAL;
+	}
+}
+
+/* Perform a single summation and record min/max */
+void _runtest(struct sum_test *test)
+{
+	double sum;
+
+	sum = test->func(numvals, values);
+	if (test->min > sum)
+		test->min = sum;
+	if (test->max < sum)
+		test->max = sum;
+	if (test->contrib) {
+		if (test_suite.gmin > sum)
+			test_suite.gmin = sum;
+		if (test_suite.gmax < sum)
+			test_suite.gmax = sum;
+	}
+}
+
+/* Perform a summations */
+void _run_tests(uint64_t tstmask)
+{
+	int n;
+
+	for (n = 0; n < NUM_CASES; n++) {
+		if (!(tstmask & (1 << n)))
+			continue;
+		if (verbose)
+			printf("    ... %s\n", test_cases[n].name);
+		_runtest(&test_cases[n]);
+	}
+}
+
+/* reorder the data, and perform summations using different methods */
+void run_permutations(uint64_t tstmask)
+{
+	int sequence[] = {
+		PERM_NOSORT,
+		PERM_REVERSE,
+		PERM_SORT,
+		PERM_REVERSE,
+		PERM_SCRAMBLE,
+		PERM_REVERSE,
+	};
+	int seqcnt = sizeof(sequence)/sizeof(int);
+	int n, p;
+
+	_reset_results();
+	for (n = 0; n < seqcnt; n++) {
+		p = sequence[n];
+		if (verbose)
+			printf("  ----- %s\n", test_perms[p].name);
+		test_perms[p].func();
+		_run_tests(tstmask);
+	}
+	_show_results();
+}
+
+/* generate a distribution of values, and run permutations */
+void run_dists(uint64_t dstmask, uint64_t tstmask)
+{
+	int n;
+
+	for (n = 0; n < NUM_DISTS; n++) {
+		if (!(dstmask & (1 << n)))
+			continue;
+		if (verbose)
+			printf("======= %s\n", test_dists[n].name);
+		test_dists[n].func();
+		run_permutations(tstmask);
+	}
+}
+
+static inline bool _equal(double a, double b)
+{
+	return (isnan(a) && isnan(b)) || a == b;
+}
+
+TestSuite(repsum, .init = cxit_setup_ep, .fini =cxit_teardown_ep,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/*
+ * Convert double->repsum and repsum->double, and compare for equality.
+ */
+Test(repsum, convert)
+{
+	struct cxip_repsum x;
+	double s[] = {1.0, -1.0};
+	double d1, d2;
+	int i, j, k;
+
+	/* note that this exponent spans subnormals and +inf/-inf */
+	for (i = -1100; i < 1100; i++) {
+		for (j = 0; j < 53; j++) {
+			for (k = 0; k < 2; k++) {
+				d1 = scalbn(s[k]*((1 << j) - 1), i);
+				cxip_dbl_to_rep(&x, d1);
+				cxip_rep_to_dbl(&d2, &x);
+				cr_assert(_equal(d1, d2),
+					"%d, %d: %.13e != %.13e\n",
+					i, j, d1, d2);
+			}
+		}
+	}
+	/* explicit -inf */
+	d1 = -INFINITY;
+	cxip_dbl_to_rep(&x, d1);
+	cxip_rep_to_dbl(&d2, &x);
+	cr_assert(d1 == d2, "%d, %d, %.13e != %.13e\n", i, j, d1, d2);
+	/* explicit +inf */
+	d1 = +INFINITY;
+	cxip_dbl_to_rep(&x, d1);
+	cxip_rep_to_dbl(&d2, &x);
+	cr_assert(d1 == d2, "%d, %d: %.13e != %.13e\n", i, j, d1, d2);
+	/* explicit NaN */
+	d1 = NAN;
+	cxip_dbl_to_rep(&x, d1);
+	cxip_rep_to_dbl(&d2, &x);
+	cr_assert(isnan(d2), "%d, %d: %.13e != %.13e %016lx != %016lx\n",
+		  i, j, d1, d2, _dbl2bits(d1), _dbl2bits(d2));
+}
+
+/*
+ * Add two values using double and using repsum, and compare for equality.
+ */
+Test(repsum, add)
+{
+	double s1[] = {1.0, 1.0, -1.0, -1.0};
+	double s2[] = {1.0, -1.0, 1.0, -1.0};
+	double d1, d2, d3, d4;
+	int i, j, k;
+
+	/* note that this exponent spans subnormals and +inf/-inf */
+	for (i = -1100; i < 1100; i++) {
+		for (j = 0; j < 53; j++) {
+			for (k = 0; k < 4; k++) {
+				d1 = scalbn(s1[k]*((1 << j) - 1), i);
+				d2 = scalbn(s2[k]*((1 << j) - 1), i+1);
+				d3 = d1 + d2;
+				d4 = cxip_rep_add_dbl(d1, d2);
+				cr_assert(_equal(d3, d4),
+					  "%d, %d, %d: %.13e != %.13e"
+					  " %016lx %016lx %016lx %016lx\n",
+					  i, j, k, d3, d4,
+					  _dbl2bits(d1), _dbl2bits(d2),
+					  _dbl2bits(d3), _dbl2bits(d4));
+			}
+		}
+	}
+}
+
+/*
+ * Add combinations of NAN and INFINITY, compare for correct result.
+ */
+Test(repsum, inf)
+{
+	double a[] = {1.0, +INFINITY, -INFINITY, NAN};
+	double d1, d2, d3, d4, exp;
+	int i, j;
+
+	for (i = 0; i < 4; i++) {
+		for (j = 0; j < 4; j++) {
+			d1 = a[i];
+			d2 = a[j];
+			d3 = d1 + d2;
+			d4 = cxip_rep_add_dbl(d1, d2);
+			if (isnan(d1) || isnan(d2))
+				exp = NAN;
+			else if (isinf(d1) && isinf(d2))
+				exp = (d1 == d2) ? d1 : NAN;
+			else if (isinf(d1))
+				exp = d1;
+			else if (isinf(d2))
+				exp = d2;
+			else
+				exp = d3;
+			cr_assert(_equal(d3, exp),
+				"dbl %d, %d: (%e + %e) = %e, expected %e\n",
+				i, j, d1, d2, d3, exp);
+			cr_assert(_equal(d4, exp),
+				"rep %d, %d: (%e + %e) = %e, expected %e\n",
+				i, j, d1, d2, d4, exp);
+		}
+	}
+}
+
+/*
+ * Test for overflow by performing too many sums.
+ * 0.5 places MSBit in bit 39 of a bin.
+ * 1LL << 24 additions of 0.5 will fill overflow area.
+ * One more addition should trigger overflow.
+ */
+Test(repsum, overflow)
+{
+	struct cxip_repsum x, y;
+	long int i, n;
+
+	cxip_dbl_to_rep(&x, 0.0);
+	cxip_dbl_to_rep(&y, 0.5);
+	n = 1LL << 24;
+	for (i = 0L; i < n-1; i++) {
+		cxip_rep_add(&x, &y);
+		if (x.overflow)
+			break;
+	}
+	cr_assert(!x.overflow, "overflow at %lx not expected\n", i++);
+	cxip_rep_add(&x, &y);
+	cr_assert(x.overflow, "overflow at %ld expected\n", i);
+	cxip_dbl_to_rep(&y, 0.0);
+	cxip_rep_add(&y, &x);
+	cr_assert(y.overflow, "overflow not propagated\n");
+}
+
+/*
+ * Test for expected loss of precision.
+ * Adding 1.0*2^i for i=(0,39) will fill a bin.
+ * Doing this four times will fill the T[] array.
+ * Doing this one more time will drop the LSBin.
+ */
+Test(repsum, inexact)
+{
+	struct cxip_repsum x, y;
+	int i, n;
+
+	cxip_dbl_to_rep(&x, 0.0);
+	n = 4*40;
+	for (i = 0; i < n; i++) {
+		cxip_dbl_to_rep(&y, scalbn(1.0, i));
+		cxip_rep_add(&x, &y);
+		if (x.inexact)
+			break;
+	}
+	cr_assert(!x.inexact, "inexact at %x not expected\n", i++);
+	cxip_dbl_to_rep(&y, scalbn(1.0, i));
+	cxip_rep_add(&x, &y);
+	cr_assert(x.inexact, "inexact at %x expected\n", i);
+	cxip_dbl_to_rep(&y, 0.0);
+	cxip_rep_add(&y, &x);
+	cr_assert(y.inexact, "inexact not propagated\n");
+}
+
+/*
+ * Test comparison of different methods over datasets
+ * In all cases, repsum should be reproducible, err = 0.
+ */
+Test(repsum, comparison)
+{
+	struct sum_test *test;
+	double dif, mid, err;
+
+	init_dataset(100000);
+	run_dists(-1L, -1L);
+
+	test = &test_cases[TEST_REPSUM];
+	dif = (test->max - test->min);
+	mid = (test->max + test->min)/2.0;
+	err = fabs(mid ? dif/mid : dif);
+	if (err)
+		printf("%12s %29.20g %29.20g %g\n",
+			test->name, test->min, test->max, err);
+	cr_assert(!err, "repsum is not reproducible\n");
+}
diff --git a/prov/cxi/test/rma.c b/prov/cxi/test/rma.c
new file mode 100644
index 00000000000..6a53fee3b07
--- /dev/null
+++ b/prov/cxi/test/rma.c
@@ -0,0 +1,2236 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+#define RMA_WIN_KEY 0x1f
+
+TestSuite(rma, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(rma, zero_byte_writev)
+{
+	int ret;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+
+	mr_create(0, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key_val,
+		  &mem_window);
+
+	ret = fi_writev(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, 0, key_val,
+			NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writev failed: %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	mr_destroy(&mem_window);
+}
+
+Test(rma, zero_byte_writemsg)
+{
+	int ret;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct fi_rma_iov rma[1] = {};
+
+	mr_create(0, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key_val,
+		  &mem_window);
+
+	rma[0].key = key_val;
+
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	ret = fi_writemsg(cxit_ep, &msg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed: %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	mr_destroy(&mem_window);
+}
+
+Test(rma, zero_byte_readv)
+{
+	int ret;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+
+	mr_create(0, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key_val,
+		  &mem_window);
+
+	ret = fi_readv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, 0, key_val,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_readv failed: %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_READ, NULL);
+
+	mr_destroy(&mem_window);
+}
+
+Test(rma, zero_byte_readmsg)
+{
+	int ret;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct fi_rma_iov rma[1] = {};
+
+	mr_create(0, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key_val,
+		  &mem_window);
+
+	rma[0].key = key_val;
+
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	ret = fi_readmsg(cxit_ep, &msg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_readmsg failed: %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_READ, NULL);
+
+	mr_destroy(&mem_window);
+}
+
+/* Test fi_write simple case. Test IDC sizes to multi-packe sizes. */
+Test(rma, simple_write)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 16 * 1024;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	for (send_len = 1; send_len <= win_len; send_len <<= 1) {
+		ret = fi_write(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, key_val, NULL);
+		cr_assert(ret == FI_SUCCESS);
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+		/* Validate sent data */
+		for (int i = 0; i < send_len; i++)
+			cr_assert_eq(mem_window.mem[i], send_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], send_buf[i]);
+	}
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+/* Test compatibility of client/provider keys */
+Test(rma, key_compatibility)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 16 * 1024;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fid_domain *domain2;
+	struct fid_ep *ep2;
+	struct fid_cq *tx_cq2;
+	struct fid_cq *rx_cq2;
+	struct fid_av *av2;
+	struct cxip_addr ep2_addr;
+	size_t addrlen = sizeof(ep2_addr);
+	struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc};
+	struct cxip_domain *dom;
+	struct cxip_mr_key cxip_key;
+	bool first_domain_prov_key;
+
+	/* Create second RMA endpoint in the opposite client/provider
+	 * mr_mode as the test default EP. When tested with
+	 * CXIP_TEST_PROV_KEY=true is set, then the second EP is started
+	 * in client key mode, if not set, then the second EP is started
+	 * in provider key mode.
+	 */
+	if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY) {
+		first_domain_prov_key = true;
+		cxit_fi->domain_attr->mr_mode &= ~FI_MR_PROV_KEY;
+		cxit_fi->domain_attr->mr_key_size = sizeof(uint32_t);
+	} else {
+		first_domain_prov_key = false;
+		cxit_fi->domain_attr->mr_mode |= FI_MR_PROV_KEY;
+		cxit_fi->domain_attr->mr_key_size = sizeof(uint64_t);
+	}
+	ret = fi_domain(cxit_fabric, cxit_fi, &domain2, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_domain 2nd domain");
+	dom = container_of(domain2, struct cxip_domain,
+			   util_domain.domain_fid);
+	if (first_domain_prov_key)
+		cr_assert(!dom->is_prov_key, "2nd domain not client key");
+	else
+		cr_assert(dom->is_prov_key, "2nd domain not provider key");
+
+	ret = fi_endpoint(domain2, cxit_fi, &ep2, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_endpoint 2nd endpoint");
+
+	ret = fi_av_open(domain2, &cxit_av_attr, &av2, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_av_open 2nd AV");
+	ret = fi_ep_bind(ep2, &av2->fid, 0);
+	cr_assert(ret == FI_SUCCESS, "fi_ep_bind 2nd AV");
+
+	ret = fi_cq_open(domain2, &cxit_tx_cq_attr, &tx_cq2, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cq_open 2nd TX CQ");
+	ret = fi_ep_bind(ep2, &tx_cq2->fid, FI_TRANSMIT);
+	cr_assert(ret == FI_SUCCESS, "fi_ep_bind 2nd TX CQ");
+
+	ret = fi_cq_open(domain2, &cxit_rx_cq_attr, &rx_cq2, NULL);
+	cr_assert(ret == FI_SUCCESS, "fi_cq_open 2nd RX CQ");
+	ret = fi_ep_bind(ep2, &rx_cq2->fid, FI_RECV);
+	cr_assert(ret == FI_SUCCESS, "fi_ep_bind 2nd RX CQ");
+
+	ret = fi_enable(ep2);
+	cr_assert(ret == FI_SUCCESS, "fi_enable 2nd EP");
+
+	ret = fi_getname(&ep2->fid, &ep2_addr, &addrlen);
+	cr_assert(ret == FI_SUCCESS, "fi_getname 2nd EP");
+
+	/* Setup AV, adding fake, first EP then second EP */
+	ret = fi_av_insert(av2, (void *)&fake_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1);
+	ret = fi_av_insert(av2, (void *)&cxit_ep_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1, "fi_av_insert 1st EP into AV2");
+	ret = fi_av_insert(av2, (void *)&ep2_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1, "fi_av_insert 2nd EP into AV2");
+
+	/* Add second EP to default EP's AV */
+	ret = fi_av_insert(cxit_av, (void *)&ep2_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1, "fi_av_insert 2nd EP into cxit_av");
+
+	/* First EP creates a MR with a key of the type specified in
+	 * the associated domain. The second EP will use this key
+	 * which to initiate a transfer.
+	 */
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	cxip_key.raw = key_val;
+	if (first_domain_prov_key)
+		cr_assert(cxip_key.is_prov, "Key is not provider key");
+	else
+		cr_assert(!cxip_key.is_prov, "Key is not client key");
+
+	for (send_len = 1; send_len <= win_len; send_len <<= 1) {
+		ret = fi_write(ep2, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, key_val, NULL);
+		cr_assert(ret == FI_SUCCESS);
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(tx_cq2, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+		/* Validate sent data */
+		for (int i = 0; i < send_len; i++)
+			cr_assert_eq(mem_window.mem[i], send_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], send_buf[i]);
+	}
+
+	ret = fi_close(&ep2->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close EP2");
+	ret = fi_close(&tx_cq2->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close TX CQ2");
+	ret = fi_close(&rx_cq2->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close RX CQ2");
+	ret = fi_close(&av2->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close AV2");
+	ret = fi_close(&domain2->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close domain2");
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+void cxit_setup_rma_opt(void)
+{
+	cxit_setup_getinfo();
+
+	/* Explicitly request unordered RMA */
+	cxit_fi_hints->caps = FI_RMA;
+	cxit_fi_hints->tx_attr->msg_order = 0;
+
+	cxit_setup_rma();
+}
+
+TestSuite(rma_opt, .init = cxit_setup_rma_opt, .fini = cxit_teardown_rma,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test an optimal fi_write. */
+Test(rma_opt, opt_write)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 16 * 1024;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	uint64_t res_start;
+	uint64_t res_end;
+	uint64_t hits_start;
+	uint64_t hits_end;
+	struct cxip_ep *cxi_ep;
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT,
+				 &res_start, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	ret = cxit_dom_read_cntr(C_CNTR_LPE_PLEC_HITS,
+				 &hits_start, NULL, false);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create_ext(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, NULL,
+		      &mem_window);
+
+	for (send_len = 1; send_len <= win_len; send_len <<= 1) {
+		ret = fi_write(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, key_val, NULL);
+		cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret);
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+		/* Validate sent data */
+		for (int i = 0; i < send_len; i++)
+			cr_assert_eq(mem_window.mem[i], send_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], send_buf[i]);
+	}
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT,
+				 &res_end, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+	cr_expect(res_end > res_start);
+
+	ret = cxit_dom_read_cntr(C_CNTR_LPE_PLEC_HITS,
+				 &hits_end, NULL, false);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	cxi_ep = container_of(cxit_ep, struct cxip_ep, ep);
+	if (!is_netsim(cxi_ep->ep_obj)) {
+		cr_assert(hits_end > hits_start);
+	} else {
+		if (hits_end == hits_start)
+			printf("PLEC Hits not registered (unsupported on netsim)\n");
+	}
+}
+
+/* Test simple writes to a standard MR. */
+Test(rma, simple_write_std_mr)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 16 * 1024;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = 0xdef;
+	struct fi_cq_tagged_entry cqe;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	for (send_len = 1; send_len <= win_len; send_len <<= 1) {
+		ret = fi_write(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, key_val, NULL);
+		cr_assert(ret == FI_SUCCESS);
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+		/* Validate sent data */
+		for (int i = 0; i < send_len; i++)
+			cr_assert_eq(mem_window.mem[i], send_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], send_buf[i]);
+	}
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+/* Test fi_writev simple case */
+Test(rma, simple_writev)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct iovec iov[1];
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window);
+
+	iov[0].iov_base = send_buf;
+	iov[0].iov_len = send_len;
+
+	/* Send 8 bytes from send buffer data to RMA window 0 */
+	ret = fi_writev(cxit_ep, iov, NULL, 1, cxit_ep_fi_addr, 0, key_val,
+			NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writev failed %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+void do_writemsg(uint64_t flags)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window);
+
+	iov[0].iov_base = send_buf;
+	iov[0].iov_len = send_len;
+
+	rma[0].addr = 0;
+	rma[0].len = send_len;
+	rma[0].key = key_val;
+
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	/* Send 8 bytes from send buffer data to RMA window 0 at FI address 0
+	 * (self)
+	 */
+	ret = fi_writemsg(cxit_ep, &msg, flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	if (flags & FI_CXI_HRP)
+		usleep(1000);
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+/* Test fi_writemsg with flags */
+Test(rma, writemsg)
+{
+	do_writemsg(0);
+	do_writemsg(FI_FENCE);
+}
+
+void cxit_rma_setup_nofence(void)
+{
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = CXIP_EP_PRI_CAPS;
+	cxit_setup_rma();
+}
+
+/* Test RMA without FI_FENCE */
+Test(rma_nofence, nofence,
+     .init = cxit_rma_setup_nofence,
+     .fini = cxit_teardown_rma)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_msg_rma msg = {};
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window);
+
+	iov[0].iov_base = send_buf;
+	iov[0].iov_len = send_len;
+
+	rma[0].addr = 0;
+	rma[0].len = send_len;
+	rma[0].key = key_val;
+
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	ret = fi_writemsg(cxit_ep, &msg, FI_FENCE);
+	cr_assert(ret == -FI_EINVAL);
+
+	ret = fi_readmsg(cxit_ep, &msg, FI_FENCE);
+	cr_assert(ret == -FI_EINVAL);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+void cxit_rma_setup_no_rma_events(void)
+{
+	cxit_setup_getinfo();
+
+	cxit_fi_hints->caps = FI_RMA | FI_ATOMIC;
+	cxit_setup_rma();
+}
+
+/* Test HRP Put */
+Test(rma_opt, hrp,
+     .init = cxit_rma_setup_no_rma_events,
+     .fini = cxit_teardown_rma)
+{
+	int ret;
+	uint64_t hrp_acks_start;
+	uint64_t hrp_acks_end;
+	struct cxip_ep *cxi_ep;
+
+	/* HRP not supported in netsim */
+	cxi_ep = container_of(cxit_ep, struct cxip_ep, ep);
+	if (is_netsim(cxi_ep->ep_obj))
+		return;
+
+	ret = cxit_dom_read_cntr(C_CNTR_HNI_HRP_ACK,
+				 &hrp_acks_start, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	do_writemsg(0);
+	do_writemsg(FI_CXI_HRP);
+	do_writemsg(0);
+
+	for (int i = 0; i < 10; i++)
+		do_writemsg(FI_CXI_HRP);
+
+	ret = cxit_dom_read_cntr(C_CNTR_HNI_HRP_ACK,
+				 &hrp_acks_end, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	cr_assert_eq(hrp_acks_end - hrp_acks_start, 11,
+		     "unexpected hrp_acks count: %lu\n",
+		     hrp_acks_end - hrp_acks_start);
+}
+
+/* Perform a write that uses a flushing ZBR at the target. */
+Test(rma, flush)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+	uint64_t flags = FI_DELIVERY_COMPLETE;
+	uint64_t flushes_start;
+	uint64_t flushes_end;
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS,
+				 &flushes_start, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window);
+
+	iov[0].iov_base = send_buf;
+	iov[0].iov_len = send_len;
+
+	rma[0].addr = 0;
+	rma[0].len = send_len;
+	rma[0].key = key_val;
+
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	/* Send 8 bytes from send buffer data to RMA window 0 at FI address 0
+	 * (self)
+	 */
+	ret = fi_writemsg(cxit_ep, &msg, flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS,
+				 &flushes_end, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+	cr_assert(flushes_end > flushes_start);
+}
+
+/* Test fi_writemsg with FI_INJECT flag */
+Test(rma, simple_writemsg_inject)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+	uint64_t flags = FI_INJECT;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window);
+
+	iov[0].iov_base = send_buf;
+	iov[0].iov_len = send_len;
+
+	rma[0].addr = 0;
+	rma[0].len = send_len;
+	rma[0].key = key_val;
+
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	/* Send 8 bytes from send buffer data to RMA window 0 at FI address 0
+	 * (self)
+	 */
+	ret = fi_writemsg(cxit_ep, &msg, flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	mr_destroy(&mem_window);
+
+	/* Try using standard MR */
+
+	key_val = 1000;
+	mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window);
+	rma[0].key = key_val;
+
+	/* Send 8 bytes from send buffer data to RMA window 0 at FI address 0
+	 * (self)
+	 */
+	ret = fi_writemsg(cxit_ep, &msg, flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	mr_destroy(&mem_window);
+
+	free(send_buf);
+}
+
+/* Test fi_inject_write simple case */
+Test(rma, simple_inject_write)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	cr_assert(!fi_cntr_read(cxit_write_cntr));
+
+	/* Test invalid inject length */
+	ret = fi_inject_write(cxit_ep, send_buf,
+			      cxit_fi->tx_attr->inject_size + 100,
+			      cxit_ep_fi_addr, 0, key_val);
+	cr_assert(ret == -FI_EMSGSIZE);
+
+	/* Send 8 bytes from send buffer data to RMA window 0 */
+	ret = fi_inject_write(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0,
+			      key_val);
+	cr_assert(ret == FI_SUCCESS);
+
+	while (fi_cntr_read(cxit_write_cntr) != 1)
+		;
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+/* Test fi_read simple case */
+Test(rma, simple_read)
+{
+	int ret;
+	uint8_t *local;
+	int remote_len = 0x1000;
+	int local_len = 8;
+	uint64_t key_val = 0xa;
+	struct fi_cq_tagged_entry cqe;
+	struct mem_region remote;
+
+	local = calloc(1, local_len);
+	cr_assert_not_null(local, "local alloc failed");
+
+	mr_create(remote_len, FI_REMOTE_READ, 0xc0, &key_val, &remote);
+
+	cr_assert(!fi_cntr_read(cxit_read_cntr));
+
+	/* Get 8 bytes from the source buffer to the receive buffer */
+	ret = fi_read(cxit_ep, local, local_len, NULL, cxit_ep_fi_addr, 0,
+		      key_val, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret);
+
+	while (fi_cntr_read(cxit_read_cntr) != 1)
+		;
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_READ, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < local_len; i++)
+		cr_expect_eq(local[i], remote.mem[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     local[i], remote.mem[i]);
+
+	mr_destroy(&remote);
+	free(local);
+}
+
+/* Test fi_readv simple case */
+Test(rma, simple_readv)
+{
+	int ret;
+	uint8_t *local;
+	int remote_len = 0x1000;
+	int local_len = 8;
+	uint64_t key_val = 0x2a;
+	struct fi_cq_tagged_entry cqe;
+	struct mem_region remote;
+	struct iovec iov[1];
+
+	local = calloc(1, local_len);
+	cr_assert_not_null(local, "local alloc failed");
+
+	mr_create(remote_len, FI_REMOTE_READ, 0x3c, &key_val, &remote);
+
+	iov[0].iov_base = local;
+	iov[0].iov_len = local_len;
+
+	/* Get 8 bytes from the source buffer to the receive buffer */
+	ret = fi_readv(cxit_ep, iov, NULL, 1, cxit_ep_fi_addr, 0, key_val,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_readv() failed (%d)", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_READ, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < local_len; i++)
+		cr_expect_eq(local[i], remote.mem[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     local[i], remote.mem[i]);
+
+	mr_destroy(&remote);
+	free(local);
+}
+
+/* Test fi_readmsg simple case */
+Test(rma, simple_readmsg)
+{
+	int ret;
+	uint8_t *local;
+	int remote_len = 0x1000;
+	int local_len = 8;
+	uint64_t key_val = 0x2a;
+	struct fi_cq_tagged_entry cqe;
+	struct mem_region remote;
+	struct fi_msg_rma msg = {};
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+	uint64_t flags = 0;
+
+	local = calloc(1, local_len);
+	cr_assert_not_null(local, "local alloc failed");
+
+	mr_create(remote_len, FI_REMOTE_READ, 0xd9, &key_val, &remote);
+
+	iov[0].iov_base = local;
+	iov[0].iov_len = local_len;
+
+	rma[0].addr = 0;
+	rma[0].len = local_len;
+	rma[0].key = key_val;
+
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	/* Get 8 bytes from the source buffer to the receive buffer */
+	ret = fi_readmsg(cxit_ep, &msg, flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_readv() failed (%d)", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_READ, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < local_len; i++)
+		cr_expect_eq(local[i], remote.mem[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     local[i], remote.mem[i]);
+
+	mr_destroy(&remote);
+	free(local);
+}
+
+/* Test fi_readmsg failure cases */
+Test(rma, readmsg_failures)
+{
+	int ret;
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+	struct fi_msg_rma msg = {
+		.msg_iov = iov,
+		.rma_iov = rma,
+		.iov_count = 1,
+		.rma_iov_count = 1,
+	};
+	uint64_t flags = 0;
+
+	/* Invalid msg value */
+	ret = fi_readmsg(cxit_ep, NULL, flags);
+	cr_assert_eq(ret, -FI_EINVAL, "NULL msg return %d", ret);
+
+	msg.iov_count = cxit_fi->tx_attr->rma_iov_limit + 1;
+	ret = fi_readmsg(cxit_ep, &msg, flags);
+	cr_assert_eq(ret, -FI_EINVAL, "Invalid iov_count return %d", ret);
+
+	msg.iov_count = cxit_fi->tx_attr->rma_iov_limit;
+	flags = FI_DIRECTED_RECV; /* Invalid flag value */
+	ret = fi_readmsg(cxit_ep, &msg, flags);
+	cr_assert_eq(ret, -FI_EBADFLAGS, "Invalid flag unexpected return %d",
+		     ret);
+}
+
+/* Test fi_writemsg failure cases */
+Test(rma, writemsg_failures)
+{
+	int ret;
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+	struct fi_msg_rma msg = {
+		.msg_iov = iov,
+		.rma_iov = rma,
+		.iov_count = 1,
+		.rma_iov_count = 1,
+	};
+	uint64_t flags = 0;
+	size_t send_len = 10;
+	char send_buf[send_len];
+
+	/* Invalid msg value */
+	ret = fi_writemsg(cxit_ep, NULL, flags);
+	cr_assert_eq(ret, -FI_EINVAL, "NULL msg return %d", ret);
+
+	msg.iov_count = cxit_fi->tx_attr->rma_iov_limit + 1;
+	ret = fi_writemsg(cxit_ep, &msg, flags);
+	cr_assert_eq(ret, -FI_EINVAL, "Invalid iov_count return %d", ret);
+
+	msg.iov_count = cxit_fi->tx_attr->rma_iov_limit;
+	flags = FI_DIRECTED_RECV; /* Invalid flag value */
+	ret = fi_writemsg(cxit_ep, &msg, flags);
+	cr_assert_eq(ret, -FI_EBADFLAGS, "Invalid flag return %d", ret);
+
+	/* Invalid length */
+	iov[0].iov_base = send_buf;
+	iov[0].iov_len = cxit_fi->ep_attr->max_msg_size + 1;
+
+	rma[0].addr = 0;
+	rma[0].len = send_len;
+	rma[0].key = 0xa;
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+
+	ret = fi_writemsg(cxit_ep, &msg, 0);
+	cr_assert_eq(ret, -FI_EMSGSIZE, "Invalid flag return %d", ret);
+
+	/* Invalid inject length */
+	iov[0].iov_len = C_MAX_IDC_PAYLOAD_RES+1;
+
+	ret = fi_writemsg(cxit_ep, &msg, FI_INJECT);
+	cr_assert_eq(ret, -FI_EMSGSIZE, "Invalid flag return %d", ret);
+}
+
+void rmamsg_bounds(bool write, bool opt_mr)
+{
+	int ret;
+	struct mem_region mem_window;
+	uint64_t key_val = opt_mr ? RMA_WIN_KEY : 200;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_cq_err_entry err;
+	struct fi_msg_rma msg = {};
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+	size_t good_len = 4096;
+	char *src_buf;
+
+	/* Create over-sized send buffer for bounds checking */
+	src_buf = calloc(1, good_len * 2);
+	cr_assert_not_null(src_buf, "send_buf alloc failed");
+	mr_create(good_len,
+		  write ? FI_REMOTE_WRITE : FI_REMOTE_READ, 0xa0,
+		  &key_val, &mem_window);
+	memset(mem_window.mem, 0x33, good_len);
+
+	/* Good length to verify operation */
+	iov[0].iov_base = src_buf;
+	iov[0].iov_len = good_len;
+
+	rma[0].addr = 0;
+	rma[0].len = good_len;
+	rma[0].key = key_val;
+
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	if (write)
+		ret = fi_writemsg(cxit_ep, &msg, FI_COMPLETION);
+	else
+		ret = fi_readmsg(cxit_ep, &msg, FI_COMPLETION);
+
+	cr_assert_eq(ret, FI_SUCCESS, "Bad RMA API status %d", ret);
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "Unexpected RMA failure");
+
+	/* Use a bad length to cause a bounds violation and
+	 * verify failure is detected.
+	 */
+	iov[0].iov_len = good_len * 2;
+	rma[0].len = good_len * 2;
+
+	if (write)
+		ret = fi_writemsg(cxit_ep, &msg, FI_COMPLETION);
+	else
+		ret = fi_readmsg(cxit_ep, &msg, FI_COMPLETION);
+
+	cr_assert_eq(ret, FI_SUCCESS, "Bad RMA return status %d", ret);
+
+	/* There should be a source error entry. */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success");
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert(ret == 1);
+	cr_assert_eq(err.err, FI_EIO, "Error return %d", err.err);
+
+	mr_destroy(&mem_window);
+	free(src_buf);
+}
+
+Test(rma, writemsg_bounds_opt)
+{
+	rmamsg_bounds(true, true);
+}
+
+Test(rma, writemsg_bounds_std)
+{
+	rmamsg_bounds(true, false);
+}
+
+Test(rma, readmsg_bounds_opt)
+{
+	rmamsg_bounds(false, true);
+}
+
+Test(rma, readmsg_bounds_std)
+{
+	rmamsg_bounds(false, false);
+}
+
+/* Test fi_readv failure cases */
+Test(rma, readv_failures)
+{
+	int ret;
+	struct iovec iov = {};
+
+	 /* Invalid count value */
+	ret = fi_readv(cxit_ep, &iov, NULL,
+		       cxit_fi->tx_attr->rma_iov_limit + 1,
+		       cxit_ep_fi_addr, 0, 0, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "Invalid count return %d", ret);
+}
+
+/* Test fi_writev failure cases */
+Test(rma, writev_failures)
+{
+	int ret;
+	struct iovec iov = {};
+
+	 /* Invalid count value */
+	ret = fi_writev(cxit_ep, &iov, NULL,
+			cxit_fi->tx_attr->rma_iov_limit + 1,
+			cxit_ep_fi_addr, 0, 0, NULL);
+	cr_assert_eq(ret, -FI_EINVAL, "Invalid count return %d", ret);
+}
+
+/* Perform an RMA write spanning a page */
+Test(rma, write_spanning_page)
+{
+	int ret;
+	uint8_t *send_buf;
+	uint8_t *send_addr;
+	int win_len = s_page_size * 2;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	send_addr = (uint8_t *)FLOOR(send_buf + s_page_size, s_page_size) - 4;
+	memset(send_addr, 0xcc, send_len);
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+	memset(mem_window.mem, 0x33, win_len);
+
+	/* Send 8 bytes from send buffer data to RMA window 0 */
+	ret = fi_write(cxit_ep, send_addr, send_len, NULL, cxit_ep_fi_addr, 0,
+		       key_val, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_addr[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_addr[i]);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+Test(rma, rma_cleanup)
+{
+	int ret;
+	long i;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	int writes = 50;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	for (i = 0; i < win_len; i++)
+		send_buf[i] = 0xb1 * i;
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	/* Send 8 bytes from send buffer data to RMA window 0 */
+	for (i = 0; i < writes; i++) {
+		ret = fi_write(cxit_ep, send_buf, send_len, NULL,
+				cxit_ep_fi_addr, 0, key_val, (void *)i);
+		cr_assert(ret == FI_SUCCESS);
+	}
+
+	mr_destroy(&mem_window);
+
+	/* Exit without gathering events. */
+}
+
+void cxit_setup_rma_selective_completion(void)
+{
+	cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION;
+
+	cxit_setup_getinfo();
+	cxit_fi_hints->tx_attr->op_flags = FI_COMPLETION;
+	cxit_setup_rma();
+}
+
+/* Test selective completion behavior with RMA. */
+Test(rma_sel, selective_completion,
+     .init = cxit_setup_rma_selective_completion,
+     .fini = cxit_teardown_rma)
+{
+	int ret;
+	uint8_t *loc_buf;
+	int win_len = 0x1000;
+	int loc_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct iovec iov;
+	struct fi_rma_iov rma;
+	int count = 0;
+
+	loc_buf = calloc(1, win_len);
+	cr_assert_not_null(loc_buf, "loc_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE | FI_REMOTE_READ, 0xa0, &key_val,
+		  &mem_window);
+
+	iov.iov_base = loc_buf;
+	iov.iov_len = loc_len;
+
+	rma.addr = 0;
+	rma.key = key_val;
+
+	msg.msg_iov = &iov;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	/* Puts */
+
+	/* Completion requested by default. */
+	for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) {
+		ret = fi_write(cxit_ep, loc_buf, loc_len, NULL,
+			       cxit_ep_fi_addr, 0, key_val, NULL);
+		cr_assert(ret == FI_SUCCESS);
+		count++;
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+		/* Validate sent data */
+		for (int i = 0; i < loc_len; i++)
+			cr_assert_eq(mem_window.mem[i], loc_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], loc_buf[i]);
+	}
+
+	/* Completion explicitly requested. */
+	for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) {
+		iov.iov_len = loc_len;
+		ret = fi_writemsg(cxit_ep, &msg, FI_COMPLETION);
+		cr_assert(ret == FI_SUCCESS);
+		count++;
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+		/* Validate sent data */
+		for (int i = 0; i < loc_len; i++)
+			cr_assert_eq(mem_window.mem[i], loc_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], loc_buf[i]);
+	}
+
+	/* Suppress completion. */
+	for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) {
+		iov.iov_len = loc_len;
+		ret = fi_writemsg(cxit_ep, &msg, 0);
+		cr_assert(ret == FI_SUCCESS);
+		count++;
+
+		while (fi_cntr_read(cxit_write_cntr) != count)
+			;
+
+		/* Validate sent data */
+		for (int i = 0; i < loc_len; i++)
+			while (mem_window.mem[i] != loc_buf[i])
+				sched_yield();
+
+		/* Ensure no events were generated */
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+	}
+
+	/* Inject never generates an event */
+	loc_len = 8;
+	ret = fi_inject_write(cxit_ep, loc_buf, loc_len, cxit_ep_fi_addr, 0,
+			      key_val);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Validate sent data */
+	for (int i = 0; i < loc_len; i++)
+		while (mem_window.mem[i] != loc_buf[i])
+			sched_yield();
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Gets */
+	memset(loc_buf, 0, win_len);
+	count = 0;
+
+	/* Completion requested by default. */
+	for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) {
+		memset(loc_buf, 0, loc_len);
+		ret = fi_read(cxit_ep, loc_buf, loc_len, NULL,
+			      cxit_ep_fi_addr, 0, key_val, NULL);
+		cr_assert(ret == FI_SUCCESS);
+		count++;
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_READ, NULL);
+
+		/* Validate sent data */
+		for (int i = 0; i < loc_len; i++)
+			cr_assert_eq(mem_window.mem[i], loc_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], loc_buf[i]);
+	}
+
+	/* Completion explicitly requested. */
+	for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) {
+		memset(loc_buf, 0, loc_len);
+		iov.iov_len = loc_len;
+		ret = fi_readmsg(cxit_ep, &msg, FI_COMPLETION);
+		cr_assert(ret == FI_SUCCESS);
+		count++;
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_READ, NULL);
+
+		/* Validate sent data */
+		for (int i = 0; i < loc_len; i++)
+			cr_assert_eq(mem_window.mem[i], loc_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], loc_buf[i]);
+	}
+
+	/* Suppress completion. */
+	for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) {
+		memset(loc_buf, 0, loc_len);
+		iov.iov_len = loc_len;
+		ret = fi_readmsg(cxit_ep, &msg, 0);
+		cr_assert(ret == FI_SUCCESS);
+		count++;
+
+		while (fi_cntr_read(cxit_read_cntr) != count)
+			;
+
+		/* Validate sent data */
+		for (int i = 0; i < loc_len; i++)
+			cr_assert_eq(mem_window.mem[i], loc_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], loc_buf[i]);
+
+		/* Ensure no events were generated */
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+	}
+
+	mr_destroy(&mem_window);
+	free(loc_buf);
+}
+
+void cxit_setup_rma_selective_completion_suppress(void)
+{
+	cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION;
+
+	cxit_setup_getinfo();
+	cxit_fi_hints->tx_attr->op_flags = 0;
+	cxit_setup_rma();
+}
+
+/* Test selective completion behavior with RMA. */
+Test(rma_sel, selective_completion_suppress,
+     .init = cxit_setup_rma_selective_completion_suppress,
+     .fini = cxit_teardown_rma)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct iovec iov;
+	struct fi_rma_iov rma;
+	int write_count = 0;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	iov.iov_base = send_buf;
+	iov.iov_len = send_len;
+
+	rma.addr = 0;
+	rma.key = key_val;
+
+	msg.msg_iov = &iov;
+	msg.iov_count = 1;
+	msg.rma_iov = &rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	/* Normal writes do not generate completions */
+	for (send_len = 1; send_len <= win_len; send_len <<= 1) {
+		memset(mem_window.mem, 0, send_len);
+		ret = fi_write(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, key_val, NULL);
+		cr_assert(ret == FI_SUCCESS);
+		write_count++;
+
+		while (fi_cntr_read(cxit_write_cntr) != write_count)
+			;
+
+		/* Validate sent data */
+		for (int i = 0; i < send_len; i++)
+			while (mem_window.mem[i] != send_buf[i])
+				sched_yield();
+
+		/* Ensure no events were generated */
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+	}
+
+	/* Request completions from fi_writemsg */
+	for (send_len = 1; send_len <= win_len; send_len <<= 1) {
+		memset(mem_window.mem, 0, send_len);
+		iov.iov_len = send_len;
+		ret = fi_writemsg(cxit_ep, &msg, FI_COMPLETION);
+		cr_assert(ret == FI_SUCCESS);
+		write_count++;
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+		/* Validate sent data */
+		for (int i = 0; i < send_len; i++)
+			cr_assert_eq(mem_window.mem[i], send_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], send_buf[i]);
+	}
+
+	/* Suppress completions using fi_writemsg */
+	for (send_len = 1; send_len <= win_len; send_len <<= 1) {
+		memset(mem_window.mem, 0, send_len);
+		iov.iov_len = send_len;
+		ret = fi_writemsg(cxit_ep, &msg, 0);
+		cr_assert(ret == FI_SUCCESS);
+		write_count++;
+
+		while (fi_cntr_read(cxit_write_cntr) != write_count)
+			;
+
+		/* Validate sent data */
+		for (int i = 0; i < send_len; i++)
+			while (mem_window.mem[i] != send_buf[i])
+				sched_yield();
+
+		/* Ensure no events were generated */
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+	}
+
+	/* Inject never generates an event */
+	send_len = 8;
+	memset(mem_window.mem, 0, send_len);
+	ret = fi_inject_write(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0,
+			      key_val);
+	cr_assert(ret == FI_SUCCESS);
+	write_count++;
+
+	while (fi_cntr_read(cxit_write_cntr) != write_count)
+		;
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		while (mem_window.mem[i] != send_buf[i])
+			sched_yield();
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+/* Test remote counter events with RMA */
+Test(rma, rem_cntr)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 16 * 1024;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	int count = 0;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	for (send_len = 1; send_len <= win_len; send_len <<= 1) {
+		ret = fi_write(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, key_val, NULL);
+		cr_assert(ret == FI_SUCCESS);
+
+		/* Wait for remote counter event, then check data */
+		count++;
+
+		while (fi_cntr_read(cxit_rem_cntr) != count)
+			;
+
+		/* Validate sent data */
+		for (int i = 0; i < send_len; i++)
+			cr_assert_eq(mem_window.mem[i], send_buf[i],
+				     "data mismatch, element: (%d) %02x != %02x\n", i,
+				     mem_window.mem[i], send_buf[i]);
+
+		/* Gather source completion after data */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+		validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+	}
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+/* Test RMA FI_MORE */
+Test(rma, more)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 16;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+	int i;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+	for (i = 0; i < win_len; i++)
+		send_buf[i] = 0xa + i;
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window);
+
+	iov[0].iov_base = send_buf;
+	iov[0].iov_len = send_len;
+
+	rma[0].addr = 0;
+	rma[0].len = send_len;
+	rma[0].key = key_val;
+
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	ret = fi_writemsg(cxit_ep, &msg, FI_MORE);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret);
+
+	/* Ensure no completion before the doorbell ring */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN,
+			     "write failed %d", ret);
+	} while (i++ < 100000);
+
+	iov[0].iov_base = send_buf + send_len;
+	rma[0].addr += send_len;
+	ret = fi_writemsg(cxit_ep, &msg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret);
+
+	/* Wait for two events. */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+Test(rma, std_mr_inject)
+{
+	int ret;
+	uint8_t *send_buf;
+	int iters = 10;
+	int send_len = 8;
+	int win_len = send_len * iters;
+	struct mem_region mem_window;
+	uint64_t key_val = CXIP_PTL_IDX_MR_OPT_CNT;
+	struct fi_cq_tagged_entry cqe;
+	int i;
+
+	send_buf = calloc(1, send_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window);
+
+	cr_assert(!fi_cntr_read(cxit_write_cntr));
+
+	for (i = 0; i < iters; i++) {
+		/* Send 8 bytes from send buffer data to RMA window 0 */
+		ret = fi_inject_write(cxit_ep, send_buf, send_len,
+				      cxit_ep_fi_addr, i * send_len, key_val);
+		cr_assert(ret == FI_SUCCESS);
+	}
+
+	/* Corrupt the user buffer to make sure the NIC is not using it for an
+	 * inject.
+	 */
+	memset(send_buf, 0xff, send_len);
+
+	while (fi_cntr_read(cxit_write_cntr) != iters)
+		;
+
+	/* Validate sent data */
+	for (int i = 0; i < win_len; i++)
+		cr_assert_eq(mem_window.mem[i], 0,
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	/* Make sure an event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+static void rma_invalid_target_mr_key(uint64_t rkey)
+{
+	int ret;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_cq_err_entry err;
+
+	/* Zero byte write to invalid MR key. */
+	ret = fi_inject_write(cxit_ep, NULL, 0, cxit_ep_fi_addr, 0, rkey);
+	cr_assert(ret == FI_SUCCESS);
+
+	while (fi_cntr_readerr(cxit_write_cntr) != 1)
+		;
+
+	/* No target event should be generated. */
+	ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* There should be an source error entry. */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAVAIL);
+
+	/* Expect a source error. */
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert(ret == 1);
+
+	/* Expect no other events. */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+}
+
+Test(rma, invalid_target_std_mr_key)
+{
+	rma_invalid_target_mr_key(0x1234);
+}
+
+Test(rma, invalid_target_opt_mr_key)
+{
+	rma_invalid_target_mr_key(0x10);
+}
+
+Test(rma, invalid_source_mr_key)
+{
+	int ret;
+
+	ret = fi_inject_write(cxit_ep, NULL, 0, cxit_ep_fi_addr, 0,
+			      0x100000001);
+	cr_assert(ret == -FI_EKEYREJECTED);
+}
+
+static void rma_invalid_read_target_mr_key(uint64_t rkey)
+{
+	int ret;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_cq_err_entry err;
+
+	/* Zero byte read to invalid MR key. */
+	ret = fi_read(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, 0, rkey, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	while (fi_cntr_readerr(cxit_read_cntr) != 1)
+		;
+
+	/* No target event should be generated. */
+	ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* There should be an source error entry. */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAVAIL);
+
+	/* Expect a source error. */
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert(ret == 1);
+
+	/* Expect no other events. */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+}
+
+Test(rma, invalid_read_target_std_mr_key)
+{
+	rma_invalid_read_target_mr_key(0x1234);
+}
+
+Test(rma, invalid_read_target_opt_mr_key)
+{
+	rma_invalid_read_target_mr_key(0x10);
+}
+
+static void rma_hybrid_mr_desc_test_runner(bool write, bool cq_events)
+{
+	struct mem_region source_window;
+	struct mem_region remote_window;
+	int iters = 10;
+	int send_len = 1024;
+	int win_len = send_len * iters;
+	uint64_t source_key = 0x2;
+	uint64_t remote_key = 0x1;
+	int ret;
+	int i;
+	struct iovec msg_iov = {};
+	struct fi_rma_iov rma_iov = {};
+	struct fi_msg_rma msg_rma = {};
+	void *desc[1];
+	struct fi_cq_tagged_entry cqe;
+	uint64_t rma_flags = cq_events ? FI_TRANSMIT_COMPLETE | FI_COMPLETION :
+		FI_TRANSMIT_COMPLETE;
+	uint64_t cqe_flags = write ? FI_RMA | FI_WRITE : FI_RMA | FI_READ;
+	struct fid_cntr *cntr = write ? cxit_write_cntr : cxit_read_cntr;
+
+	ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &source_key,
+			&source_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	desc[0] = fi_mr_desc(source_window.mr);
+	cr_assert(desc[0] != NULL);
+
+	ret = mr_create(win_len, FI_REMOTE_READ | FI_REMOTE_WRITE, 0x3,
+			&remote_key, &remote_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	msg_rma.msg_iov = &msg_iov;
+	msg_rma.desc = desc;
+	msg_rma.iov_count = 1;
+	msg_rma.addr = cxit_ep_fi_addr;
+	msg_rma.rma_iov = &rma_iov;
+	msg_rma.rma_iov_count = 1;
+
+	for (i = 0; i < iters; i++) {
+		msg_iov.iov_base = source_window.mem + (i * send_len);
+		msg_iov.iov_len = send_len;
+
+		rma_iov.addr = i * send_len;
+		rma_iov.key = remote_key;
+		rma_iov.len = send_len;
+
+		if (write)
+			ret = fi_writemsg(cxit_ep, &msg_rma, rma_flags);
+		else
+			ret = fi_readmsg(cxit_ep, &msg_rma, rma_flags);
+		cr_assert_eq(ret, FI_SUCCESS, "Bad rc=%d\n", ret);
+	}
+
+	ret = fi_cntr_wait(cntr, iters, 1000);
+	cr_assert(ret == FI_SUCCESS);
+
+	if (cq_events) {
+		for (i = 0; i < iters; i++) {
+			ret = cxit_await_completion(cxit_tx_cq, &cqe);
+			cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+			validate_tx_event(&cqe, cqe_flags, NULL);
+		}
+	}
+
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	for (i = 0; i < win_len; i++)
+		cr_assert_eq(source_window.mem[i], remote_window.mem[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     source_window.mem[i], remote_window.mem[i]);
+
+	mr_destroy(&source_window);
+	mr_destroy(&remote_window);
+}
+
+TestSuite(rma_hybrid_mr_desc, .init = cxit_setup_rma_hybrid_mr_desc,
+	  .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(rma_hybrid_mr_desc, non_inject_selective_completion_write)
+{
+	rma_hybrid_mr_desc_test_runner(true, false);
+}
+
+Test(rma_hybrid_mr_desc, selective_completion_read)
+{
+	rma_hybrid_mr_desc_test_runner(false, false);
+}
+
+Test(rma_hybrid_mr_desc, non_inject_completion_write)
+{
+	rma_hybrid_mr_desc_test_runner(true, true);
+}
+
+Test(rma_hybrid_mr_desc, completion_read)
+{
+	rma_hybrid_mr_desc_test_runner(false, true);
+}
+
+static void rma_hybrid_invalid_addr_mr_desc_test_runner(bool write,
+							bool cq_events)
+{
+	struct mem_region source_window;
+	struct mem_region remote_window;
+	int send_len = 1024;
+	uint64_t source_key = 0x2;
+	uint64_t remote_key = 0x1;
+	int ret;
+	struct iovec msg_iov = {};
+	struct fi_rma_iov rma_iov = {};
+	struct fi_msg_rma msg_rma = {};
+	void *desc[1];
+	struct fi_cq_tagged_entry cqe;
+	struct fi_cq_err_entry err;
+	uint64_t rma_flags = cq_events ? FI_TRANSMIT_COMPLETE | FI_COMPLETION :
+		FI_TRANSMIT_COMPLETE;
+	struct fid_cntr *cntr = write ? cxit_write_cntr : cxit_read_cntr;
+
+	ret = mr_create(send_len, FI_READ | FI_WRITE, 0xa, &source_key,
+			&source_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	desc[0] = fi_mr_desc(source_window.mr);
+	cr_assert(desc[0] != NULL);
+
+	ret = mr_create(send_len, FI_REMOTE_READ | FI_REMOTE_WRITE, 0x3,
+			&remote_key, &remote_window);
+	cr_assert(ret == FI_SUCCESS);
+
+	msg_rma.msg_iov = &msg_iov;
+	msg_rma.desc = desc;
+	msg_rma.iov_count = 1;
+	msg_rma.addr = cxit_ep_fi_addr;
+	msg_rma.rma_iov = &rma_iov;
+	msg_rma.rma_iov_count = 1;
+
+	/* Generate invalid memory address. */
+	msg_iov.iov_base = source_window.mem + 0xfffffffff;
+	msg_iov.iov_len = send_len;
+
+	rma_iov.key = remote_key;
+	rma_iov.len = send_len;
+
+	if (write)
+		ret = fi_writemsg(cxit_ep, &msg_rma, rma_flags);
+	else
+		ret = fi_readmsg(cxit_ep, &msg_rma, rma_flags);
+	cr_assert_eq(ret, FI_SUCCESS, "Bad rc=%d\n", ret);
+
+	while (fi_cntr_readerr(cntr) != 1)
+		;
+
+	/* No target event should be generated. */
+	ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* There should be an source error entry. */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == -FI_EAVAIL);
+
+	/* Expect a source error. */
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert(ret == 1);
+
+	/* Expect no other events. */
+	ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	mr_destroy(&source_window);
+	mr_destroy(&remote_window);
+}
+
+Test(rma_hybrid_mr_desc, invalid_addr_non_inject_selective_completion_write)
+{
+	rma_hybrid_invalid_addr_mr_desc_test_runner(true, false);
+}
+
+Test(rma_hybrid_mr_desc, invalid_addr_selective_completion_read)
+{
+	rma_hybrid_invalid_addr_mr_desc_test_runner(false, false);
+}
+
+Test(rma_hybrid_mr_desc, invalid_addr_non_inject_completion_write)
+{
+	rma_hybrid_invalid_addr_mr_desc_test_runner(true, true);
+}
+
+Test(rma_hybrid_mr_desc, invalid_addr_completion_read)
+{
+	rma_hybrid_invalid_addr_mr_desc_test_runner(false, true);
+}
+
+void cxit_rma_setup_tx_alias_no_fence(void)
+{
+	int ret;
+	uint64_t order = FI_ORDER_RMA_WAW;
+
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = CXIP_EP_PRI_CAPS;
+	cxit_setup_tx_alias_rma_dc();
+
+	/* Set WAW ordering */
+	ret = fi_set_val(&cxit_tx_alias_ep->fid, FI_OPT_CXI_SET_MSG_ORDER,
+			 (void *)&order);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_set_val(FI_OPT_SET_MSG_ORDER)");
+}
+
+/* RMA TX Alias capability */
+TestSuite(rma_tx_alias, .init = cxit_rma_setup_tx_alias_no_fence,
+	  .fini = cxit_teardown_tx_alias_rma, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(rma_tx_alias, flush)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+	uint64_t flags = FI_DELIVERY_COMPLETE;
+	uint64_t flushes_start;
+	uint64_t flushes_end;
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS,
+				 &flushes_start, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window);
+
+	iov[0].iov_base = send_buf;
+	iov[0].iov_len = send_len;
+
+	rma[0].addr = 0;
+	rma[0].len = send_len;
+	rma[0].key = key_val;
+
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	/* Send 8 bytes from send buffer data to RMA window 0 at FI address 0
+	 * (self)
+	 */
+	ret = fi_writemsg(cxit_tx_alias_ep, &msg, flags);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS,
+				 &flushes_end, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+	cr_assert(flushes_end > flushes_start);
+}
+
+Test(rma_tx_alias, weak_fence)
+{
+	int ret;
+	uint8_t *send_buf;
+	int win_len = 0x1000;
+	int send_len = 8;
+	int i;
+	struct mem_region mem_window;
+	uint64_t key_val = RMA_WIN_KEY;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_rma msg = {};
+	struct iovec iov[1];
+	struct fi_rma_iov rma[1];
+	uint64_t flags = FI_DELIVERY_COMPLETE;
+
+	send_buf = calloc(1, win_len);
+	cr_assert_not_null(send_buf, "send_buf alloc failed");
+
+	for (i = 0; i < send_len*2; i++)
+		send_buf[i] = i;
+
+	mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window);
+
+	iov[0].iov_base = send_buf;
+	iov[0].iov_len = send_len;
+
+	rma[0].addr = 0;
+	rma[0].len = send_len;
+	rma[0].key = key_val;
+
+	msg.msg_iov = iov;
+	msg.iov_count = 1;
+	msg.rma_iov = rma;
+	msg.rma_iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+
+	/* Verify FI_FENCE can not be done with original EP */
+	ret = fi_writemsg(cxit_ep, &msg, flags | FI_FENCE);
+	cr_assert_eq(ret, -FI_EINVAL, "fi_writemsg FI_FENCE ret %d", ret);
+
+	/* Verify FI_CXI_WEAK_FENCE can be done with original EP */
+	ret = fi_writemsg(cxit_ep, &msg, flags | FI_CXI_WEAK_FENCE);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg FI_WEAK_FENCE ret %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	/* Verifiy FI_CXI_WEAK_FENCE can be done with alias EP */
+	rma[0].addr = send_len;
+	iov[0].iov_base = send_buf + send_len;
+	ret = fi_writemsg(cxit_tx_alias_ep, &msg, flags | FI_CXI_WEAK_FENCE);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg FI_WEAK_FENCE ret %d", ret);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < send_len * 2; i++)
+		cr_assert_eq(mem_window.mem[i], send_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     mem_window.mem[i], send_buf[i]);
+
+	mr_destroy(&mem_window);
+	free(send_buf);
+}
+
+TestSuite(rma_mr_event, .init = cxit_setup_rma, .fini = cxit_teardown_rma,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test that use of stale MR keys cannot access cached memory */
+Test(rma_mr_event, stale_key)
+{
+	int ret;
+	long i;
+	struct fi_cq_err_entry err;
+	struct fi_cq_tagged_entry cqe;
+	struct fid_mr *mr;
+	struct cxip_mr *cxip_mr;
+	uint8_t *src_buf;
+	uint8_t *src_buf2;
+	uint8_t *tgt_buf;
+	int src_len = 8;
+	int tgt_len = 4096;
+	uint64_t key_val = 200;
+
+	src_buf = malloc(src_len);
+	cr_assert_not_null(src_buf, "src_buf alloc failed");
+	src_buf2 = malloc(src_len);
+	cr_assert_not_null(src_buf2, "src_buf2 alloc failed");
+	tgt_buf = calloc(1, tgt_len);
+	cr_assert_not_null(tgt_buf, "tgt_buf alloc failed");
+
+	for (i = 0; i < src_len; i++) {
+		src_buf[i] = 0xb1 * i;
+		src_buf2[i] = 0xa1 * i;
+	}
+
+	/* Create MR */
+	ret = fi_mr_reg(cxit_domain, tgt_buf, tgt_len, FI_REMOTE_WRITE, 0,
+			key_val, 0, &mr, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* We known cached FI_MR_PROV_KEY cannot support this
+	 * level of robustness, so just skip FI_MR_PROV_KEY
+	 * unless FI_CXI_MR_MATCH_EVENTS is enabled.
+	 */
+	cxip_mr = container_of(mr, struct cxip_mr, mr_fid);
+	if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY &&
+	    !cxip_mr->count_events) {
+		fi_close(&mr->fid);
+		goto done;
+	}
+
+	ret = fi_mr_bind(mr, &cxit_ep->fid, 0);
+	cr_assert(ret == FI_SUCCESS);
+
+	ret = fi_mr_enable(mr);
+	cr_assert(ret == FI_SUCCESS);
+
+	if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY)
+		key_val = fi_mr_key(mr);
+
+	ret = fi_write(cxit_ep, src_buf, src_len, NULL,
+		       cxit_ep_fi_addr, 0, key_val, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret);
+	validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL);
+
+	/* Validate sent data */
+	for (int i = 0; i < src_len; i++)
+		cr_assert_eq(tgt_buf[i], src_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     tgt_buf[i], src_buf[i]);
+
+	/* Close MR but leave memory backing it allocated/cached */
+	fi_close(&mr->fid);
+
+	/* Try to access using stale key */
+	ret = fi_write(cxit_ep, src_buf2, src_len, NULL,
+		       cxit_ep_fi_addr, 0, key_val, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &cqe);
+	cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret);
+
+	ret = fi_cq_readerr(cxit_tx_cq, &err, 1);
+	cr_assert_eq(ret, 1);
+	cr_assert_eq(err.err, FI_EIO, "Error return %d", err.err);
+
+	/* Verfiy data was not modified with src_buf2 data */
+	for (int i = 0; i < src_len; i++)
+		cr_assert_eq(tgt_buf[i], src_buf[i],
+			     "data mismatch, element: (%d) %02x != %02x\n", i,
+			     tgt_buf[i], src_buf[i]);
+
+done:
+	free(tgt_buf);
+	free(src_buf);
+	free(src_buf2);
+}
diff --git a/prov/cxi/test/rocr.c b/prov/cxi/test/rocr.c
new file mode 100644
index 00000000000..3d9567e133e
--- /dev/null
+++ b/prov/cxi/test/rocr.c
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <ctype.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+#include <hsa/hsa.h>
+
+#include "libcxi/libcxi.h"
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+#define MAX_MSG_SIZE 1048576U
+#define MAX_BUF_OFFSET 65536U
+#define REGION_MAX 255
+
+static unsigned int seed;
+static hsa_agent_t agent;
+static hsa_region_t regions[REGION_MAX];
+static int num_regions;
+static hsa_region_t coarse_grain;
+bool coarse_grain_valid;
+static hsa_region_t fine_grain;
+bool fine_grain_valid;
+
+static hsa_status_t get_gpu_agent(hsa_agent_t agent, void *data) {
+	hsa_status_t status;
+	hsa_device_type_t device_type;
+
+	status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
+	if (HSA_STATUS_SUCCESS == status && HSA_DEVICE_TYPE_GPU == device_type) {
+		hsa_agent_t* ret = (hsa_agent_t*)data;
+		*ret = agent;
+		return HSA_STATUS_INFO_BREAK;
+	}
+
+	return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t callback_get_num_regions(hsa_region_t region, void* data) {
+	int *num_regions = (int *)data;
+	(*num_regions)++;
+	return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t callback_get_regions(hsa_region_t region, void* data) {
+	hsa_region_t **region_list = (hsa_region_t **)data;
+	**region_list = region;
+	(*region_list)++;
+	return HSA_STATUS_SUCCESS;
+}
+
+static void hsa_test_init(void)
+{
+	hsa_status_t hsa_ret;
+	hsa_region_t *ptr_reg = regions;
+	int i;
+	size_t size_r;
+
+	enable_cxi_hmem_ops = 0;
+	seed = time(NULL);
+	srand(seed);
+
+	hsa_ret = hsa_init();
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS);
+
+	hsa_ret = hsa_iterate_agents(get_gpu_agent, &agent);
+	cr_assert_eq(hsa_ret, HSA_STATUS_INFO_BREAK);
+
+	hsa_ret = hsa_agent_iterate_regions(agent, callback_get_num_regions,
+					    &num_regions);
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS);
+	cr_assert(num_regions <= REGION_MAX);
+
+	hsa_ret = hsa_agent_iterate_regions(agent, callback_get_regions,
+					    &ptr_reg);
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS);
+
+	for (i = 0; i < num_regions; i++) {
+		hsa_ret = hsa_region_get_info(regions[i],
+					      HSA_REGION_INFO_GLOBAL_FLAGS,
+					      &size_r);
+		cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS);
+
+		if (size_r & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED &&
+		    !fine_grain_valid) {
+			fine_grain = regions[i];
+			fine_grain_valid = true;
+		}
+
+		if (size_r & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED &&
+		    !coarse_grain_valid) {
+			coarse_grain = regions[i];
+			coarse_grain_valid = true;
+		}
+
+		if (fine_grain_valid && coarse_grain_valid)
+			break;
+	}
+
+	cr_assert_eq(coarse_grain_valid, true,
+		     "Failed to find coarse grain memory");
+	cr_assert_eq(fine_grain_valid, true,
+		     "Failed to find fine grain memory");
+}
+
+static void hsa_test_fini(void)
+{
+	hsa_status_t hsa_ret;
+
+	hsa_ret = hsa_shut_down();
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS);
+}
+
+TestSuite(hsa, .timeout = CXIT_DEFAULT_TIMEOUT, .init = hsa_test_init,
+	  .fini = hsa_test_fini);
+
+static void hsa_message_runner(void *hsa_send_buf, void *hsa_recv_buf,
+			       size_t buf_size, bool device_only_mem,
+			       bool unexpected)
+{
+	int ret;
+	char *send_buf;
+	char *recv_buf;
+	struct fi_cq_tagged_entry cqe;
+	int i;
+	hsa_status_t hsa_ret;
+	int j;
+
+	cxit_setup_msg();
+
+	/* For device only memcpy, send and recv buffer as used for data
+	   validation.
+	*/
+	if (device_only_mem) {
+		send_buf = malloc(buf_size);
+		cr_assert_neq(send_buf, NULL, "Failed to allocate memory");
+
+		recv_buf = calloc(1, buf_size);
+		cr_assert_neq(send_buf, NULL, "Failed to allocate memory");
+	} else {
+		send_buf = hsa_send_buf;
+		recv_buf = hsa_recv_buf;
+	}
+
+	for (j = 0; j < 2; j++) {
+
+		ret = open("/dev/urandom", O_RDONLY);
+		cr_assert_neq(ret, -1, "open failed: %d", -errno);
+		read(ret, send_buf, buf_size);
+		close(ret);
+
+		if (device_only_mem) {
+			hsa_ret = hsa_memory_copy(hsa_send_buf, send_buf,
+						  buf_size);
+			cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS,
+				     "hsaMemcpy failed: %d", hsa_ret);
+		}
+
+		if (unexpected) {
+			ret = fi_send(cxit_ep, hsa_send_buf, buf_size, NULL, cxit_ep_fi_addr,
+				      NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+			ret = fi_recv(cxit_ep, hsa_recv_buf, buf_size, NULL, cxit_ep_fi_addr,
+				      NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+		} else {
+			ret = fi_recv(cxit_ep, hsa_recv_buf, buf_size, NULL, cxit_ep_fi_addr,
+				      NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+			ret = fi_send(cxit_ep, hsa_send_buf, buf_size, NULL, cxit_ep_fi_addr,
+				      NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+		}
+
+		do {
+			ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		do {
+			ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		if (device_only_mem) {
+			hsa_ret = hsa_memory_copy(recv_buf, hsa_recv_buf,
+						  buf_size);
+			cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS,
+				     "hsaMemcpy failed: %d", hsa_ret);
+		}
+
+		for (i = 0; i < buf_size; i++)
+			cr_assert_eq(send_buf[i], recv_buf[i],
+				     "Data corruption at byte %d seed %u iter %d", i, seed, j);
+	}
+
+	if (device_only_mem) {
+		free(recv_buf);
+		free(send_buf);
+	}
+
+	cxit_teardown_msg();
+}
+
+enum mem_type {
+	COARSE,
+	FINE,
+};
+
+static void hsa_dev_memory_test(size_t buf_size, size_t buf_offset,
+				 bool unexpected, bool hmem_dev_reg,
+				 enum mem_type type)
+{
+	hsa_status_t hsa_ret;
+	void *hsa_send_buf;
+	void *hsa_recv_buf;
+	int ret;
+	hsa_region_t region;
+
+	if (hmem_dev_reg)
+		ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1);
+	else
+		ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", -errno);
+
+	if (type == COARSE)
+		region = coarse_grain;
+	else
+		region = fine_grain;
+
+	/* hsa buffers will be used for RDMA. */
+	hsa_ret = hsa_memory_allocate(region, buf_size + buf_offset,
+				      &hsa_send_buf);
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", hsa_ret);
+
+	hsa_ret = hsa_memory_allocate(region, buf_size + buf_offset,
+				      &hsa_recv_buf);
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", hsa_ret);
+
+	hsa_message_runner((void *)((char *)hsa_send_buf + buf_offset),
+			   (void *)((char *)hsa_recv_buf + buf_offset),
+			   buf_size, true, unexpected);
+
+	hsa_ret = hsa_memory_free(hsa_recv_buf);
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree  failed: %d",
+		     hsa_ret);
+
+	hsa_ret = hsa_memory_free(hsa_send_buf);
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree  failed: %d",
+		     hsa_ret);
+
+}
+
+/* Test messaging using rendezvous, device memory, and HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_rdvz_hmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, true, COARSE);
+}
+
+/* Test messaging using eager, device memory, and HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_eager_hmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, true, COARSE);
+}
+
+/* Test messaging using IDC, device memory, and HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_idc_hmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, true, COARSE);
+}
+
+/* Test messaging using rendezvous, device memory, unexpected messaging, and
+ * HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_rdvz_unexpected_hmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, true, COARSE);
+}
+
+/* Test messaging using eager, device memory, unexpected messaging, and
+ * HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_eager_unexpected_hmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, true, COARSE);
+}
+
+/* Test messaging using IDC, device memory, unexpected messaging, and
+ * HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_idc_unexpected_hmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, true, COARSE);
+}
+
+/* Test messaging using rendezvous, device memory, and without HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_rdvz_noHmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, false, COARSE);
+}
+
+/* Test messaging using eager, device memory, and without HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_eager_noHmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, false, COARSE);
+}
+
+/* Test messaging using IDC, device memory, and without HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_idc_noHmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, false, COARSE);
+}
+
+/* Test messaging using rendezvous, device memory, unexpected messaging, and
+ * without HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_rdvz_unexpected_noHmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, false, COARSE);
+}
+
+/* Test messaging using eager, device memory, unexpected messaging, and
+ * without HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_eager_unexpected_noHmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, false, COARSE);
+}
+
+/* Test messaging using IDC, device memory, unexpected messaging, and
+ * without HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_idc_unexpected_noHmemDevReg_coarse)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, false, COARSE);
+}
+
+/* Test messaging using rendezvous, device memory, and HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_rdvz_hmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, true, FINE);
+}
+
+/* Test messaging using eager, device memory, and HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_eager_hmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, true, FINE);
+}
+
+/* Test messaging using IDC, device memory, and HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_idc_hmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, true, FINE);
+}
+
+/* Test messaging using rendezvous, device memory, unexpected messaging, and
+ * HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_rdvz_unexpected_hmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, true, FINE);
+}
+
+/* Test messaging using eager, device memory, unexpected messaging, and
+ * HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_eager_unexpected_hmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, true, FINE);
+}
+
+/* Test messaging using IDC, device memory, unexpected messaging, and
+ * HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_idc_unexpected_hmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, true, FINE);
+}
+
+/* Test messaging using rendezvous, device memory, and without HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_rdvz_noHmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, false, FINE);
+}
+
+/* Test messaging using eager, device memory, and without HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_eager_noHmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, false, FINE);
+}
+
+/* Test messaging using IDC, device memory, and without HMEM device memory
+ * registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_idc_noHmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, false, false, FINE);
+}
+
+/* Test messaging using rendezvous, device memory, unexpected messaging, and
+ * without HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_rdvz_unexpected_noHmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % MAX_MSG_SIZE;
+		if (buf_size > 65536)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, false, FINE);
+}
+
+/* Test messaging using eager, device memory, unexpected messaging, and
+ * without HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_eager_unexpected_noHmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	while (true) {
+		buf_size = rand() % 1024;
+		if (buf_size > 256)
+			break;
+	}
+
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, false, FINE);
+}
+
+/* Test messaging using IDC, device memory, unexpected messaging, and
+ * without HMEM device memory registration for load/store access.
+ */
+Test(hsa, messaging_devMemory_idc_unexpected_noHmemDevReg_fine)
+{
+	size_t buf_size;
+	size_t buf_offset;
+
+	buf_size = rand() % 128;
+	buf_offset = rand() % MAX_BUF_OFFSET;
+
+	hsa_dev_memory_test(buf_size, buf_offset, true, false, FINE);
+}
+
+static void verify_dev_reg_handle(bool hmem_dev_reg, enum mem_type type)
+{
+	int ret;
+	void *buf;
+	hsa_status_t hsa_ret;
+	struct fid_mr *fid_mr;
+	size_t buf_size = 1024;
+	struct cxip_mr *mr;
+	hsa_region_t region;
+
+	cxit_setup_msg();
+
+	if (type == COARSE)
+		region = coarse_grain;
+	else
+		region = fine_grain;
+
+	hsa_ret = hsa_memory_allocate(region, buf_size, &buf);
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d",
+		     hsa_ret);
+
+	ret = fi_mr_reg(cxit_domain, buf, buf_size, FI_READ, 0, 0x123, 0,
+			&fid_mr, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret);
+
+	mr = container_of(fid_mr, struct cxip_mr, mr_fid);
+
+	cr_assert_eq(mr->md->handle_valid, hmem_dev_reg,
+		     "Bad cxip_md handle_valid");
+	cr_assert_eq(mr->md->info.iface, FI_HMEM_ROCR,
+		     "Invalid CXIP MD iface: %d", mr->md->info.iface);
+
+	ret = fi_close(&fid_mr->fid);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret);
+
+	hsa_ret = hsa_memory_free(buf);
+	cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree  failed: %d",
+		     hsa_ret);
+
+	cxit_teardown_msg();
+}
+
+/* Verify MD handle is false. */
+Test(hsa, verify_noHmemDevReg_coarse)
+{
+	int ret;
+
+	ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", -errno);
+
+	verify_dev_reg_handle(false, COARSE);
+}
+
+/* Verify MD handle is true. */
+Test(hsa, verify_hmemDevReg_coarse)
+{
+	int ret;
+
+	ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", -errno);
+
+	verify_dev_reg_handle(true, COARSE);
+}
+
+/* Verify MD handle is false. */
+Test(hsa, verify_noHmemDevReg_fine)
+{
+	int ret;
+
+	ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", -errno);
+
+	verify_dev_reg_handle(false, FINE);
+}
+
+/* Verify MD handle is true. */
+Test(hsa, verify_hmemDevReg_fine)
+{
+	int ret;
+
+	ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1);
+	cr_assert_eq(ret, 0, "setenv failed: %d", -errno);
+
+	verify_dev_reg_handle(true, FINE);
+}
diff --git a/prov/cxi/test/run.sh b/prov/cxi/test/run.sh
new file mode 100644
index 00000000000..053df6ee304
--- /dev/null
+++ b/prov/cxi/test/run.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+#
+# Run a command in a VM. Start a new VM if necessary.
+
+RUNCMD=$@
+DIR=`dirname $0`
+
+if ! [ -c /dev/cxi0 ]; then
+	echo "Cassini device not present; attempting to launch netsim VM"
+	RUNCMD="$RUNCMD" $DIR/startvm.sh
+else
+	if [ -z "$RUNCMD" ]; then
+		RUNCMD=${SHELL}
+	fi
+	${RUNCMD}
+fi
diff --git a/prov/cxi/test/run_criterion_tests.py b/prov/cxi/test/run_criterion_tests.py
new file mode 100644
index 00000000000..093f0fee2c2
--- /dev/null
+++ b/prov/cxi/test/run_criterion_tests.py
@@ -0,0 +1,551 @@
+#!/usr/bin/env python3
+
+"""
+Executes criterion tests using parameters defined in an 
+input yaml file and outputs results to a file in TAP format
+"""
+
+import pexpect
+import sys
+import yaml
+import re
+
+from enum import Enum
+from argparse import ArgumentParser
+from contextlib import ExitStack
+
+
+class Result(Enum):
+    """
+    Enum for test results
+    """
+    PASSED = 1
+    FAILED = 2
+    SKIPPED = 3
+
+
+class Node:
+    """
+    Class for managing a node's SSH connection
+    """
+    def __init__(self, node_name):
+        """
+        Args:
+            node_name: the name of the node
+        """
+        self.name = node_name
+        self.ssh = open_ssh(self.name, prompt=PROMPT)
+        self.ssh.logfile = sys.stdout
+
+
+class TestSet:
+    """
+    Set of tests that use common runtime parameters and CSR settings
+    """
+    def __init__(self, desc, test_filter=None, runtime_params=None, csr_list=None):
+        """
+        Args:
+            desc: description of the test set
+            test_filter: tests to run
+            runtime_params: runtime parameters to use with this test set
+            csr_list: csrs to set prior to running the tests
+        """
+        self.description = desc
+        self.filter = test_filter
+        self.runtime_params = runtime_params
+        self.csr_list = csr_list
+        self.csr_list_initial_vals = []
+        self.test_list = []
+
+        # generate the list of tests
+        self.generate_testlist()
+
+    def set_csrs_for_test_set(self):
+        """
+        capture original csr values and set csrs to new values
+
+        """
+        if self.csr_list is not None:
+            send(node, "cd {}".format(PYCXI_DIR))
+
+            for c in self.csr_list:
+                csr = c[0]
+                field = c[1]
+                new_value = c[2]
+
+                # capture initial values
+                orig_val = get_csr_value(csr, field)
+                self.csr_list_initial_vals.append([csr, field, orig_val])
+
+                # set new values
+                set_csr_value(csr, field, new_value)
+
+    def restore_csrs(self):
+        """
+        Restore csrs to their original values
+
+        """
+        if self.csr_list is not None:
+            print("Restoring CSRs...")
+            send(node, "cd {}".format(PYCXI_DIR))
+
+            for c in self.csr_list_initial_vals:
+                field = c[1]
+                if isinstance(c[2], dict):
+                    # csr is an array, so set each member of the array
+                    for csr, orig_value in c[2].items():
+                        set_csr_value(csr, field, orig_value)
+                else:
+                    csr = c[0]
+                    orig_value = c[2]
+                    set_csr_value(csr, field, orig_value)
+
+    def generate_testlist(self):
+        """
+        generate a list of tests to run based on the provided filter
+
+        """
+        send(node, "cd {}".format(TEST_DIR))
+
+        # create criterion test list
+        send(node, './cxitest -l > testlist 2>&1 && sleep 1 && echo "DONE"', resp_1="DONE")
+        f_name = "{}/testlist".format(TEST_DIR)
+        with open(f_name) as file:
+            all_lines = [line.rstrip() for line in file.readlines()]
+
+        # create regex instance for filter (if needed)
+        regex_filter = None
+        if self.filter is not None:
+            regex_filter = re.compile(self.filter, re.IGNORECASE)
+
+        # parse testlist and create test objects for this test set
+        area = None
+        for line in all_lines:
+            if ':' in line:
+                area = line.split()[0].replace(":", "")
+            else:
+                tst_name = line.split()[1]
+
+                # indicates "disabled" flag was set in Criterion test
+                skip_test = "skipped" in line
+
+                # create test objects for this test set based on the filter (if provided)
+                if self.filter is None or (
+                        regex_filter is not None and regex_filter.match("{}/{}".format(area, tst_name))):
+                    tst = Test(area, tst_name, self.description, self.runtime_params, skip=skip_test)
+                    self.test_list.append(tst)
+
+    def execute_tests(self):
+        """
+        Executes the tests in the test set and capture the output
+        """
+        with ExitStack() as cleanup:
+
+            # restore CSRs on exit
+            cleanup.callback(self.restore_csrs)
+
+            # set CSRs for test set
+            self.set_csrs_for_test_set()
+
+            send(node, "cd {}".format(TEST_DIR))
+
+            # execute tests in test list
+            for te in self.test_list:
+                sys.stdout.flush()
+                cmd = '{} > tmp_result 2>&1 && echo "DONE"'.format(te.test_cmd)
+
+                # execute test
+                send(node, cmd, resp_1="DONE", timeout=60)
+
+                results_index = 0
+                enable_logging = False
+
+                # process raw results file
+                with open("{}/tmp_result".format(TEST_DIR)) as file:
+                    all_lines = [line.strip() for line in file.readlines()]
+
+                # capture all output related to this test
+                for ln in all_lines:
+                    line = ansi_escape.sub('', ln).rstrip()
+                    test_str = " {}::{}".format(te.test_area, te.test_name)
+                    if test_str in line:
+                        if "RUN" in line and line.endswith(test_str):
+                            # start capturing output for this test
+                            enable_logging = True
+
+                            # create a TestResult instance for this test
+                            te.results.append(TestResult(results_index))
+
+                            # if CSRs were modified, include that in the log:
+                            if self.csr_list is not None:
+                                te.results[results_index].log.append("Modified CSRs: {}".format(self.csr_list))
+
+                            # log the actual Criterion test command
+                            te.results[results_index].log.append("Test cmd: {}".format(te.test_cmd))
+
+                            # log the "RUN" output
+                            te.results[results_index].log.append(line)
+                        elif "{}:".format(test_str) in line:
+
+                            # set the test result
+                            te.results[results_index].result = get_result(line)
+
+                            # capture the entire result line
+                            te.results[results_index].log.append(line)
+
+                            # the test is finished, so stop capturing output for this test
+                            enable_logging = False
+
+                            # increment index (multiple results for Test instance indicates a parameterized tests)
+                            results_index += 1
+
+                    elif enable_logging:
+                        # test is in process, so capture all console output that occurs
+                        te.results[results_index].log.append(line)
+
+                # display all logged output belonging to this particular test
+                for res in te.results:
+                    print("\n-------------------------------------------------------")
+                    for s in res.log:
+                        print(s)
+                    print("-------------------------------------------------------\n")
+    
+
+class Test:
+    """
+    An individual test, which may contain multiple TestResult objects if the test is parameterized
+    """
+    def __init__(self, test_area, test_name, desc, t_params=None, skip=False):
+        """
+        Args:
+            test_area: the test area
+            test_name: the test name
+            desc: description of the test
+            t_params: runtime parameters for this test
+            skip: flag to indicate if the test should be skipped
+        """
+        self.test_area = test_area
+        self.test_name = test_name
+        self.desc = desc
+        self.skip = skip
+        self.results = []
+
+        # create the runtime parameters string for this test
+        param_str = ""
+        if t_params is not None:
+            for pa, v in t_params.items():
+                param_str += "{}={} ".format(pa, v)
+
+        self.test_params = param_str
+
+        # create the test cmd
+        self.test_cmd = \
+            '{} ./cxitest --filter="{}/{}" --verbose=1 -j1 --ascii'.format(param_str, test_area, test_name)
+
+        # create TestResult for skipped test
+        if self.skip:
+            st = TestResult()
+            st.result = Result.SKIPPED
+            self.results.append(st)
+
+    def create_tap_results(self):
+        """
+        Parse results log and create TAP results for this test
+        """
+        for res in self.results:
+            # get test number for this test
+            test_num = get_current_test_count_and_inc()
+
+            # determine TAP result based on test result
+            tap_result = "ok {}".format(test_num) if res.result != Result.FAILED else "not ok {}".format(test_num)
+
+            # construct the TAP test name
+            t_name = "{}::{}".format(self.test_area, self.test_name)
+
+            # if we have a parameterized test, append index to the test name
+            if len(self.results) > 1:
+                t_name = "{}::{}".format(t_name, res.index)
+
+            # append the description
+            t_name = "{} - {}".format(t_name, self.desc)
+
+            # if test was skipped, include skip comment
+            if res.result == Result.SKIPPED:
+                t_name = "{} # skip".format(t_name)
+
+                # include additional comment for disabled tests
+                if self.skip:
+                    t_name += " Disabled flag set in criterion test "
+
+            # append the tap result and test name to the tap report
+            tap_report.append("{} {}".format(tap_result, t_name))
+
+            # include all logged output during this test in the tap report
+            for m in res.log:
+                tap_report.append("# {}".format(m))
+
+
+class TestResult:
+    """
+    Result and log for a particular test
+    """
+    def __init__(self, index=0):
+        """
+
+        Args:
+            index: test index - used with parameterized tests
+        """
+        self.index = index
+        self.result = Result.FAILED
+        self.log = []
+
+
+def get_result(the_line):
+    """
+    Determine the test result from the given line
+
+    Args:
+        the_line: the line to check
+
+    Returns: the result
+
+    """
+    if "PASS" in the_line:
+        return Result.PASSED
+    elif "SKIP" in the_line:
+        return Result.SKIPPED
+    else:
+        return Result.FAILED
+
+
+def set_csr_value(csr, field, value):
+    """
+    Sets a CSR field to the given value
+
+    Args:
+        csr: the CSR
+        field: the field
+        value: the value
+
+    """
+    # use cxiutil to set the value
+    send(node, "cd {}".format(PYCXI_DIR))
+    cmd = 'cxiutil store csr {} {}={} && sleep 1 && echo "DONE"'.format(csr, field, value)
+    send(node, cmd, resp_1="DONE")
+    sys.stdout.flush()
+
+    # verify the new value is set as expected
+    new_val = get_csr_value(csr, field)
+    if isinstance(new_val, dict):
+        # we have a CSR array, so verify each member of the array
+        for v in new_val.values():
+            if int(v) != int(value):
+                raise RuntimeError("Unable to set CSR with cmd: {}. "
+                                   "Actual value of {} = {}".format(cmd, field, v))
+    else:
+        if int(new_val) != int(value):
+            raise RuntimeError("Unable to set CSR with cmd: {}. "
+                               "Actual value of {} = {}".format(cmd, field, new_val))
+
+
+def get_csr_value(csr, field):
+    """
+    Returns the value of the CSR field. If the CSR is an array, returns a dict containing each CSR index and value
+
+    Args:
+        csr: the CSR
+        field: the field
+
+    Returns: the value, or a dict containing each CSR index and value
+
+    """
+
+    # use cxiutil to get the value
+    send(node, "cd {}".format(PYCXI_DIR))
+    sys.stdout.flush()
+    send(node, 'cxiutil dump csr {} > tmp && sleep 1 && echo "DONE"'.format(csr, field), resp_1="DONE")
+
+    with open("{}/tmp".format(PYCXI_DIR)) as file:
+        all_lines = [line.rstrip() for line in file.readlines()]
+
+    # parse the cxiutil output
+    response = {}
+    for line in all_lines:
+        if "hex" in line:
+            csr = line.split()[0]
+
+        if field in line and "0x" in line:
+            response[csr] = line.split()[2]
+
+    # csr array, so return a dict containing each value in the csr array
+    if len(response) > 1:
+        return response
+    # not a csr array, so just return the value
+    elif len(response) == 1:
+        return response[csr]
+    else:
+        raise RuntimeError("Unable to read CSR {} {}".format(csr, field))
+
+
+def generate_tap_file():
+    """
+    generate the TAP results file
+    """
+    total_test_count = 0
+
+    # capture the total number of tests
+    for ts in test_set_list:
+        for tst in ts.test_list:
+            total_test_count += len(tst.results)
+
+    # add TAP header line
+    tap_header = "1..{}".format(total_test_count)
+    tap_report.append(tap_header)
+
+    # capture TAP results of each test
+    for ts in test_set_list:
+        for element in ts.test_list:
+            element.create_tap_results()
+
+    # create TAP file
+    with open(RESULTS_FILE, 'w') as file_handler:
+        for tap_line in tap_report:
+            file_handler.write("{}\n".format(tap_line))
+            print(tap_line)
+
+
+def get_current_test_count_and_inc():
+    """
+    returns the current test count prior to incrementing it
+
+    Returns: the current test count
+
+    """
+    global current_test_count
+    tmp_count = current_test_count
+    current_test_count += 1
+    return tmp_count
+
+
+def open_ssh(node_addr, prompt):
+    """
+    Create ssh connection to the given ip address
+
+    Args:
+        node_addr: the node name / ip address
+        prompt: the prompt to expect
+
+    Returns: SSH connection / process
+
+    """
+    s = pexpect.spawn("ssh {}".format(node_addr), encoding='utf-8')
+    try:
+        rc = s.expect([prompt, "Password:"], timeout=30)
+        if rc == 1:
+            s.sendline(PASSWORD)
+            s.expect(prompt, timeout=10)
+    except pexpect.TIMEOUT:
+        print("Unable to ssh to {}".format(node_addr))
+        raise pexpect.TIMEOUT
+    return s
+
+
+def send(the_node, cmd, resp_1=None, resp_2=None, expect_prompt=True, timeout=30):
+    """
+    send a command to the given node and verify expected response(s)
+
+    Args:
+        the_node: the node
+        cmd: the command to send
+        resp_1: the first expected response (if not None)
+        resp_2: the second expected response (if not None)
+        expect_prompt: flag to indicate if a prompt is expected
+        timeout: the maximum time to wait for a response before throwing an exception
+    """
+    ssh_sesh = the_node.ssh
+    ssh_sesh.sendline(cmd)
+
+    if resp_1:
+        ssh_sesh.expect(resp_1, timeout=timeout)
+
+    if resp_2:
+        ssh_sesh.expect(resp_2, timeout=timeout)
+
+    if expect_prompt:
+        ssh_sesh.expect(PROMPT, timeout=timeout)
+
+
+if __name__ == "__main__":
+
+    # used to filter ansi escape chars
+    ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
+
+    p = ArgumentParser("run_criterion_tests")
+    p.add_argument('-n',
+                   dest="node",
+                   nargs='?',
+                   type=str,
+                   required=True,
+                   help="Name of node where test is to be run")
+
+    p.add_argument('-y',
+                   dest="yaml_file",
+                   nargs='?',
+                   type=str,
+                   required=True,
+                   help="Path to the test YAML file")
+
+    args = p.parse_args()
+
+    # parse the input yaml file
+    try:
+        with open(args.yaml_file, 'r') as stream:
+            f = yaml.safe_load(stream)
+
+    except FileNotFoundError:
+        print("YAML file not found: {}".format(args.yaml_file))
+
+    LIBFABRIC_DIR = f["env"]["libfabric_dir_on_node"]
+    TEST_DIR = "{}/prov/cxi/test".format(LIBFABRIC_DIR)
+    PYCXI_DIR = f["env"]["pycxi_dir_on_node"]
+    PROMPT = f["env"]["node_prompt"]
+    PASSWORD = f["env"]["node_password"]
+    RESULTS_FILE = "{}/results.tap".format(TEST_DIR)
+
+    # holds all TAP results
+    tap_report = []
+
+    # instantiate node object
+    node = Node(args.node)
+
+    # activate pycxi venv for cxiutil and remove old tap files
+    send(node, "cd {}".format(PYCXI_DIR))
+    send(node, ". .venv/bin/activate")
+    send(node, "cd {}".format(TEST_DIR))
+    send(node, "rm *.tap")
+
+    # set global runtime parameters prior to running tests
+    default_runtime_parameters = f["global_runtime_parameters"]
+    for params in default_runtime_parameters:
+        for param, val in params.items():
+            send(node, "export {}={}".format(param, val))
+
+    current_test_count = 1
+
+    # create test sets
+    test_set_list = []
+    for test in f["tests"]:
+        test_set_list.append(TestSet(
+            desc=test["description"],
+            test_filter=test["filter"],
+            runtime_params=test["runtime_parameters"],
+            csr_list=test["csrs"])
+        )
+
+    # execute the tests in each test set
+    for test_set in test_set_list:
+        test_set.execute_tests()
+
+    # generate the tap file
+    generate_tap_file()
+
diff --git a/prov/cxi/test/run_tests_vm.sh b/prov/cxi/test/run_tests_vm.sh
new file mode 100644
index 00000000000..760f5162ecb
--- /dev/null
+++ b/prov/cxi/test/run_tests_vm.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+#
+# Run unit tests in a VM.
+
+DIR=`dirname $0`
+cd $DIR
+
+./run.sh ./test.sh $1
diff --git a/prov/cxi/test/startvm-setup.sh b/prov/cxi/test/startvm-setup.sh
new file mode 100644
index 00000000000..06f19c8c958
--- /dev/null
+++ b/prov/cxi/test/startvm-setup.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+#
+# Initialize a VM for CXI testing and run a command.
+
+DBS_DIR=$(realpath "../../../..")
+
+if [[ -z $RUNCMD ]]; then
+    RUNCMD="$@"
+fi
+
+export LC_ALL=en_US.UTF-8
+
+ulimit -s unlimited
+ulimit -l unlimited
+
+modprobe ptp
+modprobe iommu_v2 || modprobe amd_iommu_v2
+insmod $DBS_DIR/slingshot_base_link/sbl.ko
+insmod $DBS_DIR/sl-driver/knl/cxi-sl.ko
+insmod $DBS_DIR/cxi-driver/cxi/cxi-core.ko disable_default_svc=0
+insmod $DBS_DIR/cxi-driver/cxi/cxi-user.ko
+insmod $DBS_DIR/cxi-driver/cxi/cxi-eth.ko
+insmod $DBS_DIR/kdreg2/kdreg2.ko
+
+# Sleep to wait for Ethernet interface to come up
+sleep 3
+
+# Locate the first down Ethernet interface and configure it.
+regex="eth([0-9]{1}).+DOWN"
+eth_id=-1
+interfaces="$(ip addr)"
+if [[ $interfaces =~ $regex ]]; then
+        eth_id=${BASH_REMATCH[1]}
+fi
+
+if [ $eth_id -eq -1 ]; then
+        echo "Failed to find Ethernet interface"
+        exit 1
+fi
+
+AMA=`cat /sys/class/net/eth$eth_id/address | awk -F':' '{print "02:00:" $3 ":" $4 ":" $5 ":" $6}'`
+
+ip link set eth$eth_id addr $AMA
+ip link set dev eth$eth_id up
+
+# Add pycxi utilities to path
+export PATH=$DBS_DIR/pycxi/utils:$PATH
+
+# Initialize pycxi environment
+. $DBS_DIR/pycxi/.venv/bin/activate
+
+if [[ ! -z $RUNCMD ]]; then
+    $RUNCMD
+fi
diff --git a/prov/cxi/test/startvm.sh b/prov/cxi/test/startvm.sh
new file mode 100644
index 00000000000..933bd082fed
--- /dev/null
+++ b/prov/cxi/test/startvm.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+# Start a VM, optionally load the test driver, and exit.
+
+# The parameters are given to netsim. See netsim -h.
+#   ./startvm.sh       -> run one instance with 1 NIC
+#   ./startvm.sh -N 3  -> run one instance with 3 NICs
+#   ./startvm.sh -n 2  -> launch 2 VMs each with 1 NIC
+#
+#
+# Note: When using multiple VMs, it is recommended to set the USE_XTERM
+# variable. Each VM will be opened in a new xterm window.
+#
+# USE_XTERM=1 ./startvm.sh -n 2
+
+cd `dirname $0`
+
+DBS_DIR=$(pwd)/../../../..
+VIRTME_DIR=$DBS_DIR/virtme
+QEMU_DIR=$DBS_DIR/cassini-qemu/x86_64-softmmu/
+
+# If the emulator is not running, start it. This script must run under
+# its control, so qemu can connect to it. Simply relaunch self under
+# netsim's control.
+if [[ ! -v NETSIM_ID ]]; then
+	exec $DBS_DIR/nic-emu/netsim $@ $(basename $0)
+fi
+
+# Check whether this script is already in a VM or not (ie. running
+# under a hypervisor.) If not, we'll need a different setup for nested
+# VMs.
+HYP=$(grep -c "^flags.*\ hypervisor" /proc/cpuinfo)
+
+if [[ $NETSIM_NICS -eq 1 ]]; then
+	CCN_OPTS="-device ccn,addr=8"
+elif [[ $NETSIM_NICS -eq 2 ]]; then
+	CCN_OPTS="-device ccn,addr=8 -device ccn,addr=13"
+elif [[ $NETSIM_NICS -eq 4 ]]; then
+	CCN_OPTS="-device ccn,addr=8 -device ccn,addr=0xd -device ccn,addr=0x12 -device ccn,addr=0x17"
+fi
+
+# -M q35 = Standard PC (Q35 + ICH9, 2009) (alias of pc-q35-2.10)
+# MSI-X needs interrupt remapping enabled to fully work.
+# w/ Intel IOMMU. Intremap on requires kernel-irqchip=off OR kernel-irqchip=split
+QEMU_OPTS="--qemu-opts -machine q35,kernel-irqchip=split -machine q35 -global q35-pcihost.pci-hole64-size=64G -device intel-iommu,intremap=on,caching-mode=on -smp 4 $CCN_OPTS"
+KERN_OPTS="--kopt iommu=pt --kopt intel_iommu=on --kopt iomem=relaxed"
+KERN_OPTS="$KERN_OPTS --kopt transparent_hugepage=never --kopt hugepagesz=1g --kopt default_hugepagesz=1g --kopt hugepages=1 --kopt pci=realloc"
+KERN_OPTS="$KERN_OPTS --kopt hugepagesz=2M --kopt hugepages=256"
+
+if [[ $HYP -eq 0 ]]; then
+	# First VM needs more memory to launch nested VMs
+	# Only the first VM will have the CCN qemu device. Nested VMs will
+	# have VFs exported to them
+	QEMU_OPTS="$QEMU_OPTS -m 8192"
+
+	if [[ -n $QEMU_MOPTS ]]; then
+		QEMU_OPTS="$QEMU_OPTS $QEMU_MOPTS"
+	fi
+else
+	# Nested VM. Use the first PCI VF
+	# PCIFN = 0000:00:14.0 or similar
+
+	# Bind cxi1 to get its info
+	echo 1 > /sys/class/cxi/cxi0/device/sriov_numvfs
+	PCIFN=$(basename $(readlink /sys/class/cxi/cxi0/device/virtfn0))
+	VENDOR=$(cat /sys/class/cxi/cxi0/device/virtfn0/vendor)
+	DEVICE=$(cat /sys/class/cxi/cxi0/device/virtfn0/device)
+
+	# Unbind VF from cxi core driver. cxi1 no longer exists
+	echo $PCIFN > /sys/bus/pci/drivers/cxi_core/unbind
+
+	# Bind the VF to vfio driver
+	modprobe vfio_pci
+	echo ${VENDOR##*x} ${DEVICE##*x} > /sys/bus/pci/drivers/vfio-pci/new_id
+
+	# Tell qemu to bind the VF
+	QEMU_OPTS="$QEMU_OPTS -device vfio-pci,host=$PCIFN"
+fi
+
+PATH=$QEMU_DIR:$VIRTME_DIR:/sbin:$PATH
+
+VIRTME_OPTS="--rwdir=$(pwd) --pwd"
+
+if [[ $KDIR ]]; then
+	VIRTME_OPTS="--kdir $KDIR --mods=auto $VIRTME_OPTS"
+else
+	VIRTME_OPTS="--installed-kernel $VIRTME_OPTS"
+fi
+
+if [[ $MOS ]]; then
+	QEMU_OPTS="$QEMU_OPTS -m 2048"
+	KERN_OPTS="$KERN_OPTS --kopt kernelcore=1024M --kopt lwkcpus=0.1-3 --kopt lwkmem=1G"
+fi
+
+SETUP_SCRIPT="`dirname $0`/startvm-setup.sh"
+
+# Start the VM, execute the script inside, and exit ...
+if [[ $RUNCMD ]]; then
+    virtme-run --script-sh "$SETUP_SCRIPT $RUNCMD" $VIRTME_OPTS $KERN_OPTS $QEMU_OPTS
+
+# ... or start a VM and execute the script but don't exit
+elif [[ $USE_XTERM -eq 1 ]]; then
+	xterm -e "virtme-run --init-sh '$SETUP_SCRIPT' $VIRTME_OPTS $KERN_OPTS $QEMU_OPTS"
+else
+	virtme-run --init-sh "$SETUP_SCRIPT" $VIRTME_OPTS $KERN_OPTS $QEMU_OPTS
+fi
+
+# ... or just start a clean VM
+#virtme-run --installed-kernel --pwd $KERN_OPTS $QEMU_OPTS
diff --git a/prov/cxi/test/tagged.c b/prov/cxi/test/tagged.c
new file mode 100644
index 00000000000..f730f5228e9
--- /dev/null
+++ b/prov/cxi/test/tagged.c
@@ -0,0 +1,5777 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+TestSuite(tagged, .init = cxit_setup_tagged, .fini = cxit_teardown_tagged,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Test basic send/recv */
+Test(tagged, ping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Try invalid lengths */
+	ret = fi_tsend(cxit_ep, send_buf, cxit_fi->ep_attr->max_msg_size+1,
+		       NULL, cxit_ep_fi_addr, 0, NULL);
+	cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tsend failed %d", ret);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic zero-byte send/recv */
+Test(tagged, zbr)
+{
+	int ret;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	fi_addr_t from;
+
+	ret = fi_trecv(cxit_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	ret = fi_tsend(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, 0, FI_TAGGED | FI_RECV, NULL, 0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Try an unexpected send */
+	ret = fi_tsend(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	sleep(1);
+
+	ret = fi_trecv(cxit_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, 0, FI_TAGGED | FI_RECV, NULL, 0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+}
+
+static void simple_rdzv(bool check_invalid_length)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 8192;
+	int send_len = 8192;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	/* Send 8192 bytes to self */
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	if (check_invalid_length) {
+		ret = fi_tsend(cxit_ep, send_buf,
+			       cxit_fi->ep_attr->max_msg_size+1,
+			       NULL, cxit_ep_fi_addr, 0, NULL);
+		cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tsend failed %d", ret);
+	}
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic rendezvous send */
+Test(tagged, rdzv)
+{
+	simple_rdzv(true);
+}
+
+/* Verify unrestricted non-eager rendezvous get is used if requested */
+Test(tagged, alt_read_rdzv)
+{
+	char *rdzv_proto;
+	uint64_t end_pkt_cnt;
+	uint64_t start_pkt_cnt;
+	int ret;
+
+	/* If not testing alt_read protocol skip */
+	rdzv_proto = getenv("FI_CXI_RDZV_PROTO");
+	if (!rdzv_proto || strcmp(rdzv_proto, "alt_read")) {
+		cr_assert(1);
+		return;
+	}
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT,
+				 &start_pkt_cnt, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	simple_rdzv(false);
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT,
+				 &end_pkt_cnt, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	/* Some number of non-eager data restricted get packets need
+	 * have been sent.
+	 */
+	cr_assert(end_pkt_cnt > start_pkt_cnt,
+		  "Incorrect number of restricted packets");
+}
+
+Test(tagged, zero_byte_tsend_trecv_iov)
+{
+	int ret;
+	struct fi_cq_tagged_entry cqe;
+
+	ret = fi_trecvv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, 0, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvv failed: %d", ret);
+
+	ret = fi_tsendv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendv failed: %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+}
+
+Test(tagged, zero_byte_tsend_trecv_msg)
+{
+	int ret;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_tagged rmsg = {};
+	struct fi_msg_tagged smsg = {};
+
+	rmsg.addr = cxit_ep_fi_addr;
+
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed: %d", ret);
+
+	smsg.addr = cxit_ep_fi_addr;
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed: %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+}
+
+#if ENABLE_DEBUG
+/* Verify fallback to default rendezvous proto on H/W resource failure */
+Test(tagged, fail_alt_read_rdzv)
+{
+	char *rdzv_proto;
+	uint64_t end_pkt_cnt;
+	uint64_t start_pkt_cnt;
+	int ret;
+	struct cxip_ep *ep = container_of(&cxit_ep->fid,
+					  struct cxip_ep, ep.fid);
+
+	/* If not testing alt_read protocol skip */
+	rdzv_proto = getenv("FI_CXI_RDZV_PROTO");
+	if (!rdzv_proto || strcmp(rdzv_proto, "alt_read")) {
+		cr_assert(1);
+		return;
+	}
+
+	/* Force error on allocation of hardware resources required
+	 * by alt_read rendezvous protocol.
+	 */
+	ep->ep_obj->txc.force_err |= CXIP_TXC_FORCE_ERR_ALT_READ_PROTO_ALLOC;
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT,
+				 &start_pkt_cnt, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	simple_rdzv(false);
+
+	ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT,
+				 &end_pkt_cnt, NULL, true);
+	cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret);
+
+	/* No restricted packets should have been sent */
+	cr_assert(end_pkt_cnt == start_pkt_cnt,
+		  "Incorrect number of restricted packets");
+}
+#endif /* ENABLE_DEBUG */
+
+/* Test basic send/recv w/data */
+Test(tagged, pingdata)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	uint64_t data = 0xabcdabcdabcdabcd;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_tsenddata(cxit_ep, send_buf, send_len, NULL, data,
+			   cxit_ep_fi_addr, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsenddata failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len,
+			  FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA,
+			  NULL, data, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic inject send */
+Test(tagged, inject_ping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_tinject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tinject failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Make sure a TX event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	/* Try invalid lengths */
+	ret = fi_tinject(cxit_ep, send_buf, cxit_fi->tx_attr->inject_size+1,
+			 cxit_ep_fi_addr, 0);
+	cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tinject failed %d", ret);
+
+	ret = fi_tinject(cxit_ep, send_buf, 4*1024*1024,
+			 cxit_ep_fi_addr, 0);
+	cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tinject failed %d", ret);
+
+	ret = fi_tinject(cxit_ep, send_buf, cxit_fi->ep_attr->max_msg_size+1,
+			 cxit_ep_fi_addr, 0);
+	cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tinject failed %d", ret);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic injectdata */
+Test(tagged, injectdata_ping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	uint64_t data = 0xabcdabcdabcdabcd;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	ret = fi_tinjectdata(cxit_ep, send_buf, send_len, data,
+			     cxit_ep_fi_addr, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tinject failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len,
+			  FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA,
+			  NULL, data, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Make sure a TX event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic sendv/recvv */
+Test(tagged, vping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct iovec siovec;
+	struct iovec riovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	ret = fi_trecvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, 0, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvv failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	ret = fi_tsendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendv failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic sendmsg/recvmsg */
+Test(tagged, msgping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged rmsg = {};
+	struct fi_msg_tagged smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test FI_FENCE */
+Test(tagged, fence)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged rmsg = {};
+	struct fi_msg_tagged smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+
+	recv_buf = aligned_alloc(s_page_size, s_page_size);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, s_page_size);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, FI_FENCE);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Test rendezvous fence */
+	send_len = recv_len = s_page_size;
+	siovec.iov_len = send_len;
+	riovec.iov_len = recv_len;
+
+	for (i = 0; i < send_len; i++) {
+		recv_buf[i] = 0;
+		send_buf[i] = i + 0xa0;
+	}
+
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, FI_FENCE);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+
+		/* progress */
+		fi_cq_read(cxit_tx_cq, &tx_cqe, 0);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+void cxit_tagged_setup_nofence(void)
+{
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = CXIP_EP_PRI_CAPS;
+	cxit_setup_rma();
+}
+
+/* Test messaging without FI_FENCE */
+Test(tagged_nofence, nofence,
+     .init = cxit_tagged_setup_nofence,
+     .fini = cxit_teardown_rma)
+{
+	int ret;
+	uint8_t *send_buf;
+	int send_len = 64;
+	struct fi_msg_tagged smsg = {};
+	struct fi_msg msg = {};
+	struct iovec siovec;
+
+	send_buf = aligned_alloc(s_page_size, s_page_size);
+	cr_assert(send_buf);
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, FI_FENCE);
+	cr_assert_eq(ret, -FI_EINVAL);
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	msg.msg_iov = &siovec;
+	msg.iov_count = 1;
+	msg.addr = cxit_ep_fi_addr;
+	msg.context = NULL;
+
+	ret = fi_sendmsg(cxit_ep, &msg, FI_FENCE);
+	cr_assert_eq(ret, -FI_EINVAL);
+
+	free(send_buf);
+}
+
+/* Test basic sendmsg/recvmsg with data */
+Test(tagged, msgping_wdata)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged rmsg = {};
+	struct fi_msg_tagged smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+	uint64_t data = 0xabcdabcdabcdabcd;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+	smsg.data = data;
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, FI_REMOTE_CQ_DATA);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len,
+			  FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA, NULL,
+			  data, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test basic injectmsg */
+Test(tagged, inject_msgping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged rmsg = {};
+	struct fi_msg_tagged smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post RX buffer */
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	/* Send 64 bytes to self */
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, FI_INJECT);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test unexpected send/recv */
+Test(tagged, ux_ping)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	fi_addr_t from;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Send 64 bytes to self */
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0,
+		       NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Give some time for the message to move */
+	sleep(1);
+
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == 1);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert(ret == 1);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_assert(recv_buf[i] == send_buf[i],
+			  "data mismatch, element: %d\n", i);
+	}
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Issue a fi_trecvmsg with FI_PEEK and validate result */
+ssize_t try_peek(fi_addr_t addr, uint64_t tag, uint64_t ignore,
+		 ssize_t len, void *context, bool claim)
+{
+	struct fi_msg_tagged tmsg = {
+		.msg_iov = NULL,
+		.iov_count = 0,
+		.addr = addr,
+		.tag = tag,
+		.ignore = ignore,
+		.context = context,
+		.data = 0
+	};
+	struct fi_cq_tagged_entry cqe = {};
+	struct fi_cq_err_entry err_cqe = {};
+	fi_addr_t from;
+	ssize_t ret;
+
+	do {
+		fi_cq_read(cxit_tx_cq, NULL, 0);
+		fi_cq_read(cxit_rx_cq, NULL, 0);
+		ret = fi_trecvmsg(cxit_ep, &tmsg,
+				  claim ? FI_CLAIM | FI_PEEK : FI_PEEK);
+	} while (ret == -FI_EAGAIN);
+	if (ret != FI_SUCCESS)
+		return ret;
+
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from);
+		if (ret == 1) {
+			validate_rx_event_mask(&cqe, context, len,
+					       FI_TAGGED | FI_RECV, NULL, 0,
+					       tag, ignore);
+			cr_assert_eq(from, cxit_ep_fi_addr,
+				     "Invalid source address");
+			ret = FI_SUCCESS;
+			break;
+		} else if (ret == -FI_EAVAIL) {
+			ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0);
+			cr_assert_eq(ret, 1);
+
+			cr_assert(err_cqe.err == ENOMSG, "Bad CQE error %d",
+				  err_cqe.err);
+			cr_assert(err_cqe.buf == 0, "Invalid buffer");
+			cr_assert(err_cqe.olen == 0, "Invalid length");
+			cr_assert(err_cqe.tag == tag, "Invalid tag");
+			cr_assert(err_cqe.err == FI_ENOMSG,
+				  "Invalid error code %d", err_cqe.err);
+			ret = err_cqe.err;
+			break;
+		}
+	} while (ret == -FI_EAGAIN);
+
+	return ret;
+}
+
+static int wait_peek(fi_addr_t addr, uint64_t tag, uint64_t ignore,
+		     ssize_t len, void *context, bool claim)
+{
+	int ret;
+
+	do {
+		ret = try_peek(addr, tag, ignore, len, context, claim);
+	} while (ret == FI_ENOMSG);
+
+	return ret;
+}
+
+#define PEEK_TAG_BASE		0x0000a000
+#define PEEK_MSG_LEN		64
+#define PEEK_NUM_MSG		4
+#define PEEK_NUM_FAKE_ADDRS	3
+
+/* Test fi_trecvmsg using FI_PEEK flag to search unexpected message list.
+ * Additional message sizes will be tested within the multitudes tests.
+ */
+Test(tagged, ux_peek)
+{
+	ssize_t ret;
+	uint8_t *rx_buf;
+	uint8_t *tx_buf;
+	ssize_t	rx_len = PEEK_MSG_LEN;
+	ssize_t tx_len = PEEK_MSG_LEN;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_context rx_context[PEEK_NUM_MSG];
+	struct fi_context tx_context[PEEK_NUM_MSG];
+	struct fi_msg_tagged tmsg = {};
+	struct iovec iovec;
+	fi_addr_t from;
+	int i, tx_comp;
+	struct cxip_addr fake_ep_addrs[PEEK_NUM_FAKE_ADDRS];
+
+	/* Add fake AV entries to test peek for non-matching valid address */
+	for (i = 0; i < PEEK_NUM_FAKE_ADDRS; i++) {
+		fake_ep_addrs[i].nic = i + 0x41c;
+		fake_ep_addrs[i].pid = i + 0x21;
+	}
+	ret = fi_av_insert(cxit_av, (void *)fake_ep_addrs,
+			   PEEK_NUM_FAKE_ADDRS, NULL, 0, NULL);
+	cr_assert(ret == PEEK_NUM_FAKE_ADDRS);
+
+	rx_buf = aligned_alloc(s_page_size, rx_len * PEEK_NUM_MSG);
+	cr_assert(rx_buf);
+	memset(rx_buf, 0, rx_len * PEEK_NUM_MSG);
+
+	tx_buf = aligned_alloc(s_page_size, tx_len * PEEK_NUM_MSG);
+	cr_assert(tx_buf);
+
+	/* Send messages to build the unexpected list */
+	for (i = 0; i < PEEK_NUM_MSG; i++) {
+		memset(&tx_buf[i * tx_len], 0xa0 + i, tx_len);
+		iovec.iov_base = &tx_buf[i * tx_len];
+		iovec.iov_len = tx_len;
+
+		tmsg.msg_iov = &iovec;
+		tmsg.iov_count = 1;
+		tmsg.addr = cxit_ep_fi_addr;
+		tmsg.tag = PEEK_TAG_BASE + i;
+		tmsg.ignore = 0;
+		tmsg.context = &tx_context[i];
+
+		ret = fi_tsendmsg(cxit_ep, &tmsg, FI_COMPLETION);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %" PRId64,
+			     ret);
+	}
+
+	sleep(1);
+
+	/* Force onloading of UX entries if operating in SW EP mode */
+	fi_cq_read(cxit_rx_cq, &cqe, 0);
+
+	/* Any address with bad tag and no context */
+	ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + PEEK_NUM_MSG + 1, 0,
+		       tx_len, NULL, false);
+	cr_assert_eq(ret, FI_ENOMSG, "Peek with invalid tag");
+
+	/* Any address with bad tag with context */
+	ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + PEEK_NUM_MSG + 1, 0,
+		       tx_len, &rx_context[0], false);
+	cr_assert_eq(ret, FI_ENOMSG, "Peek with invalid tag");
+
+	/* Non matching valid source address with valid tag */
+	ret = try_peek(3, PEEK_TAG_BASE, 0, tx_len, NULL, false);
+	cr_assert_eq(ret, FI_ENOMSG, "Peek with wrong match address");
+
+	/* Valid with any address and valid tag */
+	ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + 1, 0, tx_len,
+		       NULL, false);
+	cr_assert_eq(ret, FI_SUCCESS, "Peek with invalid tag");
+
+	/* Valid with expected address and valid tag */
+	ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + 1, 0, tx_len,
+		       NULL, false);
+	cr_assert_eq(ret, FI_SUCCESS, "Peek with bad address");
+
+	/* Valid with any address and good tag when masked correctly */
+	ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + 0x20002,
+		       0x0FFF0000UL, tx_len, NULL, false);
+	cr_assert_eq(ret, FI_SUCCESS, "Peek tag ignore bits failed");
+
+	/* Valid with expected address and good tag when masked correctly */
+	ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + 0x20002,
+		       0x0FFF0000UL, tx_len, NULL, false);
+	cr_assert_eq(ret, FI_SUCCESS, "Peek tag ignore bits failed");
+
+	/* Verify peek of all sends */
+	for (i = 0; i < PEEK_NUM_MSG; i++) {
+		ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0,
+			       tx_len, &rx_context[i], false);
+		cr_assert_eq(ret, FI_SUCCESS, "Peek valid tag not found");
+	}
+
+	/* Verify peek of all sends in reverse order */
+	for (i = PEEK_NUM_MSG - 1; i >= 0; i--) {
+		ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0,
+			       tx_len, &rx_context[i], false);
+		cr_assert_eq(ret, FI_SUCCESS, "Peek valid tag not found");
+	}
+
+	/* Receive all unexpected sends */
+	for (i = 0; i < PEEK_NUM_MSG; i++) {
+		iovec.iov_base = &rx_buf[i * rx_len];
+		iovec.iov_len = rx_len;
+
+		tmsg.msg_iov = &iovec;
+		tmsg.iov_count = 1;
+		tmsg.addr = cxit_ep_fi_addr;
+		tmsg.tag = PEEK_TAG_BASE + i;
+		tmsg.ignore = 0;
+		tmsg.context = &rx_context[i];
+
+		ret = fi_trecvmsg(cxit_ep, &tmsg, 0);
+		cr_assert_eq(ret, FI_SUCCESS,
+			     "fi_trecvmsg failed %" PRId64, ret);
+
+		/* Wait for async event indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+
+		cr_assert(ret == 1);
+		cr_assert_eq(from, cxit_ep_fi_addr, "Invalid source address");
+		validate_rx_event(&cqe, &rx_context[i], rx_len,
+				  FI_TAGGED | FI_RECV, NULL, 0,
+				  PEEK_TAG_BASE + i);
+	}
+
+	/* Verify received data */
+	for (i = 0; i < PEEK_NUM_MSG; i++) {
+		ret = memcmp(&tx_buf[i * tx_len], &rx_buf[i * rx_len], tx_len);
+		cr_assert_eq(ret, 0, "RX buffer data mismatch for msg %d", i);
+	}
+
+	/* Verify received messages have been removed from unexpected list */
+	for (i = 0; i < PEEK_NUM_MSG; i++) {
+		ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0,
+			       tx_len, &rx_context[i], false);
+		cr_assert_eq(ret, FI_ENOMSG,
+			     "Peek after receive did not fail %" PRId64, ret);
+	}
+
+	/* Wait for TX async events to complete, and validate */
+	tx_comp = 0;
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		if (ret == 1) {
+			validate_tx_event(&cqe, FI_TAGGED | FI_SEND,
+					  &tx_context[tx_comp]);
+			tx_comp++;
+		}
+		cr_assert(ret == 1 || ret == -FI_EAGAIN,
+			  "Bad fi_cq_read return %" PRId64, ret);
+	} while (tx_comp < PEEK_NUM_MSG);
+	cr_assert_eq(tx_comp, PEEK_NUM_MSG,
+		     "Peek tsendmsg only %d TX completions read", tx_comp);
+
+	free(rx_buf);
+	free(tx_buf);
+}
+
+/* FI_PEEK with FI_CLAIM testing */
+void test_ux_claim(int num_msgs, int msg_len)
+{
+	ssize_t ret;
+	uint8_t *rx_buf;
+	uint8_t *tx_buf;
+	ssize_t	rx_len = msg_len;
+	ssize_t tx_len = msg_len;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_context *rx_context; /* [PEEK_NUM_MSG]; */
+	struct fi_context *tx_context; /* [PEEK_NUM_MSG]; */
+	struct fi_msg_tagged tmsg = {};
+	struct iovec iovec;
+	fi_addr_t from;
+	int i, tx_comp;
+	struct cxip_addr fake_ep_addrs[PEEK_NUM_FAKE_ADDRS];
+
+	rx_context = calloc(num_msgs, sizeof(struct fi_context));
+	cr_assert_not_null(rx_context);
+	tx_context = calloc(num_msgs, sizeof(struct fi_context));
+	cr_assert_not_null(tx_context);
+
+	rx_buf = aligned_alloc(s_page_size, rx_len * num_msgs);
+	cr_assert_not_null(rx_buf);
+	memset(rx_buf, 0, rx_len * num_msgs);
+
+	tx_buf = aligned_alloc(s_page_size, tx_len * num_msgs);
+	cr_assert_not_null(tx_buf);
+
+	/* Add fake AV entries to test peek for non-matching valid address */
+	for (i = 0; i < PEEK_NUM_FAKE_ADDRS; i++) {
+		fake_ep_addrs[i].nic = i + 0x41c;
+		fake_ep_addrs[i].pid = i + 0x21;
+	}
+	ret = fi_av_insert(cxit_av, (void *)fake_ep_addrs,
+			   PEEK_NUM_FAKE_ADDRS, NULL, 0, NULL);
+	cr_assert(ret == PEEK_NUM_FAKE_ADDRS);
+
+	/* Send messages to build the unexpected list */
+	for (i = 0; i < num_msgs; i++) {
+		memset(&tx_buf[i * tx_len], 0xa0 + i, tx_len);
+		iovec.iov_base = &tx_buf[i * tx_len];
+		iovec.iov_len = tx_len;
+
+		tmsg.msg_iov = &iovec;
+		tmsg.iov_count = 1;
+		tmsg.addr = cxit_ep_fi_addr;
+		tmsg.tag = PEEK_TAG_BASE + i;
+		tmsg.ignore = 0;
+		tmsg.context = &tx_context[i];
+
+		ret = fi_tsendmsg(cxit_ep, &tmsg, FI_COMPLETION);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %" PRId64,
+			     ret);
+	}
+
+	sleep(1);
+
+	/* Force onloading of UX entries if operating in SW EP mode */
+	fi_cq_read(cxit_rx_cq, &cqe, 0);
+
+	/* Any address with bad tag and FI_CLAIM with no context */
+	ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + num_msgs + 1, 0,
+		       tx_len, NULL, true);
+	cr_assert_eq(ret, -FI_EINVAL,
+		     "FI_CLAIM with invalid tag and no context");
+
+	/* Any address with bad tag and FI_CLAIM with context */
+	ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + num_msgs + 1, 0,
+		       tx_len, &rx_context[0], true);
+	cr_assert_eq(ret, FI_ENOMSG, "FI_CLAIM with invalid tag");
+
+	/* Non matching valid source address with valid tag and context */
+	ret = try_peek(3, PEEK_TAG_BASE, 0, tx_len, &rx_context[0], true);
+	cr_assert_eq(ret, FI_ENOMSG, "FI_CLAIM with wrong match address");
+
+	/* Verify peek of all sends */
+	for (i = 0; i < num_msgs; i++) {
+		ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0,
+			       tx_len, &rx_context[i], false);
+		cr_assert_eq(ret, FI_SUCCESS, "All unexpected tags not found");
+	}
+
+	/* Verify peek of all sends in reverse order with FI_CLAIM */
+	for (i = num_msgs - 1; i >= 0; i--) {
+		ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0,
+			       tx_len, &rx_context[i], true);
+		cr_assert_eq(ret, FI_SUCCESS,
+			     "FI_PEEK | FI_CLAIM valid tag not found");
+	}
+
+	/* Verify peek of previously claimed messages fail */
+	for (i = 0; i < num_msgs; i++) {
+		ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0,
+			       tx_len, &rx_context[i], false);
+		cr_assert_eq(ret, FI_ENOMSG,
+			     "Unexpected message not claimed found");
+	}
+
+	/* Receive all claimed unexpected messages */
+	for (i = 0; i < num_msgs; i++) {
+		iovec.iov_base = &rx_buf[i * rx_len];
+		iovec.iov_len = rx_len;
+
+		tmsg.msg_iov = &iovec;
+		tmsg.iov_count = 1;
+		tmsg.addr = cxit_ep_fi_addr;
+		tmsg.tag = PEEK_TAG_BASE + i;
+		tmsg.ignore = 0;
+		tmsg.context = &rx_context[i];
+
+		ret = fi_trecvmsg(cxit_ep, &tmsg, FI_CLAIM);
+		cr_assert_eq(ret, FI_SUCCESS,
+			     "fi_trecvmsg FI_CLAIM failed %" PRId64, ret);
+
+		/* Wait for async event indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+
+		cr_assert(ret == 1);
+		cr_assert_eq(from, cxit_ep_fi_addr, "Invalid source address");
+		validate_rx_event(&cqe, &rx_context[i], rx_len,
+				  FI_TAGGED | FI_RECV, NULL, 0,
+				  PEEK_TAG_BASE + i);
+	}
+
+	/* Verify received data */
+	for (i = 0; i < num_msgs; i++) {
+		ret = memcmp(&tx_buf[i * tx_len], &rx_buf[i * rx_len], tx_len);
+		cr_assert_eq(ret, 0, "RX buffer data mismatch for msg %d", i);
+	}
+
+	/* Wait for TX async events to complete, and validate */
+	tx_comp = 0;
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		if (ret == 1) {
+			validate_tx_event(&cqe, FI_TAGGED | FI_SEND,
+					  &tx_context[tx_comp]);
+			tx_comp++;
+		}
+		cr_assert(ret == 1 || ret == -FI_EAGAIN,
+			  "Bad fi_cq_read return %" PRId64, ret);
+	} while (tx_comp < num_msgs);
+	cr_assert_eq(tx_comp, num_msgs,
+		     "Peek tsendmsg only %d TX completions read", tx_comp);
+
+	free(rx_buf);
+	free(tx_buf);
+	free(rx_context);
+	free(tx_context);
+}
+
+/* Test fi_trecvmsg using FI_PEEK and FI_CLAIM flags to search unexpected
+ * message list and claim the message.
+ */
+Test(tagged, ux_claim)
+{
+	test_ux_claim(4, 1024);
+}
+
+Test(tagged, ux_claim_rdzv)
+{
+	test_ux_claim(4, 65536);
+}
+
+#define PEEK_ORDER_SEND_COUNT 5
+#define PEEK_ORDER_TAG 0x1234ULL
+
+static void verify_peek_claim_order_same_tag(size_t xfer_base_size, bool claim)
+{
+	void *buf;
+	struct fi_context context;
+	int i;
+	int ret;
+	struct fi_cq_tagged_entry cqe;
+	fi_addr_t from;
+	struct fi_msg_tagged tmsg = {};
+	struct iovec iovec;
+	size_t buf_size = xfer_base_size + (PEEK_ORDER_SEND_COUNT - 1);
+	size_t xfer_size;
+
+	buf = malloc(buf_size);
+	cr_assert_not_null(buf);
+
+	/* Issue sends unexpected to target. Same tagged is used with different
+	 * transfer size. Transfer size identifies operation order.
+	 */
+	for (i = 0; i < PEEK_ORDER_SEND_COUNT; i++) {
+		ret = fi_tsend(cxit_ep, buf, xfer_base_size + i, NULL,
+			       cxit_ep_fi_addr, PEEK_ORDER_TAG, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed: %d", ret);
+	}
+
+	/* Receives should be processed in order. Order is incrementing receive
+	 * size.
+	 */
+	iovec.iov_base = buf;
+	iovec.iov_len = buf_size;
+
+	tmsg.msg_iov = &iovec;
+	tmsg.iov_count = 1;
+	tmsg.addr = cxit_ep_fi_addr;
+	tmsg.tag = PEEK_ORDER_TAG;
+	tmsg.ignore = 0;
+	tmsg.context = &context;
+
+	for (i = 0; i < PEEK_ORDER_SEND_COUNT; i++) {
+		xfer_size = xfer_base_size + i;
+
+		ret = wait_peek(cxit_ep_fi_addr, PEEK_ORDER_TAG, 0,
+				xfer_size, tmsg.context, claim);
+		cr_assert_eq(ret, FI_SUCCESS, "try_peek failed: %d", ret);
+
+		/* With claim, subsequent FI_PEEK without FI_CLAIM should always
+		 * return next message.
+		 */
+		if (claim && i < (PEEK_ORDER_SEND_COUNT - 1)) {
+			ret = wait_peek(cxit_ep_fi_addr, PEEK_ORDER_TAG, 0,
+					xfer_size + 1, NULL, false);
+			cr_assert_eq(ret, FI_SUCCESS, "try_peek failed: %d",
+				     ret);
+		}
+
+		/* Recieve unexpected message. If message is FI_CLAIM,
+		 * FI_CONTEXT buffer contains data to progress receive.
+		 */
+		ret = fi_trecvmsg(cxit_ep, &tmsg, claim ? FI_CLAIM : 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed: %d", ret);
+
+		do {
+			/* Process TX CQ (if needed). */
+			fi_cq_read(cxit_tx_cq, NULL, 0);
+			ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+
+		cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret);
+		cr_assert_eq(from, cxit_ep_fi_addr,
+			     "Invalid user id: expected=%#lx got=%#lx",
+			     cxit_ep_fi_addr, from);
+		validate_rx_event_mask(&cqe, tmsg.context, xfer_size,
+				       FI_RECV | FI_TAGGED,
+				       NULL, 0, PEEK_ORDER_TAG, 0);
+	}
+
+	free(buf);
+}
+
+Test(tagged, verify_peek_order_same_tag_idc)
+{
+	verify_peek_claim_order_same_tag(0, false);
+}
+
+Test(tagged, verify_peek_order_same_tag_eager)
+{
+	verify_peek_claim_order_same_tag(257, false);
+}
+
+Test(tagged, verify_peek_order_same_tag_rendezvous)
+{
+	verify_peek_claim_order_same_tag(1048576, false);
+}
+
+Test(tagged, verify_claim_order_same_tag_idc)
+{
+	verify_peek_claim_order_same_tag(0, true);
+}
+
+Test(tagged, verify_claim_order_same_tag_eager)
+{
+	verify_peek_claim_order_same_tag(257, true);
+}
+
+Test(tagged, verify_claim_order_same_tag_rendezvous)
+{
+	verify_peek_claim_order_same_tag(1048576, true);
+}
+
+/* Test MQD get of unexpected message list */
+void verify_ux_dump(int num, ssize_t msg_len)
+{
+	ssize_t ret;
+	size_t count;
+	size_t ux_count;
+	size_t ux_ret_count;
+	struct fi_cq_tagged_entry *cq_entry;
+	fi_addr_t *src_addr;
+	uint8_t *tx_buf;
+	ssize_t tx_len = msg_len;
+	uint8_t *rx_buf;
+	ssize_t	rx_len = msg_len;
+	struct fi_cq_tagged_entry cqe;
+	struct fi_msg_tagged tmsg = {};
+	struct iovec iovec;
+	int i;
+	int tx_comp = 0;
+	fi_addr_t from;
+
+	rx_buf = aligned_alloc(s_page_size, rx_len * num);
+	cr_assert(rx_buf);
+	tx_buf = aligned_alloc(s_page_size, tx_len * num);
+	cr_assert(tx_buf);
+
+	/* Send messages to build the unexpected list */
+	for (i = 0; i < num; i++) {
+		memset(&tx_buf[i * tx_len], 0xa0 + i, tx_len);
+		iovec.iov_base = &tx_buf[i * tx_len];
+		iovec.iov_len = tx_len;
+
+		tmsg.msg_iov = &iovec;
+		tmsg.iov_count = 1;
+		tmsg.addr = cxit_ep_fi_addr;
+		tmsg.tag = PEEK_TAG_BASE + i;
+		tmsg.ignore = 0;
+		tmsg.context = NULL;
+
+		ret = fi_tsendmsg(cxit_ep, &tmsg, FI_COMPLETION);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %" PRId64,
+			     ret);
+	}
+
+	sleep(1);
+
+	/* Force onloading of UX entries if operating in SW EP mode */
+	fi_cq_read(cxit_rx_cq, &cqe, 0);
+
+	/* Call first to get number of UX entries */
+	ux_ret_count = dom_ops->ep_get_unexp_msgs(cxit_ep, NULL, 0,
+						  NULL, &ux_count);
+	cr_assert_eq(ux_ret_count, 0, "Num entries returned");
+	count = ux_count;
+
+	cq_entry = calloc(ux_count, sizeof(*cq_entry));
+	ux_ret_count = dom_ops->ep_get_unexp_msgs(cxit_ep, cq_entry, count,
+						  NULL, &ux_count);
+	cr_assert(ux_ret_count <= count, "Number UX returned <= count");
+	cr_assert_eq(ux_ret_count, num, "Number UX returned wrong");
+
+	for (i = 0; i < ux_ret_count; i++) {
+		cr_assert(cq_entry[i].op_context == NULL, "Context");
+		cr_assert(cq_entry[i].buf == NULL, "Buf");
+		cr_assert(cq_entry[i].tag == PEEK_TAG_BASE + i, "Tag match");
+		cr_assert(cq_entry[i].len == tx_len, "Length %ld",
+			  cq_entry[i].len);
+		cr_assert(cq_entry[i].flags & FI_TAGGED, "FI_TAGGED");
+		cr_assert(!(cq_entry[i].flags & FI_REMOTE_CQ_DATA),
+			  "FI_REMOTE_CQ_DATA");
+	}
+
+	/* Get entries with source address */
+	src_addr = calloc(ux_count, sizeof(*src_addr));
+	ux_ret_count = dom_ops->ep_get_unexp_msgs(cxit_ep, cq_entry, count,
+						  src_addr, &ux_count);
+	cr_assert(ux_ret_count <= count, "Number UX returned <= count");
+	cr_assert_eq(ux_ret_count, num, "Number UX returned wrong");
+
+	for (i = 0; i < ux_ret_count; i++)
+		cr_assert_eq(src_addr[i], cxit_ep_fi_addr, "Source address");
+
+	/* Receive all unexpected messages */
+	for (i = 0; i < num; i++) {
+		ret = fi_trecv(cxit_ep, &rx_buf[i * rx_len], rx_len, NULL,
+			       cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %ld", ret);
+
+		/* Wait for async event indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+
+		cr_assert(ret == 1);
+		cr_assert_eq(from, cxit_ep_fi_addr, "Invalid source address");
+
+		validate_rx_event(&cqe, NULL, rx_len,
+				  FI_TAGGED | FI_RECV, NULL, 0,
+				  PEEK_TAG_BASE + i);
+	}
+
+	/* Verify received data */
+	for (i = 0; i < num; i++) {
+		ret = memcmp(&tx_buf[i * tx_len], &rx_buf[i * rx_len], tx_len);
+		cr_assert_eq(ret, 0, "RX buffer data mismatch for msg %d", i);
+	}
+
+	/* Wait for TX async events to complete, and validate */
+	tx_comp = 0;
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+		if (ret == 1)
+			tx_comp++;
+		cr_assert(ret == 1 || ret == -FI_EAGAIN,
+			  "Bad fi_cq_read return %ld", ret);
+	} while (tx_comp < num);
+	cr_assert_eq(tx_comp, num,
+		     "Peek tsendmsg only %d TX completions read", tx_comp);
+
+	free(src_addr);
+	free(cq_entry);
+	free(tx_buf);
+}
+
+Test(tagged, ux_dump_eager)
+{
+	verify_ux_dump(4, 512);
+}
+
+Test(tagged, ux_dump_rdzv)
+{
+	verify_ux_dump(4, 16384);
+}
+
+/* Test DIRECTED_RECV send/recv */
+void directed_recv(bool logical)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*fake_recv_buf,
+		*send_buf;
+	int recv_len = 0x1000;
+	int send_len = 0x1000;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+#define N_FAKE_ADDRS 3
+	struct cxip_addr fake_ep_addrs[N_FAKE_ADDRS+1];
+	fi_addr_t from;
+
+	if (logical)
+		cxit_av_attr.flags = FI_SYMMETRIC;
+	cxit_setup_enabled_ep();
+
+	/* Create multiple logical names for the local EP address */
+	for (i = 0; i < N_FAKE_ADDRS; i++) {
+		fake_ep_addrs[i].nic = i + 0x41c;
+		fake_ep_addrs[i].pid = i + 0x21;
+	}
+
+	ret = fi_av_insert(cxit_av, (void *)fake_ep_addrs, 3, NULL, 0, NULL);
+	cr_assert(ret == 3);
+
+	ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, NULL, 0, NULL);
+	cr_assert(ret == 1);
+
+	recv_buf = calloc(recv_len, 1);
+	cr_assert(recv_buf);
+
+	fake_recv_buf = calloc(recv_len, 1);
+	cr_assert(fake_recv_buf);
+
+	send_buf = malloc(send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Post an RX buffer matching each EP name that won't be targeted */
+	for (i = 0; i < N_FAKE_ADDRS; i++) {
+		ret = fi_trecv(cxit_ep, fake_recv_buf, recv_len, NULL, i, 0, 0,
+			       NULL);
+		cr_assert(ret == FI_SUCCESS);
+	}
+
+	/* Post short RX buffer matching EP name 3 */
+	ret = fi_trecv(cxit_ep, recv_buf, 64, NULL, 3, 0, 0, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Post long RX buffer matching EP name 3 */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, 3, 0, 0, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Send short message to self (FI address 3)  */
+	send_len = 64;
+
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, 3, 0, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == 1);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == 3, "Invalid source address, exp: 3 got: %lu", from);
+
+	/* Wait for async event indicating data has been sent */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == 1);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			     i, send_buf[i], recv_buf[i], err++);
+		cr_expect_eq(fake_recv_buf[i], 0,
+			     "fake data corrupted, element[%d] err=%d\n",
+			     i, err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Send long message to self (FI address 3)  */
+	memset(recv_buf, 0, recv_len);
+	send_len = 0x1000;
+
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, 3, 0, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == 1);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == 3, "Invalid source address, exp: 3 got: %lu", from);
+
+	/* Wait for async event indicating data has been sent */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == 1);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			     i, send_buf[i], recv_buf[i], err++);
+		cr_expect_eq(fake_recv_buf[i], 0,
+			     "fake data corrupted, element[%d] err=%d\n",
+			     i, err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Send long UX message to self (FI address 3)  */
+	memset(recv_buf, 0, recv_len);
+	send_len = 0x1000;
+
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, 3, 0, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	sleep(1);
+
+	/* Post long RX buffer matching EP name 3 */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, 3, 0, 0, NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+
+		/* Progress */
+		fi_cq_read(cxit_tx_cq, &tx_cqe, 0);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == 1);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == 3, "Invalid source address, exp: 3 got: %lu", from);
+
+	/* Wait for async event indicating data has been sent */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == 1);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			     i, send_buf[i], recv_buf[i], err++);
+		cr_expect_eq(fake_recv_buf[i], 0,
+			     "fake data corrupted, element[%d] err=%d\n",
+			     i, err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(fake_recv_buf);
+	free(recv_buf);
+
+	cxit_teardown_tagged();
+}
+
+Test(tagged_directed, directed)
+{
+	directed_recv(false);
+}
+
+Test(tagged_directed, directed_logical)
+{
+	directed_recv(true);
+}
+
+/* Test unexpected send/recv */
+#define RDZV_TAG (46)
+
+struct tagged_thread_args {
+	uint8_t *buf;
+	size_t len;
+	struct fi_cq_tagged_entry *cqe;
+	fi_addr_t src_addr;
+	size_t io_num;
+	size_t tag;
+	void *context;
+};
+
+static void *tsend_worker(void *data)
+{
+	int ret;
+	struct tagged_thread_args *args;
+	uint64_t tag;
+
+	args = (struct tagged_thread_args *)data;
+	tag = args->tag;
+
+	/* Send 64 bytes to FI address 0 (self) */
+	ret = fi_tsend(cxit_ep, args->buf, args->len, NULL, cxit_ep_fi_addr,
+		       tag, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "%s %ld: unexpected ret %d", __func__,
+		     args->io_num, ret);
+
+	/* Wait for async event indicating data has been sent */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, args->cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "%s %ld: unexpected ret %d", __func__,
+		     args->io_num, ret);
+
+	pthread_exit(NULL);
+}
+
+static void *trecv_worker(void *data)
+{
+	int ret;
+	struct tagged_thread_args *args;
+	uint64_t tag;
+
+	args = (struct tagged_thread_args *)data;
+	tag = args->tag;
+
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, args->buf, args->len, NULL, FI_ADDR_UNSPEC, tag,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "%s %ld: unexpected ret %d", __func__,
+		     args->io_num, ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, args->cqe, 1, &args->src_addr);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "%s %ld: unexpected ret %d", __func__,
+		     args->io_num, ret);
+
+	pthread_exit(NULL);
+}
+
+Test(tagged, ux_sw_rdzv)
+{
+	size_t i;
+	int ret;
+	uint8_t *recv_buf, *send_buf;
+	size_t buf_len = 2 * 1024 * 1024;
+	int recv_len = 4 * 1024;
+	int send_len = 4 * 1024;
+	struct fi_cq_tagged_entry rx_cqe, tx_cqe;
+	pthread_t threads[2];
+	struct tagged_thread_args args[2];
+	pthread_attr_t attr;
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	recv_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, buf_len);
+
+	send_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < buf_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	args[0].buf = send_buf;
+	args[0].len = send_len;
+	args[0].cqe = &tx_cqe;
+	args[0].io_num = 0;
+	args[0].tag = RDZV_TAG;
+	args[1].buf = recv_buf;
+	args[1].len = recv_len;
+	args[1].cqe = &rx_cqe;
+	args[1].io_num = 1;
+	args[1].tag = RDZV_TAG;
+
+	/* Give some time for the message to move */
+	cr_assert_arr_neq(recv_buf, send_buf, buf_len);
+
+	/* start tsend thread */
+	ret = pthread_create(&threads[0], &attr, tsend_worker,
+			     (void *)&args[0]);
+	cr_assert_eq(ret, 0, "Send thread create failed %d", ret);
+
+	sleep(1);
+
+	/* start trecv thread */
+	ret = pthread_create(&threads[1], &attr, trecv_worker,
+			     (void *)&args[1]);
+	cr_assert_eq(ret, 0, "Recv thread create failed %d", ret);
+
+	/* Wait for the threads to complete */
+	ret = pthread_join(threads[0], NULL);
+	cr_assert_eq(ret, 0, "Send thread join failed %d", ret);
+	ret = pthread_join(threads[1], NULL);
+	cr_assert_eq(ret, 0, "Recv thread join failed %d", ret);
+
+	pthread_attr_destroy(&attr);
+
+	/* Validate sent data */
+	cr_expect_arr_eq(recv_buf, send_buf, recv_len);
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+	validate_rx_event(&rx_cqe, NULL, recv_len, FI_TAGGED | FI_RECV, NULL,
+			  0, args[0].tag);
+	cr_assert_eq(args[1].src_addr, cxit_ep_fi_addr,
+		     "Invalid source address");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+Test(tagged, expected_sw_rdzv)
+{
+	size_t i;
+	int ret;
+	uint8_t *recv_buf, *send_buf;
+	size_t buf_len = 2 * 1024 * 1024;
+	int recv_len = 4 * 1024;
+	int send_len = 4 * 1024;
+	struct fi_cq_tagged_entry rx_cqe, tx_cqe;
+	pthread_t threads[2];
+	struct tagged_thread_args args[2];
+	pthread_attr_t attr;
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	recv_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, buf_len);
+
+	send_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < buf_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	args[0].buf = send_buf;
+	args[0].len = send_len;
+	args[0].cqe = &tx_cqe;
+	args[0].io_num = 0;
+	args[0].tag = RDZV_TAG;
+	args[1].buf = recv_buf;
+	args[1].len = recv_len;
+	args[1].cqe = &rx_cqe;
+	args[1].io_num = 1;
+	args[1].tag = RDZV_TAG;
+
+	/* Give some time for the message to move */
+	cr_assert_arr_neq(recv_buf, send_buf, buf_len);
+
+	/* Start trecv thread first so the buffer is ready when the data is sent
+	 */
+	ret = pthread_create(&threads[1], &attr, trecv_worker,
+			     (void *)&args[1]);
+	cr_assert_eq(ret, 0, "Recv thread create failed %d", ret);
+
+	sleep(1);
+
+	/* Start tsend thread to send the data into the ready buffer */
+	ret = pthread_create(&threads[0], &attr, tsend_worker,
+			     (void *)&args[0]);
+	cr_assert_eq(ret, 0, "Send thread create failed %d", ret);
+
+	/* Wait for the threads to complete */
+	ret = pthread_join(threads[0], NULL);
+	cr_assert_eq(ret, 0, "Send thread join failed %d", ret);
+	ret = pthread_join(threads[1], NULL);
+	cr_assert_eq(ret, 0, "Recv thread join failed %d", ret);
+
+	pthread_attr_destroy(&attr);
+
+	/* Validate sent data */
+	cr_expect_arr_eq(recv_buf, send_buf, recv_len);
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+	validate_rx_event(&rx_cqe, NULL, recv_len, FI_TAGGED | FI_RECV, NULL,
+			  0, args[0].tag);
+	cr_assert_eq(args[1].src_addr, cxit_ep_fi_addr,
+		     "Invalid source address");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+#define NUM_IOS (12)
+
+struct tagged_event_args {
+	struct fid_cq *cq;
+	struct fi_cq_tagged_entry *cqe;
+	size_t io_num;
+};
+
+static void *tagged_evt_worker(void *data)
+{
+	int ret;
+	struct tagged_event_args *args;
+
+	args = (struct tagged_event_args *)data;
+
+	for (size_t i = 0; i < args->io_num; i++) {
+		/* Wait for async event indicating data has been sent */
+		do {
+			ret = fi_cq_read(args->cq, &args->cqe[i], 1);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "%ld: unexpected ret %d", i,
+			     ret);
+	}
+
+	pthread_exit(NULL);
+}
+
+Test(tagged, multitudes_sw_rdzv, .timeout=60)
+{
+	int ret;
+	size_t buf_len = 4 * 1024;
+	struct fi_cq_tagged_entry rx_cqe[NUM_IOS];
+	struct fi_cq_tagged_entry tx_cqe[NUM_IOS];
+	struct tagged_thread_args tx_args[NUM_IOS];
+	struct tagged_thread_args rx_args[NUM_IOS];
+	pthread_t tx_thread;
+	pthread_t rx_thread;
+	pthread_attr_t attr;
+	struct tagged_event_args tx_evt_args = {
+		.cq = cxit_tx_cq,
+		.cqe = tx_cqe,
+		.io_num = NUM_IOS,
+	};
+	struct tagged_event_args rx_evt_args = {
+		.cq = cxit_rx_cq,
+		.cqe = rx_cqe,
+		.io_num = NUM_IOS,
+	};
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	/* Issue the Sends */
+	for (size_t tx_io = 0; tx_io < NUM_IOS; tx_io++) {
+		tx_args[tx_io].len = buf_len;
+		tx_args[tx_io].tag = tx_io;
+		tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(tx_args[tx_io].buf);
+		for (size_t i = 0; i < buf_len; i++)
+			tx_args[tx_io].buf[i] = i + 0xa0 + tx_io;
+
+		ret = fi_tsend(cxit_ep, tx_args[tx_io].buf, tx_args[tx_io].len,
+			       NULL, cxit_ep_fi_addr, tx_args[tx_io].tag,
+			       NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d",
+			     tx_io, ret);
+	}
+
+	/* Start processing Send events */
+	ret = pthread_create(&tx_thread, &attr, tagged_evt_worker,
+				(void *)&tx_evt_args);
+	cr_assert_eq(ret, 0, "Send thread create failed %d", ret);
+
+	sleep(1);
+
+	/* Force onloading of UX entries if operating in SW EP mode */
+	fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+
+	/* Peek for each tag on UX list */
+	for (size_t rx_io = 0; rx_io < NUM_IOS; rx_io++) {
+		ret = try_peek(FI_ADDR_UNSPEC, rx_io, 0, buf_len, NULL, false);
+		cr_assert_eq(ret, FI_SUCCESS, "peek of UX message failed");
+	}
+
+	/* Issue the Receives */
+	for (size_t rx_io = 0; rx_io < NUM_IOS; rx_io++) {
+		rx_args[rx_io].len = buf_len;
+		rx_args[rx_io].tag = rx_io;
+		rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(rx_args[rx_io].buf);
+		memset(rx_args[rx_io].buf, 0, buf_len);
+
+		ret = fi_trecv(cxit_ep, rx_args[rx_io].buf, rx_args[rx_io].len,
+			       NULL, FI_ADDR_UNSPEC, rx_args[rx_io].tag,
+			       0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d",
+			     rx_io, ret);
+	}
+
+	/* Start processing Receive events */
+	ret = pthread_create(&rx_thread, &attr, tagged_evt_worker,
+			     (void *)&rx_evt_args);
+	cr_assert_eq(ret, 0, "Receive thread create failed %d", ret);
+
+	/* Wait for the RX/TX event threads to complete */
+	ret = pthread_join(tx_thread, NULL);
+	cr_assert_eq(ret, 0, "Send thread join failed %d", ret);
+
+	ret = pthread_join(rx_thread, NULL);
+	cr_assert_eq(ret, 0, "Recv thread join failed %d", ret);
+
+	/* Validate results */
+	for (size_t io = 0; io < NUM_IOS; io++) {
+		/* Validate sent data */
+		cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len);
+		validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL);
+		validate_rx_event(&rx_cqe[io], NULL, buf_len,
+				  FI_TAGGED | FI_RECV, NULL,
+				  0, tx_args[rx_cqe[io].tag].tag);
+
+		free(tx_args[io].buf);
+		free(rx_args[io].buf);
+	}
+
+	pthread_attr_destroy(&attr);
+}
+
+struct multitudes_params {
+	size_t length;
+	size_t num_ios;
+	bool peek;
+	bool claim;
+};
+
+/* This is a parameterized test to execute an arbitrary set of tagged send/recv
+ * operations. The test is configurable in two parameters, the length value is
+ * the size of the data to be transferred. The num_ios will set the number of
+ * matching send/recv that are launched in each test.
+ *
+ * The test will first execute the fi_tsend() for `num_ios` number of buffers.
+ * A background thread is launched to start processing the Cassini events for
+ * the Send operations. The test will then pause for 1 second. After the pause,
+ * the test will optionally use fi_trecvmsg() to FI_PEEK the unexpected list
+ * and verify the send messages are on the unexpected list. Then the
+ * test will execute the fi_trecv() to receive the buffers that were
+ * previously sent. Another background thread is then launched to process the
+ * receive events. When all send and receive operations have completed, the
+ * threads exit and the results are compared to ensure the expected data was
+ * returned.
+ *
+ * Based on the test's length parameter it will change the processing of the
+ * send and receive operation. 2kiB and below lengths will cause the eager
+ * data path to be used. Larger than 2kiB buffers will use the SW Rendezvous
+ * data path to be used.
+ */
+void do_multitudes(struct multitudes_params *param)
+{
+	int ret;
+	size_t buf_len = param->length;
+	struct fi_cq_tagged_entry *rx_cqe;
+	struct fi_cq_tagged_entry *tx_cqe;
+	struct tagged_thread_args *tx_args;
+	struct tagged_thread_args *rx_args;
+	struct fi_context *rx_ctxts;
+	struct iovec iovec;
+	struct fi_msg_tagged tmsg = {};
+	pthread_t tx_thread;
+	pthread_t rx_thread;
+	pthread_attr_t attr;
+	struct tagged_event_args tx_evt_args = {
+		.cq = cxit_tx_cq,
+		.io_num = param->num_ios,
+	};
+	struct tagged_event_args rx_evt_args = {
+		.cq = cxit_rx_cq,
+		.io_num = param->num_ios,
+	};
+	char *rx_mode;
+	bool claim = param->claim;
+
+	/* TODO: Remove after HW FI_CLAIM support is implemented */
+	rx_mode = getenv("FI_CXI_RX_MATCH_MODE");
+	if (claim && (!rx_mode || strcmp(rx_mode, "software"))) {
+		cr_assert(1);
+		return;
+	}
+
+	tx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry));
+	cr_assert_not_null(tx_cqe);
+
+	rx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry));
+	cr_assert_not_null(rx_cqe);
+
+	tx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args));
+	cr_assert_not_null(tx_args);
+
+	rx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args));
+	cr_assert_not_null(rx_args);
+
+	rx_ctxts = calloc(param->num_ios, sizeof(struct fi_context));
+	cr_assert_not_null(rx_ctxts);
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	tx_evt_args.cqe = tx_cqe;
+	rx_evt_args.cqe = rx_cqe;
+
+	/* Issue the Sends */
+	for (size_t tx_io = 0; tx_io < param->num_ios; tx_io++) {
+		tx_args[tx_io].len = buf_len;
+		tx_args[tx_io].tag = tx_io;
+		tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(tx_args[tx_io].buf);
+		for (size_t i = 0; i < buf_len; i++)
+			tx_args[tx_io].buf[i] = i + 0xa0 + tx_io;
+
+		do {
+			ret = fi_tsend(cxit_ep, tx_args[tx_io].buf,
+				       tx_args[tx_io].len, NULL,
+				       cxit_ep_fi_addr, tx_args[tx_io].tag,
+				       NULL);
+			if (ret == -FI_EAGAIN) {
+				fi_cq_read(cxit_tx_cq, NULL, 0);
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+			}
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d",
+			     tx_io, ret);
+	}
+
+	/* Start processing Send events */
+	ret = pthread_create(&tx_thread, &attr, tagged_evt_worker,
+				(void *)&tx_evt_args);
+	cr_assert_eq(ret, 0, "Send thread create failed %d", ret);
+
+	sleep(1);
+
+	/* Force onloading of UX entries if operating in SW EP mode */
+	fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+
+	/* Optional peek to see if all send tags are found on ux list */
+	if (param->peek) {
+		for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) {
+			if (claim)
+				rx_args[rx_io].context = &rx_ctxts[rx_io];
+
+			ret = try_peek(FI_ADDR_UNSPEC, rx_io, 0, buf_len,
+				       claim ? &rx_ctxts[rx_io] : NULL, claim);
+			cr_assert_eq(ret, FI_SUCCESS,
+				     "peek of UX message failed");
+		}
+	}
+
+	/* Issue the Receives */
+	for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) {
+		rx_args[rx_io].len = buf_len;
+		rx_args[rx_io].tag = rx_io;
+		rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(rx_args[rx_io].buf);
+		memset(rx_args[rx_io].buf, 0, buf_len);
+
+		do {
+			if (claim) {
+				iovec.iov_base = rx_args[rx_io].buf;
+				iovec.iov_len = rx_args[rx_io].len;
+
+				tmsg.msg_iov = &iovec;
+				tmsg.iov_count = 1;
+				tmsg.addr = FI_ADDR_UNSPEC;
+				tmsg.tag = rx_args[rx_io].tag;
+				tmsg.ignore = 0;
+				tmsg.context = &rx_ctxts[rx_io];
+
+				ret = fi_trecvmsg(cxit_ep, &tmsg, FI_CLAIM);
+			} else {
+				ret = fi_trecv(cxit_ep, rx_args[rx_io].buf,
+					       rx_args[rx_io].len, NULL,
+					       FI_ADDR_UNSPEC,
+					       rx_args[rx_io].tag, 0, NULL);
+			}
+			if (ret == -FI_EAGAIN)
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d",
+			     rx_io, ret);
+	}
+
+	/* Start processing Receive events */
+	ret = pthread_create(&rx_thread, &attr, tagged_evt_worker,
+			     (void *)&rx_evt_args);
+	cr_assert_eq(ret, 0, "Receive thread create failed %d", ret);
+
+	/* Wait for the RX/TX event threads to complete */
+	ret = pthread_join(tx_thread, NULL);
+	cr_assert_eq(ret, 0, "Send thread join failed %d", ret);
+
+	ret = pthread_join(rx_thread, NULL);
+	cr_assert_eq(ret, 0, "Recv thread join failed %d", ret);
+
+	/* Validate results */
+	for (size_t io = 0; io < param->num_ios; io++) {
+		/* Validate sent data */
+		cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len);
+
+		validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL);
+		validate_rx_event(&rx_cqe[io], claim ?
+				  rx_args[rx_cqe[io].tag].context : NULL,
+				  buf_len, FI_TAGGED | FI_RECV, NULL,
+				  0, tx_args[rx_cqe[io].tag].tag);
+		free(tx_args[io].buf);
+		free(rx_args[io].buf);
+	}
+
+	pthread_attr_destroy(&attr);
+	free(rx_cqe);
+	free(tx_cqe);
+	free(tx_args);
+	free(rx_args);
+	free(rx_ctxts);
+}
+
+ParameterizedTestParameters(tagged, multitudes)
+{
+	size_t param_sz;
+
+	static struct multitudes_params params[] = {
+		{.length = 1024,	/* Eager */
+		 .num_ios = 10,
+		 .peek = true},
+		{.length = 2 * 1024,	/* Eager */
+		 .num_ios = 15,
+		 .peek = true},
+		{.length = 4 * 1024,	/* Rendezvous */
+		 .num_ios = 12,
+		 .peek = true},
+		{.length = 128 * 1024,	/* Rendezvous */
+		 .num_ios = 25,
+		 .peek = true},
+		{.length = 1024,	/* Eager */
+		 .num_ios = 10,
+		 .peek = true,
+		 .claim = true,
+		},
+		{.length = 2 * 1024,	/* Eager */
+		 .num_ios = 15,
+		 .peek = true,
+		 .claim = true,
+		},
+		{.length = 4 * 1024,	/* Rendezvous */
+		 .num_ios = 12,
+		 .peek = true,
+		 .claim = true,
+		},
+		{.length = 128 * 1024,	/* Rendezvous */
+		 .num_ios = 25,
+		 .peek = true,
+		 .claim = true,
+		},
+		{.length = 8 * 1024,	/* Rendezvous ID > 8 bits */
+		 .num_ios = 350,
+		 .peek = true,
+		 .claim = false,
+		},
+	};
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct multitudes_params, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct multitudes_params *param, tagged, multitudes, .timeout=60)
+{
+	do_multitudes(param);
+}
+
+/* Use multitudes test to force transition from hardware
+ * matching to software matching. LE_POOL resources should
+ * be set to 60.
+ */
+ParameterizedTestParameters(tagged, hw2sw_multitudes)
+{
+	size_t param_sz;
+
+	static struct multitudes_params params[] = {
+		{.length = 1024,	/* Eager */
+		 .num_ios = 100,
+		 .peek = true
+		},
+		{.length = 2 * 2048,	/* Rendezvous */
+		 .num_ios = 100,
+		 .peek = true
+		},
+		{.length = 8 * 2048,	/* Rendezvous */
+		 .num_ios = 100,
+		 .peek = true
+		},
+	};
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct multitudes_params, params,
+				   param_sz);
+}
+
+/* This test will only require HW to SW matching transition if the
+ * LE pool resources have been limited (60) and if running in HW offloaded
+ * mode with RDZV offloaded and the eager long protocol is not used.
+ */
+ParameterizedTest(struct multitudes_params *param, tagged, hw2sw_multitudes,
+		.timeout=60, .disabled=false)
+{
+	do_multitudes(param);
+}
+
+/* This will only test hybrid matching transition when LE resources
+ * are restricted to no more than 60.
+ */
+Test(tagged, hw2sw_hybrid_matching, .timeout=60)
+{
+	int ret;
+	size_t buf_len = 4096;
+	struct fi_cq_tagged_entry *rx_cqe;
+	struct fi_cq_tagged_entry *tx_cqe;
+	struct tagged_thread_args *tx_args;
+	struct tagged_thread_args *rx_args;
+	pthread_t tx_thread;
+	pthread_t rx_thread;
+	pthread_attr_t attr;
+	struct tagged_event_args tx_evt_args = {
+		.cq = cxit_tx_cq,
+		.io_num = 100,
+	};
+	struct tagged_event_args rx_evt_args = {
+		.cq = cxit_rx_cq,
+		.io_num = 100,
+	};
+
+	tx_cqe = calloc(100, sizeof(struct fi_cq_tagged_entry));
+	cr_assert_not_null(tx_cqe);
+
+	rx_cqe = calloc(100, sizeof(struct fi_cq_tagged_entry));
+	cr_assert_not_null(rx_cqe);
+
+	tx_args = calloc(100, sizeof(struct tagged_thread_args));
+	cr_assert_not_null(tx_args);
+
+	rx_args = calloc(100, sizeof(struct tagged_thread_args));
+	cr_assert_not_null(rx_args);
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	tx_evt_args.cqe = tx_cqe;
+	rx_evt_args.cqe = rx_cqe;
+
+	/* Issue 25 receives for tags 25-49 to pre-load priority list */
+	for (size_t rx_io = 25; rx_io < 50; rx_io++) {
+		rx_args[rx_io].len = buf_len;
+		rx_args[rx_io].tag = rx_io;
+		rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(rx_args[rx_io].buf);
+		memset(rx_args[rx_io].buf, 0, buf_len);
+
+		do {
+			ret = fi_trecv(cxit_ep, rx_args[rx_io].buf,
+				       rx_args[rx_io].len, NULL,
+				       FI_ADDR_UNSPEC, rx_args[rx_io].tag,
+				       0, NULL);
+			if (ret == -FI_EAGAIN)
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d",
+			     rx_io, ret);
+	}
+
+	/* Start processing Receive events */
+	ret = pthread_create(&rx_thread, &attr, tagged_evt_worker,
+			     (void *)&rx_evt_args);
+	cr_assert_eq(ret, 0, "Receive thread create failed %d", ret);
+
+	/* Issue all of the Sends exhausting resources */
+	for (size_t tx_io = 0; tx_io < 100; tx_io++) {
+		tx_args[tx_io].len = buf_len;
+		tx_args[tx_io].tag = tx_io;
+		tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(tx_args[tx_io].buf);
+		for (size_t i = 0; i < buf_len; i++)
+			tx_args[tx_io].buf[i] = i + 0xa0 + tx_io;
+
+		do {
+			ret = fi_tsend(cxit_ep, tx_args[tx_io].buf,
+				       tx_args[tx_io].len, NULL,
+				       cxit_ep_fi_addr, tx_args[tx_io].tag,
+				       NULL);
+			if (ret == -FI_EAGAIN) {
+				fi_cq_read(cxit_tx_cq, NULL, 0);
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+			}
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d",
+			     tx_io, ret);
+	}
+
+	/* Start processing Send events */
+	ret = pthread_create(&tx_thread, &attr, tagged_evt_worker,
+				(void *)&tx_evt_args);
+	cr_assert_eq(ret, 0, "Send thread create failed %d", ret);
+
+	sleep(1);
+
+	/* Force onloading of UX entries if operating in SW EP mode */
+	fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+
+	/* Issue the remainder of the receives */
+	for (size_t rx_io = 0; rx_io < 100; rx_io++) {
+		if (rx_io >= 25 && rx_io < 50)
+			continue;
+		rx_args[rx_io].len = buf_len;
+		rx_args[rx_io].tag = rx_io;
+		rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(rx_args[rx_io].buf);
+		memset(rx_args[rx_io].buf, 0, buf_len);
+
+		do {
+			ret = fi_trecv(cxit_ep, rx_args[rx_io].buf,
+				       rx_args[rx_io].len, NULL,
+				       FI_ADDR_UNSPEC, rx_args[rx_io].tag,
+				       0, NULL);
+			if (ret == -FI_EAGAIN)
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d",
+			     rx_io, ret);
+	}
+
+	/* Wait for the RX/TX event threads to complete */
+	ret = pthread_join(tx_thread, NULL);
+	cr_assert_eq(ret, 0, "Send thread join failed %d", ret);
+
+	ret = pthread_join(rx_thread, NULL);
+	cr_assert_eq(ret, 0, "Recv thread join failed %d", ret);
+
+	/* Validate results */
+	for (size_t io = 0; io < 100; io++) {
+		/* Validate sent data */
+		cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len);
+
+		validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL);
+
+		validate_rx_event(&rx_cqe[io], NULL, buf_len,
+				  FI_TAGGED | FI_RECV, NULL,
+				  0, tx_args[rx_cqe[io].tag].tag);
+
+		free(tx_args[io].buf);
+		free(rx_args[io].buf);
+	}
+
+	pthread_attr_destroy(&attr);
+	free(rx_cqe);
+	free(tx_cqe);
+	free(tx_args);
+	free(rx_args);
+}
+
+#define RECV_INIT 0x77
+#define SEND_INIT ~RECV_INIT
+
+void do_msg(uint8_t *send_buf, size_t send_len, uint64_t send_tag,
+	    uint8_t *recv_buf, size_t recv_len, uint64_t recv_tag,
+	    uint64_t recv_ignore, bool send_first, size_t buf_size,
+	    bool tagged, bool wdata, uint64_t data, bool match_complete)
+{
+	int i, ret;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	bool sent = false,
+	     recved = false,
+	     truncated = false;
+	struct fi_cq_err_entry err_cqe = {};
+	size_t recved_len;
+	static int send_cnt;
+	static int recv_cnt;
+	static int recv_errcnt;
+
+	struct fi_msg_tagged tsmsg = {};
+	struct fi_msg smsg = {};
+	struct iovec siovec;
+	uint64_t send_flags = 0;
+
+	memset(recv_buf, RECV_INIT, buf_size);
+
+	for (i = 0; i < buf_size; i++) {
+		if (i < send_len)
+			send_buf[i] = i + 0xa0;
+		else
+			send_buf[i] = SEND_INIT;
+	}
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.context = NULL;
+	smsg.data = data;
+
+	tsmsg.msg_iov = &siovec;
+	tsmsg.iov_count = 1;
+	tsmsg.addr = cxit_ep_fi_addr;
+	tsmsg.tag = send_tag;
+	tsmsg.ignore = 0;
+	tsmsg.context = NULL;
+	tsmsg.data = data;
+
+	/* FI_REMOTE_CQ_DATA flag is not strictly necessary. */
+	if (wdata)
+		send_flags |= FI_REMOTE_CQ_DATA;
+	if (match_complete)
+		send_flags |= FI_MATCH_COMPLETE;
+
+	if (send_first) {
+		if (tagged) {
+			ret = fi_tsendmsg(cxit_ep, &tsmsg, send_flags);
+			cr_assert_eq(ret, FI_SUCCESS,
+				     "fi_tsendmsg failed %d", ret);
+		} else {
+			ret = fi_sendmsg(cxit_ep, &smsg, send_flags);
+			cr_assert_eq(ret, FI_SUCCESS,
+				     "fi_sendmsg failed %d", ret);
+		}
+
+		/* Progress send to ensure it arrives unexpected */
+		i = 0;
+		do {
+			ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+			if (ret == 1) {
+				sent = true;
+				break;
+			}
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "send failed %d", ret);
+		} while (i++ < 100000);
+	}
+
+	/* Post RX buffer */
+
+	if (tagged) {
+		ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+			       FI_ADDR_UNSPEC, recv_tag, recv_ignore, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+	} else {
+		ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL,
+			      FI_ADDR_UNSPEC, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret);
+	}
+
+	if (!send_first) {
+		if (tagged) {
+			ret = fi_tsendmsg(cxit_ep, &tsmsg, send_flags);
+			cr_assert_eq(ret, FI_SUCCESS,
+				     "fi_tsendmsg failed %d", ret);
+		} else {
+			ret = fi_sendmsg(cxit_ep, &smsg, send_flags);
+			cr_assert_eq(ret, FI_SUCCESS,
+				     "fi_sendmsg failed %d", ret);
+		}
+	}
+
+	/* Gather both events, ensure progress on both sides. */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		if (ret == 1) {
+			cr_assert_eq(recved, false);
+			recved = true;
+		} else if (ret == -FI_EAVAIL) {
+			cr_assert_eq(recved, false);
+			recved = true;
+			truncated = true;
+
+			ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0);
+			cr_assert_eq(ret, 1);
+		} else {
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d", ret);
+		}
+
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		if (ret == 1) {
+			cr_assert_eq(sent, false);
+			sent = true;
+		} else {
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d", ret);
+		}
+	} while (!(sent && recved));
+
+	if (truncated) {
+		cr_assert(err_cqe.op_context == NULL,
+			  "Error RX CQE Context mismatch");
+		cr_assert(err_cqe.flags ==
+			  ((tagged ? FI_TAGGED : FI_MSG) | FI_RECV |
+			   (wdata ? FI_REMOTE_CQ_DATA : 0UL)),
+			  "Error RX CQE flags mismatch");
+		cr_assert(err_cqe.len == recv_len,
+			  "Invalid Error RX CQE length, got: %ld exp: %ld",
+			  err_cqe.len, recv_len);
+		cr_assert(err_cqe.buf == 0, "Invalid Error RX CQE address");
+		cr_assert(err_cqe.data == (wdata ? data : 0UL),
+			  "Invalid Error RX CQE data");
+		cr_assert(err_cqe.tag == send_tag, "Invalid Error RX CQE tag");
+		cr_assert(err_cqe.olen == (send_len - recv_len),
+			  "Invalid Error RX CQE olen, got: %ld exp: %ld",
+			  err_cqe.olen, send_len - recv_len);
+		cr_assert(err_cqe.err == FI_ETRUNC,
+			  "Invalid Error RX CQE code\n");
+		cr_assert(err_cqe.prov_errno == C_RC_OK,
+			  "Invalid Error RX CQE errno");
+		cr_assert(err_cqe.err_data == NULL);
+		cr_assert(err_cqe.err_data_size == 0);
+		recved_len = err_cqe.len;
+	} else {
+		validate_rx_event(&rx_cqe, NULL, send_len,
+				  (tagged ? FI_TAGGED : FI_MSG) | FI_RECV
+				  | (wdata ? FI_REMOTE_CQ_DATA : 0UL),
+				  NULL, wdata ? data : 0UL, send_tag);
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+		recved_len = rx_cqe.len;
+	}
+
+	validate_tx_event(&tx_cqe, (tagged ? FI_TAGGED : FI_MSG) | FI_SEND,
+			  NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < buf_size; i++) {
+		uint8_t cmp = RECV_INIT;
+		if (i < recved_len)
+			cmp = send_buf[i];
+
+		cr_expect_eq(recv_buf[i], cmp,
+			     "data mismatch, len: %ld, element[%d], exp=0x%x saw=0x%x, err=%d\n",
+			     recv_len, i, cmp, recv_buf[i], err++);
+		if (err >= 10)
+			break;
+	}
+	cr_assert_eq(err, 0, "%d data errors seen\n", err);
+
+	/* Check counters */
+	send_cnt++;
+
+	if (truncated)
+		recv_errcnt++;
+	else
+		recv_cnt++;
+
+	while (fi_cntr_read(cxit_send_cntr) != send_cnt)
+		;
+	while (fi_cntr_read(cxit_recv_cntr) != recv_cnt)
+		;
+	while (fi_cntr_readerr(cxit_recv_cntr) != recv_errcnt)
+		;
+
+	/* Error count is 7 bits */
+	if (recv_errcnt == 127) {
+		recv_errcnt = 0;
+		fi_cntr_seterr(cxit_recv_cntr, 0);
+	}
+}
+
+#define BUF_SIZE (8*1024)
+#define SEND_MIN 64
+#define SEND_INC 64
+#define TAG 0x333333333333
+#define IGNORE_ALL (-1ULL & CXIP_TAG_MASK)
+#define HDR_DATA 0xabcdabcdabcdabcd
+
+struct tagged_rx_params {
+	size_t buf_size;
+	size_t send_min;
+	size_t send_inc;
+	uint64_t send_tag;
+	int recv_len_off;
+	uint64_t recv_tag;
+	uint64_t ignore;
+	bool ux;
+	bool tagged;
+	bool wdata;
+	uint64_t data;
+};
+
+static struct tagged_rx_params params[] = {
+	{.buf_size = BUF_SIZE, /* equal length no data */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = false,
+	 .tagged = true},
+
+	/* Use CQ data */
+
+	{.buf_size = BUF_SIZE, /* truncate */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = -8,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = false,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* truncate UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = -8,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = true,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* truncate ignore */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = TAG,
+	 .recv_len_off = -8,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = false,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* truncate ignore UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = TAG,
+	 .recv_len_off = -8,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = true,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* equal length */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = false,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* equal length UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = true,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* equal length ignore */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = TAG,
+	 .recv_len_off = 0,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = false,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* equal length ignore UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = TAG,
+	 .recv_len_off = 0,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = true,
+	 .tagged = true},
+	{.buf_size = BUF_SIZE, /* excess */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 8,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = false,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* excess UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 8,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = true,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* excess ignore */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = TAG,
+	 .recv_len_off = 8,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = false,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* excess ignore UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = TAG,
+	 .recv_len_off = 8,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = true,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+
+	/* Un-tagged variants */
+
+	{.buf_size = BUF_SIZE, /* equal length no data */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = false,
+	 .tagged = false},
+
+	/* Use CQ data */
+
+	{.buf_size = BUF_SIZE, /* truncate */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = -8,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = false,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* truncate UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = -8,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = true,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* truncate ignore */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = -8,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = false,
+	 .tagged = true,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* truncate ignore UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = -8,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = true,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* equal length */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = false,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* equal length UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = true,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* equal length ignore */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = false,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* equal length ignore UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = true,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* excess */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 8,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = false,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* excess UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 8,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = true,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* excess ignore */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 8,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = false,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+	{.buf_size = BUF_SIZE, /* excess ignore UX */
+	 .send_min = SEND_MIN,
+	 .send_inc = SEND_INC,
+	 .send_tag = 0,
+	 .recv_len_off = 8,
+	 .recv_tag = ~TAG & CXIP_TAG_MASK,
+	 .ignore = IGNORE_ALL,
+	 .ux = true,
+	 .tagged = false,
+	 .wdata = true,
+	 .data = HDR_DATA},
+};
+
+ParameterizedTestParameters(tagged, rx)
+{
+	size_t param_sz;
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct tagged_rx_params, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct tagged_rx_params *param, tagged, rx, .timeout=30)
+{
+	uint8_t *recv_buf,
+		*send_buf;
+	size_t send_len;
+
+	recv_buf = aligned_alloc(s_page_size, param->buf_size);
+	cr_assert(recv_buf);
+
+	send_buf = aligned_alloc(s_page_size, param->buf_size);
+	cr_assert(send_buf);
+
+	for (send_len = param->send_min;
+	     send_len <= param->buf_size;
+	     send_len += param->send_inc) {
+		do_msg(send_buf, send_len, param->send_tag,
+		       recv_buf, send_len + param->recv_len_off,
+		       param->recv_tag, param->ignore, param->ux,
+		       param->buf_size, param->tagged,
+		       param->wdata, param->data, false);
+		do_msg(send_buf, send_len, param->send_tag,
+		       recv_buf, send_len + param->recv_len_off,
+		       param->recv_tag, param->ignore, param->ux,
+		       param->buf_size, param->tagged,
+		       param->wdata, param->data, true);
+	}
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+#define GB 1024*1024*1024
+Test(tagged, rput_abort, .disabled=true)
+{
+	size_t recv_len = GB;
+	size_t send_len = GB;
+	void *recv_buf;
+	void *send_buf;
+	int ret;
+	uint64_t val __attribute__((unused));
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+		       FI_ADDR_UNSPEC, 0, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+	sleep(1);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	ret = fi_tsend(cxit_ep, send_buf, send_len,
+		       NULL, cxit_ep_fi_addr, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS,
+		     "fi_tsend failed %d", ret);
+
+	sleep(1);
+	val = *(uint64_t *)0;
+}
+
+
+Test(tagged, oflow_replenish, .timeout=180)
+{
+	uint8_t *recv_buf,
+		*send_buf;
+	size_t send_len = 1024;
+	int i;
+
+	recv_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(recv_buf);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < 6*1024+1; i++) {
+		do_msg(send_buf, send_len, 0,
+		       recv_buf, send_len, 0, 0,
+		       true, send_len, true, false, 0, false);
+	}
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test outstanding send cleanup */
+Test(tagged, cleanup_sends)
+{
+	int i, ret;
+	uint8_t *send_buf;
+	int send_len = 64;
+	int sends = 5;
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	/* Send 64 bytes to self */
+	for (i = 0; i < sends; i++) {
+		ret = fi_tsend(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+	}
+
+	/* Close Endpoint with outstanding Sends */
+}
+
+/* Test UX cleanup */
+Test(tagged, ux_cleanup)
+{
+	int i, ret;
+	uint8_t *send_buf;
+	int send_len = 64;
+	struct fi_cq_tagged_entry cqe;
+	int sends = 5;
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	/* Send 64 bytes to self */
+	for (i = 0; i < sends; i++) {
+		ret = fi_tsend(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+	}
+
+	validate_tx_event(&cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Wait for async event indicating data has been received */
+	for (i = 0 ; i < 1000; i++)
+		fi_cq_readfrom(cxit_rx_cq, &cqe, 1, NULL);
+
+	free(send_buf);
+
+	/* Close Endpoint with UX sends on the RX Queue */
+}
+
+/* Test outstanding recv cleanup */
+Test(tagged, cleanup_recvs)
+{
+	int i, ret;
+	uint8_t *recv_buf;
+	int recv_len = 64;
+	int recvs = 5;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+
+	for (i = 0; i < recvs; i++) {
+		ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+			       FI_ADDR_UNSPEC, 0x0, 0x0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+	}
+
+	/* Close Endpoint with outstanding Receives */
+}
+
+/* Test outstanding recv cancel */
+Test(tagged, cancel_recvs)
+{
+	int i, ret;
+	uint8_t *recv_buf;
+	int recv_len = 64;
+	int recvs = 5;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+
+	for (i = 0; i < recvs; i++) {
+		ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+			       FI_ADDR_UNSPEC, 0x0, 0x0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+	}
+
+	for (i = 0; i < recvs; i++) {
+		ret = fi_cancel(&cxit_ep->fid, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_cancel failed %d", ret);
+	}
+}
+
+/* Test cancel with no matching recv */
+Test(tagged, cancel_nomatch)
+{
+	int ret;
+
+	ret = fi_cancel(&cxit_ep->fid, NULL);
+	cr_assert_eq(ret, -FI_ENOENT, "fi_cancel failed to fail %d", ret);
+}
+
+/* Test outstanding recv cancel events */
+Test(tagged, cancel_recvs_sync)
+{
+	int i, ret;
+	uint8_t *recv_buf;
+	int recv_len = 64;
+	int recvs = 5;
+	struct fi_cq_tagged_entry rx_cqe;
+	struct fi_cq_err_entry err_cqe;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+
+	for (i = 0; i < recvs; i++) {
+		ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+			       FI_ADDR_UNSPEC, 0x0, 0x0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+	}
+
+	for (i = 0; i < recvs; i++) {
+		ret = fi_cancel(&cxit_ep->fid, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_cancel failed %d", ret);
+	}
+
+	ret = fi_cancel(&cxit_ep->fid, NULL);
+	cr_assert_eq(ret, -FI_ENOENT, "fi_cancel failed to fail %d", ret);
+
+	for (i = 0; i < recvs; i++) {
+		do {
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			if (ret == -FI_EAVAIL)
+				break;
+
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "unexpected event %d", ret);
+		} while (1);
+
+		ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0);
+		cr_assert_eq(ret, 1);
+
+		cr_assert(err_cqe.op_context == NULL,
+			  "Error RX CQE Context mismatch");
+		cr_assert(err_cqe.flags == (FI_TAGGED | FI_RECV),
+			  "Error RX CQE flags mismatch");
+		cr_assert(err_cqe.err == FI_ECANCELED,
+			  "Invalid Error RX CQE code\n");
+		cr_assert(err_cqe.prov_errno == 0,
+			  "Invalid Error RX CQE errno");
+	}
+}
+
+void cxit_setup_selective_completion(void)
+{
+	cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION;
+	cxit_rx_cq_bind_flags |= FI_SELECTIVE_COMPLETION;
+
+	cxit_setup_getinfo();
+	cxit_fi_hints->tx_attr->op_flags = FI_COMPLETION;
+	cxit_fi_hints->rx_attr->op_flags = FI_COMPLETION;
+	cxit_setup_tagged();
+}
+
+/* Test selective completion behavior with RMA. */
+Test(tagged_sel, selective_completion,
+     .init = cxit_setup_selective_completion,
+     .fini = cxit_teardown_tagged)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int buf_len = 0x1000;
+	int send_len;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged smsg = {};
+	struct fi_msg_tagged rmsg = {};
+	struct iovec siovec;
+	struct iovec riovec;
+	int recv_cnt = 0;
+
+	recv_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert(recv_buf);
+
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = buf_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	send_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert(send_buf);
+
+	siovec.iov_base = send_buf;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	/* Normal writes generate completions */
+	for (send_len = 1; send_len <= buf_len; send_len <<= 1) {
+		bool sent = false;
+		bool rcved = false;
+
+		memset(recv_buf, 0, send_len);
+		for (i = 0; i < send_len; i++)
+			send_buf[i] = i + 0xa0;
+
+		/* Post RX buffer */
+		ret = fi_trecv(cxit_ep, recv_buf, send_len, NULL,
+			       FI_ADDR_UNSPEC, 0, 0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+		recv_cnt++;
+
+		/* Send to self */
+		ret = fi_tsend(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+		/* Wait for async events indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+			if (ret == 1)
+				rcved = true;
+
+			ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+			if (ret == 1)
+				sent = true;
+		} while (!(sent && rcved));
+
+		validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV,
+				  NULL, 0, 0);
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+		validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+		/* Validate sent data */
+		for (i = 0; i < send_len; i++) {
+			cr_expect_eq(recv_buf[i], send_buf[i],
+				     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+				     i, send_buf[i], recv_buf[i], err++);
+		}
+		cr_assert_eq(err, 0, "Data errors seen\n");
+	}
+
+	/* Request completions from fi_writemsg */
+	for (send_len = 1; send_len <= buf_len; send_len <<= 1) {
+		bool sent = false;
+		bool rcved = false;
+
+		memset(recv_buf, 0, send_len);
+		for (i = 0; i < send_len; i++)
+			send_buf[i] = i + 0xa0;
+
+		/* Post RX buffer */
+		ret = fi_trecvmsg(cxit_ep, &rmsg, FI_COMPLETION);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+		recv_cnt++;
+
+		/* Send to self */
+		siovec.iov_len = send_len;
+		ret = fi_tsendmsg(cxit_ep, &smsg, FI_COMPLETION);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+		/* Wait for async events indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+			if (ret == 1)
+				rcved = true;
+
+			ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+			if (ret == 1)
+				sent = true;
+		} while (!(sent && rcved));
+
+		validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV,
+				  NULL, 0, 0);
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+		validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+		/* Validate sent data */
+		for (i = 0; i < send_len; i++) {
+			cr_expect_eq(recv_buf[i], send_buf[i],
+				     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+				     i, send_buf[i], recv_buf[i], err++);
+		}
+		cr_assert_eq(err, 0, "Data errors seen\n");
+	}
+
+	/* Suppress completions using fi_writemsg */
+	for (send_len = 1; send_len <= buf_len; send_len <<= 1) {
+		memset(recv_buf, 0, send_len);
+		for (i = 0; i < send_len; i++)
+			send_buf[i] = i + 0xa0;
+
+		/* Post RX buffer */
+		riovec.iov_len = send_len;
+		ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+		recv_cnt++;
+
+		/* Send to self */
+		siovec.iov_len = send_len;
+		ret = fi_tsendmsg(cxit_ep, &smsg, 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+		/* Wait for async events indicating data has been received */
+		while (fi_cntr_read(cxit_recv_cntr) != recv_cnt)
+			;
+
+		/* Validate sent data */
+		for (i = 0; i < send_len; i++) {
+			cr_expect_eq(recv_buf[i], send_buf[i],
+				     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+				     i, send_buf[i], recv_buf[i], err++);
+		}
+		cr_assert_eq(err, 0, "Data errors seen\n");
+
+		/* Ensure no events were generated */
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+
+		ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+	}
+
+	/* Inject never generates an event */
+
+	send_len = 8;
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, recv_buf, send_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+	recv_cnt++;
+
+	/* Send 64 bytes to self */
+	ret = fi_tinject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			     i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Make sure a TX event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+void cxit_setup_selective_completion_suppress(void)
+{
+	cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION;
+	cxit_rx_cq_bind_flags |= FI_SELECTIVE_COMPLETION;
+
+	cxit_setup_getinfo();
+	cxit_fi_hints->tx_attr->op_flags = 0;
+	cxit_fi_hints->rx_attr->op_flags = 0;
+	cxit_setup_tagged();
+}
+
+/* Test selective completion behavior with RMA. */
+Test(tagged_sel, selective_completion_suppress,
+     .init = cxit_setup_selective_completion_suppress,
+     .fini = cxit_teardown_tagged)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int buf_len = 0x1000;
+	int send_len;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged smsg = {};
+	struct fi_msg_tagged rmsg = {};
+	struct iovec siovec;
+	struct iovec riovec;
+	int recv_cnt = 0;
+
+	recv_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert(recv_buf);
+
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = buf_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	send_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert(send_buf);
+
+	siovec.iov_base = send_buf;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	/* Normal writes do not generate completions */
+	for (send_len = 1; send_len <= buf_len; send_len <<= 1) {
+		memset(recv_buf, 0, send_len);
+		for (i = 0; i < send_len; i++)
+			send_buf[i] = i + 0xa0;
+
+		/* Post RX buffer */
+		ret = fi_trecv(cxit_ep, recv_buf, send_len, NULL,
+			       FI_ADDR_UNSPEC, 0, 0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+		recv_cnt++;
+
+		/* Send to self */
+		ret = fi_tsend(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, 0, NULL);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+		/* Wait for async events indicating data has been received */
+		while (fi_cntr_read(cxit_recv_cntr) != recv_cnt)
+			;
+
+		/* Validate sent data */
+		for (i = 0; i < send_len; i++) {
+			cr_expect_eq(recv_buf[i], send_buf[i],
+				     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+				     i, send_buf[i], recv_buf[i], err++);
+		}
+		cr_assert_eq(err, 0, "Data errors seen\n");
+
+		/* Ensure no events were generated */
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+
+		ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+	}
+
+	/* Request completions from fi_writemsg */
+	for (send_len = 1; send_len <= buf_len; send_len <<= 1) {
+		bool sent = false;
+		bool rcved = false;
+
+		memset(recv_buf, 0, send_len);
+		for (i = 0; i < send_len; i++)
+			send_buf[i] = i + 0xa0;
+
+		/* Post RX buffer */
+		riovec.iov_len = send_len;
+		ret = fi_trecvmsg(cxit_ep, &rmsg, FI_COMPLETION);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+		recv_cnt++;
+
+		/* Send to self */
+		siovec.iov_len = send_len;
+		ret = fi_tsendmsg(cxit_ep, &smsg, FI_COMPLETION);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+		/* Wait for async events indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+			if (ret == 1)
+				rcved = true;
+
+			ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+			if (ret == 1)
+				sent = true;
+		} while (!(sent && rcved));
+
+		validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV,
+				  NULL, 0, 0);
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+		validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+		/* Validate sent data */
+		for (i = 0; i < send_len; i++) {
+			cr_expect_eq(recv_buf[i], send_buf[i],
+				     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+				     i, send_buf[i], recv_buf[i], err++);
+		}
+		cr_assert_eq(err, 0, "Data errors seen\n");
+	}
+
+	/* Suppress completions using fi_writemsg */
+	for (send_len = 1; send_len <= buf_len; send_len <<= 1) {
+		memset(recv_buf, 0, send_len);
+		for (i = 0; i < send_len; i++)
+			send_buf[i] = i + 0xa0;
+
+		/* Post RX buffer */
+		riovec.iov_len = send_len;
+		ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+		recv_cnt++;
+
+		/* Send to self */
+		siovec.iov_len = send_len;
+		ret = fi_tsendmsg(cxit_ep, &smsg, 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+		/* Wait for async events indicating data has been received */
+		while (fi_cntr_read(cxit_recv_cntr) != recv_cnt)
+			;
+
+
+		/* Validate sent data */
+		for (i = 0; i < send_len; i++) {
+			cr_expect_eq(recv_buf[i], send_buf[i],
+				     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+				     i, send_buf[i], recv_buf[i], err++);
+		}
+		cr_assert_eq(err, 0, "Data errors seen\n");
+
+		/* Ensure no events were generated */
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+
+		ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+	}
+
+	/* Inject never generates an event */
+
+	send_len = 8;
+	/* Post RX buffer */
+	ret = fi_trecv(cxit_ep, recv_buf, send_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+	recv_cnt++;
+
+	/* Send 64 bytes to self */
+	ret = fi_tinject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Wait for async events indicating data has been received */
+	while (fi_cntr_read(cxit_recv_cntr) != recv_cnt)
+		;
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			     i, send_buf[i], recv_buf[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	/* Make sure a TX event wasn't delivered */
+	ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+	cr_assert(ret == -FI_EAGAIN);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test match complete */
+Test(tagged, match_comp)
+{
+	int i, j, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged rmsg = {};
+	struct fi_msg_tagged smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	for (j = 0; j < 100; j++) {
+		ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+		ret = fi_tsendmsg(cxit_ep, &smsg, FI_MATCH_COMPLETE);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+		/* Wait for async event indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV,
+				  NULL, 0, 0);
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+		/* Validate sent data */
+		for (i = 0; i < send_len; i++) {
+			cr_expect_eq(recv_buf[i], send_buf[i],
+				  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+				  i, send_buf[i], recv_buf[i], err++);
+		}
+		cr_assert_eq(err, 0, "Data errors seen\n");
+
+		/* UX */
+
+		ret = fi_tsendmsg(cxit_ep, &smsg, FI_MATCH_COMPLETE);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+		/* Ensure no TX event is generated */
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		cr_assert(ret == -FI_EAGAIN);
+
+		ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+		/* Wait for async event indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV,
+				  NULL, 0, 0);
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+		/* Wait for async event indicating data has been sent */
+		ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+	}
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test eager Send with FI_MORE */
+Test(tagged, esend_more)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*recv_buf2,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged rmsg = {};
+	struct fi_msg_tagged smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	recv_buf2 = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf2);
+	memset(recv_buf2, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	/* Post two Receives */
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	riovec.iov_base = recv_buf2;
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, FI_MORE);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Ensure no completion before the doorbell ring */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN,
+			     "write failed %d", ret);
+	} while (i++ < 100000);
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Gather 2 Receive events */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Gather 2 Send events */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+		cr_expect_eq(recv_buf2[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf2[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test rendezvous Send with FI_MORE */
+Test(tagged, rsend_more)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*recv_buf2,
+		*send_buf;
+	int recv_len = 0x1000;
+	int send_len = 0x1000;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged rmsg = {};
+	struct fi_msg_tagged smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	recv_buf2 = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf2);
+	memset(recv_buf2, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	/* Post two Receives */
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	riovec.iov_base = recv_buf2;
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, FI_MORE);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Ensure no completion before the doorbell ring */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN,
+			     "write failed %d", ret);
+	} while (i++ < 100000);
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Gather 2 Receive events */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Gather 2 Send events */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+		cr_expect_eq(recv_buf2[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf2[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test Receive with FI_MORE */
+Test(tagged, recv_more)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*recv_buf2,
+		*send_buf;
+	int recv_len = 0x2000;
+	int send_len = 0x2000;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	int err = 0;
+	fi_addr_t from;
+	struct fi_msg_tagged rmsg = {};
+	struct fi_msg_tagged smsg = {};
+	struct iovec riovec;
+	struct iovec siovec;
+	struct cxip_ep *ep = container_of(cxit_ep, struct cxip_ep, ep.fid);
+
+	/* FI_MORE has no meaning if receives are not offloaded */
+	if (!ep->ep_obj->rxc.msg_offload) {
+		cr_assert(1);
+		return;
+	}
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	recv_buf2 = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf2);
+	memset(recv_buf2, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	riovec.iov_base = recv_buf;
+	riovec.iov_len = recv_len;
+	rmsg.msg_iov = &riovec;
+	rmsg.iov_count = 1;
+	rmsg.addr = FI_ADDR_UNSPEC;
+	rmsg.tag = 0;
+	rmsg.ignore = 0;
+	rmsg.context = NULL;
+
+	siovec.iov_base = send_buf;
+	siovec.iov_len = send_len;
+	smsg.msg_iov = &siovec;
+	smsg.iov_count = 1;
+	smsg.addr = cxit_ep_fi_addr;
+	smsg.tag = 0;
+	smsg.ignore = 0;
+	smsg.context = NULL;
+
+	/* Perform 2 Sends */
+	ret = fi_tsendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	ret = fi_tsendmsg(cxit_ep, &smsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret);
+
+	/* Post two Receives */
+	ret = fi_trecvmsg(cxit_ep, &rmsg, FI_MORE);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	/* Ensure no completion before the doorbell ring */
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN,
+			     "write failed %d", ret);
+	} while (i++ < 100000);
+
+	riovec.iov_base = recv_buf2;
+	ret = fi_trecvmsg(cxit_ep, &rmsg, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret);
+
+	/* Gather 2 Receive events */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Gather 2 Send events */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	/* Validate sent data */
+	for (i = 0; i < send_len; i++) {
+		cr_expect_eq(recv_buf[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf[i], err++);
+		cr_expect_eq(recv_buf2[i], send_buf[i],
+			  "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+			  i, send_buf[i], recv_buf2[i], err++);
+	}
+	cr_assert_eq(err, 0, "Data errors seen\n");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Test flow control.
+ *
+ * Perform enough Sends to overwhelm target LEs. Flow control recovery is
+ * transparent.
+ *
+ * Post matching Receives and check data to validate correct ordering amid flow
+ * control recovery.
+ */
+Test(tagged, fc, .timeout = 180)
+{
+	int i, j, ret, tx_ret;
+	uint8_t *send_bufs;
+	uint8_t *send_buf;
+	int send_len = 64;
+	uint8_t *recv_buf;
+	int recv_len = 64;
+	struct fi_cq_tagged_entry tx_cqe;
+	struct fi_cq_tagged_entry rx_cqe;
+	int nsends_concurrent = 3; /* must be less than the LE pool min. */
+	int nsends = 14000;
+	int sends = 0;
+	uint64_t tag = 0xbeef;
+	fi_addr_t from;
+
+	send_bufs = aligned_alloc(s_page_size, send_len * nsends_concurrent);
+	cr_assert(send_bufs);
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+
+	for (i = 0; i < nsends_concurrent - 1; i++) {
+		send_buf = send_bufs + (i % nsends_concurrent) * send_len;
+		memset(send_buf, i, send_len);
+
+		tx_ret = fi_tsend(cxit_ep, send_buf, send_len, NULL,
+			       cxit_ep_fi_addr, tag, NULL);
+	}
+
+	for (i = nsends_concurrent - 1; i < nsends; i++) {
+		send_buf = send_bufs + (i % nsends_concurrent) * send_len;
+		memset(send_buf, i, send_len);
+
+		do {
+			tx_ret = fi_tsend(cxit_ep, send_buf, send_len, NULL,
+				       cxit_ep_fi_addr, tag, NULL);
+
+			/* Progress RX to avoid EQ drops */
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d",
+				     ret);
+
+			/* Just progress */
+			fi_cq_read(cxit_tx_cq, NULL, 0);
+		} while (tx_ret == -FI_EAGAIN);
+
+		cr_assert_eq(tx_ret, FI_SUCCESS, "fi_tsend failed %d", tx_ret);
+
+		do {
+			tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+
+			/* Progress RX to avoid EQ drops */
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d",
+				     ret);
+		} while (tx_ret == -FI_EAGAIN);
+
+		cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d",
+			     tx_ret);
+
+		validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+		if (!(++sends % 1000))
+			printf("%u Sends complete.\n", sends);
+	}
+
+	for (i = 0; i < nsends_concurrent - 1; i++) {
+		do {
+			tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+
+			/* Progress RX to avoid EQ drops */
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d",
+				     ret);
+		} while (tx_ret == -FI_EAGAIN);
+
+		cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d",
+			     tx_ret);
+
+		validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+		if (!(++sends % 1000))
+			printf("%u Sends complete.\n", sends);
+	}
+
+	for (i = 0; i < nsends; i++) {
+		do {
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			assert(ret == -FI_EAGAIN);
+
+			ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+				       FI_ADDR_UNSPEC, tag, 0, NULL);
+		} while (ret == -FI_EAGAIN);
+
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_rx_event(&rx_cqe, NULL, recv_len, FI_TAGGED | FI_RECV,
+				  NULL, 0, tag);
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+		for (j = 0; j < recv_len; j++) {
+			cr_assert_eq(recv_buf[j], (uint8_t)i,
+				     "data mismatch, recv: %d element[%d], exp=%d saw=%d\n",
+				     i, j, (uint8_t)i, recv_buf[j]);
+		}
+	}
+
+	free(send_bufs);
+	free(recv_buf);
+}
+
+#define FC_TRANS 100
+
+static void *fc_sender(void *data)
+{
+	int i, tx_ret;
+	uint8_t *send_buf;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe;
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < FC_TRANS; i++) {
+		memset(send_buf, i, send_len);
+
+		/* Send 64 bytes to self */
+		do {
+			tx_ret = fi_tsend(cxit_ep, send_buf, send_len, NULL,
+				       cxit_ep_fi_addr, 0xa, NULL);
+		} while (tx_ret == -FI_EAGAIN);
+
+		cr_assert_eq(tx_ret, FI_SUCCESS, "fi_tsend failed %d", tx_ret);
+
+		do {
+			tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		} while (tx_ret == -FI_EAGAIN);
+
+		cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d",
+			     tx_ret);
+
+		validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+	}
+
+	free(send_buf);
+
+	pthread_exit(NULL);
+}
+
+static void *fc_recver(void *data)
+{
+	int i, j, ret;
+	uint8_t *recv_buf;
+	int recv_len = 64;
+	struct fi_cq_tagged_entry rx_cqe;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+
+	for (i = 0; i < 5; i++) {
+		sleep(1);
+
+		/* Progress RX to avoid EQ drops */
+		ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+		cr_assert_eq(ret, -FI_EAGAIN,
+			     "fi_cq_read unexpected value %d",
+			     ret);
+	}
+
+	for (i = 0; i < FC_TRANS; i++) {
+		memset(recv_buf, 0, recv_len);
+
+		/* Send 64 bytes to self */
+
+		do {
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+			assert(ret == -FI_EAGAIN);
+
+			ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+				       FI_ADDR_UNSPEC, 0xa, 0, NULL);
+		} while (ret == -FI_EAGAIN);
+
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+		do {
+			ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+		} while (ret == -FI_EAGAIN);
+
+		cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+		validate_rx_event(&rx_cqe, NULL, recv_len, FI_TAGGED | FI_RECV,
+				  NULL, 0, 0xa);
+
+		for (j = 0; j < recv_len; j++) {
+			cr_assert_eq(recv_buf[j], i,
+				     "data mismatch, element[%d], exp=%d saw=%d\n",
+				     j, i, recv_buf[j]);
+		}
+	}
+
+	free(recv_buf);
+
+	pthread_exit(NULL);
+}
+
+/*
+ * Multi-threaded flow control test.
+ *
+ * Run sender and receiver threads. Start sender first to allow it to overwhelm
+ * target LEs (set artificially low). Software matching is exercised while the
+ * receiver catches up. Matching is a hybrid of SW/HW as threads race to
+ * finish.
+ *
+ * Run with driver le_pool_max set below FC_TRANS.
+ */
+Test(tagged, fc_mt)
+{
+	pthread_t send_thread;
+	pthread_t recv_thread;
+	pthread_attr_t attr;
+	int ret;
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	ret = pthread_create(&send_thread, &attr, fc_sender, NULL);
+	cr_assert_eq(ret, 0);
+
+	ret = pthread_create(&recv_thread, &attr, fc_recver, NULL);
+	cr_assert_eq(ret, 0);
+
+	ret = pthread_join(recv_thread, NULL);
+	cr_assert_eq(ret, 0);
+
+	ret = pthread_join(send_thread, NULL);
+	cr_assert_eq(ret, 0);
+
+	pthread_attr_destroy(&attr);
+}
+
+/* Post a bunch of receives to cause append failures. */
+Test(tagged, fc_too_many_recv_early_close)
+{
+	void *recv_buf;
+	size_t recv_len = 1;
+	int i;
+	int ret;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+
+	for (i = 0; i < 50; i++) {
+		do {
+			ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+				       FI_ADDR_UNSPEC, 0xa, 0, NULL);
+			/* Just progress */
+			if (ret == -FI_EAGAIN)
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+		} while (ret == -FI_EAGAIN);
+
+		assert(ret == FI_SUCCESS);
+	}
+
+	/* Early endpoint close. */
+	ret = fi_close(&cxit_ep->fid);
+	cr_assert(ret == FI_SUCCESS, "fi_close endpoint");
+	cxit_ep = NULL;
+
+	free(recv_buf);
+}
+
+#define RDZV_FC_ITERS 100
+#define RDZV_FC_BATCH 5
+
+static void *rdzv_fc_sender(void *data)
+{
+	int i, j, tx_ret;
+	int send_id;
+	uint8_t *send_bufs;
+	uint8_t *send_buf;
+	long send_len = (long)data;
+	struct fi_cq_tagged_entry tx_cqe;
+	int batch_size = RDZV_FC_BATCH;
+
+	send_bufs = aligned_alloc(s_page_size, send_len * batch_size);
+	cr_assert(send_bufs);
+
+	for (i = 0; i < RDZV_FC_ITERS; i++) {
+		for (j = 0; j < batch_size; j++) {
+			send_id = i * batch_size + j;
+			send_buf = &send_bufs[j * send_len];
+			memset(send_buf, send_id, send_len);
+
+			do {
+				tx_ret = fi_tsend(cxit_ep, send_buf, send_len,
+						  NULL, cxit_ep_fi_addr,
+						  send_id, NULL);
+
+				if (tx_ret == -FI_EAGAIN) {
+					fi_cq_read(cxit_tx_cq, &tx_cqe, 0);
+					sched_yield();
+				}
+			} while (tx_ret == -FI_EAGAIN);
+
+			cr_assert_eq(tx_ret, FI_SUCCESS, "fi_tsend failed %d",
+				     tx_ret);
+		}
+
+		for (j = 0; j < batch_size; j++) {
+			do {
+				tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+
+				if (tx_ret == -FI_EAGAIN)
+					sched_yield();
+			} while (tx_ret == -FI_EAGAIN);
+
+			cr_assert_eq(tx_ret, 1,
+				     "fi_cq_read unexpected value %d",
+				     tx_ret);
+
+			validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+		}
+	}
+
+	free(send_bufs);
+
+	pthread_exit(NULL);
+}
+
+static void *rdzv_fc_recver(void *data)
+{
+	int i, j, k, ret;
+	int recv_id;
+	uint8_t *recv_bufs;
+	uint8_t *recv_buf;
+	long recv_len = (long)data;
+	struct fi_cq_tagged_entry rx_cqe;
+	int batch_size = RDZV_FC_BATCH;
+
+	recv_bufs = aligned_alloc(s_page_size, recv_len * batch_size);
+	cr_assert(recv_bufs);
+
+	/* Let Sender get ahead and land some UX messages */
+	sleep(1);
+
+	for (i = 0; i < RDZV_FC_ITERS; i++) {
+
+		for (j = 0; j < batch_size; j++) {
+			recv_id = i * batch_size + j;
+			recv_buf = &recv_bufs[j * recv_len];
+			memset(recv_buf, -1, recv_len);
+
+			do {
+				ret = fi_trecv(cxit_ep, recv_buf, recv_len,
+					       NULL, FI_ADDR_UNSPEC, recv_id,
+					       0, NULL);
+
+				if (ret == -FI_EAGAIN) {
+					fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+					sched_yield();
+				}
+			} while (ret == -FI_EAGAIN);
+
+			cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d",
+				     ret);
+
+			do {
+				ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1);
+
+				if (ret == -FI_EAGAIN)
+					sched_yield();
+			} while (ret == -FI_EAGAIN);
+
+			cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d",
+				     ret);
+
+			validate_rx_event(&rx_cqe, NULL, recv_len,
+					  FI_TAGGED | FI_RECV,
+					  NULL, 0, rx_cqe.tag);
+
+			recv_id = rx_cqe.tag % batch_size;
+			recv_buf = &recv_bufs[recv_id * recv_len];
+			for (k = 0; k < recv_len; k++) {
+				cr_assert_eq(recv_buf[k], (uint8_t)rx_cqe.tag,
+					     "data mismatch, element[%d], exp=%d saw=%d\n",
+					     k, (uint8_t)rx_cqe.tag,
+					     recv_buf[k]);
+			}
+		}
+	}
+
+	free(recv_bufs);
+
+	pthread_exit(NULL);
+}
+
+/*
+ * Rendezvous Send multi-threaded flow control test.
+ *
+ * Run with driver le_pool_max set just above RDZV_FC_BATCH.
+ */
+Test(tagged, rdzv_fc_mt, .timeout = 60)
+{
+	pthread_t send_thread;
+	pthread_t recv_thread;
+	pthread_attr_t attr;
+	int ret;
+	long xfer_len;
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	for (xfer_len = 64; xfer_len <= 4*1024; xfer_len <<= 2) {
+		ret = pthread_create(&send_thread, &attr, rdzv_fc_sender,
+				     (void *)xfer_len);
+		cr_assert_eq(ret, 0);
+
+		ret = pthread_create(&recv_thread, &attr, rdzv_fc_recver,
+				     (void *)xfer_len);
+		cr_assert_eq(ret, 0);
+
+		ret = pthread_join(recv_thread, NULL);
+		cr_assert_eq(ret, 0);
+
+		ret = pthread_join(send_thread, NULL);
+		cr_assert_eq(ret, 0);
+
+		printf("%ld byte Sends complete\n", xfer_len);
+	}
+
+	pthread_attr_destroy(&attr);
+}
+
+Test(tagged, NC2192)
+{
+	int i, ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int send_len = CXIP_RDZV_THRESHOLD - 1;
+	int recv_len = send_len;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	fi_addr_t from;
+	int sends = (CXIP_OFLOW_BUF_SIZE - CXIP_RDZV_THRESHOLD) / send_len + 1;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	/* Consume 1 oflow byte */
+	ret = fi_tsend(cxit_ep, send_buf, 1, NULL, cxit_ep_fi_addr, 0,
+		       NULL);
+	cr_assert(ret == FI_SUCCESS);
+
+	for (i = 0; i < sends; i++) {
+		do {
+			ret = fi_tsend(cxit_ep, send_buf, send_len, NULL,
+				       cxit_ep_fi_addr, 1, NULL);
+			/* progress */
+			if (ret == -FI_EAGAIN) {
+				fi_cq_read(cxit_tx_cq, &tx_cqe, 0);
+				fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+			}
+		} while (ret == -FI_EAGAIN);
+		cr_assert(ret == FI_SUCCESS);
+	}
+
+
+	/* Force processing in software mode */
+	fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+
+	for (i = 0; i < sends + 1; i++) {
+		fi_cq_read(cxit_tx_cq, &tx_cqe, 0);
+		fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+
+		ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+		cr_assert(ret == 1);
+
+		validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+	}
+
+	for (i = 0; i < sends; i++) {
+		do {
+			ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+				       FI_ADDR_UNSPEC, 1, 0, NULL);
+			/* progress */
+			if (ret == -FI_EAGAIN) {
+				fi_cq_read(cxit_tx_cq, &tx_cqe, 0);
+				fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+			}
+		} while (ret == -FI_EAGAIN);
+		cr_assert(ret == FI_SUCCESS);
+	}
+
+	for (i = 0; i < sends; i++) {
+		/* Wait for async event indicating data has been received */
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+		cr_assert(ret == 1);
+
+		validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV,
+				  NULL, 0, 1);
+		cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+	}
+
+	/* Match the 1 byte Send */
+	do {
+		ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL,
+			       FI_ADDR_UNSPEC, 0, 0, NULL);
+		/* progress */
+		if (ret == -FI_EAGAIN)
+			fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == FI_SUCCESS);
+
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert(ret == 1);
+
+	validate_rx_event(&rx_cqe, NULL, 1, FI_TAGGED | FI_RECV, NULL, 0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+TestSuite(tagged_tclass, .init = cxit_setup_tx_alias_tagged,
+	  .fini = cxit_teardown_tx_alias_tagged,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+/* Simple send using both the EP and alias EP with new TC */
+Test(tagged_tclass, ping)
+{
+	int ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	uint32_t tclass = FI_TC_LOW_LATENCY;
+	fi_addr_t from;
+
+	recv_buf = aligned_alloc(s_page_size, recv_len * 2);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len * 2);
+
+	send_buf = aligned_alloc(s_page_size, send_len * 2);
+	cr_assert(send_buf);
+
+	/* Post RX buffers */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+	ret = fi_trecv(cxit_ep, recv_buf + recv_len, recv_len, NULL,
+		       FI_ADDR_UNSPEC, 0, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+
+	/* Update EP alias traffic class */
+	ret = fi_set_val(&cxit_tx_alias_ep->fid, FI_OPT_CXI_SET_TCLASS,
+			 (void *)&tclass);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_set_val failed %d for tc %d\n",
+		     ret, tclass);
+
+
+	/* Send 64 bytes to self */
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	ret = fi_tsend(cxit_tx_alias_ep, send_buf + send_len, send_len, NULL,
+		       cxit_ep_fi_addr, 0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend for alias failed %d", ret);
+
+	/* Wait for async event indicating data has been received */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL,
+			  0, 0);
+	cr_assert(from == cxit_ep_fi_addr, "Invalid source address");
+
+	/* Wait for async event indicating data has been sent */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+	validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL);
+
+	free(send_buf);
+	free(recv_buf);
+}
+
+/* Various tagged protocols using both the original endpoint
+ * and an alias endpoint modified to use the specified tclass.
+ *
+ * Note that receive order is not expected between the original
+ * and alias EP; tags are used for getting completions.
+ */
+struct multi_tc_params {
+	size_t length;
+	size_t num_ios;
+	uint32_t tclass;
+	uint32_t alias_mask;
+	bool peek;
+};
+
+void do_multi_tc(struct multi_tc_params *param)
+{
+	int ret;
+	size_t buf_len = param->length;
+	struct fid_ep *ep;
+	struct fi_cq_tagged_entry *rx_cqe;
+	struct fi_cq_tagged_entry *tx_cqe;
+	struct tagged_thread_args *tx_args;
+	struct tagged_thread_args *rx_args;
+	pthread_t tx_thread;
+	pthread_t rx_thread;
+	pthread_attr_t attr;
+	struct tagged_event_args tx_evt_args = {
+		.cq = cxit_tx_cq,
+		.io_num = param->num_ios,
+	};
+	struct tagged_event_args rx_evt_args = {
+		.cq = cxit_rx_cq,
+		.io_num = param->num_ios,
+	};
+
+	tx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry));
+	cr_assert_not_null(tx_cqe);
+
+	rx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry));
+	cr_assert_not_null(rx_cqe);
+
+	tx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args));
+	cr_assert_not_null(tx_args);
+
+	rx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args));
+	cr_assert_not_null(rx_args);
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	tx_evt_args.cqe = tx_cqe;
+	rx_evt_args.cqe = rx_cqe;
+
+	/* Set alias EP traffic class */
+	ret = fi_set_val(&cxit_tx_alias_ep->fid, FI_OPT_CXI_SET_TCLASS,
+			 &param->tclass);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_set_val traffic class");
+
+	/* Issue the Sends */
+	for (size_t tx_io = 0; tx_io < param->num_ios; tx_io++) {
+		tx_args[tx_io].len = buf_len;
+		tx_args[tx_io].tag = tx_io;
+		tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(tx_args[tx_io].buf);
+		for (size_t i = 0; i < buf_len; i++)
+			tx_args[tx_io].buf[i] = i + 0xa0 + tx_io;
+
+		ep = tx_io & param->alias_mask ? cxit_tx_alias_ep : cxit_ep;
+		do {
+			ret = fi_tsend(ep, tx_args[tx_io].buf,
+				       tx_args[tx_io].len, NULL,
+				       cxit_ep_fi_addr, tx_args[tx_io].tag,
+				       NULL);
+			if (ret == -FI_EAGAIN) {
+				fi_cq_read(cxit_tx_cq, NULL, 0);
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+			}
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d",
+			     tx_io, ret);
+	}
+
+	/* Start processing Send events */
+	ret = pthread_create(&tx_thread, &attr, tagged_evt_worker,
+				(void *)&tx_evt_args);
+	cr_assert_eq(ret, 0, "Send thread create failed %d", ret);
+
+	sleep(1);
+
+	/* Force onloading of UX entries if operating in SW EP mode */
+	fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+
+	/* Optional peek to see if all send tags are found on ux list */
+	if (param->peek) {
+		for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) {
+			ret = try_peek(FI_ADDR_UNSPEC, rx_io, 0, buf_len,
+				       NULL, false);
+			cr_assert_eq(ret, FI_SUCCESS,
+				     "peek of UX message failed");
+		}
+	}
+
+	/* Issue the Receives */
+	for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) {
+		rx_args[rx_io].len = buf_len;
+		rx_args[rx_io].tag = rx_io;
+		rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(rx_args[rx_io].buf);
+		memset(rx_args[rx_io].buf, 0, buf_len);
+
+		do {
+			ret = fi_trecv(cxit_ep, rx_args[rx_io].buf,
+				       rx_args[rx_io].len, NULL,
+				       FI_ADDR_UNSPEC, rx_args[rx_io].tag,
+				       0, NULL);
+			if (ret == -FI_EAGAIN)
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d",
+			     rx_io, ret);
+	}
+
+	/* Start processing Receive events */
+	ret = pthread_create(&rx_thread, &attr, tagged_evt_worker,
+			     (void *)&rx_evt_args);
+	cr_assert_eq(ret, 0, "Receive thread create failed %d", ret);
+
+	/* Wait for the RX/TX event threads to complete */
+	ret = pthread_join(tx_thread, NULL);
+	cr_assert_eq(ret, 0, "Send thread join failed %d", ret);
+
+	ret = pthread_join(rx_thread, NULL);
+	cr_assert_eq(ret, 0, "Recv thread join failed %d", ret);
+
+	/* Validate results */
+	for (size_t io = 0; io < param->num_ios; io++) {
+		/* Validate sent data */
+		cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len);
+
+		validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL);
+
+		validate_rx_event(&rx_cqe[io], NULL, buf_len,
+				  FI_TAGGED | FI_RECV, NULL,
+				  0, tx_args[rx_cqe[io].tag].tag);
+
+		free(tx_args[io].buf);
+		free(rx_args[io].buf);
+	}
+
+	pthread_attr_destroy(&attr);
+	free(rx_cqe);
+	free(tx_cqe);
+	free(tx_args);
+	free(rx_args);
+}
+
+ParameterizedTestParameters(tagged_tclass, multi_tc)
+{
+	size_t param_sz;
+
+	static struct multi_tc_params params[] = {
+		{.length = 64,		/* Eager IDC */
+		 .num_ios = 10,
+		 .tclass = FI_TC_LOW_LATENCY,
+		 .peek = true,
+		 .alias_mask = 0x1},
+		{.length = 64,		/* Eager IDC */
+		 .num_ios = 10,
+		 .tclass = FI_TC_LOW_LATENCY,
+		 .peek = true,
+		 .alias_mask = 0x3},
+		{.length = 2 * 1024,	/* Eager */
+		 .num_ios = 15,
+		 .tclass = FI_TC_LOW_LATENCY,
+		 .peek = true,
+		 .alias_mask = 0x1},
+		{.length = 4 * 1024,	/* Rendezvous */
+		 .num_ios = 12,
+		 .tclass = FI_TC_LOW_LATENCY,
+		 .peek = true,
+		 .alias_mask = 0x1},
+		{.length = 128 * 1024,	/* Rendezvous */
+		 .num_ios = 25,
+		 .tclass = FI_TC_LOW_LATENCY,
+		 .peek = true,
+		 .alias_mask = 0x1},
+	};
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct multi_tc_params, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct multi_tc_params *param, tagged_tclass, multi_tc,
+		  .timeout = 60)
+{
+	do_multi_tc(param);
+}
+
+TestSuite(tagged_src_err, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(tagged_src_err, cap_not_requested)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* No hints, both FI_SOURCE and FI_SOURCE_ERR should be removed
+	 * since they are secondary capabilities that impact performance.
+	 */
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, NULL,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+	cr_assert_eq(info->caps & FI_SOURCE, 0, "FI_SOURCE");
+	cr_assert_eq(info->caps & FI_SOURCE_ERR, 0, "FI_SOURCE_ERR");
+	fi_freeinfo(info);
+
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = 0;
+
+	/* No caps, both FI_SOURCE and FI_SOURCE_ERR should not be set since
+	 * they are secondary capabilities and they impact performance.
+	 */
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+
+	cr_assert_eq(info->caps & FI_SOURCE, 0, "FI_SOURCE");
+	cr_assert_eq(info->caps & FI_SOURCE_ERR, 0, "FI_SOURCE_ERR");
+
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(tagged_src_err, hints_check)
+{
+	struct fi_info *info;
+	int ret;
+
+	/* If only FI_SOURCE then FI_SOURCE_ERR should not be set */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_MSG | FI_SOURCE;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+
+	cr_assert_eq(info->caps & FI_SOURCE, FI_SOURCE, "FI_SOURCE");
+	cr_assert_eq(info->caps & FI_SOURCE_ERR, 0, "FI_SOURCE_ERR");
+
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+
+	/* Validate FI_SOURCE are set if FI_SOURCE_ERR specified in hints */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_MSG | FI_SOURCE | FI_SOURCE_ERR;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+
+	cr_assert_eq(info->caps & FI_SOURCE, FI_SOURCE, "FI_SOURCE");
+	cr_assert_eq(info->caps & FI_SOURCE_ERR, FI_SOURCE_ERR,
+		     "FI_SOURCE_ERR");
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+
+	/* Verify that if hints are specified, but do not include FI_SOURCE
+	 * FI_SOURCE_ERR in capabilities they are not returned.
+	 */
+	cxit_setup_getinfo();
+	cxit_fi_hints->caps = FI_MSG;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == FI_SUCCESS);
+
+	cr_assert_eq(info->caps & FI_SOURCE, 0, "FI_SOURCE");
+	cr_assert_eq(info->caps & FI_SOURCE_ERR, 0, "FI_SOURCE_ERR");
+	fi_freeinfo(info);
+	cxit_teardown_getinfo();
+}
+
+Test(tagged_src_err, invalid_use)
+{
+	struct fi_info *info;
+	int ret;
+
+	cxit_setup_getinfo();
+
+	/* If no FI_SOURCE then FI_SOURCE_ERR is not allowed */
+	cxit_fi_hints->caps = FI_MSG | FI_SOURCE_ERR;
+	ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION),
+			 cxit_node, cxit_service, cxit_flags, cxit_fi_hints,
+			 &info);
+	cr_assert(ret == -FI_ENODATA);
+
+	cxit_teardown_getinfo();
+}
+
+Test(tagged_src_err, addr)
+{
+	struct fid_ep *fid_ep;
+	struct fid_eq *fid_eq;
+	struct fi_eq_attr eq_attr = {
+		.size = 32,
+		.flags = FI_WRITE,
+		.wait_obj = FI_WAIT_NONE
+	};
+	struct fid_cq *fid_tx_cq;
+	struct fid_cq *fid_rx_cq;
+	struct fid_av *fid_av;
+	struct cxip_addr ep_addr;
+	fi_addr_t fi_dest_ep_addr;
+	fi_addr_t fi_src_err_ep_addr;
+	size_t addr_len = sizeof(ep_addr);
+	int ret;
+	uint8_t *recv_buf,
+		*send_buf;
+	int recv_len = 64;
+	int send_len = 64;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	struct fi_cq_err_entry err_entry;
+	int i;
+
+	/* Create first EP - adds itself to the AV */
+	cxit_setup_enabled_ep();
+	ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, NULL, 0, NULL);
+	cr_assert_eq(ret, 1, "First EP AV insert of self %d\n", ret);
+
+	/* Create second EP and resources */
+	cr_assert_eq(cxit_fi->caps &
+		     (FI_TAGGED | FI_SOURCE | FI_SOURCE_ERR | FI_DIRECTED_RECV),
+		     (FI_TAGGED | FI_SOURCE | FI_SOURCE_ERR | FI_DIRECTED_RECV),
+		     "info->caps");
+	ret = fi_endpoint(cxit_domain, cxit_fi, &fid_ep, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "Second EP %d", ret);
+	ret = fi_eq_open(cxit_fabric, &eq_attr, &fid_eq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "Second EP EQ %d", ret);
+	ret = fi_ep_bind(fid_ep, &fid_eq->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "Second PE EQ bind %d", ret);
+	ret = fi_cq_open(cxit_domain, &cxit_tx_cq_attr, &fid_tx_cq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "Second EP TXCQ %d", ret);
+	ret = fi_cq_open(cxit_domain, &cxit_rx_cq_attr, &fid_rx_cq, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "Second EP RXCQ %d", ret);
+	ret = fi_ep_bind(fid_ep, &fid_tx_cq->fid, FI_TRANSMIT);
+	cr_assert_eq(ret, FI_SUCCESS, "Second EP bind TXCQ %d", ret);
+	ret = fi_ep_bind(fid_ep, &fid_rx_cq->fid, FI_RECV);
+	cr_assert_eq(ret, FI_SUCCESS, "Second EP bind RXCQ %d", ret);
+
+	/* Needs it's own AV */
+	ret = fi_av_open(cxit_domain, &cxit_av_attr, &fid_av, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "Second AV %d\n", ret);
+	ret = fi_ep_bind(fid_ep, &fid_av->fid, 0);
+	cr_assert_eq(ret, FI_SUCCESS, "Secnd AV bind %d\n", ret);
+
+	ret = fi_enable(fid_ep);
+	cr_assert_eq(ret, FI_SUCCESS, "Second EP enable %d\n", ret);
+	ret = fi_getname(&fid_ep->fid, &ep_addr, &addr_len);
+	cr_assert_eq(ret, FI_SUCCESS, "Second EP getname %d\n", ret);
+
+	/* Insert second EP address into to both AV, but do not insert
+	 * the first EP address into the the second EP AV.
+	 */
+	ret = fi_av_insert(fid_av, (void *)&ep_addr, 1, 0,
+			   0, NULL);
+	cr_assert_eq(ret, 1, "Second EP AV insert local %d\n", ret);
+
+	ret = fi_av_insert(cxit_av, (void *)&ep_addr, 1, &fi_dest_ep_addr,
+			   0, NULL);
+	cr_assert_eq(ret, 1, "Fisrt EP AV insert second EP %d\n", ret);
+
+	/* Setup buffers */
+	recv_buf = aligned_alloc(s_page_size, recv_len);
+	cr_assert(recv_buf);
+	memset(recv_buf, 0, recv_len);
+
+	send_buf = aligned_alloc(s_page_size, send_len);
+	cr_assert(send_buf);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	/* Test address not found EP1->EP2 */
+	ret = fi_trecv(fid_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+	sleep(1);
+
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, fi_dest_ep_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Receive should get an -FI_EAVAIL with source error info */
+	ret = cxit_await_completion(fid_rx_cq, &rx_cqe);
+	cr_assert_eq(ret, -FI_EAVAIL);
+	err_entry.err_data_size = sizeof(uint32_t);
+	err_entry.err_data = malloc(sizeof(uint32_t));
+	cr_assert(err_entry.err_data);
+
+	ret = fi_cq_readerr(fid_rx_cq, &err_entry, 0);
+	cr_assert_eq(ret, 1, "Readerr CQ %d\n", ret);
+
+	/* Insert address from FI_SOURCE_ERR into AV */
+	ret = fi_av_insert(fid_av, (void *)err_entry.err_data, 1,
+			   &fi_src_err_ep_addr, 0, NULL);
+
+	cr_assert_eq(ret, 1, "Second EP AV add src address %d\n", ret);
+
+	/* Wait for TX */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "Send completion %d\n", ret);
+
+	/* First EP address should now be found EP1->EP2 */
+	ret = fi_trecv(fid_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+	sleep(1);
+
+	ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, fi_dest_ep_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Receive should complete successfully */
+	ret = cxit_await_completion(fid_rx_cq, &rx_cqe);
+	cr_assert_eq(ret, 1);
+
+	/* Wait for TX */
+	ret = cxit_await_completion(cxit_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "Send completion %d\n", ret);
+
+	/* Validate that the inserted address may be used in send,
+	 * i.e. EP2 can now send to EP1.
+	 */
+	ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0,
+		       0, NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret);
+	sleep(1);
+
+	ret = fi_tsend(fid_ep, send_buf, send_len, NULL, fi_src_err_ep_addr, 0,
+		       NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret);
+
+	/* Receive should complete successfully */
+	ret = cxit_await_completion(cxit_rx_cq, &rx_cqe);
+	cr_assert_eq(ret, 1);
+
+	/* Wait for TX */
+	ret = cxit_await_completion(fid_tx_cq, &tx_cqe);
+	cr_assert_eq(ret, 1, "Send completion %d\n", ret);
+
+	/* Cleanup Second EP */
+	fi_close(&fid_ep->fid);
+	fi_close(&fid_av->fid);
+	fi_close(&fid_tx_cq->fid);
+	fi_close(&fid_rx_cq->fid);
+
+	/* Cleanup First EP */
+	cxit_teardown_tagged();
+	cxit_teardown_getinfo();
+
+	free(err_entry.err_data);
+}
+
+TestSuite(tagged_cq_wait, .init = cxit_setup_rma_fd,
+	  .fini = cxit_teardown_rma_fd,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+struct fd_params {
+	size_t length;
+	size_t num_ios;
+	int timeout;
+	bool poll;
+	bool ux_msg;
+};
+
+struct tagged_cq_wait_event_args {
+	struct fid_cq *cq;
+	struct fi_cq_tagged_entry *cqe;
+	size_t io_num;
+	int timeout;
+	bool poll;
+};
+
+static void *tagged_cq_wait_evt_worker(void *data)
+{
+	int ret;
+	struct tagged_cq_wait_event_args *args;
+	struct fid *fids[1];
+	int cq_fd;
+	size_t completions = 0;
+
+	args = (struct tagged_cq_wait_event_args *)data;
+
+	if (args->poll) {
+		ret = fi_control(&args->cq->fid, FI_GETWAIT, &cq_fd);
+		cr_assert_eq(ret, FI_SUCCESS, "Get CQ wait FD %d", ret);
+		fids[0] = &args->cq->fid;
+	}
+
+	while (completions < args->io_num) {
+		if (args->poll) {
+			ret = fi_trywait(cxit_fabric, fids, 1);
+			if (ret == FI_SUCCESS) {
+				struct pollfd fds;
+
+				fds.fd = cq_fd;
+				fds.events = POLLIN;
+
+				ret = poll(&fds, 1, args->timeout);
+				cr_assert_neq(ret, 0, "Poll timed out");
+				cr_assert_eq(ret, 1, "Poll error");
+			}
+			ret = fi_cq_read(args->cq,
+					 &args->cqe[completions], 1);
+			if (ret == 1)
+				completions++;
+		} else {
+			ret = fi_cq_sread(args->cq, &args->cqe[completions],
+					  1, NULL, args->timeout);
+			cr_assert_eq(ret, 1, "Completion not received\n");
+			completions++;
+		}
+	}
+
+	pthread_exit(NULL);
+}
+
+static void cq_wait_post_sends(struct tagged_thread_args *tx_args,
+			       struct fd_params *param)
+{
+	int ret;
+	size_t buf_len = param->length;
+
+	/* Issue the Sends */
+	for (size_t tx_io = 0; tx_io < param->num_ios; tx_io++) {
+		tx_args[tx_io].len = buf_len;
+		tx_args[tx_io].tag = tx_io;
+		tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(tx_args[tx_io].buf);
+		for (size_t i = 0; i < buf_len; i++)
+			tx_args[tx_io].buf[i] = i + 0xa0 + tx_io;
+
+		do {
+			ret = fi_tsend(cxit_ep, tx_args[tx_io].buf,
+				       tx_args[tx_io].len, NULL,
+				       cxit_ep_fi_addr, tx_args[tx_io].tag,
+				       NULL);
+			if (ret == -FI_EAGAIN) {
+				fi_cq_read(cxit_tx_cq, NULL, 0);
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+			}
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d",
+			     tx_io, ret);
+	}
+}
+
+void do_cq_wait(struct fd_params *param)
+{
+	int ret;
+	size_t buf_len = param->length;
+	struct fi_cq_tagged_entry *rx_cqe;
+	struct fi_cq_tagged_entry *tx_cqe;
+	struct tagged_thread_args *tx_args;
+	struct tagged_thread_args *rx_args;
+	pthread_t tx_thread;
+	pthread_t rx_thread;
+	pthread_attr_t attr;
+	struct tagged_cq_wait_event_args tx_evt_args = {
+		.cq = cxit_tx_cq,
+		.io_num = param->num_ios,
+		.timeout = param->timeout,
+		.poll = param->poll,
+	};
+	struct tagged_cq_wait_event_args rx_evt_args = {
+		.cq = cxit_rx_cq,
+		.io_num = param->num_ios,
+		.timeout = param->timeout,
+		.poll = param->poll,
+	};
+
+	tx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry));
+	cr_assert_not_null(tx_cqe);
+
+	rx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry));
+	cr_assert_not_null(rx_cqe);
+
+	tx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args));
+	cr_assert_not_null(tx_args);
+
+	rx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args));
+	cr_assert_not_null(rx_args);
+
+	pthread_attr_init(&attr);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	tx_evt_args.cqe = tx_cqe;
+	rx_evt_args.cqe = rx_cqe;
+
+	/* Sends first if testing unexpected message operation */
+	if (param->ux_msg) {
+		cq_wait_post_sends(tx_args, param);
+
+		/* Start processing Send events */
+		ret = pthread_create(&tx_thread, &attr,
+				     tagged_cq_wait_evt_worker,
+				     (void *)&tx_evt_args);
+		cr_assert_eq(ret, 0, "Send thread create failed %d", ret);
+
+		/* Force onloading of UX entries if operating in SW EP mode */
+		fi_cq_read(cxit_rx_cq, &rx_cqe, 0);
+	}
+
+	/* Issue the Receives */
+	for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) {
+		rx_args[rx_io].len = buf_len;
+		rx_args[rx_io].tag = rx_io;
+		rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len);
+		cr_assert_not_null(rx_args[rx_io].buf);
+		memset(rx_args[rx_io].buf, 0, buf_len);
+
+		do {
+			ret = fi_trecv(cxit_ep, rx_args[rx_io].buf,
+				       rx_args[rx_io].len, NULL,
+				       FI_ADDR_UNSPEC, rx_args[rx_io].tag,
+				       0, NULL);
+			if (ret == -FI_EAGAIN)
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d",
+			     rx_io, ret);
+	}
+
+	/* Start processing Receive events */
+	ret = pthread_create(&rx_thread, &attr, tagged_cq_wait_evt_worker,
+			     (void *)&rx_evt_args);
+	cr_assert_eq(ret, 0, "Receive thread create failed %d", ret);
+
+	/* Sends last for expected messaging */
+	if (!param->ux_msg) {
+		/* Make sure receive has blocked */
+		sleep(1);
+		cq_wait_post_sends(tx_args, param);
+
+		/* Start processing Send events */
+		ret = pthread_create(&tx_thread, &attr,
+				     tagged_cq_wait_evt_worker,
+				     (void *)&tx_evt_args);
+	}
+
+	/* Wait for the RX/TX event threads to complete */
+	ret = pthread_join(tx_thread, NULL);
+	cr_assert_eq(ret, 0, "Send thread join failed %d", ret);
+
+	ret = pthread_join(rx_thread, NULL);
+	cr_assert_eq(ret, 0, "Recv thread join failed %d", ret);
+
+	/* Validate results */
+	for (size_t io = 0; io < param->num_ios; io++) {
+		/* Validate sent data */
+		cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len);
+
+		validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL);
+
+		validate_rx_event(&rx_cqe[io], NULL, buf_len,
+				  FI_TAGGED | FI_RECV, NULL,
+				  0, tx_args[rx_cqe[io].tag].tag);
+
+		free(tx_args[io].buf);
+		free(rx_args[io].buf);
+	}
+
+	pthread_attr_destroy(&attr);
+	free(rx_cqe);
+	free(tx_cqe);
+	free(tx_args);
+	free(rx_args);
+}
+
+ParameterizedTestParameters(tagged_cq_wait, wait_fd)
+{
+	size_t param_sz;
+
+	static struct fd_params params[] = {
+		{.length = 1024,
+		 .num_ios = 4,
+		 .timeout = 5000,
+		 .poll = true},
+		{.length = 8192,
+		 .num_ios = 4,
+		 .timeout = 5000,
+		 .poll = true},
+		{.length = 1024,
+		 .num_ios = 4,
+		 .timeout = 5000,
+		 .poll = false},
+		{.length = 8192,
+		 .num_ios = 4,
+		 .timeout = 5000,
+		 .poll = false},
+	};
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct fd_params, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct fd_params *param, tagged_cq_wait, wait_fd,
+		  .timeout = 60)
+{
+	do_cq_wait(param);
+}
+
+TestSuite(tagged_tx_size, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+Test(tagged_tx_size, force_progress)
+{
+	struct fi_cq_tagged_entry rx_cqe;
+	struct fi_cq_tagged_entry tx_cqe;
+	fi_addr_t from;
+	char *send_buf;
+	char *recv_buf;
+	size_t buf_len;
+	int ret;
+	int tx_posted;
+	int rx_posted;
+	int i;
+
+	/* Limit the TX queue size to 32 */
+	cxit_setup_getinfo();
+	cxit_fi_hints->tx_attr->size = 32;
+	cxit_setup_rma();
+
+	cr_assert_eq(cxit_fi_hints->tx_attr->size,
+		     cxit_fi->tx_attr->size, "tx_attr->size");
+
+	/* Send unexpected rendezvous messages so that completions
+	 * will not occur and verify we get resource management
+	 * at tx_attr->size.
+	 */
+	buf_len = 32 * 1024;
+	send_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert_not_null(send_buf);
+	recv_buf = aligned_alloc(s_page_size, buf_len);
+	cr_assert_not_null(recv_buf);
+
+	ret = 0;
+	for (tx_posted = 0; tx_posted < cxit_fi->tx_attr->size + 1;
+	     tx_posted++) {
+		ret = fi_tsend(cxit_ep, send_buf, buf_len, NULL,
+			       cxit_ep_fi_addr, 0, NULL);
+		if (ret == -FI_EAGAIN)
+			break;
+	}
+	cr_assert_eq(ret, -FI_EAGAIN, "-FI_EAGAIN expected");
+	cr_assert(tx_posted <= cxit_fi->tx_attr->size,
+		  "Too many I/O initiated\n");
+
+	/* Post the receives and get RX completions */
+	ret = 0;
+	for (rx_posted = 0; rx_posted < tx_posted; rx_posted++) {
+		do {
+			ret = fi_trecv(cxit_ep, recv_buf, buf_len, NULL,
+				       FI_ADDR_UNSPEC, 0, 0, NULL);
+			if (ret == -FI_EAGAIN)
+				fi_cq_read(cxit_rx_cq, NULL, 0);
+		} while (ret == -FI_EAGAIN);
+		cr_assert_eq(ret, FI_SUCCESS,
+			     "fi_trecv %d: unexpected ret %d", rx_posted, ret);
+		do {
+			ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		} while (ret == -FI_EAGAIN);
+	}
+
+	/* Get TX completions */
+	ret = 0;
+	for (i = 0; i < tx_posted; i++) {
+		do {
+			ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		} while (ret == -FI_EAGAIN);
+	}
+	cr_assert_eq(ret, 1, "bad completion status");
+	cr_assert_eq(i, tx_posted, "bad TX completion count");
+
+	cxit_teardown_rma();
+
+	free(send_buf);
+	free(recv_buf);
+}
diff --git a/prov/cxi/test/tagged_stress.c b/prov/cxi/test/tagged_stress.c
new file mode 100644
index 00000000000..4addafaf52a
--- /dev/null
+++ b/prov/cxi/test/tagged_stress.c
@@ -0,0 +1,224 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ *
+ * Copyright (c) 2018 Hewlett Packard Enterprise Development LP
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+#define RECV_INIT 0x77
+#define SEND_INIT ~RECV_INIT
+
+TestSuite(tagged_stress, .init = cxit_setup_tagged,
+	  .fini = cxit_teardown_tagged,
+	  .timeout = CXIT_DEFAULT_TIMEOUT);
+
+static void do_msg(uint8_t *send_buf, size_t send_len, uint64_t send_tag,
+	    uint8_t *recv_buf, size_t recv_len, uint64_t recv_tag,
+	    uint64_t recv_ignore, bool send_first, size_t buf_size,
+	    bool tagged, size_t ntrans)
+{
+	int i, j, ret;
+	int err = 0;
+	struct fi_cq_tagged_entry tx_cqe,
+				  rx_cqe;
+	fi_addr_t from;
+	int sent = false;
+	int recved = false;
+	struct fi_cq_err_entry err_cqe = {};
+
+	memset(recv_buf, RECV_INIT, send_len * ntrans);
+
+	for (i = 0; i < send_len; i++)
+		send_buf[i] = i + 0xa0;
+
+	if (send_first) {
+		for (j = 0; j < ntrans; j++) {
+			/* Send 64 bytes to self */
+			if (tagged) {
+				ret = fi_tsend(cxit_ep, send_buf, send_len,
+					       NULL, cxit_ep_fi_addr, send_tag,
+					       NULL);
+				cr_assert_eq(ret, FI_SUCCESS,
+					     "fi_tsend failed %d",
+					     ret);
+			} else {
+				ret = fi_send(cxit_ep, send_buf, send_len,
+					      NULL, cxit_ep_fi_addr, NULL);
+				cr_assert_eq(ret, FI_SUCCESS,
+					     "fi_send failed %d",
+					     ret);
+			}
+
+			/* Progress send to ensure it arrives unexpected */
+			i = 0;
+			do {
+				ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+				if (ret == 1) {
+					sent++;
+					break;
+				}
+				cr_assert_eq(ret, -FI_EAGAIN,
+					     "send failed %d", ret);
+			} while (i++ < 10000);
+		}
+	}
+
+	/* Post RX buffer */
+
+	for (j = 0; j < ntrans; j++) {
+		if (tagged) {
+			ret = fi_trecv(cxit_ep, recv_buf + j * send_len,
+				       recv_len, NULL,
+				       FI_ADDR_UNSPEC, recv_tag, recv_ignore,
+				       NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d",
+				     ret);
+		} else {
+			ret = fi_recv(cxit_ep, recv_buf + j * send_len,
+				      recv_len, NULL,
+				      FI_ADDR_UNSPEC, NULL);
+			cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d",
+				     ret);
+		}
+	}
+
+	if (!send_first) {
+		for (j = 0; j < ntrans; j++) {
+			if (tagged) {
+				ret = fi_tsend(cxit_ep, send_buf, send_len,
+					       NULL, cxit_ep_fi_addr, send_tag,
+					       NULL);
+				cr_assert_eq(ret, FI_SUCCESS,
+					     "fi_tsend failed %d",
+					     ret);
+			} else {
+				ret = fi_send(cxit_ep, send_buf, send_len,
+					      NULL,
+					      cxit_ep_fi_addr, NULL);
+				cr_assert_eq(ret, FI_SUCCESS,
+					     "fi_send failed %d",
+					     ret);
+			}
+		}
+	}
+
+	/* Gather both events, ensure progress on both sides. */
+	do {
+		ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from);
+		if (ret == 1) {
+			cr_assert_lt(recved, ntrans);
+			recved++;
+		} else if (ret == -FI_EAVAIL) {
+			cr_assert_lt(recved, ntrans);
+
+			ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0);
+			cr_assert_eq(ret, 1);
+			recved++;
+		} else {
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d", ret);
+		}
+
+		ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1);
+		if (ret == 1) {
+			cr_assert_lt(sent, ntrans);
+			sent++;
+		} else {
+			cr_assert_eq(ret, -FI_EAGAIN,
+				     "fi_cq_read unexpected value %d", ret);
+		}
+	} while (sent < ntrans || recved < ntrans);
+
+	for (i = 0; i < ntrans; i++) {
+		for (j = 0; j < send_len; j++) {
+			uint8_t *r = recv_buf + i * send_len;
+
+			cr_expect_eq(r[j], send_buf[j],
+				     "data mismatch, element[%d], exp=%d saw=%d, err=%d\n",
+				     j, send_buf[j], r[j], err++);
+		}
+		cr_assert_eq(err, 0, "trans[%d] Data errors seen\n", i);
+	}
+}
+
+#define BUF_SIZE (128*1024)
+#define SEND_MIN 64
+#define TAG 0x333333333333
+
+struct tagged_rx_params {
+	size_t buf_size;
+	size_t send_min;
+	uint64_t send_tag;
+	int recv_len_off;
+	uint64_t recv_tag;
+	uint64_t ignore;
+	bool ux;
+	bool tagged;
+	size_t ntrans;
+};
+
+static struct tagged_rx_params params[] = {
+	{.buf_size = BUF_SIZE, /* equal length */
+	 .send_min = SEND_MIN,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = false,
+	 .tagged = true,
+	 .ntrans = 200},
+	{.buf_size = BUF_SIZE, /* equal length UX */
+	 .send_min = SEND_MIN,
+	 .send_tag = 0,
+	 .recv_len_off = 0,
+	 .recv_tag = 0,
+	 .ignore = 0,
+	 .ux = true,
+	 .tagged = true,
+	 .ntrans = 200},
+};
+
+ParameterizedTestParameters(tagged_stress, rx)
+{
+	size_t param_sz;
+
+	param_sz = ARRAY_SIZE(params);
+	return cr_make_param_array(struct tagged_rx_params, params,
+				   param_sz);
+}
+
+ParameterizedTest(struct tagged_rx_params *param, tagged_stress, rx,
+		  .timeout = 60*10, .disabled = true)
+{
+	uint8_t *recv_buf,
+		*send_buf;
+	size_t send_len;
+
+	recv_buf = aligned_alloc(s_page_size, param->buf_size * param->ntrans);
+	cr_assert(recv_buf);
+
+	send_buf = aligned_alloc(s_page_size, param->buf_size * param->ntrans);
+	cr_assert(send_buf);
+
+	for (send_len = param->send_min;
+	     send_len <= param->buf_size;
+	     send_len <<= 1) {
+		do_msg(send_buf, send_len, param->send_tag,
+		       recv_buf, send_len + param->recv_len_off,
+		       param->recv_tag, param->ignore, param->ux,
+		       param->buf_size, param->tagged, param->ntrans);
+		printf("send_len: %ld completed\n", send_len);
+	}
+
+	free(send_buf);
+	free(recv_buf);
+}
diff --git a/prov/cxi/test/test.sh b/prov/cxi/test/test.sh
new file mode 100644
index 00000000000..ea6a913703f
--- /dev/null
+++ b/prov/cxi/test/test.sh
@@ -0,0 +1,450 @@
+#!/bin/bash
+#
+# set -x
+#
+# Run CXI unit tests.
+#
+# ################################################################
+#
+# Tests are declared as an array with up to 3 strings:
+# 1) the test body
+# 2) an optional prolog
+# 3) an optional epilog
+#
+# The strings are executed with the shell 'eval' function.
+# They may contain more than one statement separated by semi-colons.
+# You probably want to escape your '\$' and '\"' in the strings....
+# Prologs and epilogs may be "".  Or absent if both at "".
+# Output from test body is captured in $TEST_OUTPUT automatically.
+# Output from prologs and epilogs is not captured by default.
+#
+# Tests are grouped into suites.  Since Bash does not
+# really have 2 dimensional arrays, the suites are arrays
+# of test names, which match the variable names of the tests.
+#
+# The long suite is selected by default.
+# The short and dummy suites can be selected with -s or -d.
+#
+# A no-execute mode is selected with -n.  This prints the
+# test name, prolog, body and epilog for every test in the
+# selected suite.
+#
+# See default_env for environment variables common to all tests.
+# Overriding for a particular test is supported in the test body.
+#
+# To disable a test, comment out the name in the suite.
+# ################################################################
+#
+# The examples:
+
+dummy_test1=(
+	"echo \"dummy test\""
+	"echo \"dummy prolog\"; echo \$(hostname)"
+	"echo \"dummy epilog\""
+)
+
+dummy_test2=(
+	"echo \"dummy test with epilog but no prolog\""
+	""
+	"echo \"dummy epilog\"")
+
+dummy_test3=(
+	"echo \"simple dummy test\"")
+
+dummy_test_suite=(
+	"dummy_test1"
+	"dummy_test2"
+	"dummy_test3"
+)
+
+# ################################################################
+# The short tests and short test suite
+
+short_test1=(
+	"./cxitest --verbose --filter=\"@(msg*|tagged*|rma*|atomic*)/*\" -j 1 -f --tap=cxitest-short.tap")
+
+short_test_suite=(
+	"short_test1"
+)
+
+# ################################################################
+# the long tests and long test suite
+
+basic_test=("./cxitest --verbose --tap=cxitest.tap -j 1")
+
+swget_test=(
+	"FI_CXI_RGET_TC=BULK_DATA ./cxitest --verbose --filter=\"@(tagged|msg)/*\" --tap=cxitest-swget.tap -j1"
+	"csrutil store csr C_LPE_CFG_GET_CTRL get_en=0 > /dev/null"
+	"csrutil store csr C_LPE_CFG_GET_CTRL get_en=1 > /dev/null")
+
+swget_unaligned_test=(
+	"FI_CXI_RDZV_THRESHOLD=2036 ./cxitest --verbose --filter=\"@(tagged|msg)/*\" --tap=cxitest-swget-unaligned.tap -j1"
+	"csrutil store csr C_LPE_CFG_GET_CTRL get_en=0 > /dev/null"
+	"csrutil store csr C_LPE_CFG_GET_CTRL get_en=1 > /dev/null")
+
+constrained_le_test=(
+	"FI_CXI_DEFAULT_CQ_SIZE=16384 ./cxitest --verbose --filter=\"@(tagged|msg)/fc*\" --tap=cxitest-constrained-le.tap -j1"
+	"MAX_ALLOC=\$(csrutil dump csr le_pools[63] | grep max_alloc | awk '{print \$3}'); echo \"Saving MAX_ALLOC=\$MAX_ALLOC\"; csrutil store csr le_pools[] max_alloc=10 > /dev/null"
+	"echo \"Restoring MAX_ALLOC=\$MAX_ALLOC\"; csrutil store csr le_pools[] max_alloc=\$MAX_ALLOC > /dev/null")
+
+hw_matching_rendezvous_test=(
+	"FI_CXI_DEVICE_NAME=\"cxi1,cxi0\" FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --verbose -j 1 --filter=\"tagged_directed/*\" --tap=cxitest-hw-rdzv-tag-matching.tap")
+
+sw_matching_rendezvous_test=(
+	"FI_CXI_RX_MATCH_MODE=\"software\" FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --verbose -j 1 --filter=\"@(tagged|msg)/*\" --tap=cxitest-sw-ep-mode.tap")
+
+fc_eq_space_test=(
+	"FI_CXI_DEFAULT_CQ_SIZE=64 FI_CXI_DISABLE_EQ_HUGETLB=1 FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --filter=\"msg/fc_no_eq_space_expected_multi_recv\" --verbose -j 1 --tap=cxitest-fc-eq-space.tap")
+
+fc_eq_20_percent_test=(
+	"FI_CXI_CQ_FILL_PERCENT=20 FI_CXI_DEFAULT_CQ_SIZE=64 FI_CXI_DISABLE_EQ_HUGETLB=1 FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --filter=\"msg/fc_no_eq_space_expected_multi_recv\" --verbose -j 1 --tap=cxitest-fc-20%-eq-space.tap")
+
+fi_info_test=(
+	"./fi_info_test.sh --tap=fi_info.tap")
+
+unoptimized_mr_test=(
+	"FI_CXI_OPTIMIZED_MRS=0 ./cxitest --filter=\"amo_hybrid_mr_desc/*\" -j 1 -f --verbose --tap=cxitest-hybrid_mr_desc_unopt_mrs.tap")
+
+provider_keys_mr_test=(
+	"CXIP_TEST_PROV_KEY=1 ./cxitest -j 1 -f --verbose --tap=cxitest-prov_key_mrs.tap")
+
+unoptimized_provider_keys_mr_test=(
+	"CXIP_TEST_PROV_KEY=1 FI_CXI_OPTIMIZED_MRS=0 ./cxitest --filter=\"@(rma|mr)/*\" -j 1 -f --verbose --tap=cxitest-prov_key_no_opt_mrs.tap")
+
+provider_keys_std_fallback_test=(
+	"CXIP_TEST_PROV_KEY=1 FI_MR_CACHE_MONITOR=\"disabled\" ./cxitest --filter=\"mr_resources/opt_fallback\" -j 1 -f --verbose --tap=cxitest-prov_key_opt_to_std.tap")
+
+zero_eager_size_test=(
+	"FI_CXI_RDZV_EAGER_SIZE=0 ./cxitest --filter=\"@(tagged|msg)/*\" -j 1 -f --verbose --tap=cxitest-zero-rdzv-eager-size.tap")
+
+alt_read_rendezvous_test=(
+	"FI_CXI_RDZV_PROTO=\"alt_read\" ./cxitest --filter=\"tagged/*rdzv\" -j 1 -f --verbose --tap=cxitest-alt-read-rdzv.tap"
+	"csrutil store csr C_LPE_CFG_GET_CTRL get_en=0 > /dev/null"
+	"csrutil store csr C_LPE_CFG_GET_CTRL get_en=1 > /dev/null")
+
+mr_mode_no_compat_test=(
+	"FI_CXI_COMPAT=0 ./cxitest -j 1 --filter=\"getinfo_infos/*\" -f --verbose --tap=cxitest-mr-mode-no-compat.tap")
+
+mr_mode_with_odp_test=(
+	"FI_CXI_ODP=1 ./cxitest -j 1 --filter=\"getinfo_infos/*\" -f --verbose --tap=cxitest-mr-mode-with-odp.tap")
+
+mr_mode_with_prov_keys_odp_test=(
+	"FI_CXI_ODP=1 CXIP_TEST_PROV_KEY=1 ./cxitest -j 1 --filter=\"getinfo_infos/*\" -f --verbose --tap=cxitest-mr-mode-with-prov-key-odp.tap")
+
+cxi_fork_safe_test=(
+	"CXI_FORK_SAFE=1 CXI_FORK_SAFE_HP=1 ./cxitest --verbose --tap=cxitest-fork-safe.tap --filter=\"@(rma*|tagged*|msg*|atomic*)/*\" -j 1")
+
+fork_safe_monitor_disabled_test=(
+	"FI_MR_CACHE_MONITOR=\"disabled\" ./cxitest --verbose --tap=cxitest-fork_tests-mr_cache_disabled.tap --filter=\"fork/*\" -j 1")
+
+fork_safe_uffd_test=(
+	"FI_MR_CACHE_MONITOR=\"uffd\" ./cxitest --verbose --tap=cxitest-fork_tests-mr_cache_uffd.tap --filter=\"fork/*\" -j 1")
+
+fork_safe_memhooks_test=(
+	"FI_MR_CACHE_MONITOR=\"memhooks\" ./cxitest --verbose --tap=cxitest-fork_tests-mr_cache_memhooks.tap --filter=\"fork/*\" -j 1")
+
+fork_safe_kdreg2_test=(
+	"FI_MR_CACHE_MONITOR=\"kdreg2\" ./cxitest --verbose --tap=cxitest-fork_tests-mr_cache_kdreg2.tap --filter=\"fork/*\" -j 1")
+
+unlimited_triggered_ops_test=(
+	"FI_CXI_ENABLE_TRIG_OP_LIMIT=0 ./cxitest -j 1 --verbose --filter=\"deferred_work_trig_op_limit/*\" --tap=cxitest-disable-trig-op-limit.tap")
+
+long_test_suite=(
+	"basic_test"
+	"swget_test"
+	"swget_unaligned_test"
+	"constrained_le_test"
+	"hw_matching_rendezvous_test"
+	"sw_matching_rendezvous_test"
+	"fc_eq_space_test"
+	"fc_eq_20_percent_test"
+	"fi_info_test"
+	"unoptimized_mr_test"
+	"provider_keys_mr_test"
+	"unoptimized_provider_keys_mr_test"
+	"provider_keys_std_fallback_test"
+	"zero_eager_size_test"
+	"alt_read_rendezvous_test"
+	"mr_mode_no_compat_test"
+	"mr_mode_with_odp_test"
+	"mr_mode_with_prov_keys_odp_test"
+	"cxi_fork_safe_test"
+	"fork_safe_monitor_disabled_test"
+	"fork_safe_uffd_test"
+	"fork_safe_memhooks_test"
+	"fork_safe_kdreg2_test"
+	"unlimited_triggered_ops_test"
+)
+
+# ################################################################
+
+known_suites=(
+	"short_test_suite"
+	"long_test_suite"
+	"dummy_test_suite"
+)
+
+default_test_suite="long_test_suite"
+
+# ################################################################
+
+default_env=(
+	"DMA_FAULT_RATE=0.1"
+	"MALLOC_FAULT_RATE=0.1"
+	"FI_LOG_LEVEL=warn"
+	"FI_CXI_FC_RECOVERY=1"
+	"FI_CXI_ENABLE_TRIG_OP_LIMIT=1"
+	"FI_MR_CACHE_MONITOR=uffd"
+)
+
+# ################################################################
+
+dashes="----------------------------------------------------------------"
+
+# ################################################################
+
+print_suites() {
+
+	for suite in "${known_suites[@]}"; do
+		echo "Suite: $suite"
+		local -n tests="$suite"
+		for test in "${tests[@]}"; do
+			echo "    $test"
+		done;
+	done;
+
+	return 0
+}
+
+# ################################################################
+# Function to run one test
+# It expects the following argument:
+#   test name
+
+run_one_test() {
+	if [ $# -eq 0 ]; then
+		echo "$0 called with no arguments (?)"
+		exit 1
+	fi
+	local name="$1"
+
+	local -n elements="$name"
+	local -i num_elements=${#elements[@]}
+
+	if [ $no_execute -ne 0 ]; then
+		echo $dashes
+	fi
+
+	if [ $num_elements -lt 1 ]; then
+		echo "Test $name not found"
+		return 1
+	elif [ $num_elements -gt 3 ]; then
+		echo "test $1 malformed: maximum 3 elements in array: test prolog epilog"
+		exit 1
+	fi
+
+	local test_body="${elements[0]}"
+	if [ $num_elements -ge 2 ]; then
+		local prolog="${elements[1]}"
+	else
+		local prolog=""
+	fi
+	if [ $num_elements -ge 3 ]; then
+		local epilog="${elements[2]}"
+	else
+		local epilog=""
+	fi
+
+	local full_test_string="$test_body >> $TEST_OUTPUT 2>&1"
+
+	if [ $no_execute -ne 0 ]; then
+		echo "Test name: $name"
+		echo "Prolog: $prolog"
+		echo "Test body: $full_test_string"
+		echo "Epilog: $epilog"
+		return 0
+	fi
+
+	if [ -n "$prolog" ]; then
+		echo "Running $name prolog: $prolog"
+		eval $prolog
+	fi
+
+	echo "Running $name: $full_test_string" | tee -a $TEST_OUTPUT
+	eval $full_test_string
+	local -i test_result=$?
+
+	if [ $test_result -ne 0 ]; then
+		echo "Test $name returns non-zero exit code. Possible failures in test teardown."
+	fi
+
+	if [ -n "$epilog" ]; then
+		echo "Running $name epilog: $epilog"
+		eval $epilog
+	fi
+
+	return $test_result
+}
+
+# ################################################################
+# Function to run a list of tests
+
+run_tests() {
+	local ret=0
+	for test in $@; do
+		run_one_test "$test"
+		local r=$?
+		if [ $r -ne 0 ]; then
+			ret=$r
+			if [ $fail_fast -ne 0 ]; then
+				break
+			fi
+		fi
+	done
+	return $ret
+}
+
+# ################################################################
+# Function to run all the tests in a suite
+# It expects the following argument:
+#   suite name
+
+run_test_suite() {
+	if [ $# -ne 1 ]; then
+		echo "$0 called with no arguments (?)"
+		exit 1
+	fi
+	local suite=$1
+
+	echo "Running Suite: $suite"
+
+	local -n tests=$suite
+
+	run_tests "${tests[@]}"
+	local ret=$?
+
+	if [ $no_execute -ne 0 ]; then
+		echo $dashes
+	fi
+
+	return $ret
+}
+
+# ################################################################
+
+print_help() {
+cat <<EOF
+Usage: $SCRIPT [-sdpnefh] -t test1 [test2 ...]
+
+With no options, $SCRIPT runs $default_test_suite.
+
+  -s: run the short_test_suite
+  -d: run the dummy_test_suite
+  -p: print the known test suites and test names
+  -n: print (but do not execute) the commands for a test suite
+  -e: print the environment variables when starting test run
+  -f: fail fast (stop executing after first test failure)
+  -h: print the usage information
+
+  -t test1 [test2 ...]: execute the tests listed on the command line
+
+  (the -s -d and -t options are mutually exclusive)
+
+EOF
+}
+
+# ################################################################
+
+DIR=$(dirname ${BASH_SOURCE[0]:-$0})
+SCRIPT=$(basename ${BASH_SOURCE[0]:-$0})
+TEST_OUTPUT=cxitest.out
+
+suite=$default_test_suite
+test_names=()
+
+declare -i no_execute=0
+declare -i show_environment=0
+declare -i run_specific_tests=0
+declare -i fail_fast=0
+declare -i exclusive=0
+
+while getopts "spdnefht:" option; do
+	case "${option}" in
+		s)
+			suite="short_test_suite"
+			exclusive=$((exclusive + 1))
+			;;
+		d)
+			suite="dummy_test_suite"
+			exclusive=$((exclusive + 1))
+			;;
+		p)
+			print_suites
+			exit 0
+			;;
+		n)
+			no_execute=1
+			;;
+		e)
+			show_environment=1
+			;;
+		f)
+			fail_fast=1
+			;;
+		t)
+			run_specific_tests=1
+			test_names+=("$OPTARG")
+			exclusive=$((exclusive + 1))
+			;;
+		h)
+			print_help
+			exit 0
+			;;
+		*)
+			exit 1
+			;;
+	esac
+done
+
+if [ $exclusive -gt 1 ]; then
+	echo "$SCRIPT: Please specify only one of -s -d or -t."
+	exit 0
+fi
+
+# get the rest of the command line if -t was given
+if [ $run_specific_tests -ne 0 ]; then
+	shift $(expr $OPTIND - 1 )
+	test_names+=($@)
+fi
+
+# setup the environment
+for var in "${default_env[@]}"; do
+	export $var
+done
+
+# Run unit tests. $(CWD) should be writeable.
+
+cd $DIR
+
+echo "Clearing output file: $(realpath $TEST_OUTPUT)"
+
+rm -f $TEST_OUTPUT
+touch $TEST_OUTPUT
+
+if [ $show_environment -ne 0 ]; then
+	echo "Initial test environment variables:" | tee -a $TEST_OUTPUT
+	eval "printenv" | tee -a $TEST_OUTPUT
+fi
+
+if [ $run_specific_tests -ne 0 ]; then
+	run_tests "${test_names[@]}"
+else
+	run_test_suite $suite
+fi
+ret=$?
+
+grep "Tested" $TEST_OUTPUT
+
+echo "$SCRIPT exits, return code $ret"
+
+exit $ret
diff --git a/prov/cxi/test/test_sw.sh b/prov/cxi/test/test_sw.sh
new file mode 100644
index 00000000000..eac237a78f9
--- /dev/null
+++ b/prov/cxi/test/test_sw.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+#
+# Run CXI unit tests for "hybrid" RX match mode.
+# These tests are not supported by NETSIM.
+
+DIR=`dirname $0`
+cd $DIR
+TEST_OUTPUT=cxitest.out
+
+export DMA_FAULT_RATE=.1
+export MALLOC_FAULT_RATE=.1
+export FI_LOG_LEVEL=warn
+export FI_LOG_PROV=cxi
+
+# Run tests using hybrid RX mode, but do not constrain LE
+#
+#echo "running: FI_CXI_RX_MATCH_MODE=hybrid ./cxitest --verbose --tap=cxitest.tap -j2 > $TEST_OUTPUT 2>&1"
+#FI_CXI_RX_MATCH_MODE=hybrid ./cxitest --verbose --tap=cxitest.tap -j2 > $TEST_OUTPUT 2>&1
+#if [[ $? -ne 0 ]]; then
+#    echo "cxitest return non-zero exit code. Possible failures in test teardown"
+#    exit 1
+#fi
+
+# Run tests with constrained LE count - Using Flow Control recovery
+MAX_ALLOC=`csrutil dump csr le_pools[63] |grep max_alloc |awk '{print $3}'`
+csrutil store csr le_pools[] max_alloc=10 > /dev/null
+echo "running;FI_CXI_RX_MATCH_MODE=hardware ./cxitest --verbose --filter=\"tagged/fc*\" --tap=cxitest-fc.tap -j1 > $TEST_OUTPUT 2>&1"
+FI_CXI_RX_MATCH_MODE=hardware ./cxitest --verbose --filter="tagged/fc*" --tap=cxitest-fc.tap -j1 > $TEST_OUTPUT 2>&1
+cxitest_exit_status=$?
+csrutil store csr le_pools[] max_alloc=$MAX_ALLOC > /dev/null
+if [[ $cxitest_exit_status -ne 0 ]]; then
+    echo "cxitest return non-zero exit code. Possible failures in test teardown"
+    exit 1
+fi
+
+# Run tests with constrained LE count - Using hybrid operation instead
+# of flow control recovery
+MAX_ALLOC=`csrutil dump csr le_pools[63] |grep max_alloc |awk '{print $3}'`
+csrutil store csr le_pools[] max_alloc=10 > /dev/null
+echo "running;FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 ./cxitest --verbose --filter=\"tagged/fc*\" --tap=cxitest-sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1"
+FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 ./cxitest --verbose --filter="tagged/fc*" --tap=cxitest-sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1
+cxitest_exit_status=$?
+csrutil store csr le_pools[] max_alloc=$MAX_ALLOC > /dev/null
+if [[ $cxitest_exit_status -ne 0 ]]; then
+    echo "cxitest return non-zero exit code. Possible failures in test teardown"
+    exit 1
+fi
+
+# Run HW to SW hybrid test with constrained LE count and forcing both
+# eager and rendezvous processing
+MAX_ALLOC=`csrutil dump csr le_pools[63] |grep max_alloc |awk '{print $3}'`
+csrutil store csr le_pools[] max_alloc=60 > /dev/null
+echo "running;FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --verbose --filter=\"tagged/hw2sw_*\" --tap=cxitest-hw2sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1"
+FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --verbose --filter="tagged/hw2sw_*" --tap=cxitest-hw2sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1
+csrutil store csr le_pools[] max_alloc=$MAX_ALLOC > /dev/null
+if [[ $cxitest_exit_status -ne 0 ]]; then
+    echo "cxitest return non-zero exit code. Possible failures in test teardown"
+    exit 1
+fi
+
+# Run HW to SW hybrid test with constrained LE count and forcing only eager processing
+MAX_ALLOC=`csrutil dump csr le_pools[63] |grep max_alloc |awk '{print $3}'`
+echo "running;FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=16384 ./cxitest --verbose --filter=\"tagged/hw2sw_*\" --tap=cxitest-hw2sw-eager-transition.tap -j1 >> $TEST_OUTPUT 2>&1"
+FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=16384 ./cxitest --verbose --filter="tagged/hw2sw_*" --tap=cxitest-hw2sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1
+cxitest_exit_status=$?
+csrutil store csr le_pools[] max_alloc=$MAX_ALLOC > /dev/null
+if [[ $cxitest_exit_status -ne 0 ]]; then
+    echo "cxitest return non-zero exit code. Possible failures in test teardown"
+    exit 1
+fi
+
+# Test scaling of request buffers
+echo "running; FI_CXI_RX_MATCH_MODE=software FI_CXI_REQ_BUF_MIN_POSTED=2 FI_CXI_REQ_BUF_MAX_COUNT=10 ./cxitest --verbose --filter=\"tagged/*fc_mt\" --tap=cxitest-sw-reqbuf.tap -j1 >> $TEST_OUTPUT 2>&1"
+FI_CXI_RX_MATCH_MODE=software FI_CXI_REQ_BUF_MIN_POSTED=2 FI_CXI_REQ_BUF_MAX_COUNT=10 ./cxitest --verbose --filter="tagged/*fc_mt" --tap=cxitest-sw-req_buf.tap -j1 >> $TEST_OUTPUT 2>&1
+cxitest_exit_status=$?
+if [[ $cxitest_exit_status -ne 0 ]]; then
+    echo "cxitest return non-zero exit code. Possible failures in test teardown"
+    exit 1
+fi
+
+grep "Tested" $TEST_OUTPUT
diff --git a/prov/cxi/test/ze.c b/prov/cxi/test/ze.c
new file mode 100644
index 00000000000..4d08d103966
--- /dev/null
+++ b/prov/cxi/test/ze.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP
+ * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <ctype.h>
+
+#include <criterion/criterion.h>
+#include <criterion/parameterized.h>
+#include <pthread.h>
+
+#include <level_zero/ze_api.h>
+
+#include "libcxi/libcxi.h"
+#include "cxip.h"
+#include "cxip_test_common.h"
+
+static uint32_t ze_driver_count = 1;
+static ze_driver_handle_t ze_driver;
+static ze_context_handle_t ze_context;
+static uint32_t ze_device_count = 1;
+static ze_device_handle_t ze_device;
+static ze_command_queue_handle_t ze_cq;
+static const ze_device_mem_alloc_desc_t device_desc = {
+	.stype		= ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+	.pNext		= NULL,
+	.flags		= 0,
+	.ordinal	= 0,
+};
+static const ze_host_mem_alloc_desc_t host_desc = {
+	.stype		= ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+	.pNext		= NULL,
+	.flags		= 0,
+};
+static const ze_command_queue_desc_t cq_desc = {
+	.stype		= ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+	.pNext		= NULL,
+	.ordinal	= 0,
+	.index		= 0,
+	.flags		= 0,
+	.mode		= ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
+	.priority	= ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
+};
+static const ze_command_list_desc_t cl_desc = {
+	.stype				= ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+	.pNext				= NULL,
+	.commandQueueGroupOrdinal	= 0,
+	.flags				= 0,
+};
+
+static void ze_init(void)
+{
+	ze_result_t ze_ret;
+	ze_context_desc_t ze_context_desc = {};
+
+	ze_ret = zeInit(ZE_INIT_FLAG_GPU_ONLY);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeInit failed: %d", ze_ret);
+
+	ze_ret = zeDriverGet(&ze_driver_count, &ze_driver);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeDriverGet failed: %d",
+		     ze_ret);
+
+	ze_ret = zeContextCreate(ze_driver, &ze_context_desc, &ze_context);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeContextCreate failed: %d",
+		     ze_ret);
+
+	/* Only support a single device. */
+	ze_ret = zeDeviceGet(ze_driver, &ze_device_count, &ze_device);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeDeviceGet failed: %d",
+		     ze_ret);
+
+	ze_ret = zeCommandQueueCreate(ze_context, ze_device, &cq_desc, &ze_cq);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS,
+		     "zeCommandQueueCreate failed: %d", ze_ret);
+}
+
+static void ze_fini(void)
+{
+	ze_result_t ze_ret;
+
+	ze_ret = zeCommandQueueDestroy(ze_cq);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS,
+		     "zeCommandQueueDestroy failed: %d", ze_ret);
+
+	ze_ret = zeContextDestroy(ze_context);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS,
+		     "zeContextDestroy failed: %d", ze_ret);
+}
+
+static void ze_copy(void *dst, const void *src, size_t size)
+{
+	ze_command_list_handle_t cmd_list;
+	ze_result_t ze_ret;
+
+	ze_ret = zeCommandListCreate(ze_context, ze_device, &cl_desc,
+				     &cmd_list);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS,
+		     "zeCommandListCreate failed: %d", ze_ret);
+
+	ze_ret = zeCommandListAppendMemoryCopy(cmd_list, dst, src, size, NULL,
+					       0, NULL);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS,
+		     "zeCommandListAppendMemoryCopy failed: %d", ze_ret);
+
+	ze_ret = zeCommandListClose(cmd_list);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS,
+		     "zeCommandListClose failed: %d", ze_ret);
+
+	ze_ret = zeCommandQueueExecuteCommandLists(ze_cq, 1, &cmd_list, NULL);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS,
+		     "zeCommandQueueExecuteCommandLists failed: %d", ze_ret);
+}
+
+TestSuite(ze, .timeout = CXIT_DEFAULT_TIMEOUT);
+
+static void ze_message_runner(void *ze_send_buf, void *ze_recv_buf,
+			      size_t buf_size)
+{
+	int ret;
+	char *send_buf;
+	char *recv_buf;
+	struct fi_cq_tagged_entry cqe;
+	int i;
+
+	cxit_setup_msg();
+
+	/* Send and recv buffer as used as bounce buffers for their ze
+	 * counterparts. This is not true for zeMemAllocHost.
+	 */
+	send_buf = malloc(buf_size);
+	cr_assert_neq(send_buf, NULL, "Failed to allocate memory");
+
+	ret = open("/dev/urandom", O_RDONLY);
+	cr_assert_neq(ret, -1, "open failed: %d", -errno);
+	read(ret, send_buf, buf_size);
+	close(ret);
+
+	recv_buf = calloc(1, buf_size);
+	cr_assert_neq(send_buf, NULL, "Failed to allocate memory");
+
+	ze_copy(ze_send_buf, send_buf, buf_size);
+
+	ret = fi_recv(cxit_ep, ze_recv_buf, buf_size, NULL, cxit_ep_fi_addr,
+		      NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret);
+
+	ret = fi_send(cxit_ep, ze_send_buf, buf_size, NULL, cxit_ep_fi_addr,
+		      NULL);
+	cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_rx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	do {
+		ret = fi_cq_read(cxit_tx_cq, &cqe, 1);
+	} while (ret == -FI_EAGAIN);
+	cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret);
+
+	ze_copy(recv_buf, ze_recv_buf, buf_size);
+
+	for (i = 0; i < buf_size; i++)
+		cr_assert_eq(send_buf[i], recv_buf[i],
+			     "Data corruption at byte %d", i);
+
+	free(recv_buf);
+	free(send_buf);
+
+	cxit_teardown_msg();
+}
+
+Test(ze, messaging_devMemory)
+{
+	ze_result_t ze_ret;
+	void *ze_send_buf;
+	void *ze_recv_buf;
+	size_t buf_size = 1048576;
+
+	ze_init();
+
+	/* Ze buffers will be used for RDMA. */
+	ze_ret = zeMemAllocDevice(ze_context, &device_desc, buf_size, 0,
+				  ze_device, &ze_send_buf);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemAllocDevice failed: %d",
+		     ze_ret);
+
+	ze_ret = zeMemAllocDevice(ze_context, &device_desc, buf_size, 0,
+				  ze_device, &ze_recv_buf);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemAllocDevice failed: %d",
+		     ze_ret);
+
+	ze_message_runner(ze_send_buf, ze_recv_buf, buf_size);
+
+	ze_ret = zeMemFree(ze_context, ze_recv_buf);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemFree failed: %d",
+		     ze_ret);
+
+	ze_ret = zeMemFree(ze_context, ze_send_buf);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemFree failed: %d",
+		     ze_ret);
+
+	ze_fini();
+}
+
+Test(ze, messaging_hostMemory)
+{
+	ze_result_t ze_ret;
+	void *ze_send_buf;
+	void *ze_recv_buf;
+	size_t buf_size = 1048576;
+
+	ze_init();
+
+	/* Ze buffers will be used for RDMA. */
+	ze_ret = zeMemAllocHost(ze_context, &host_desc, buf_size, 0,
+				&ze_send_buf);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemAllocDevice failed: %d",
+		     ze_ret);
+
+	ze_ret = zeMemAllocHost(ze_context, &host_desc, buf_size, 0,
+				&ze_recv_buf);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemAllocDevice failed: %d",
+		     ze_ret);
+
+	ze_message_runner(ze_send_buf, ze_recv_buf, buf_size);
+
+	ze_ret = zeMemFree(ze_context, ze_recv_buf);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemFree failed: %d",
+		     ze_ret);
+
+	ze_ret = zeMemFree(ze_context, ze_send_buf);
+	cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemFree failed: %d",
+		     ze_ret);
+
+	ze_fini();
+}
diff --git a/src/fabric.c b/src/fabric.c
index b33c6068907..849909fd0d8 100644
--- a/src/fabric.c
+++ b/src/fabric.c
@@ -445,7 +445,7 @@ static struct fi_provider *ofi_get_hook(const char *name)
 static void ofi_ordered_provs_init(void)
 {
 	char *ordered_prov_names[] = {
-		"efa", "psm2", "opx", "verbs",
+		"efa", "psm2", "opx", "verbs", "cxi",
 		"netdir", "psm3", "ucx", "ofi_rxm", "ofi_rxd", "shm",
 
 		/* Initialize the socket based providers last of the
@@ -545,6 +545,7 @@ static void ofi_register_provider(struct fi_provider *provider, void *dlhandle)
 	    !strcasecmp(provider->name, "efa") ||
 	    !strcasecmp(provider->name, "psm3") ||
 	    !strcasecmp(provider->name, "ucx") ||
+	    !strcasecmp(provider->name, "cxi") ||
 	    ofi_is_util_prov(provider))
 		ofi_prov_ctx(provider)->disable_layering = true;
 
@@ -898,6 +899,7 @@ void fi_ini(void)
 
 	ofi_register_provider(PSM3_INIT, NULL);
 	ofi_register_provider(PSM2_INIT, NULL);
+	ofi_register_provider(CXI_INIT, NULL);
 	ofi_register_provider(SHM_INIT, NULL);
 	ofi_register_provider(SM2_INIT, NULL);