diff --git a/Makefile.am b/Makefile.am index c18d2fa26b6..e38497b3811 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2017-2018 Intel Corporation, Inc. All right reserved. # Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved. -# (C) Copyright 2020 Hewlett Packard Enterprise Development LP +# (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP # # Makefile.am for libfabric @@ -458,6 +458,7 @@ include prov/verbs/Makefile.include include prov/efa/Makefile.include include prov/psm2/Makefile.include include prov/psm3/Makefile.include +include prov/cxi/Makefile.include include prov/rxm/Makefile.include include prov/mrail/Makefile.include include prov/rxd/Makefile.include diff --git a/configure.ac b/configure.ac index 3e9c31c53e5..db61225cced 100644 --- a/configure.ac +++ b/configure.ac @@ -1004,6 +1004,7 @@ FI_PROVIDER_SETUP([psm3]) FI_PROVIDER_SETUP([sockets]) FI_PROVIDER_SETUP([verbs]) FI_PROVIDER_SETUP([efa]) +FI_PROVIDER_SETUP([cxi]) FI_PROVIDER_SETUP([udp]) FI_PROVIDER_SETUP([tcp]) FI_PROVIDER_SETUP([rxm]) diff --git a/include/ofi_prov.h b/include/ofi_prov.h index 506c1fd8f08..aabce7fc283 100644 --- a/include/ofi_prov.h +++ b/include/ofi_prov.h @@ -48,6 +48,17 @@ * not built: no-op call for ctor */ +#if (HAVE_CXI) && (HAVE_CXI_DL) +# define CXI_INI FI_EXT_INI +# define CXI_INIT NULL +#elif (HAVE_CXI) +# define CXI_INI INI_SIG(fi_cxi_ini) +# define CXI_INIT fi_cxi_ini() +CXI_INI ; +#else +# define CXI_INIT NULL +#endif + /* If HAVE_EFA is defined on Windows, then the VisualStudio project configures * MSBuild to include the efa related files and exclude the verbs related files. * With the verbs related files excluded from the build, we need only ensure diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md new file mode 100644 index 00000000000..7cc3d288675 --- /dev/null +++ b/man/fi_cxi.7.md @@ -0,0 +1,1781 @@ +--- +layout: page +title: fi_cxi(7) +tagline: Libfabric Programmer's Manual +--- +{% include JB/setup %} + +# NAME + +fi_cxi \- The CXI Fabric Provider + +# OVERVIEW + +The CXI provider enables libfabric on Cray's Slingshot network. Slingshot is +comprised of the Rosetta switch and Cassini NIC. Slingshot is an +Ethernet-compliant network. However, The provider takes advantage of proprietary +extensions to support HPC applications. + +The CXI provider supports reliable, connection-less endpoint semantics. It +supports two-sided messaging interfaces with message matching offloaded by the +Cassini NIC. It also supports one-sided RMA and AMO interfaces, light-weight +counting events, triggered operations (via the deferred work API), and +fabric-accelerated small reductions. + +# REQUIREMENTS + +The CXI Provider requires Cassini's optimized HPC protocol which is only +supported in combination with the Rosetta switch. + +The provider uses the libCXI library for control operations and a set of +Cassini-specific header files to enable direct hardware access in the data path. + +# SUPPORTED FEATURES + +The CXI provider supports a subset of OFI features. + +## Endpoint types + +The provider supports the *FI_EP_RDM* endpoint type. + +## Memory registration modes + +The provider implements scalable memory registration. The provider requires +*FI_MR_ENDPOINT*. *FI_MR_ALLOCATED* is required if ODP in not enabled or not +desired. Client specified 32-bit MR keys are the default unless *FI_MR_PROV_KEY* +is specified. For *FI_MR_PROV_KEY* provider generated 64-bit MR keys are used. +An RMA initiator can work concurrently with client and provider generated keys. + +In client/server environments, if concerns with stale MR key usage exists, then +*FI_MR_PROV_KEY* generated keys should be used along with *FI_CXI_MR_MATCH_EVENTS=1* +and *FI_CXI_OPTIMIZED_MRS=0*. The former speeds up MR close, allowing non-remote +MR cached keys to be used that enable full remote memory access protection +after an MR is closed, even if that memory remains in the libfabric MR cache. +The latter uses only standard MR which use matching to enable robust key +usage, protecting against a stale MR key matching a newly generated MR keys. + +## Data transfer operations + +The following data transfer interfaces are supported: *FI_ATOMIC*, *FI_MSG*, +*FI_RMA*, *FI_TAGGED*. See DATA TRANSFER OPERATIONS below for more details. + +## Completion events + +The CXI provider supports all CQ event formats. + +## Modes + +The CXI provider does not require any operation modes. + +## Progress + +The CXI provider currently supports *FI_PROGRESS_MANUAL* data and control +progress modes. + +## Multi-threading + +The CXI provider supports FI_THREAD_SAFE and FI_THREAD_DOMAIN threading models. + +## Wait Objects + +The CXI provider supports FI_WAIT_FD and FI_WAIT_POLLFD CQ wait object types. +FI_WAIT_UNSPEC will default to FI_WAIT_FD. However FI_WAIT_NONE should achieve +the lowest latency and reduce interrupt overhead. + +## Additional Features + +The CXI provider also supports the following capabilities and features: + +* *FI_MULTI_RECV* +* *FI_SOURCE* +* *FI_NAMED_RX_CTX* +* *FI_RM_ENABLED* +* *FI_RMA_EVENT* +* *FI_REMOTE_CQ_DATA* +* *FI_MORE* +* *FI_FENCE* + +## Addressing Format + +The CXI provider uses a proprietary address format. This format includes fields +for NIC Address and PID. NIC Address is the topological address of the NIC +endpoint on the fabric. All OFI Endpoints sharing a Domain share the same NIC +Address. PID (for Port ID or Process ID, adopted from the Portals 4 +specification), is analogous to an IP socket port number. Valid PIDs are in the +range [0-510]. + +A third component of Slingshot network addressing is the Virtual Network ID +(VNI). VNI is a protection key used by the Slingshot network to provide +isolation between applications. A VNI defines an isolated PID space for a given +NIC. Therefore, Endpoints must use the same VNI in order to communicate. Note +that VNI is not a field of the CXI address, but rather is specified as part of +the OFI Endpoint auth_key. The combination of NIC Address, VNI, and PID is +unique to a single OFI Endpoint within a Slingshot fabric. + +The NIC Address of an OFI Endpoint is inherited from the Domain. By default, a +PID is automatically assigned to an Endpoint when it is enabled. The address of +an Endpoint can be queried using fi_getname. The address received from +fi_getname may then be inserted into a peer's Address Vector. The resulting FI +address may then be used to perform an RDMA operation. + +Alternatively, a client may manage PID assignment. fi_getinfo may be used to +create an fi_info structure that can be used to create an Endpoint with a +client-specified address. To achieve this, use fi_getinfo with the *FI_SOURCE* +flag set and set node and service strings to represent the local NIC interface +and PID to be assigned to the Endpoint. The NIC interface string should match +the name of an available CXI domain (in the format cxi[0-9]). The PID string +will be interpreted as a 9-bit integer. Address conflicts will be detected when +the Endpoint is enabled. + +## Authorization Keys + +The CXI authorization key format is defined by struct cxi_auth_key. This +structure is defined in fi_cxi_ext.h. + +```c +struct cxi_auth_key { + uint32_t svc_id; + uint16_t vni; +}; +``` + +The CXI authorization key format includes a VNI and CXI service ID. VNI is a +component of the CXI Endpoint address that provides isolation. A CXI service is +a software container which defines a set of local CXI resources, VNIs, and +Traffic Classes which a libfabric user can access. + +Two endpoints must use the same VNI in order to communicate. Generally, a +parallel application should be assigned to a unique VNI on the fabric in order +to achieve network traffic and address isolation. Typically a privileged +entity, like a job launcher, will allocate one or more VNIs for use by the +libfabric user. + +The CXI service API is provided by libCXI. It enables a privileged entity, like +an application launcher, to control an unprivileged process's access to NIC +resources. Generally, a parallel application should be assigned to a unique CXI +service in order to control access to local resources, VNIs, and Traffic +Classes. + +While a libfabric user provided authorization key is optional, it is highly +encouraged that libfabric users provide an authorization key through the domain +attribute hints during `fi_getinfo()`. How libfabric users acquire the +authorization key may vary between the users and is outside the scope of this +document. + +If an authorization key is not provided by the libfabric user, the CXI provider +will attempt to generate an authorization key on behalf of the user. The +following outlines how the CXI provider will attempt to generate an +authorization key. + +1. Query for the following environment variables and generate an authorization +key using them. + * *SLINGSHOT_VNIS*: Comma separated list of VNIs. The CXI provider will only + use the first VNI if multiple are provide. Example: `SLINGSHOT_VNIS=234`. + * *SLINGSHOT_DEVICES*: Comma separated list of device names. Each device index + will use the same index to lookup the service ID in *SLINGSHOT_SVC_IDS*. + Example: `SLINGSHOT_DEVICES=cxi0,cxi1`. + * *SLINGSHOT_SVC_IDS*: Comma separated list of pre-configured CXI service IDs. + Each service ID index will use the same index to lookup the CXI device in + *SLINGSHOT_DEVICES*. Example: `SLINGSHOT_SVC_IDS=5,6`. + + **Note:** How valid VNIs and device services are configured is outside + the responsibility of the CXI provider. + +2. Query pre-configured device services and find first entry with same UID as +the libfabric user. + +3. Query pre-configured device services and find first entry with same GID as +the libfabric user. + +4. Query pre-configured device services and find first entry which does not +restrict member access. If enabled, the default service is an example of an +unrestricted service. + + **Note:** There is a security concern with such services since it allows + for multiple independent libfabric users to use the same service. + +**Note:** For above entries 2-4, it is possible the found device service does +not restrict VNI access. For such cases, the CXI provider will query +*FI_CXI_DEFAULT_VNI* to assign a VNI. + +During Domain allocation, if the domain auth_key attribute is NULL, the CXI +provider will attempt to generate a valid authorization key. If the domain +auth_key attribute is valid (i.e. not NULL and encoded authorization key has +been verified), the CXI provider will use the encoded VNI and service ID. +Failure to generate a valid authorization key will result in Domain allocation +failure. + +During Endpoint allocation, if the endpoint auth_key attribute is NULL, the +Endpoint with inherit the parent Domain's VNI and service ID. If the Endpoint +auth_key attribute is valid, the encoded VNI and service ID must match the +parent Domain's VNI and service ID. Allocating an Endpoint with a different VNI +and service from the parent Domain is not supported. + +The following is the expected parallel application launch workflow with +CXI integrated launcher and CXI authorization key aware libfabric user: + +1. A parallel application is launched. +2. The launcher allocates one or more VNIs for use by the application. +3. The launcher communicates with compute node daemons where the application + will be run. +4. The launcher compute node daemon configures local CXI interfaces. libCXI is + used to allocate one or more services for the application. The service will + define the local resources, VNIs, and Traffic Classes that the application + may access. Service allocation policies must be defined by the launcher. + libCXI returns an ID to represent a service. +5. The launcher forks application processes. +6. The launcher provides one or more service IDs and VNI values to the + application processes. +7. Application processes select from the list of available service IDs and VNIs + to form an authorization key to use for Endpoint allocation. + +## Address Vectors + +The CXI provider supports both *FI_AV_TABLE* and *FI_AV_MAP* with the same +internal implementation. + +The CXI provider uses the *FI_SYMMETRIC* AV flag for optimization. When used +with *FI_AV_TABLE*, the CXI provider can use the fi_addr_t index as an endpoint +identifier instead of a network address. The benefit of this is when running +with FI_SOURCE, a reverse lookup is not needed to generate the source fi_addr_t +for target CQ events. Note: FI_SOURCE_ERR should not be used for this +configuration. + +If the AV is not configured with *FI_SYMMETRIC*, *FI_AV_USER_ID* is supported +as a flag which can be passed into AV insert. + +Since scalable EPs are not supported, fi_av_attr::rx_ctx_bits must be zero. + +The following AV capabilities and flags are not supported: FI_SHARED_AV, +FI_SYNC_ERR, FI_EVENT, and FI_READ. + +## Operation flags + +The CXI provider supports the following Operation flags: + +*FI_MORE* +: When *FI_MORE* is specified in a data transfer operation, the provider will + defer submission of RDMA commands to hardware. When one or more data + transfer operations is performed using *FI_MORE*, followed by an operation + without *FI_MORE*, the provider will submit the entire batch of queued + operations to hardware using a single PCIe transaction, improving PCIe + efficiency. + + When *FI_MORE* is used, queued commands will not be submitted to hardware + until another data transfer operation is performed without *FI_MORE*. + +*FI_TRANSMIT_COMPLETE* +: By default, all CXI provider completion events satisfy the requirements of + the 'transmit complete' completion level. Transmit complete events are + generated when the intiator receives an Ack from the target NIC. The Ack is + generated once all data has been received by the target NIC. Transmit + complete events do not guarantee that data is visibile to the target + process. + +*FI_DELIVERY_COMPLETE* +: When the 'delivery complete' completion level is used, the event guarantees + that data is visible to the target process. To support this, hardware at + the target performs a zero-byte read operation to flush data across the + PCIe bus before generating an Ack. Flushing reads are performed + unconditionally and will lead to higher latency. + +*FI_MATCH_COMPLETE* +: When the 'match complete' completion level is used, the event guarantees + that the message has been matched to a client-provided buffer. All messages + longer than the eager threshold support this guarantee. When 'match + complete' is used with a Send that is shorter than the eager threshold, an + additional handshake may be performed by the provider to notify the + initiator that the Send has been matched. + +The CXI provider also supports the following operation flags: + +* *FI_INJECT* +* *FI_FENCE* +* *FI_COMPLETION* +* *FI_REMOTE_CQ_DATA* + +## Scalable Endpoints + +Scalable Endpoints (SEPs) support is not enabled in the CXI provider. Future +releases of the provider will re-introduce SEP support. + +## Messaging + +The CXI provider supports both tagged (*FI_TAGGED*) and untagged (*FI_MSG*) +two-sided messaging interfaces. In the normal case, message matching is +performed by hardware. In certain low resource conditions, the responsibility to +perform message matching may be transferred to software. Specification +of the receive message matching mode in the environment (*FI_CXI_RX_MATCH_MODE*) +controls the initial matching mode and whether hardware matching can +transparently transition matching to software where a hybrid of hardware +and software receive matching is done. + +If a Send operation arrives at a node where there is no matching Receive +operation posted, it is considered unexpected. Unexpected messages are +supported. The provider manages buffers to hold unexpected message data. + +Unexpected message handling is transparent to clients. Despite that, clients +should take care to avoid excessive use of unexpected messages by pre-posting +Receive operations. An unexpected message ties up hardware and memory resources +until it is matched with a user buffer. + +The CXI provider implements several message protocols internally. A message +protocol is selected based on payload length. Short messages are transferred +using the eager protocol. In the eager protocol, the entire message payload is +sent along with the message header. If an eager message arrives unexpectedly, +the entire message is buffered at the target until it is matched to a Receive +operation. + +Long messages are transferred using a rendezvous protocol. The threshold at +which the rendezvous protocol is used is controlled with the +*FI_CXI_RDZV_THRESHOLD* and *FI_CXI_RDZV_GET_MIN* environment variables. + +In the rendezvous protocol, a portion of the message payload is sent +along with the message header. Once the header is matched to a Receive +operation, the remainder of the payload is pulled from the source using an RDMA +Get operation. If the message arrives unexpectedly, the eager portion of the +payload is buffered at the target until it is matched to a Receive operation. +In the normal case, the Get is performed by hardware and the operation +completes without software progress. + +Unexpected rendezvous protocol messages can not complete and release source side +buffer resources until a matching receive is posted at the destination and the +non-eager data is read from the source with a rendezvous get DMA. The number of +rendezvous messages that may be outstanding is limited by the minimum of the +hints->tx_attr->size value specified and the number of rendezvous operation ID +mappings available. FI_TAGGED rendezvous messages have 32K-256 ID mappings, +FI_MSG rendezvous messages are limited to 256 ID mappings. While this +works well with MPI, care should be taken that this minimum is large enough to +ensure applications written in a manner that assumes unlimited resources and +use FI_MSG rendezvous messaging do not induce a software deadlock. If FI_MSG +rendezvous messaging is done in a unexpected manner that may exceed the FI_MSG +ID mappings available, it may be sufficient to reduce the number of rendezvous +operations by increasing the rendezvous threshold. See *FI_CXI_RDZV_THRESHOLD* +for information. + +Message flow-control is triggered when hardware message matching resources +become exhausted. Messages may be dropped and retransmitted in order to +recover; impacting performance significantly. Programs should be careful to avoid +posting large numbers of unmatched receive operations and to minimize the +number of outstanding unexpected messages to prevent message flow-control. +If the RX message matching mode is configured to support hybrid mode, when +resources are exhausted, hardware will transition to hybrid operation where +hardware and software share matching responsibility. + +To help avoid this condition, increase Overflow buffer space using environment +variables *FI_CXI_OFLOW_\**, and for software and hybrid RX match modes +increase Request buffer space using the variables *FI_CXI_REQ_\**. + +## Message Ordering + +The CXI provider supports the following ordering rules: + +* All message Send operations are always ordered. +* RMA Writes may be ordered by specifying *FI_ORDER_RMA_WAW*. +* AMOs may be ordered by specifying *FI_ORDER_AMO_{WAW|WAR|RAW|RAR}*. +* RMA Writes may be ordered with respect to AMOs by specifying *FI_ORDER_WAW*. + Fetching AMOs may be used to perform short reads that are ordered with + respect to RMA Writes. + +Ordered RMA size limits are set as follows: + +* *max_order_waw_size* is -1. RMA Writes and non-fetching AMOs of any size are + ordered with respect to each other. +* *max_order_raw_size* is -1. Fetching AMOs of any size are ordered with + respect to RMA Writes and non-fetching AMOs. +* *max_order_war_size* is -1. RMA Writes and non-fetching AMOs of any size are + ordered with respect to fetching AMOs. + +## PCIe Ordering + +Generally, PCIe writes are strictly ordered. As an optimization, PCIe TLPs may +have the Relaxed Order (RO) bit set to allow writes to be reordered. Cassini +sets the RO bit in PCIe TLPs when possible. Cassini sets PCIe RO as follows: + +* Ordering of messaging operations is established using completion events. + Therefore, all PCIe TLPs related to two-sided message payloads will have RO + set. +* Every PCIe TLP associated with an unordered RMA or AMO operation will have RO + cleared. +* PCIe TLPs associated with the last packet of an ordered RMA or AMO operation + will have RO cleared. +* PCIe TLPs associated with the body packets (all except the last packet of an + operation) of an ordered RMA operation will have RO set. + +## Translation + +The CXI provider supports two translation mechanisms: Address Translation +Services (ATS) and NIC Translation Agent (NTA). Use the environment variable +*FI_CXI_ATS* to select between translation mechanisms. + +ATS refers to NIC support for PCIe rev. 4 ATS, PRI and PASID features. ATS +enables the NIC to efficiently access the entire virtual address space of a +process. ATS mode currently supports AMD hosts using the iommu_v2 API. + +The NTA is an on-NIC translation unit. The NTA supports two-level page tables +and additional hugepage sizes. Most CPUs support 2MB and 1GB hugepage sizes. +Other hugepage sizes may be supported by SW to enable the NIC to cache more +address space. + +ATS and NTA both support on-demand paging (ODP) in the event of a page fault. +Use the environment variable *FI_CXI_ODP* to enable ODP. + +With ODP enabled, buffers used for data transfers are not required to be backed +by physical memory. An un-populated buffer that is referenced by the NIC will +incur a network page fault. Network page faults will significantly impact +application performance. Clients should take care to pre-populate buffers used +for data-tranfer operations to avoid network page faults. Copy-on-write +semantics work as expected with ODP. + +With ODP disabled, all buffers used for data transfers are backed by pinned +physical memory. Using Pinned mode avoids any overhead due to network page +faults but requires all buffers to be backed by physical memory. Copy-on-write +semantics are broken when using pinned memory. See the Fork section for more +information. + +## Translation Cache + +Mapping a buffer for use by the NIC is an expensive operation. To avoid this +penalty for each data transfer operation, the CXI provider maintains an internal +translation cache. + +When using the ATS translation mode, the provider does not maintain translations +for individual buffers. It follows that translation caching is not required. + +## Triggered Operation + +The CXI provider supports triggered operations through the deferred work queue +API. The following deferred work queue operations are supported: FI_OP_SEND, +FI_OP_TSEND, FI_OP_READ, FI_OP_WRITE, FI_OP_ATOMIC, FI_OP_FETCH_ATOMIC, and +FI_OP_COMPARE_ATOMIC. FI_OP_RECV and FI_OP_TRECV are also supported, but with +only a threshold of zero. + +The CXI provider backs each triggered operation by hardware resources. +Exhausting triggered operation resources leads to indeterminate behavior and +should be prevented. + +The CXI provider offers two methods to prevent triggered operation resource +exhaustion. + +### Experimental FI_CXI_ENABLE_TRIG_OP_LIMIT Environment Variable + +When FI_CXI_ENABLE_TRIG_OP_LIMIT is enabled, the CXI provider will use +semaphores to coordinate triggered operation usage between threads and across +processes using the same service ID. When triggered operation resources are +exhausted, fi_control(FI_QUEUE_WORK) will return -FI_ENOSPC. It is up to the +libfabric user to recover from this situation. + +**Note:** Preventing triggered operation resource exhaustion with this method +may be expensive and result in a negative performance impact. It is encouraged +libfabric users avoid method unless absolutely needed. By default, +FI_CXI_ENABLE_TRIG_OP_LIMIT is disabled. + +**Note:** Named semaphores are used to coordinated triggered operation resource +usage across multiple processes. System/node software may need to be implemented +to ensure all semaphores are unlinked during unexpected application termination. + +**Note:** This feature is considered experimental and implementation may be +subjected to changed. + +### CXI Domain get_dwq_depth Extension + +The CXI domain get_dwq_depth extension returns the deferred work queue queue +depth (i.e. the number of triggered operation resources assigned to the service +ID used by the fi_domain). Libfabric users can use the returned queue depth to +coordinate resource usage. + +For example, suppose the job launcher has configured a service ID with for 512 +triggered operation resources. Since the CXI provider needs to consume 8 per +service ID, 504 should be usable by libfabric users. If the libfabric user knows +there are *N* processes using a given service ID and NIC, it can divide the 504 +triggered operation resource among all *N* processes. + +**Note:** This is the preferred method to prevent triggered operation resource +exhaustion since it does not introduce semaphores into the +fi_control(FI_QUEUE_WORK) critical path. + +## Fork Support + +The following subsections outline the CXI provider fork support. + +### RDMA and Fork Overview + +Under Linux, `fork()` is implemented using copy-on-write (COW) pages, so the +only penalty that it incurs is the time and memory required to duplicate the +parent's page tables, mark all of the process’s page structs as read only and +COW, and create a unique task structure for the child. + +Due to the Linux COW fork policy, both parent and child processes’ virtual +addresses are mapped to the same physical address. The first process to write +to the virtual address will get a new physical page, and thus a new physical +address, with the same content as the previous physical page. + +The Linux COW fork policy is problematic for RDMA NICs. RDMA NICs require +memory to be registered with the NIC prior to executing any RDMA operations. In +user-space, memory registration results in establishing a virtual address to +physical address mapping with the RDMA NIC. This resulting RDMA NIC +mapping/memory region does not get updated when the Linux COW fork policy is +executed. + +Consider the following example: +- Process A is planning to perform RDMA with virtual address 0xffff0000 and a +size of 4096. This virtual address maps to physical address 0x1000. +- Process A registers this virtual address range with the RDMA NIC. The RDMA +NIC device driver programs its page tables to establish the virtual address +0xffff0000 to physical address 0x1000 mapping. +- Process A decides to fork Process B. Virtual address 0xffff0000 will now be +subjected to COW. +- Process A decides to write to virtual address 0xffff0000 before doing the +RDMA operation. This will trigger the Linux COW fork policy resulting in the +following: + - Process A: Virtual address 0xffff0000 maps to new physical address + 0x2000 + - Process B: Virtual address 0xffff0000 maps to previous physical address + 0x1000 +- Process A now executes an RDMA operation using the mapping/memory region +associated with virtual address 0xffff0000. Since COW occurred, the RDMA NIC +executes the RDMA operation using physical address 0x1000 which belongs to +Process B. This results in data corruption. + +The crux of the issue is the parent issuing forks while trying to do RDMA +operations to registered memory regions. Excluding software RDMA emulation, two +options exist for RDMA NIC vendors to resolve this data corruption issue. +- Linux `madvise()` MADV_DONTFORK and MADV_DOFORK +- RDMA NIC support for on-demand paging (ODP) + +#### Linux madvise() MADV_DONTFORK and MADV_DOFORK + +The generic (i.e. non-vendor specific) RDMA NIC solution to the Linux COW fork +policy and RDMA problem is to use the following `madvise()` operations during +memory registration and deregistration: +- MADV_DONTFORK: Do not make the pages in this range available to the child +after a `fork()`. This is useful to prevent copy-on-write semantics from +changing the physical location of a page if the parent writes to it after a +`fork()`. (Such page relocations cause problems for hardware that DMAs into the +page.) +- MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the default +behavior, whereby a mapping is inherited across `fork()`. + +In the Linux kernel, MADV_DONTFORK will result in the virtual memory area struct +(VMA) being marked with the VM_DONTCOPY flag. VM_DONTCOPY signals to the Linux +kernel to not duplicate this VMA on fork. This effectively leaves a hole in +child process address space. Should the child reference the virtual address +corresponding to the VMA which was not duplicated, it will segfault. + +In the previous example, if Process A issued `madvise(0xffff0000, 4096, +MADV_DONTFORK)` before performing RDMA memory registration, the physical address +0x1000 would have remained with Process A. This would prevent the Process A data +corruption as well. If Process B were to reference virtual address 0xffff0000, it +will segfault due to the hole in the virtual address space. + +Using `madvise()` with MADV_DONTFORK may be problematic for applications +performing RDMA and page aliasing. Paging aliasing is where the parent process +uses part or all of a page to share information with the child process. If RDMA is +also being used for a separate portion of this page, the child process will +segfault when an access causes page aliasing. + +#### RDMA NIC Support for ODP + +An RDMA NIC vendor specific solution to the Linux COW fork policy and RDMA +problem is to use ODP. ODP allows for the RDMA NIC to generate page requests for +translations it does not have a physical address for. The following is an +updated example with ODP: +- Process A is planning to perform RDMA with virtual address 0xffff0000 and a +size of 4096. This virtual address maps to physical address 0x1000. +- Process A registers this virtual address range with the RDMA NIC. The RDMA NIC +device driver may optionally program its page tables to establish the virtual +address 0xffff0000 to physical address 0x1000 mapping. +- Process A decides to fork Process B. Virtual address 0xffff0000 will now be +subjected to COW. +- Process A decides to write to virtual address 0xffff0000 before doing the RDMA +operation. This will trigger the Linux COW fork policy resulting in the +following: + - Process A: Virtual address 0xffff0000 maps to new physical address 0x2000 + - Process B: Virtual address 0xffff0000 maps to previous physical address + 0x1000 + - RDMA NIC device driver: Receives MMU invalidation event for Process A + virtual address range 0xffff0000 through 0xffff0ffe. The device driver + updates the corresponding memory region to no longer reference physical + address 0x1000. +- Process A now executes an RDMA operation using the memory region associated +with 0xffff0000. The RDMA NIC will recognize the corresponding memory region as +no longer having a valid physical address. The RDMA NIC will then signal to the +device driver to fault in the corresponding address, if necessary, and update +the physical address associated with the memory region. In this case, the memory +region will be updated with physical address 0x2000. Once completed, the device +driver signals to the RDMA NIC to continue the RDMA operation. Data corruption +does not occur since RDMA occurred to the correct physical address. + +A RDMA NIC vendor specific solution to the Linux COW fork policy and RDMA +problem is to use ODP. ODP allows for the RDMA NIC to generate page requests +for translations it does not have a physical address for. + +### CXI Provider Fork Support + +The CXI provider is subjected to the Linux COW fork policy and RDMA issues +described in section *RDMA and Fork Overview*. To prevent data corruption with +fork, the CXI provider supports the following options: +- CXI specific fork environment variables to enable `madvise()` MADV_DONTFORK +and MADV_DOFORK +- ODP Support* + +**Formal ODP support pending.* + +#### CXI Specific Fork Environment Variables + +The CXI software stack has two environment variables related to fork: +0 CXI_FORK_SAFE: Enables base fork safe support. With this environment variable +set, regardless of value, libcxi will issue `madvise()` with MADV_DONTFORK on +the virtual address range being registered for RDMA. In addition, libcxi always +align the `madvise()` to the system default page size. On x86, this is 4 KiB. To +prevent redundant `madvise()` calls with MADV_DONTFORK against the same virtual +address region, reference counting is used against each tracked `madvise()` +region. In addition, libcxi will spilt and merge tracked `madvise()` regions if +needed. Once the reference count reaches zero, libcxi will call `madvise()` with +MADV_DOFORK, and no longer track the region. +- CXI_FORK_SAFE_HP: With this environment variable set, in conjunction with +CXI_FORK_SAFE, libcxi will not assume the page size is system default page size. +Instead, libcxi will walk `/proc//smaps` to determine the correct page size +and align the `madvise()` calls accordingly. This environment variable should be +set if huge pages are being used for RDMA. To amortize the per memory +registration walk of `/proc//smaps`, the libfabric MR cache should be used. + +Setting these environment variables will prevent data corruption when the parent +issues a fork. But it may result in the child process experiencing a segfault if +it references a virtual address being used for RDMA in the parent process. + +#### ODP Support and Fork + +CXI provider ODP support would allow for applications to not have to set +CXI_FORK_SAFE and CXI_FORK_SAFE_HP to prevent parent process data corruption. +Enabling ODP to resolve the RDMA and fork issue may or may not result in a +performance impact. The concern with ODP is if the rate of invalidations and ODP +page requests are relatively high and occur at the same time, ODP timeouts may +occur. This would result in application libfabric data transfer operations +failing. + +Please refer to the *CXI Provider ODP Support* for more information on how to +enable/disable ODP. + +#### CXI Provider Fork Support Guidance + +Since the CXI provider offloads the majority of the libfabric data transfer +operations to the NIC, thus enabling end-to-end RDMA between libfabric user +buffers, it is subjected to the issue described in section *RDMA and Fork +Overview*. For comparison, software emulated RDMA libfabric providers may not +have these issues since they rely on bounce buffers to facilitate data transfer. + +The following is the CXI provider fork support guidance: +- Enable CXI_FORK_SAFE. If huge pages are also used, CXI_FORK_SAFE_HP should be +enabled as well. Since enabling this will result in `madvice()` with +MADV_DONTFORK, the following steps should be taken to prevent a child process +segfault: + - Avoid using stack memory for RDMA + - Avoid child process having to access a virtual address range the parent + process is performing RDMA against + - Use page-aligned heap allocations for RDMA +- Enable ODP and run without CXI_FORK_SAFE and CXI_FORK_SAFE_HP. The +functionality and performance of ODP with fork may be application specific. +Currently, ODP is not formally supported. + +The CXI provider preferred approach is to use CXI_FORK_SAFE and +CXI_FORK_SAFE_HP. While it may require the application to take certain +precautions, it will result in a more portable application regardless of RDMA +NIC. + +## Heterogenous Memory (HMEM) Supported Interfaces + +The CXI provider supports the following OFI iface types: FI_HMEM_CUDA, FI_HMEM_ROCR, and FI_HMEM_ZE. + +### FI_HMEM_ZE Limitations + +The CXI provider only supports GPU direct RDMA with ZE device buffers if implicit scaling +is disabled. The following ZE environment variables disable implicit scaling: +EnableImplicitScaling=0 NEOReadDebugKeys=1. + +For testing purposes only, the implicit scaling check can be disabled by setting the +following environment variable: FI_CXI_FORCE_ZE_HMEM_SUPPORT=1. This may need to be +combined with the following environment variable to get CXI provider memory registration +to work: FI_CXI_DISABLE_HMEM_DEV_REGISTER=1. + +## Collectives (accelerated) + +The CXI provider supports a limited set of collective operations specifically +intended to support use of the hardware-accelerated reduction features of the +CXI-supported NIC and fabric hardware. + +These features are implemented using the (experimental) OFI collectives API. The +implementation supports the following collective functions: + +* **fi_query_collective**() +* **fi_join_collective**() +* **fi_barrier**() +* **fi_broadcast**() +* **fi_reduce**() +* **fi_allreduce**() + +### **fi_query_collective**() + +Standard implementation that exposes the features described below. + +### **fi_join_collective**() + +The **fi_join_collective**() implementation is provider-managed. However, the +*coll_addr* parameter is not useful to the implementation, and must be +specified as FI_ADDR_NOTAVAIL. The *set* parameter must contain fi_addr_t +values that resolve to meaningful CXI addresses in the endpoint *fi_av* +structure. **fi_join_collective**() must be called for every address in the +*set* list, and must be progressed until the join operation is complete. There +is no inherent limit on join concurrency. + +The join will create a multicast tree in the fabric to manage the collective +operations. This operation requires access to a secure Fabric Manager REST API +that constructs this tree, so any application that attempts to use accelerated +collectives will bind to libcurl and associated security libraries, which must +be available on the system. + +There are hard limits to the number of multicast addresses available on a +system, and administrators may impose additional limits on the number of +multicast addresses available to any given collective job. + +### fi_reduction operations + +Payloads are limited to 32-byte data structures, and because they all use the +same underlying hardware model, they are all synchronizing calls. Specifically, +the supported functions are all variants of fi_allreduce(). + +* **fi_barrier** is **fi_allreduce** using an optimized no-data operator. +* **fi_broadcast** is **fi_allreduce** using FI_BOR, with data forced to zero for all but the root rank. +* **fi_reduce** is **fi_allreduce** with a result pointer ignored by all but the root rank. + +All functions must be progressed to completion on all ranks participating in +the collective group. There is a hard limit of eight concurrent reductions on +each collective group, and attempts to launch more operations will return +-FI_EAGAIN. + +**allreduce** supports the following hardware-accelerated reduction operators: + +| Operator | Supported Datatypes | +| -------- | --------- | +| FI_BOR | FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64 | +| FI_BAND | FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64 | +| FI_BXOR | FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64 | +| FI_MIN | FI_INT64, FI_DOUBLE | +| FI_MAX | FI_INT64, FI_DOUBLE | +| FI_SUM | FI_INT64, FI_DOUBLE | +| FI_CXI_MINMAXLOC | FI_INT64, FI_DOUBLE | +| FI_CXI_REPSUM | FI_DOUBLE | + +Data space is limited to 32 bytes in all cases except REPSUM, which supports +only a single FI_DOUBLE. + +Only unsigned bitwise operators are supported. + +Only signed integer arithmetic operations are are supported. + +The MINMAXLOC operators are a mixed data representation consisting of two +values, and two indices. Each rank reports its minimum value and rank index, +and its maximum value and rank index. The collective result is the global +minimum value and rank index, and the global maximum value and rank index. Data +structures for these functions can be found int the fi_cxi_ext.h file. The +*datatype* should represent the type of the minimum/maximum values, and the +*count* must be 1. + +The double-precision operators provide an associative (NUM) variant for MIN, +MAX, and MINMAXLOC. Default IEEE behavior is to treat any operation with NaN as +invalid, including comparison, which has the interesting property of causing: + + MIN(NaN, value) => NaN + MAX(NaN, value) => NaN + +This means that if NaN creeps into a MIN/MAX reduction in any rank, it tends to +poison the entire result. The associative variants instead effectively ignore +the NaN, such that: + + MIN(NaN, value) => value + MAX(NaN, value) => value + +The REPSUM operator implements a reproducible (associative) sum of +double-precision values. The payload can accommodate only a single +double-precision value per reduction, so *count* must be 1. + +See: [Berkeley reproducible sum algorithm](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-121.pdf) +https://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-121.pdf + +### double precision rounding + +C99 defines four rounding modes for double-precision SUM, and some systems may +support a "flush-to-zero" mode for each of these, resulting in a total of eight +different modes for double-precision sum. + +The fabric hardware supports all eight modes transparently. + +Although the rounding modes have thread scope, all threads, processes, and +nodes should use the same rounding mode for any single reduction. + +### reduction flags + +The reduction operations supports two flags: + +* **FI_MORE** +* **FI_CXI_PRE_REDUCED** (overloads **FI_SOURCE**) + +The **FI_MORE** flag advises that the *result* data pointer represents an +opaque, local reduction accumulator, and will be used as the destination of the +reduction. This operation can be repeated any number of times to accumulate +results locally, and spans the full set of all supported reduction operators. +The *op*, *count*, and *datatype* values must be consistent for all calls. The +operation ignores all global or static variables — it can be treated as a +*pure* function call — and returns immediately. The caller is responsible +for protecting the accumulator memory if it is used by multiple threads or +processes on a compute node. + +If **FI_MORE** is omitted, the destination is the fabric, and this will +initiate a fabric reduction through the associated endpoint. The reduction must +be progressed, and upon successful completion, the *result* data pointer will +be filled with the final reduction result of *count* elements of type +*datatype*. + +The **FI_CXI_PRE_REDUCED** flag advises that the source data pointer represents +an opaque reduction accumulator containing pre-reduced data. The *count* and +*datatype* arguments are ignored. + +if **FI_CXI_PRE_REDUCED** is omitted, the source is taken to be user data with +*count* elements of type *datatype*. + +The opaque reduction accumulator is exposed as **struct cxip_coll_accumulator** +in the fi_cxi_ext.h file. + +**Note**: The opaque reduction accumulator provides extra space for the +expanded form of the reproducible sum, which carries the extra data required to +make the operation reproducible in software. + +# OPTIMIZATION + +## Optimized MRs + +The CXI provider has two separate MR implementations: standard and optimized. +Standard MRs are designed to support applications which require a large number +of remote memory regions. Optimized MRs are designed to support one-sided +programming models that allocate a small number of large remote memory windows. +The CXI provider can achieve higher RMA Write rates when targeting an optimized +MR. + +Both types of MRs are allocated using fi_mr_reg. MRs with client-provided key in +the range [0-99] are optimized MRs. MRs with key greater or equal to 100 are +standard MRs. An application may create a mix of standard and optimized MRs. To +disable the use of optimized MRs, set environment variable +*FI_CXI_OPTIMIZED_MRS=false*. When disabled, all MR keys are available and all MRs +are implemented as standard MRs. All communicating processes must agree on the +use of optimized MRs. + +When FI_MR_PROV_KEY mr_mode is specified caching of remote access MRs is enabled, +which can improve registration/de-registration performance in RPC type applications, +that wrap RMA operations within a message RPC protocol. Optimized MRs will be +preferred, but will fallback to standard MRs if insufficient hardware resources are +available. + +## Optimized RMA + +Optimized MRs are one requirement for the use of low overhead packet formats +which enable higher RMA Write rates. An RMA Write will use the low overhead +format when all the following requirements are met: + +* The Write targets an optimized MR +* The target MR does not require remote completion notifications (no + *FI_RMA_EVENT*) +* The Write does not have ordering requirements (no *FI_RMA_WAW*) + +Theoretically, Cassini has resources to support 64k standard MRs or 2k optimized +MRs. Practically, the limits are much lower and depend greatly on application +behavior. + +Hardware counters can be used to validate the use of the low overhead packets. +The counter C_CNTR_IXE_RX_PTL_RESTRICTED_PKT counts the number of low overhead +packets received at the target NIC. Counter C_CNTR_IXE_RX_PTL_UNRESTRICTED_PKT +counts the number of ordered RDMA packets received at the target NIC. + +Message rate performance may be further optimized by avoiding target counting +events. To avoid counting events, do not bind a counter to the MR. To validate +optimal writes without target counting events, monitor the counter: +C_CNTR_LPE_PLEC_HITS. + +## Unreliable AMOs + +By default, all AMOs are resilient to intermittent packet loss in the network. +Cassini implements a connection-based reliability model to support reliable +execution of AMOs. + +The connection-based reliability model may be disabled for AMOs in order to +increase message rate. With reliability disabled, a lost AMO packet will result +in operation failure. A failed AMO will be reported to the client in a +completion event as usual. Unreliable AMOs may be useful for applications that +can tolerate intermittent AMO failures or those where the benefit of increased +message rate outweighs by the cost of restarting after a failure. + +Unreliable, non-fetching AMOs may be performed by specifying the +*FI_CXI_UNRELIABLE* flag. Unreliable, fetching AMOs are not supported. Unreliable +AMOs must target an optimized MR and cannot use remote completion notification. +Unreliable AMOs are not ordered. + +## High Rate Put + +High Rate Put (HRP) is a feature that increases message rate performance of RMA +and unreliable non-fetching AMO operations at the expense of global ordering +guarantees. + +HRP responses are generated by the fabric egress port. Responses are coalesced +by the fabric to achieve higher message rates. The completion event for an HRP +operation guarantees delivery but does not guarantee global ordering. If global +ordering is needed following an HRP operation, the source may follow the +operation with a normal, fenced Put. + +HRP RMA and unreliable AMO operations may be performed by specifying the +*FI_CXI_HRP* flag. HRP AMOs must also use the *FI_CXI_UNRELIABLE* flag. Monitor the +hardware counter C_CNTR_HNI_HRP_ACK at the initiator to validate that HRP is in +use. + +## Counters + +Cassini offloads light-weight counting events for certain types of operations. +The rules for offloading are: + +* Counting events for RMA and AMO source events are always offloaded. +* Counting events for RMA and AMO target events are always offloaded. +* Counting events for Sends are offloaded when message size is less than the + rendezvous threshold. +* Counting events for message Receives are never offloaded by default. + +Software progress is required to update counters unless the criteria for +offloading are met. + +# RUNTIME PARAMETERS + +The CXI provider checks for the following environment variables: + +*FI_CXI_ODP* +: Enables on-demand paging. If disabled, all DMA buffers are pinned. + If enabled and mr_mode bits in the hints exclude FI_MR_ALLOCATED, + then ODP mode will be used. + +*FI_CXI_FORCE_ODP* +: Experimental value that can be used to force the use of ODP mode + even if FI_MR_ALLOCATED is set in the mr_mode hint bits. This is + intended to be used primarily for testing. + +*FI_CXI_ATS* +: Enables PCIe ATS. If disabled, the NTA mechanism is used. + +*FI_CXI_ATS_MLOCK_MODE* +: Sets ATS mlock mode. The mlock() system call may be used in conjunction + with ATS to help avoid network page faults. Valid values are "off" and + "all". When mlock mode is "off", the provider does not use mlock(). An + application using ATS without mlock() may experience network page faults, + reducing network performance. When ats_mlock_mode is set to "all", the + provider uses mlockall() during initialization with ATS. mlockall() causes + all mapped addresses to be locked in RAM at all times. This helps to avoid + most network page faults. Using mlockall() may increase pressure on + physical memory. Ignored when ODP is disabled. + +*FI_CXI_RDZV_THRESHOLD* +: Message size threshold for rendezvous protocol. + +*FI_CXI_RDZV_GET_MIN* +: Minimum rendezvous Get payload size. A Send with length less than or equal + to *FI_CXI_RDZV_THRESHOLD* plus *FI_CXI_RDZV_GET_MIN* will be performed + using the eager protocol. Larger Sends will be performed using the + rendezvous protocol with *FI_CXI_RDZV_THRESHOLD* bytes of payload sent + eagerly and the remainder of the payload read from the source using a Get. + *FI_CXI_RDZV_THRESHOLD* plus *FI_CXI_RDZV_GET_MIN* must be less than or + equal to *FI_CXI_OFLOW_BUF_SIZE*. + +*FI_CXI_RDZV_EAGER_SIZE* +: Eager data size for rendezvous protocol. + +*FI_CXI_RDZV_PROTO* +: Direct the provider to use a preferred protocol to transfer non-eager + rendezvous data. + *FI_CXI_RDZV_PROTO*= default | alt_read + + To use an alternate protocol, the CXI driver property rdzv_get_en should be + set to "0". The "alt_read" rendezvous protocol may help improve collective + operation performance. Note that all rendezvous protocol use RDMA to transfer + eager and non-eager rendezvous data. + +*FI_CXI_DISABLE_NON_INJECT_MSG_IDC* +: Experimental option to disable favoring IDC for transmit of small messages + when FI_INJECT is not specified. This can be useful with GPU source buffers + to avoid the host copy in cases a performant copy can not be used. The default + is to use IDC for all messages less than IDC size. + +*FI_CXI_DISABLE_HOST_REGISTER* +: Disable registration of host buffers (overflow and request) with GPU. There + are scenarios where using a large number of processes per GPU results in page + locking excessive amounts of memory degrading performance and/or restricting + process counts. The default is to register buffers with the GPU. + +*FI_CXI_OFLOW_BUF_SIZE* +: Size of overflow buffers. Increasing the overflow buffer size allows for + more unexpected message eager data to be held in single overflow buffer. + The default size is 2MB. + +*FI_CXI_OFLOW_BUF_MIN_POSTED/FI_CXI_OFLOW_BUF_COUNT* +: The minimum number of overflow buffers that should be posted. The default + minimum posted count is 3. Buffers will grow unbounded to support + outstanding unexpected messages. Care should be taken to size appropriately + based on job scale, size of eager data, and the amount of unexpected + message traffic to reduce the need for flow control. + +*FI_CXI_OFLOW_BUF_MAX_CACHED* +: The maximum number of overflow buffers that will be cached. The default + maximum count is 3 * FI_CXI_OFLOW_BUF_MIN_POSTED. A value of zero indicates + that once a overflow buffer is allocated it will be cached and used as + needed. A non-zero value can be used with bursty traffic to shrink the + number of allocated buffers to the maximum count when they are no longer + needed. + +*FI_CXI_SAFE_DEVMEM_COPY_THRESHOLD +: Defines the maximum CPU memcpy size for HMEM device memory that is + accessible by the CPU with load/store operations. + +*FI_CXI_OPTIMIZED_MRS* +: Enables optimized memory regions. See section + *CXI Domain Control Extensions* on how to enable/disable optimized MRs at + the domain level instead of for the global process/job. + +*FI_CXI_MR_MATCH_EVENTS* +: Enabling MR match events in a client/server environment can be used + to ensure that memory backing a memory region cannot be remotely + accessed after the MR has been closed, even if it that memory remains + mapped in the libfabric MR cache. Manual progress must be made at the + target to process the MR match event accounting and avoid event queue + overflow. There is a slight additional cost in the creation and + tear-down of MR. This option is disabled by default. + + See section *CXI Domain Control Extensions* on how to enable MR match + events at the domain level instead of for the global process/job. + +*FI_CXI_PROV_KEY_CACHE* +: Enabled by default, the caching of remote MR provider keys can be + disable by setting to 0. + + See section *CXI Domain Control Extensions* on how to disable the + remote provider key cache at the domain level instead of for the + global process/job. + +*FI_CXI_LLRING_MODE* +: Set the policy for use of the low-latency command queue ring mechanism. + This mechanism improves the latency of command processing on an idle + command queue. Valid values are idle, always, and never. + +*FI_CXI_CQ_POLICY* +: Experimental. Set Command Queue write-back policy. Valid values are always, + high_empty, low_empty, and low. "always", "high", and "low" refer to the + frequency of write-backs. "empty" refers to whether a write-back is + performed when the queue becomes empty. + +*FI_CXI_DEFAULT_VNI* +: Default VNI value used only for service IDs where the VNI is not restricted. + +*FI_CXI_EQ_ACK_BATCH_SIZE* +: Number of EQ events to process before writing an acknowledgement to HW. + Batching ACKs amortizes the cost of event acknowledgement over multiple + network operations. + +*FI_CXI_RX_MATCH_MODE* +: Specify the receive message matching mode to be utilized. + *FI_CXI_RX_MATCH_MODE=*hardware | software | hybrid + + *hardware* - Message matching is fully offloaded, if resources become + exhausted flow control will be performed and existing unexpected message + headers will be onloaded to free resources. + + *software* - Message matching is fully onloaded. + + *hybrid* - Message matching begins fully offloaded, if resources become + exhuasted hardware will transition message matching to a hybrid of + hardware and software matching. + + For both *"hybrid"* and *"software"* modes and care should be taken to + minimize the threshold for rendezvous processing + (i.e. *FI_CXI_RDZV_THRESHOLD* + *FI_CXI_RDZV_GET_MIN*). When running in + software endpoint mode the environment variables *FI_CXI_REQ_BUF_SIZE* + and *FI_CXI_REQ_BUF_MIN_POSTED* are used to control the size and number + of the eager request buffers posted to handle incoming unmatched messages. + +*FI_CXI_HYBRID_PREEMPTIVE* +: When in hybrid mode, this variable can be used to enable preemptive + transitions to software matching. This is useful at scale for poorly + written applications with a large number of unexpected messages + where reserved resources may be insufficient to prevent to prevent + starvation of software request list match entries. Default is 0, disabled. + +*FI_CXI_HYBRID_RECV_PREEMPTIVE* +: When in hybrid mode, this variable can be used to enable preemptive + transitions to software matching. This is useful at scale for poorly + written applications with a large number of unmatched posted receives + where reserved resources may be insufficient to prevent starvation of + software request list match entries. Default is 0, disabled. + +*FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE* +: When in hybrid mode, this variable can be used to enable preemptive + transitions to software matching when the number of posted receives + exceeds the user requested RX size attribute. This is useful for + applications where they may not know the exact number of posted receives + and they are expereincing application termination due to event queue + overflow. Default is 0, disabled. + +*FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE* +: When in hybrid mode, this variable can be used to enable preemptive + transitions to software matching when the number of hardware unexpected + messages exceeds the user requested RX size attribute. This is useful for + applications where they may not know the exact number of posted receives + and they are expereincing application termination due to event queue + overflow. Default is 0, disabled. + +*FI_CXI_REQ_BUF_SIZE* +: Size of request buffers. Increasing the request buffer size allows for more + unmatched messages to be sent into a single request buffer. The default + size is 2MB. + +*FI_CXI_REQ_BUF_MIN_POSTED* +: The minimum number of request buffers that should be posted. The default + minimum posted count is 4. The number of buffers will grow unbounded to + support outstanding unexpected messages. Care should be taken to size + appropriately based on job scale and the size of eager data to reduce + the need for flow control. + +*FI_CXI_REQ_BUF_MAX_CACHED/FI_CXI_REQ_BUF_MAX_COUNT* +: The maximum number of request buffers that will be cached. The default + maximum count is 0. A value of zero indicates that once a request buffer + is allocated it will be cached and used as needed. A non-zero value can + be used with bursty traffic to shrink the number of allocated buffers to + a maximum count when they are no longer needed. + +*FI_CXI_MSG_LOSSLESS* +: Enable or disable lossless receive matching. If hardware resources are + exhausted, hardware will pause the associated traffic class until a + overflow buffer (hardware match mode) or request buffer (software match + mode or hybrid match mode) is posted. This is considered experimental and + defaults to disabled. + +*FI_CXI_FC_RETRY_USEC_DELAY* +: Number of micro-seconds to sleep before retrying a dropped side-band, flow + control message. Setting to zero will disable any sleep. + +*FI_UNIVERSE_SIZE* +: Defines the maximum number of processes that will be used by distribute + OFI application. Note that this value is used in setting the default + control EQ size, see FI_CXI_CTRL_RX_EQ_MAX_SIZE. + +*FI_CXI_CTRL_RX_EQ_MAX_SIZE* +: Max size of the receive event queue used for side-band/control messages. + Default receive event queue size is based on FI_UNIVERSE_SIZE. Increasing the + receive event queue size can help prevent side-band/control messages from + being dropped and retried but at the cost of additional memory usage. Size is + always aligned up to a 4KiB boundary. + +*FI_CXI_DEFAULT_CQ_SIZE* +: Change the provider default completion queue size expressed in entries. This + may be useful for applications which rely on middleware, and middleware defaults + the completion queue size to the provider default. + +*FI_CXI_DISABLE_EQ_HUGETLB/FI_CXI_DISABLE_CQ_HUGETLB* +: By default, the provider will attempt to allocate 2 MiB hugetlb pages for + provider event queues. Disabling hugetlb support will cause the provider + to fallback to memory allocators using host page sizes. + FI_CXI_DISABLE_EQ_HUGETLB replaces FI_CXI_DISABLE_CQ_HUGETLB, however use + of either is still supported. + +*FI_CXI_DEFAULT_TX_SIZE* +: Set the default tx_attr.size field to be used by the provider if the size + is not specified in the user provided fi_info hints. + +*FI_CXI_DEFAULT_RX_SIZE* +: Set the default rx_attr.size field to be used by the provider if the size + is not specified in the user provided fi_info hints. + +*FI_CXI_SW_RX_TX_INIT_MAX* +: Debug control to override the number of TX operations that can be + outstanding that are initiated by software RX processing. It has no impact + on hardware initiated RX rendezvous gets. + +*FI_CXI_DEVICE_NAME* +: Restrict CXI provider to specific CXI devices. Format is a comma separated + list of CXI devices (e.g. cxi0,cxi1). + +*FI_CXI_TELEMETRY* +: Perform a telemetry delta between fi_domain open and close. Format is a + comma separated list of telemetry files as defined in + /sys/class/cxi/cxi*/device/telemetry/. The ALL-in-binary file in this + directory is invalid. Note that these are per CXI interface counters and not + per CXI process per interface counters. + +*FI_CXI_TELEMETRY_RGID* +: Resource group ID (RGID) to restrict the telemetry collection to. Value less + than 0 is no restrictions. + +*FI_CXI_CQ_FILL_PERCENT* +: Fill percent of underlying hardware event queue used to determine when + completion queue is saturated. A saturated completion queue results in the + provider returning -FI_EAGAIN for data transfer and other related libfabric + operations. + +*FI_CXI_COMPAT* +: Temporary compatibility to allow use of pre-upstream values for FI_ADDR_CXI and + FI_PROTO_CXI. Compatibility can be disabled to verify operation with upstream + constant values and to enable access to conflicting provider values. The default + setting of 1 specifies both old and new constants are supported. A setting of 0 + disables support for old constants and can be used to test that an application is + compatible with the upstream values. A setting of 2 is a safety fallback that if + used the provider will only export fi_info with old constants and will be incompatible + with libfabric clients that been recompiled. + +*FI_CXI_COLL_FABRIC_MGR_URL* +: **accelerated collectives:** Specify the HTTPS address of the fabric manager REST API + used to create specialized multicast trees for accelerated collectives. This parameter + is **REQUIRED** for accelerated collectives, and is a fixed, system-dependent value. + +*FI_CXI_COLL_TIMEOUT_USEC* +: **accelerated collectives:** Specify the reduction engine timeout. This should be + larger than the maximum expected compute cycle in repeated reductions, or acceleration + can create incast congestion in the switches. The relative performance benefit of + acceleration declines with increasing compute cycle time, dropping below one percent at + 32 msec (32000). Using acceleration with compute cycles larger than 32 msec is not + recommended except for experimental purposes. Default is 32 msec (32000), maximum is + 20 sec (20000000). + +*FI_CXI_COLL_USE_DMA_PUT* +: **accelerated collectives:** Use DMA for collective packet put. This uses DMA to + inject reduction packets rather than IDC, and is considered experimental. Default + is false. + +*FI_CXI_DISABLE_HMEM_DEV_REGISTER* +: Disable registering HMEM device buffer for load/store access. Some HMEM devices + (e.g. AMD, Nvidia, and Intel GPUs) support backing the device memory by the PCIe BAR. + This enables software to perform load/stores to the device memory via the BAR instead + of using device DMA engines. Direct load/store access may improve performance. + +*FI_CXI_FORCE_ZE_HMEM_SUPPORT* +: Force the enablement of ZE HMEM support. By default, the CXI provider will only + support ZE memory registration if implicit scaling is disabled (i.e. the environment + variables EnableImplicitScaling=0 NEOReadDebugKeys=1 are set). Set + FI_CXI_FORCE_ZE_HMEM_SUPPORT to 1 will cause the CXI provider to skip the implicit + scaling checks. GPU direct RDMA may or may not work in this case. + +*FI_CXI_ENABLE_TRIG_OP_LIMIT* +: Enable enforcement of triggered operation limit. Doing this can prevent + fi_control(FI_QUEUE_WORK) deadlocking at the cost of performance. + +Note: Use the fi_info utility to query provider environment variables: +fi_info -p cxi -e + +# CXI EXTENSIONS + +The CXI provider supports various fabric-specific extensions. Extensions are +accessed using the fi_open_ops function. + +### CXI Domain Control Extensions + +The **fi_control**() function is extended for domain FIDs to query and override +global environment settings for a specific domain. This is useful for example +where the application process also includes a client API that has different +optimizations and protections. + +Command *FI_OPT_CXI_GET_OPTIMIZED* where the argument is a pointer to a bool. +The call returns the setting for optimized MR usage for the domain. The default +is determined by the environment setting of *FI_CXI_OPTIMIZED_MRS*. + +Command *FI_OPT_CXI_SET_OPTIMIZED* where the argument is a pointer to a bool +initialized to true or false. The call enables or disables the use of optimized +MRs for the domain. If the domain is not configured for FI_MR_PROV_KEY MR mode, +the call will fail with -FI_EINVAL, it is not supported for client generated +keys. It must be called prior to MR being created. + +Command *FI_OPT_CXI_GET_MR_MATCH_EVENTS* where the argument is a pointer to a +bool. The call returns the setting for MR Match Event accounting for the +domain. The default is determined by the environment setting of +*FI_CXI_MR_MATCH_EVENTS*. + +Command *FI_OPT_CXI_SET_MR_MATCH_EVENTS* where the argument is a pointer to a +bool initialized to true or false. This call enables or disables the use of MR +Match Event counting. This ensures that memory backing a MR cannot be accessed +after invoking fi_close() on the MR, even if that memory remains in the +libfabric MR cache. Manual progress must be made to process events at the RMA +destination. It can only be changed prior to any EP or MR being created. + +Command *FI_OPT_CXI_GET_PROV_KEY_CACHE* where the argument is a pointer to a +bool. The call returns the setting for enabling use of the remote MR +cache for provider keys for the domain. The default is determined by the +environment setting of *FI_CXI_PROV_KEY_CACHE* and is only valid if +FI_MR_PROV_KEY MR mode is used. + +Command *FI_OPT_CXI_SET_PROV_KEY_CACHE* where the argument is a pointer to a +bool initialized to true or false. This call enables or disables the use of +the remote MR cache for provider keys for the domain. By default the cache +is enabled and can be used for provider keys that do not require events. +The command will fail with -FI_EINVAL if FI_MR_PROV_KEY MR mode is not in use. +It can only be changed prior to any MR being created. + +## CXI Domain Extensions + +CXI domain extensions have been named *FI_CXI_DOM_OPS_6*. The flags parameter +is ignored. The fi_open_ops function takes a `struct fi_cxi_dom_ops`. See an +example of usage below: + +```c +struct fi_cxi_dom_ops *dom_ops; + +ret = fi_open_ops(&domain->fid, FI_CXI_DOM_OPS_4, 0, (void **)&dom_ops, NULL); +``` + +The following domain extensions are defined: + +```c +struct fi_cxi_dom_ops { + int (*cntr_read)(struct fid *fid, unsigned int cntr, uint64_t *value, + struct timespec *ts); + int (*topology)(struct fid *fid, unsigned int *group_id, + unsigned int *switch_id, unsigned int *port_id); + int (*enable_hybrid_mr_desc)(struct fid *fid, bool enable); + size_t (*ep_get_unexp_msgs)(struct fid_ep *fid_ep, + struct fi_cq_tagged_entry *entry, + size_t count, fi_addr_t *src_addr, + size_t *ux_count); + int (*get_dwq_depth)(struct fid *fid, size_t *depth); +}; +``` + +*cntr_read* extension is used to read hardware counter values. Valid values +of the cntr argument are found in the Cassini-specific header file +cassini_cntr_defs.h. Note that Counter accesses by applications may be +rate-limited to 1HZ. + +*topology* extension is used to return CXI NIC address topology information +for the domain. Currently only a dragonfly fabric topology is reported. + +The enablement of hybrid MR descriptor mode allows for libfabric users +to optionally pass in a valid MR desc for local communications operations. + +The get unexpected message function is used to obtain a list of +unexpected messages associated with an endpoint. The list is returned +as an array of CQ tagged entries set in the following manner: + +``` +struct fi_cq_tagged_entry { + .op_context = NULL, + .flags = any of [FI_TAGGED | FI_MSG | FI_REMOTE_CQ_DATA], + .len = message length, + .buf = NULL, + .data = CQ data if FI_REMOTE_CQ_DATA set + .tag = tag if FI_TAGGED set +}; +``` + +If the src_addr or entry array is NULL, only the ux_count of +available unexpected list entries will be returned. The parameter +count specifies the size of the array provided, if it is 0 then only +the ux_count will be returned. The function returns the number of +entries written to the array or a negative errno. On successful return, +ux_count will always be set to the total number of unexpected messages available. + +*enable_hybrid_mr_desc* is used to enable hybrid MR descriptor mode. Hybrid MR +desc allows for libfabric users to optionally pass in a valid MR desc for local +communication operations. This is currently only used for RMA and AMO transfers. + +*get_dwq_depth* is used to get the depth of the deferred work queue. The depth +is the number of triggered operation commands which can be queued to hardware. +The depth is not per fi_domain but rather per service ID. Since a single service +ID is intended to be shared between all processing using the same NIC in a job +step, the triggered operations are shared across processes. + +*enable_mr_match_events* and *enable_optimized_mrs* have been deprecated +in favor of using the fi_control() API. While the can be still be called via +the domain ops, They will be removed from the domain opts prior to software +release 2.2. + +## CXI Counter Extensions + +CXI counter extensions have been named *FI_CXI_COUNTER_OPS*. The flags parameter +is ignored. The fi_open_ops function takes a `struct fi_cxi_cntr_ops`. See an +example of usage below. + +```c +struct fi_cxi_cntr_ops *cntr_ops; + +ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0, (void **)&cntr_ops, NULL); +``` + +The following domain extensions are defined: + +```c +struct fi_cxi_cntr_ops { + /* Set the counter writeback address to a client provided address. */ + int (*set_wb_buffer)(struct fid *fid, const void *buf, size_t len); + + /* Get the counter MMIO region. */ + int (*get_mmio_addr)(struct fid *fid, void **addr, size_t *len); +}; +``` + +## CXI Counter Writeback Flag + +If a client is using the CXI counter extensions to define a counter writeback +buffer, the CXI provider will not update the writeback buffer success or +failure values for each hardware counter success or failure update. This can +especially create issues when clients expect the completion of a deferred +workqueue operation to generate a counter writeback. To support this, the flag +*FI_CXI_CNTR_WB* can be used in conjunction with a deferred workqueue operation +to force a writeback at the completion of the deferred workqueue operation. See +an example of usage below. + +```c +struct fi_op_rma rma = { + /* Signal to the provider the completion of the RMA should trigger a + * writeback. + */ + .flags = FI_CXI_CNTR_WB, +}; + +struct fi_deferred_work rma_work = { + .op_type = FI_OP_READ, + .triggering_counter = cntr, + .completion_cntr = cntr, + .threshold = 1, + .op.rma = &rma, +}; + +ret = fi_control(&domain->fid, FI_QUEUE_WORK, &rma_work); +``` + +**Note:** Using *FI_CXI_CNTR_WB* will lead to additional hardware usage. To +conserve hardware resources, it is recommended to only use the *FI_CXI_CNTR_WB* +when a counter writeback is absolutely required. + +## CXI Alias EP Overrides + +A transmit alias endpoint can be created and configured to utilize +a different traffic class than the original endpoint. This provides a +lightweight mechanism to utilize multiple traffic classes within a process. +Message order between the original endpoint and the alias endpoint is +not defined/guaranteed. See example usage below for setting the traffic +class of a transmit alias endpoint. + +```c +#include +#include +#include // Ultimately fi_ext.h + +struct fid_ep *ep; +. . . + +struct fid_ep *alias_ep = NULL; +uint32_t tclass = FI_TC_LOW_LATENCY; +uint64_t op_flags = FI_TRANSMIT | desired data operation flags; + +ret = fi_ep_alias(ep, &alias_ep, op_flags); +if (ret) + error; + +ret = fi_set_val(&alias_ep->fid, FI_OPT_CXI_SET_TCLASS, (void *)&tlcass); +if (ret) + error; +``` + +In addition, the alias endpoint message order may be modified to override +the default endpoint message order. Message order between the modified +alias endpoint and the original endpoint is not guaranteed. See example +usage below for setting the traffic class of a transmit alias endpoint. + +```c +uint64_t msg_order = FI_ORDER_RMA_WAW; + +ret = fi_set_val(&alias_ep->fid, FI_OPT_CXI_SET_MSG_ORDER, + (void *)&msg_order); +if (ret) + error; +``` + +When an endpoint does not support FI_FENCE (e.g. optimized MR), a provider +specific transmit flag, FI_CXI_WEAK_FENCE, may be specified on an alias EP +to issue a FENCE operation to create a data ordering point for the alias. +This is supported for one-sided operations only. + +Alias EP must be closed prior to closing the original EP. + +## PCIe Atomics +The CXI provider has the ability to issue a given libfabric atomic memory +operation as a PCIe operation as compared to a NIC operation. The CXI +provider extension flag FI_CXI_PCIE_AMO is used to signify this. + +Since not all libfabric atomic memory operations can be executed as a PCIe +atomic memory operation, `fi_query_atomic()` could be used to query if a +given libfabric atomic memory operation could be executed as PCIe atomic +memory operation. + +The following is a query to see if a given libfabric operation can be a +PCIe atomic operation. +```c +int ret; +struct fi_atomic_attr out_attrs; + +/* Query if non-fetching PCIe atomic is supported. */ +ret = fi_query_atomic(domain, FI_UINT32, FI_SUM, &out_attrs, FI_CXI_PCIE_AMO); + +/* Query if fetching PCIe atomic is supported. */ +ret = fi_query_atomic(domain, FI_UINT32, FI_SUM, &out_attrs, + FI_FETCH_ATOMIC | FI_CXI_PCIE_AMO); +``` + +The following is how to issue a PCIe atomic operation. +```c +ssize_t ret; +struct fi_msg_atomic msg; +struct fi_ioc resultv; +void *result_desc; +size_t result_count; + +ret = fi_fetch_atomicmsg(ep, &msg, &resultv, &result_desc, result_count, + FI_CXI_PCIE_AMO); +``` + +**Note:** The CXI provider only supports PCIe fetch add for UINT32_T, INT32_t, +UINT64_T, and INT64_t. This support requires enablement of PCIe fetch add in +the CXI driver, and it comes at the cost of losing NIC atomic support for another +libfabric atomic operation. + +**Note:** Ordering between PCIe atomic operations and NIC atomic/RMA operations is +undefined. + +To enable PCIe fetch add for libfabric, the following CXI driver kernel module +parameter must be set to non-zero. + +``` +/sys/module/cxi_core/parameters/amo_remap_to_pcie_fadd +``` + +The following are the possible values for this kernel module and the impact of +each value: +- -1: Disable PCIe fetch add support. FI_CXI_PCIE_AMO is not supported. +- 0: Enable PCIe fetch add support. FI_MIN is not supported. +- 1: Enable PCIe fetch add support. FI_MAX is not supported. +- 2: Enable PCIe fetch add support. FI_SUM is not supported. +- 4: Enable PCIe fetch add support. FI_LOR is not supported. +- 5: Enable PCIe fetch add support. FI_LAND is not supported. +- 6: Enable PCIe fetch add support. FI_BOR is not supported. +- 7: Enable PCIe fetch add support. FI_BAND is not supported. +- 8: Enable PCIe fetch add support. FI_LXOR is not supported. +- 9: Enable PCIe fetch add support. FI_BXOR is not supported. +- 10: Enable PCIe fetch add support. No loss of default CXI provider AMO +functionality. + +Guidance is to default amo_remap_to_pcie_fadd to 10. + +# FABTESTS + +The CXI provider does not currently support fabtests which depend on IP +addressing. + +fabtest RDM benchmarks are supported, like: + +```c +# Start server by specifying source PID and interface +./fabtests/benchmarks/fi_rdm_tagged_pingpong -B 10 -s cxi0 + +# Read server NIC address +CXI0_ADDR=$(cat /sys/class/cxi/cxi0/device/properties/nic_addr) + +# Start client by specifying server PID and NIC address +./fabtests/benchmarks/fi_rdm_tagged_pingpong -P 10 $CXI0_ADDR + +# The client may be bound to a specific interface, like: +./fabtests/benchmarks/fi_rdm_tagged_pingpong -B 10 -s cxi1 -P 10 $CXI0_ADDR +``` + +Some functional fabtests are supported (including fi_bw). Others use IP sockets +and are not yet supported. + +multinode fabtests are not yet supported. + +ubertest is supported for test configs matching the provider's current +capabilities. + +unit tests are supported where the test feature set matches the CXI provider's +current capabilities. + +# ERRATA + +* Fetch and compare type AMOs with FI_DELIVERY_COMPLETE or FI_MATCH_COMPLETE + completion semantics are not supported with FI_RMA_EVENT. + +# Libfabric CXI Provider User Programming and Troubleshooting Guide + +The scope of the following subsection is to provide guidance and/or troubleshooting tips +for users of the libfabric CXI provider. The scope of this section is not a full guide +for user libfabric. + +## Sizing Libfabric Objects Based on Expected Usage + +The CXI provider uses various libfabric object attribute size and/or libfabric enviroment +variables to size hardware related resources accordingly. Failure to size resources properly +can result in the CXI provider frequently returning -FI_EAGAIN which may negatively impact +performance. The following subsection outline important sizing related attributes and +environment variables. + +### Completion Queue Size Attribute + +The CXI provider uses completion queue attribute size to size various software and hardware +event queues used to generate libfabric completion events. While the size of the software +queues may grow, hardware event queue sizes are static. Failing to size hardware queues +properly may result in CXI provider returning -FI_EAGAIN frequently for data transfer +operations. When this error is returned, user should progress the corresponding endpoint +completion queues by calling fi_cq_read(). + +Users are encouraged to set the completion queue size attribute based on the expected +number of inflight RDMA operations to and from a single endpoint. For users which are +relying on the provider default value (e.g. MPI), the FI_CXI_DEFAULT_CQ_SIZE environment +variable can be used to override the provider default value. + +### Endpoint Recieve Size Attribute + +The CXI provider uses the endpoint receive size attribute to size internal command +and hardware event queues. Failing to size the either command queue correctly can result +in the CXI provider returning -FI_EAGAIN frequently for data transfer operations. When +this error is returned, user should progress the corresponding endpoint completion queues +by calling fi_cq_read(). + +Users are encouraged to set the endpoint receive size attribute based on the expected +numbfer of inflight untagged and tagged RDMA operations. For users which are relying on the +provider default value (e.g. MPI), the FI_CXI_DEFAULT_RX_SIZE environment variable can be +used to override the provider default value. + +### Endpoint Transmit Size Attribute + +The CXI provider uses the endpoint transmit size attribute to size internal command +and hardware event queues. Failing to size the either command queue correctly can result +in the CXI provider returning -FI_EAGAIN frequently for data transfer operations. When +this error is returned, user should progress the corresponding endpoint completion queues +by calling fi_cq_read(). + +At a minimum, users are encouraged to set the endpoint transmit size attribute based on +the expected numbfer of inflight, initiator RDMA operations. If users are going to be +issuing message opeartions over the CXI provider rendezvous limit (FI_CXI_RDZV_THRESHOLD), +the transmit size attribute must also include the number of outstanding, unexpected +rendezvous operations (i.e. inflight, initiator RDMA operations + outstanding, unexpected +rendezvous operations). + +For users which are relying on the provider default value (e.g. MPI), the +FI_CXI_DEFAULT_TX_SIZE environment variable can be used to override the provider default +value. + +### FI_UNIVERSE_SIZE Environment Variable + +The libfabric FI_UNIVERSE_SIZE environment variable defines the number of expected ranks/peers +an application needs to communicate with. The CXI provider may use this environment variable +to size resources tied to number of peers. Users are encourage to set this environment +variable accordingly. + +## Selecting Proper Receive Match Mode + +As mentioned in the *Runtime Parameters* section, the CXI provider supports 3 different +operational modes: hardware, hybrid, and software. + +Hardware match mode is approriate for users who can ensure the sum of unexpected messages +and posted receives does not exceed the configured hardware receive resource limit for the +application. When resources are consumed, the endpoint will transition into a flow control +operational mode which requires side-band messaging to recover from. Recovery will involve +the CXI provider trying to reclaim hardware receive resources to help prevent future +transition into flow control. If the CXI provider is unable to reclaim hardware receive +resoures, this can lead to a cycle of entering and exiting flow control which may present +itself as a hang to the libfabric user. Running with FI_LOG_LEVEL=warn and FI_LOG_PROV=cxi +will report if this flow control transition is happening. + +Hybrid match mode is approriate for users who are unsure if the sum of unexpected messages +and posted receives will not exceed the configure hardware receive resource limit for the +application but want to ensure they application still functions if hardware receive resources +are consumed. Hybrid match mode extends hardware match by allowing for an automated +transition into software match mode if resources are consumed. + +Sofftware match mode is approriate for user who know the sum of unexpected messages +and posted receives will exceed the configured hardware receive resource limit for the +application. In software match mode, the CXI provider maintains the a software unexpected and +posted receive list rather than offloading to hardware. This avoids having to allocated a +hardware receive resource for each unxpected messsage and posted receive. + +*Note*: In practice, dependent processes (e.g. parallel job) will most likely be sharing a +recieve hardware resource pool. + +*Note*: Each match mode may still enter flow control. For example, if a user is not draining +the libfabric completion queue at a reasonable rate, corresponding hardware events may fill +up which will trigger flow control. + +## Using Hybrid Match Mode Preemptive Options + +The high-level objective of the hybrid match mode preemptive environment variables (i.e. +FI_CXI_HYBRID_PREEMPTIVE, FI_CXI_HYBRID_RECV_PREEMPTIVE, +FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE, and FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE) is to +ensure a process requiring more hardware receives resource does not force other process +requiring less hardware receive resource to be force into software match mode due to no +available hardware receive resources available. + +For example, considered a parallel application which has multiple processes (i.e. ranks) +per NIC all sharing the same hardware receive resource pool. Suppose that the application +communication pattern results in an all-to-one communication to only a single rank (e.g. +rank 0) while other ranks may be doing communication amongst each other. If the width of +the all-to-one exceeds hardware resource consumptions, all ranks on the target NIC will +transition to software match mode. The preemptive options may help ensure that only +rank 0 would transition to software match mode instead of all the ranks on the target NIC. + +The FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE and FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE +environment variables will force the transition to software match mode if the user +requested endpoint recieve size attribute is exceeded. The benefit of running with +these enabled is that software match mode transition is 100% in control of the libfabric +user through the receive size attribute. One approach users could take here is set +receive size attribute to expected usage, and if this expected usage is exceeded, only +the offending endpoints will transition to software match mode. + +FI_CXI_HYBRID_PREEMPTIVE and FI_CXI_HYBRID_RECV_PREEMPTIVE environment variables will +force the transition to software match mode if hardware receive resources in the pool +are running low. The CXI provider will do a multi-step process to transition the libfabric +endpoint to software match mode. The benefit of running with these enabled is that the +number of endpoints transitioning to software match mode may be smaller when compared to +forced software match mode transition due to zero hardware resources available. + +## Preventing Messaging Flow Control Due to Hardware Event Queue Sizing + +As much as possible, CXI provider message flow control should be avoided. Flow control +results in expensive, side-band, CXI provider internal messaging to recover from. One +cause for flow control is due to improper hardware event queue sizing. If the hardware +event queue is undersized resulting it filling quicker than expected, the next incoming +message operation targeting a full event queue will result in the message operation +being dropped and flow control triggered. + +The default CXI provider behavior is to size hardware event queues based on endpoint +transmit and receive size attributes. Thus, it is critical for users to set these +attributes accordingly. + +The CQ size can be used to override the CXI provider calcuatled hardware event queue +size based on endpoint transmit and receive size attributes. If the CQ size is greater +than the CXI proviuder calcuation, the value from the CQ size will be used. + +The CQ fill percent can be used to define a threshold for when no new RDMA operations +can be queued until the libfabric CQ a progressed thus draining hardware event queues. + +## Interrupting CXI Provider CQ Error Event Errno + +The following are the libfabric errno value which may be returned in an RDMA CQ error event. + +FI_ETRUNC: Receive message truncation. + +FI_EHOSTUNREACH: Target is unreachable. This is due to connectivity issues, such as downed +links, between the two peers. + +FI_ENOTCONN: Cannot communicate due to no libfabric endpoint configure. In this case, the +target NIC is reachable. + +FI_EIO: Catch all errno. + +# SEE ALSO + +[`fabric`(7)](fabric.7.html), +[`fi_provider`(7)](fi_provider.7.html), diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 new file mode 100644 index 00000000000..bb8e7cba3e5 --- /dev/null +++ b/man/man7/fi_cxi.7 @@ -0,0 +1,2144 @@ +.\"t +.\" Automatically generated by Pandoc 2.9.2.1 +.\" +.TH "fi_cxi" "7" "2023\-11\-30" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.hy +.SH NAME +.PP +fi_cxi - The CXI Fabric Provider +.SH OVERVIEW +.PP +The CXI provider enables libfabric on Cray\[cq]s Slingshot network. +Slingshot is comprised of the Rosetta switch and Cassini NIC. +Slingshot is an Ethernet-compliant network. +However, The provider takes advantage of proprietary extensions to +support HPC applications. +.PP +The CXI provider supports reliable, connection-less endpoint semantics. +It supports two-sided messaging interfaces with message matching +offloaded by the Cassini NIC. +It also supports one-sided RMA and AMO interfaces, light-weight counting +events, triggered operations (via the deferred work API), and +fabric-accelerated small reductions. +.SH REQUIREMENTS +.PP +The CXI Provider requires Cassini\[cq]s optimized HPC protocol which is +only supported in combination with the Rosetta switch. +.PP +The provider uses the libCXI library for control operations and a set of +Cassini-specific header files to enable direct hardware access in the +data path. +.SH SUPPORTED FEATURES +.PP +The CXI provider supports a subset of OFI features. +.SS Endpoint types +.PP +The provider supports the \f[I]FI_EP_RDM\f[R] endpoint type, including +scalable endpoints. +.SS Memory registration modes +.PP +The provider implements scalable memory registration. +The provider requires \f[I]FI_MR_ENDPOINT\f[R]. +\f[I]FI_MR_ALLOCATED\f[R] is required if ODP in not enabled or not +desired. +Client specified 32-bit MR keys are the default unless +\f[I]FI_MR_PROV_KEY\f[R] is specified. +For \f[I]FI_MR_PROV_KEY\f[R] provider generated 64-bit MR keys are used. +An RMA initiator can work concurrently with client and provider +generated keys. +.PP +In client/server environments, if concerns with stale MR key usage +exists, then \f[I]FI_MR_PROV_KEY\f[R] generated keys should be used +along with \f[I]FI_CXI_MR_MATCH_EVENTS=1\f[R] and +\f[I]FI_CXI_OPTIMIZED_MRS=0\f[R]. +The former speeds up MR close, allowing non-remote MR cached keys to be +used that enable full remote memory access protection after an MR is +closed, even if that memory remains in the libfabric MR cache. +The latter uses only standard MR which use matching to enable robust key +usage, protecting against a stale MR key matching a newly generated MR +keys. +.SS Data transfer operations +.PP +The following data transfer interfaces are supported: +\f[I]FI_ATOMIC\f[R], \f[I]FI_MSG\f[R], \f[I]FI_RMA\f[R], +\f[I]FI_TAGGED\f[R]. +See DATA TRANSFER OPERATIONS below for more details. +.SS Completion events +.PP +The CXI provider supports all CQ event formats. +.SS Modes +.PP +The CXI provider does not require any operation modes. +.SS Progress +.PP +The CXI provider currently supports \f[I]FI_PROGRESS_MANUAL\f[R] data +and control progress modes. +.SS Multi-threading +.PP +The CXI provider supports FI_THREAD_SAFE and FI_THREAD_DOMAIN threading +models. +.SS Wait Objects +.PP +The CXI provider supports FI_WAIT_FD and FI_WAIT_POLLFD CQ wait object +types. +FI_WAIT_UNSPEC will default to FI_WAIT_FD. +However FI_WAIT_NONE should achieve the lowest latency and reduce +interrupt overhead. +.SS Additional Features +.PP +The CXI provider also supports the following capabilities and features: +.IP \[bu] 2 +\f[I]FI_MULTI_RECV\f[R] +.IP \[bu] 2 +\f[I]FI_SOURCE\f[R] +.IP \[bu] 2 +\f[I]FI_NAMED_RX_CTX\f[R] +.IP \[bu] 2 +\f[I]FI_RM_ENABLED\f[R] +.IP \[bu] 2 +\f[I]FI_RMA_EVENT\f[R] +.IP \[bu] 2 +\f[I]FI_REMOTE_CQ_DATA\f[R] +.IP \[bu] 2 +\f[I]FI_MORE\f[R] +.IP \[bu] 2 +\f[I]FI_FENCE\f[R] +.SS Addressing Format +.PP +The CXI provider uses a proprietary address format. +This format includes fields for NIC Address and PID. +NIC Address is the topological address of the NIC endpoint on the +fabric. +All OFI Endpoints sharing a Domain share the same NIC Address. +PID (for Port ID or Process ID, adopted from the Portals 4 +specification), is analogous to an IP socket port number. +Valid PIDs are in the range [0-510]. +.PP +A third component of Slingshot network addressing is the Virtual Network +ID (VNI). +VNI is a protection key used by the Slingshot network to provide +isolation between applications. +A VNI defines an isolated PID space for a given NIC. +Therefore, Endpoints must use the same VNI in order to communicate. +Note that VNI is not a field of the CXI address, but rather is specified +as part of the OFI Endpoint auth_key. +The combination of NIC Address, VNI, and PID is unique to a single OFI +Endpoint within a Slingshot fabric. +.PP +The NIC Address of an OFI Endpoint is inherited from the Domain. +By default, a PID is automatically assigned to an Endpoint when it is +enabled. +The address of an Endpoint can be queried using fi_getname. +The address received from fi_getname may then be inserted into a +peer\[cq]s Address Vector. +The resulting FI address may then be used to perform an RDMA operation. +.PP +Alternatively, a client may manage PID assignment. +fi_getinfo may be used to create an fi_info structure that can be used +to create an Endpoint with a client-specified address. +To achieve this, use fi_getinfo with the \f[I]FI_SOURCE\f[R] flag set +and set node and service strings to represent the local NIC interface +and PID to be assigned to the Endpoint. +The NIC interface string should match the name of an available CXI +domain (in the format cxi[0-9]). +The PID string will be interpreted as a 9-bit integer. +Address conflicts will be detected when the Endpoint is enabled. +.SS Authorization Keys +.PP +The CXI authorization key format is defined by struct cxi_auth_key. +This structure is defined in fi_cxi_ext.h. +.IP +.nf +\f[C] +struct cxi_auth_key { + uint32_t svc_id; + uint16_t vni; +}; +\f[R] +.fi +.PP +The CXI authorization key format includes a VNI and CXI service ID. +VNI is a component of the CXI Endpoint address that provides isolation. +A CXI service is a software container which defines a set of local CXI +resources, VNIs, and Traffic Classes which a libfabric user can access. +.PP +Two endpoints must use the same VNI in order to communicate. +Generally, a parallel application should be assigned to a unique VNI on +the fabric in order to achieve network traffic and address isolation. +Typically a privileged entity, like a job launcher, will allocate one or +more VNIs for use by the libfabric user. +.PP +The CXI service API is provided by libCXI. +It enables a privileged entity, like an application launcher, to control +an unprivileged process\[cq]s access to NIC resources. +Generally, a parallel application should be assigned to a unique CXI +service in order to control access to local resources, VNIs, and Traffic +Classes. +.PP +While a libfabric user provided authorization key is optional, it is +highly encouraged that libfabric users provide an authorization key +through the domain attribute hints during \f[C]fi_getinfo()\f[R]. +How libfabric users acquire the authorization key may vary between the +users and is outside the scope of this document. +.PP +If an authorization key is not provided by the libfabric user, the CXI +provider will attempt to generate an authorization key on behalf of the +user. +The following outlines how the CXI provider will attempt to generate an +authorization key. +.IP "1." 3 +Query for the following environment variables and generate an +authorization key using them. +.RS 4 +.IP \[bu] 2 +\f[I]SLINGSHOT_VNIS\f[R]: Comma separated list of VNIs. +The CXI provider will only use the first VNI if multiple are provide. +Example: \f[C]SLINGSHOT_VNIS=234\f[R]. +.IP \[bu] 2 +\f[I]SLINGSHOT_DEVICES\f[R]: Comma separated list of device names. +Each device index will use the same index to lookup the service ID in +\f[I]SLINGSHOT_SVC_IDS\f[R]. +Example: \f[C]SLINGSHOT_DEVICES=cxi0,cxi1\f[R]. +.IP \[bu] 2 +\f[I]SLINGSHOT_SVC_IDS\f[R]: Comma separated list of pre-configured CXI +service IDs. +Each service ID index will use the same index to lookup the CXI device +in \f[I]SLINGSHOT_DEVICES\f[R]. +Example: \f[C]SLINGSHOT_SVC_IDS=5,6\f[R]. +.PP +\f[B]Note:\f[R] How valid VNIs and device services are configured is +outside the responsibility of the CXI provider. +.RE +.IP "2." 3 +Query pre-configured device services and find first entry with same UID +as the libfabric user. +.IP "3." 3 +Query pre-configured device services and find first entry with same GID +as the libfabric user. +.IP "4." 3 +Query pre-configured device services and find first entry which does not +restrict member access. +If enabled, the default service is an example of an unrestricted +service. +.RS 4 +.PP +\f[B]Note:\f[R] There is a security concern with such services since it +allows for multiple independent libfabric users to use the same service. +.RE +.PP +\f[B]Note:\f[R] For above entries 2-4, it is possible the found device +service does not restrict VNI access. +For such cases, the CXI provider will query \f[I]FI_CXI_DEFAULT_VNI\f[R] +to assign a VNI. +.PP +During Domain allocation, if the domain auth_key attribute is NULL, the +CXI provider will attempt to generate a valid authorization key. +If the domain auth_key attribute is valid (i.e.\ not NULL and encoded +authorization key has been verified), the CXI provider will use the +encoded VNI and service ID. +Failure to generate a valid authorization key will result in Domain +allocation failure. +.PP +During Endpoint allocation, if the endpoint auth_key attribute is NULL, +the Endpoint with inherit the parent Domain\[cq]s VNI and service ID. +If the Endpoint auth_key attribute is valid, the encoded VNI and service +ID must match the parent Domain\[cq]s VNI and service ID. +Allocating an Endpoint with a different VNI and service from the parent +Domain is not supported. +.PP +The following is the expected parallel application launch workflow with +CXI integrated launcher and CXI authorization key aware libfabric user: +.IP "1." 3 +A parallel application is launched. +.IP "2." 3 +The launcher allocates one or more VNIs for use by the application. +.IP "3." 3 +The launcher communicates with compute node daemons where the +application will be run. +.IP "4." 3 +The launcher compute node daemon configures local CXI interfaces. +libCXI is used to allocate one or more services for the application. +The service will define the local resources, VNIs, and Traffic Classes +that the application may access. +Service allocation policies must be defined by the launcher. +libCXI returns an ID to represent a service. +.IP "5." 3 +The launcher forks application processes. +.IP "6." 3 +The launcher provides one or more service IDs and VNI values to the +application processes. +.IP "7." 3 +Application processes select from the list of available service IDs and +VNIs to form an authorization key to use for Endpoint allocation. +.SS Address Vectors +.PP +The CXI provider supports both \f[I]FI_AV_TABLE\f[R] and +\f[I]FI_AV_MAP\f[R] with the same internal implementation. +.PP +The CXI provider uses the \f[I]FI_SYMMETRIC\f[R] AV flag for +optimization. +When used with \f[I]FI_AV_TABLE\f[R], the CXI provider can use the +fi_addr_t index as an endpoint identifier instead of a network address. +The benefit of this is when running with FI_SOURCE, a reverse lookup is +not needed to generate the source fi_addr_t for target CQ events. +Note: FI_SOURCE_ERR should not be used for this configuration. +.PP +If the AV is not configured with \f[I]FI_SYMMETRIC\f[R], +\f[I]FI_AV_USER_ID\f[R] is supported as a flag which can be passed into +AV insert. +.PP +Since scalable EPs are not support, fi_av_attr::rx_ctx_bits must be +zero. +.PP +The following AV capabilities and flags are not supported: FI_SHARED_AV, +FI_SYNC_ERR, FI_EVENT, and FI_READ. +.SS Operation flags +.PP +The CXI provider supports the following Operation flags: +.TP +\f[I]FI_MORE\f[R] +When \f[I]FI_MORE\f[R] is specified in a data transfer operation, the +provider will defer submission of RDMA commands to hardware. +When one or more data transfer operations is performed using +\f[I]FI_MORE\f[R], followed by an operation without \f[I]FI_MORE\f[R], +the provider will submit the entire batch of queued operations to +hardware using a single PCIe transaction, improving PCIe efficiency. +.RS +.PP +When \f[I]FI_MORE\f[R] is used, queued commands will not be submitted to +hardware until another data transfer operation is performed without +\f[I]FI_MORE\f[R]. +.RE +.TP +\f[I]FI_TRANSMIT_COMPLETE\f[R] +By default, all CXI provider completion events satisfy the requirements +of the `transmit complete' completion level. +Transmit complete events are generated when the intiator receives an Ack +from the target NIC. +The Ack is generated once all data has been received by the target NIC. +Transmit complete events do not guarantee that data is visibile to the +target process. +.TP +\f[I]FI_DELIVERY_COMPLETE\f[R] +When the `delivery complete' completion level is used, the event +guarantees that data is visible to the target process. +To support this, hardware at the target performs a zero-byte read +operation to flush data across the PCIe bus before generating an Ack. +Flushing reads are performed unconditionally and will lead to higher +latency. +.TP +\f[I]FI_MATCH_COMPLETE\f[R] +When the `match complete' completion level is used, the event guarantees +that the message has been matched to a client-provided buffer. +All messages longer than the eager threshold support this guarantee. +When `match complete' is used with a Send that is shorter than the eager +threshold, an additional handshake may be performed by the provider to +notify the initiator that the Send has been matched. +.PP +The CXI provider also supports the following operation flags: +.IP \[bu] 2 +\f[I]FI_INJECT\f[R] +.IP \[bu] 2 +\f[I]FI_FENCE\f[R] +.IP \[bu] 2 +\f[I]FI_COMPLETION\f[R] +.IP \[bu] 2 +\f[I]FI_REMOTE_CQ_DATA\f[R] +.SS Scalable Endpoints +.PP +Scalable Endpoints (SEPs) support is not enabled in the CXI provider. +Future releases of the provider will re-introduce SEP support. +.SS Messaging +.PP +The CXI provider supports both tagged (\f[I]FI_TAGGED\f[R]) and untagged +(\f[I]FI_MSG\f[R]) two-sided messaging interfaces. +In the normal case, message matching is performed by hardware. +In certain low resource conditions, the responsibility to perform +message matching may be transferred to software. +Specification of the receive message matching mode in the environment +(\f[I]FI_CXI_RX_MATCH_MODE\f[R]) controls the initial matching mode and +whether hardware matching can transparently transition matching to +software where a hybrid of hardware and software receive matching is +done. +.PP +If a Send operation arrives at a node where there is no matching Receive +operation posted, it is considered unexpected. +Unexpected messages are supported. +The provider manages buffers to hold unexpected message data. +.PP +Unexpected message handling is transparent to clients. +Despite that, clients should take care to avoid excessive use of +unexpected messages by pre-posting Receive operations. +An unexpected message ties up hardware and memory resources until it is +matched with a user buffer. +.PP +The CXI provider implements several message protocols internally. +A message protocol is selected based on payload length. +Short messages are transferred using the eager protocol. +In the eager protocol, the entire message payload is sent along with the +message header. +If an eager message arrives unexpectedly, the entire message is buffered +at the target until it is matched to a Receive operation. +.PP +Long messages are transferred using a rendezvous protocol. +The threshold at which the rendezvous protocol is used is controlled +with the \f[I]FI_CXI_RDZV_THRESHOLD\f[R] and +\f[I]FI_CXI_RDZV_GET_MIN\f[R] environment variables. +.PP +In the rendezvous protocol, a portion of the message payload is sent +along with the message header. +Once the header is matched to a Receive operation, the remainder of the +payload is pulled from the source using an RDMA Get operation. +If the message arrives unexpectedly, the eager portion of the payload is +buffered at the target until it is matched to a Receive operation. +In the normal case, the Get is performed by hardware and the operation +completes without software progress. +.PP +Unexpected rendezvous protocol messages can not complete and release +source side buffer resources until a matching receive is posted at the +destination and the non-eager data is read from the source with a +rendezvous get DMA. +The number of rendezvous messages that may be outstanding is limited by +the minimum of the hints->tx_attr->size value specified and the number +of rendezvous operation ID mappings available. +FI_TAGGED rendezvous messages have 32K-256 ID mappings, FI_MSG +rendezvous messages are limited to 256 ID mappings. +While this works well with MPI, care should be taken that this minimum +is large enough to ensure applications written in a manner that assumes +unlimited resources and use FI_MSG rendezvous messaging do not induce a +software deadlock. +If FI_MSG rendezvous messaging is done in a unexpected manner that may +exceed the FI_MSG ID mappings available, it may be sufficient to reduce +the number of rendezvous operations by increasing the rendezvous +threshold. +See \f[I]FI_CXI_RDZV_THRESHOLD\f[R] for information. +.PP +Message flow-control is triggered when hardware message matching +resources become exhausted. +Messages may be dropped and retransmitted in order to recover; impacting +performance significantly. +Programs should be careful to avoid posting large numbers of unmatched +receive operations and to minimize the number of outstanding unexpected +messages to prevent message flow-control. +If the RX message matching mode is configured to support hybrid mode, +when resources are exhausted, hardware will transition to hybrid +operation where hardware and software share matching responsibility. +.PP +To help avoid this condition, increase Overflow buffer space using +environment variables \f[I]FI_CXI_OFLOW_*\f[R], and for software and +hybrid RX match modes increase Request buffer space using the variables +\f[I]FI_CXI_REQ_*\f[R]. +.SS Message Ordering +.PP +The CXI provider supports the following ordering rules: +.IP \[bu] 2 +All message Send operations are always ordered. +.IP \[bu] 2 +RMA Writes may be ordered by specifying \f[I]FI_ORDER_RMA_WAW\f[R]. +.IP \[bu] 2 +AMOs may be ordered by specifying +\f[I]FI_ORDER_AMO_{WAW|WAR|RAW|RAR}\f[R]. +.IP \[bu] 2 +RMA Writes may be ordered with respect to AMOs by specifying +\f[I]FI_ORDER_WAW\f[R]. +Fetching AMOs may be used to perform short reads that are ordered with +respect to RMA Writes. +.PP +Ordered RMA size limits are set as follows: +.IP \[bu] 2 +\f[I]max_order_waw_size\f[R] is -1. +RMA Writes and non-fetching AMOs of any size are ordered with respect to +each other. +.IP \[bu] 2 +\f[I]max_order_raw_size\f[R] is -1. +Fetching AMOs of any size are ordered with respect to RMA Writes and +non-fetching AMOs. +.IP \[bu] 2 +\f[I]max_order_war_size\f[R] is -1. +RMA Writes and non-fetching AMOs of any size are ordered with respect to +fetching AMOs. +.SS PCIe Ordering +.PP +Generally, PCIe writes are strictly ordered. +As an optimization, PCIe TLPs may have the Relaxed Order (RO) bit set to +allow writes to be reordered. +Cassini sets the RO bit in PCIe TLPs when possible. +Cassini sets PCIe RO as follows: +.IP \[bu] 2 +Ordering of messaging operations is established using completion events. +Therefore, all PCIe TLPs related to two-sided message payloads will have +RO set. +.IP \[bu] 2 +Every PCIe TLP associated with an unordered RMA or AMO operation will +have RO cleared. +.IP \[bu] 2 +PCIe TLPs associated with the last packet of an ordered RMA or AMO +operation will have RO cleared. +.IP \[bu] 2 +PCIe TLPs associated with the body packets (all except the last packet +of an operation) of an ordered RMA operation will have RO set. +.SS Translation +.PP +The CXI provider supports two translation mechanisms: Address +Translation Services (ATS) and NIC Translation Agent (NTA). +Use the environment variable \f[I]FI_CXI_ATS\f[R] to select between +translation mechanisms. +.PP +ATS refers to NIC support for PCIe rev. +4 ATS, PRI and PASID features. +ATS enables the NIC to efficiently access the entire virtual address +space of a process. +ATS mode currently supports AMD hosts using the iommu_v2 API. +.PP +The NTA is an on-NIC translation unit. +The NTA supports two-level page tables and additional hugepage sizes. +Most CPUs support 2MB and 1GB hugepage sizes. +Other hugepage sizes may be supported by SW to enable the NIC to cache +more address space. +.PP +ATS and NTA both support on-demand paging (ODP) in the event of a page +fault. +Use the environment variable \f[I]FI_CXI_ODP\f[R] to enable ODP. +.PP +With ODP enabled, buffers used for data transfers are not required to be +backed by physical memory. +An un-populated buffer that is referenced by the NIC will incur a +network page fault. +Network page faults will significantly impact application performance. +Clients should take care to pre-populate buffers used for data-tranfer +operations to avoid network page faults. +Copy-on-write semantics work as expected with ODP. +.PP +With ODP disabled, all buffers used for data transfers are backed by +pinned physical memory. +Using Pinned mode avoids any overhead due to network page faults but +requires all buffers to be backed by physical memory. +Copy-on-write semantics are broken when using pinned memory. +See the Fork section for more information. +.SS Translation Cache +.PP +Mapping a buffer for use by the NIC is an expensive operation. +To avoid this penalty for each data transfer operation, the CXI provider +maintains an internal translation cache. +.PP +When using the ATS translation mode, the provider does not maintain +translations for individual buffers. +It follows that translation caching is not required. +.SS Triggered Operation +.PP +The CXI provider supports triggered operations through the deferred work +queue API. +The following deferred work queue operations are supported: FI_OP_SEND, +FI_OP_TSEND, FI_OP_READ, FI_OP_WRITE, FI_OP_ATOMIC, FI_OP_FETCH_ATOMIC, +and FI_OP_COMPARE_ATOMIC. +FI_OP_RECV and FI_OP_TRECV are also supported, but with only a threshold +of zero. +.PP +The CXI provider backs each triggered operation by hardware resources. +Exhausting triggered operation resources leads to indeterminate behavior +and should be prevented. +.PP +The CXI provider offers two methods to prevent triggered operation +resource exhaustion. +.SS Experimental FI_CXI_ENABLE_TRIG_OP_LIMIT Environment Variable +.PP +When FI_CXI_ENABLE_TRIG_OP_LIMIT is enabled, the CXI provider will use +semaphores to coordinate triggered operation usage between threads and +across processes using the same service ID. +When triggered operation resources are exhausted, +fi_control(FI_QUEUE_WORK) will return -FI_ENOSPC. +It is up to the libfabric user to recover from this situation. +.PP +\f[B]Note:\f[R] Preventing triggered operation resource exhaustion with +this method may be expensive and result in a negative performance +impact. +It is encouraged libfabric users avoid method unless absolutely needed. +By default, FI_CXI_ENABLE_TRIG_OP_LIMIT is disabled. +.PP +\f[B]Note:\f[R] Named semaphores are used to coordinated triggered +operation resource usage across multiple processes. +System/node software may need to be implemented to ensure all semaphores +are unlinked during unexpected application termination. +.PP +\f[B]Note:\f[R] This feature is considered experimental and +implementation may be subjected to changed. +.SS CXI Domain get_dwq_depth Extension +.PP +The CXI domain get_dwq_depth extension returns the deferred work queue +queue depth (i.e.\ the number of triggered operation resources assigned +to the service ID used by the fi_domain). +Libfabric users can use the returned queue depth to coordinate resource +usage. +.PP +For example, suppose the job launcher has configured a service ID with +for 512 triggered operation resources. +Since the CXI provider needs to consume 8 per service ID, 504 should be +usable by libfabric users. +If the libfabric user knows there are \f[I]N\f[R] processes using a +given service ID and NIC, it can divide the 504 triggered operation +resource among all \f[I]N\f[R] processes. +.PP +\f[B]Note:\f[R] This is the preferred method to prevent triggered +operation resource exhaustion since it does not introduce semaphores +into the fi_control(FI_QUEUE_WORK) critical path. +.SS Fork Support +.PP +The following subsections outline the CXI provider fork support. +.SS RDMA and Fork Overview +.PP +Under Linux, \f[C]fork()\f[R] is implemented using copy-on-write (COW) +pages, so the only penalty that it incurs is the time and memory +required to duplicate the parent\[cq]s page tables, mark all of the +process\[cq]s page structs as read only and COW, and create a unique +task structure for the child. +.PP +Due to the Linux COW fork policy, both parent and child processes\[cq] +virtual addresses are mapped to the same physical address. +The first process to write to the virtual address will get a new +physical page, and thus a new physical address, with the same content as +the previous physical page. +.PP +The Linux COW fork policy is problematic for RDMA NICs. +RDMA NICs require memory to be registered with the NIC prior to +executing any RDMA operations. +In user-space, memory registration results in establishing a virtual +address to physical address mapping with the RDMA NIC. +This resulting RDMA NIC mapping/memory region does not get updated when +the Linux COW fork policy is executed. +.PP +Consider the following example: - Process A is planning to perform RDMA +with virtual address 0xffff0000 and a size of 4096. +This virtual address maps to physical address 0x1000. +- Process A registers this virtual address range with the RDMA NIC. +The RDMA NIC device driver programs its page tables to establish the +virtual address 0xffff0000 to physical address 0x1000 mapping. +- Process A decides to fork Process B. +Virtual address 0xffff0000 will now be subjected to COW. +- Process A decides to write to virtual address 0xffff0000 before doing +the RDMA operation. +This will trigger the Linux COW fork policy resulting in the following: +- Process A: Virtual address 0xffff0000 maps to new physical address +0x2000 - Process B: Virtual address 0xffff0000 maps to previous physical +address 0x1000 - Process A now executes an RDMA operation using the +mapping/memory region associated with virtual address 0xffff0000. +Since COW occurred, the RDMA NIC executes the RDMA operation using +physical address 0x1000 which belongs to Process B. +This results in data corruption. +.PP +The crux of the issue is the parent issuing forks while trying to do +RDMA operations to registered memory regions. +Excluding software RDMA emulation, two options exist for RDMA NIC +vendors to resolve this data corruption issue. +- Linux \f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - RDMA NIC +support for on-demand paging (ODP) +.SS Linux madvise() MADV_DONTFORK and MADV_DOFORK +.PP +The generic (i.e.\ non-vendor specific) RDMA NIC solution to the Linux +COW fork policy and RDMA problem is to use the following +\f[C]madvise()\f[R] operations during memory registration and +deregistration: - MADV_DONTFORK: Do not make the pages in this range +available to the child after a \f[C]fork()\f[R]. +This is useful to prevent copy-on-write semantics from changing the +physical location of a page if the parent writes to it after a +\f[C]fork()\f[R]. +(Such page relocations cause problems for hardware that DMAs into the +page.) - MADV_DOFORK: Undo the effect of MADV_DONTFORK, restoring the +default behavior, whereby a mapping is inherited across +\f[C]fork()\f[R]. +.PP +In the Linux kernel, MADV_DONTFORK will result in the virtual memory +area struct (VMA) being marked with the VM_DONTCOPY flag. +VM_DONTCOPY signals to the Linux kernel to not duplicate this VMA on +fork. +This effectively leaves a hole in child process address space. +Should the child reference the virtual address corresponding to the VMA +which was not duplicated, it will segfault. +.PP +In the previous example, if Process A issued +\f[C]madvise(0xffff0000, 4096, MADV_DONTFORK)\f[R] before performing +RDMA memory registration, the physical address 0x1000 would have +remained with Process A. +This would prevent the Process A data corruption as well. +If Process B were to reference virtual address 0xffff0000, it will +segfault due to the hole in the virtual address space. +.PP +Using \f[C]madvise()\f[R] with MADV_DONTFORK may be problematic for +applications performing RDMA and page aliasing. +Paging aliasing is where the parent process uses part or all of a page +to share information with the child process. +If RDMA is also being used for a separate portion of this page, the +child process will segfault when an access causes page aliasing. +.SS RDMA NIC Support for ODP +.PP +An RDMA NIC vendor specific solution to the Linux COW fork policy and +RDMA problem is to use ODP. +ODP allows for the RDMA NIC to generate page requests for translations +it does not have a physical address for. +The following is an updated example with ODP: - Process A is planning to +perform RDMA with virtual address 0xffff0000 and a size of 4096. +This virtual address maps to physical address 0x1000. +- Process A registers this virtual address range with the RDMA NIC. +The RDMA NIC device driver may optionally program its page tables to +establish the virtual address 0xffff0000 to physical address 0x1000 +mapping. +- Process A decides to fork Process B. +Virtual address 0xffff0000 will now be subjected to COW. +- Process A decides to write to virtual address 0xffff0000 before doing +the RDMA operation. +This will trigger the Linux COW fork policy resulting in the following: +- Process A: Virtual address 0xffff0000 maps to new physical address +0x2000 - Process B: Virtual address 0xffff0000 maps to previous physical +address 0x1000 - RDMA NIC device driver: Receives MMU invalidation event +for Process A virtual address range 0xffff0000 through 0xffff0ffe. +The device driver updates the corresponding memory region to no longer +reference physical address 0x1000. +- Process A now executes an RDMA operation using the memory region +associated with 0xffff0000. +The RDMA NIC will recognize the corresponding memory region as no longer +having a valid physical address. +The RDMA NIC will then signal to the device driver to fault in the +corresponding address, if necessary, and update the physical address +associated with the memory region. +In this case, the memory region will be updated with physical address +0x2000. +Once completed, the device driver signals to the RDMA NIC to continue +the RDMA operation. +Data corruption does not occur since RDMA occurred to the correct +physical address. +.PP +A RDMA NIC vendor specific solution to the Linux COW fork policy and +RDMA problem is to use ODP. +ODP allows for the RDMA NIC to generate page requests for translations +it does not have a physical address for. +.SS CXI Provider Fork Support +.PP +The CXI provider is subjected to the Linux COW fork policy and RDMA +issues described in section \f[I]RDMA and Fork Overview\f[R]. +To prevent data corruption with fork, the CXI provider supports the +following options: - CXI specific fork environment variables to enable +\f[C]madvise()\f[R] MADV_DONTFORK and MADV_DOFORK - ODP Support* +.PP +**Formal ODP support pending.* +.SS CXI Specific Fork Environment Variables +.PP +The CXI software stack has two environment variables related to fork: 0 +CXI_FORK_SAFE: Enables base fork safe support. +With this environment variable set, regardless of value, libcxi will +issue \f[C]madvise()\f[R] with MADV_DONTFORK on the virtual address +range being registered for RDMA. +In addition, libcxi always align the \f[C]madvise()\f[R] to the system +default page size. +On x86, this is 4 KiB. +To prevent redundant \f[C]madvise()\f[R] calls with MADV_DONTFORK +against the same virtual address region, reference counting is used +against each tracked \f[C]madvise()\f[R] region. +In addition, libcxi will spilt and merge tracked \f[C]madvise()\f[R] +regions if needed. +Once the reference count reaches zero, libcxi will call +\f[C]madvise()\f[R] with MADV_DOFORK, and no longer track the region. +- CXI_FORK_SAFE_HP: With this environment variable set, in conjunction +with CXI_FORK_SAFE, libcxi will not assume the page size is system +default page size. +Instead, libcxi will walk \f[C]/proc//smaps\f[R] to determine the +correct page size and align the \f[C]madvise()\f[R] calls accordingly. +This environment variable should be set if huge pages are being used for +RDMA. +To amortize the per memory registration walk of +\f[C]/proc//smaps\f[R], the libfabric MR cache should be used. +.PP +Setting these environment variables will prevent data corruption when +the parent issues a fork. +But it may result in the child process experiencing a segfault if it +references a virtual address being used for RDMA in the parent process. +.SS ODP Support and Fork +.PP +CXI provider ODP support would allow for applications to not have to set +CXI_FORK_SAFE and CXI_FORK_SAFE_HP to prevent parent process data +corruption. +Enabling ODP to resolve the RDMA and fork issue may or may not result in +a performance impact. +The concern with ODP is if the rate of invalidations and ODP page +requests are relatively high and occur at the same time, ODP timeouts +may occur. +This would result in application libfabric data transfer operations +failing. +.PP +Please refer to the \f[I]CXI Provider ODP Support\f[R] for more +information on how to enable/disable ODP. +.SS CXI Provider Fork Support Guidance +.PP +Since the CXI provider offloads the majority of the libfabric data +transfer operations to the NIC, thus enabling end-to-end RDMA between +libfabric user buffers, it is subjected to the issue described in +section \f[I]RDMA and Fork Overview\f[R]. +For comparison, software emulated RDMA libfabric providers may not have +these issues since they rely on bounce buffers to facilitate data +transfer. +.PP +The following is the CXI provider fork support guidance: - Enable +CXI_FORK_SAFE. +If huge pages are also used, CXI_FORK_SAFE_HP should be enabled as well. +Since enabling this will result in \f[C]madvice()\f[R] with +MADV_DONTFORK, the following steps should be taken to prevent a child +process segfault: - Avoid using stack memory for RDMA - Avoid child +process having to access a virtual address range the parent process is +performing RDMA against - Use page-aligned heap allocations for RDMA - +Enable ODP and run without CXI_FORK_SAFE and CXI_FORK_SAFE_HP. +The functionality and performance of ODP with fork may be application +specific. +Currently, ODP is not formally supported. +.PP +The CXI provider preferred approach is to use CXI_FORK_SAFE and +CXI_FORK_SAFE_HP. +While it may require the application to take certain precautions, it +will result in a more portable application regardless of RDMA NIC. +.SS Heterogenous Memory (HMEM) Supported Interfaces +.PP +The CXI provider supports the following OFI iface types: FI_HMEM_CUDA, +FI_HMEM_ROCR, and FI_HMEM_ZE. +.SS FI_HMEM_ZE Limitations +.PP +The CXI provider only supports GPU direct RDMA with ZE device buffers if +implicit scaling is disabled. +The following ZE environment variables disable implicit scaling: +EnableImplicitScaling=0 NEOReadDebugKeys=1. +.PP +For testing purposes only, the implicit scaling check can be disabled by +setting the following environment variable: +FI_CXI_FORCE_ZE_HMEM_SUPPORT=1. +This may need to be combined with the following environment variable to +get CXI provider memory registration to work: +FI_CXI_DISABLE_HMEM_DEV_REGISTER=1. +.SS Collectives (accelerated) +.PP +The CXI provider supports a limited set of collective operations +specifically intended to support use of the hardware-accelerated +reduction features of the CXI-supported NIC and fabric hardware. +.PP +These features are implemented using the (experimental) OFI collectives +API. +The implementation supports the following collective functions: +.IP \[bu] 2 +\f[B]fi_query_collective\f[R]() +.IP \[bu] 2 +\f[B]fi_join_collective\f[R]() +.IP \[bu] 2 +\f[B]fi_barrier\f[R]() +.IP \[bu] 2 +\f[B]fi_broadcast\f[R]() +.IP \[bu] 2 +\f[B]fi_reduce\f[R]() +.IP \[bu] 2 +\f[B]fi_allreduce\f[R]() +.SS \f[B]fi_query_collective\f[R]() +.PP +Standard implementation that exposes the features described below. +.SS \f[B]fi_join_collective\f[R]() +.PP +The \f[B]fi_join_collective\f[R]() implementation is provider-managed. +However, the \f[I]coll_addr\f[R] parameter is not useful to the +implementation, and must be specified as FI_ADDR_NOTAVAIL. +The \f[I]set\f[R] parameter must contain fi_addr_t values that resolve +to meaningful CXI addresses in the endpoint \f[I]fi_av\f[R] structure. +\f[B]fi_join_collective\f[R]() must be called for every address in the +\f[I]set\f[R] list, and must be progressed until the join operation is +complete. +There is no inherent limit on join concurrency. +.PP +The join will create a multicast tree in the fabric to manage the +collective operations. +This operation requires access to a secure Fabric Manager REST API that +constructs this tree, so any application that attempts to use +accelerated collectives will bind to libcurl and associated security +libraries, which must be available on the system. +.PP +There are hard limits to the number of multicast addresses available on +a system, and administrators may impose additional limits on the number +of multicast addresses available to any given collective job. +.SS fi_reduction operations +.PP +Payloads are limited to 32-byte data structures, and because they all +use the same underlying hardware model, they are all synchronizing +calls. +Specifically, the supported functions are all variants of +fi_allreduce(). +.IP \[bu] 2 +\f[B]fi_barrier\f[R] is \f[B]fi_allreduce\f[R] using an optimized +no-data operator. +.IP \[bu] 2 +\f[B]fi_broadcast\f[R] is \f[B]fi_allreduce\f[R] using FI_BOR, with data +forced to zero for all but the root rank. +.IP \[bu] 2 +\f[B]fi_reduce\f[R] is \f[B]fi_allreduce\f[R] with a result pointer +ignored by all but the root rank. +.PP +All functions must be progressed to completion on all ranks +participating in the collective group. +There is a hard limit of eight concurrent reductions on each collective +group, and attempts to launch more operations will return -FI_EAGAIN. +.PP +\f[B]allreduce\f[R] supports the following hardware-accelerated +reduction operators: +.PP +.TS +tab(@); +l l. +T{ +Operator +T}@T{ +Supported Datatypes +T} +_ +T{ +FI_BOR +T}@T{ +FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64 +T} +T{ +FI_BAND +T}@T{ +FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64 +T} +T{ +FI_BXOR +T}@T{ +FI_UINT8, FI_UINT16, FI_UINT32, FI_UINT64 +T} +T{ +FI_MIN +T}@T{ +FI_INT64, FI_DOUBLE +T} +T{ +FI_MAX +T}@T{ +FI_INT64, FI_DOUBLE +T} +T{ +FI_SUM +T}@T{ +FI_INT64, FI_DOUBLE +T} +T{ +FI_CXI_MINMAXLOC +T}@T{ +FI_INT64, FI_DOUBLE +T} +T{ +FI_CXI_REPSUM +T}@T{ +FI_DOUBLE +T} +.TE +.PP +Data space is limited to 32 bytes in all cases except REPSUM, which +supports only a single FI_DOUBLE. +.PP +Only unsigned bitwise operators are supported. +.PP +Only signed integer arithmetic operations are are supported. +.PP +The MINMAXLOC operators are a mixed data representation consisting of +two values, and two indices. +Each rank reports its minimum value and rank index, and its maximum +value and rank index. +The collective result is the global minimum value and rank index, and +the global maximum value and rank index. +Data structures for these functions can be found int the fi_cxi_ext.h +file. +The \f[I]datatype\f[R] should represent the type of the minimum/maximum +values, and the \f[I]count\f[R] must be 1. +.PP +The double-precision operators provide an associative (NUM) variant for +MIN, MAX, and MINMAXLOC. +Default IEEE behavior is to treat any operation with NaN as invalid, +including comparison, which has the interesting property of causing: +.IP +.nf +\f[C] +MIN(NaN, value) => NaN +MAX(NaN, value) => NaN +\f[R] +.fi +.PP +This means that if NaN creeps into a MIN/MAX reduction in any rank, it +tends to poison the entire result. +The associative variants instead effectively ignore the NaN, such that: +.IP +.nf +\f[C] +MIN(NaN, value) => value +MAX(NaN, value) => value +\f[R] +.fi +.PP +The REPSUM operator implements a reproducible (associative) sum of +double-precision values. +The payload can accommodate only a single double-precision value per +reduction, so \f[I]count\f[R] must be 1. +.PP +See: Berkeley reproducible sum algorithm +https://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-121.pdf +.SS double precision rounding +.PP +C99 defines four rounding modes for double-precision SUM, and some +systems may support a \[lq]flush-to-zero\[rq] mode for each of these, +resulting in a total of eight different modes for double-precision sum. +.PP +The fabric hardware supports all eight modes transparently. +.PP +Although the rounding modes have thread scope, all threads, processes, +and nodes should use the same rounding mode for any single reduction. +.SS reduction flags +.PP +The reduction operations supports two flags: +.IP \[bu] 2 +\f[B]FI_MORE\f[R] +.IP \[bu] 2 +\f[B]FI_CXI_PRE_REDUCED\f[R] (overloads \f[B]FI_SOURCE\f[R]) +.PP +The \f[B]FI_MORE\f[R] flag advises that the \f[I]result\f[R] data +pointer represents an opaque, local reduction accumulator, and will be +used as the destination of the reduction. +This operation can be repeated any number of times to accumulate results +locally, and spans the full set of all supported reduction operators. +The \f[I]op\f[R], \f[I]count\f[R], and \f[I]datatype\f[R] values must be +consistent for all calls. +The operation ignores all global or static variables \[em] it can be +treated as a \f[I]pure\f[R] function call \[em] and returns immediately. +The caller is responsible for protecting the accumulator memory if it is +used by multiple threads or processes on a compute node. +.PP +If \f[B]FI_MORE\f[R] is omitted, the destination is the fabric, and this +will initiate a fabric reduction through the associated endpoint. +The reduction must be progressed, and upon successful completion, the +\f[I]result\f[R] data pointer will be filled with the final reduction +result of \f[I]count\f[R] elements of type \f[I]datatype\f[R]. +.PP +The \f[B]FI_CXI_PRE_REDUCED\f[R] flag advises that the source data +pointer represents an opaque reduction accumulator containing +pre-reduced data. +The \f[I]count\f[R] and \f[I]datatype\f[R] arguments are ignored. +.PP +if \f[B]FI_CXI_PRE_REDUCED\f[R] is omitted, the source is taken to be +user data with \f[I]count\f[R] elements of type \f[I]datatype\f[R]. +.PP +The opaque reduction accumulator is exposed as \f[B]struct +cxip_coll_accumulator\f[R] in the fi_cxi_ext.h file. +.PP +\f[B]Note\f[R]: The opaque reduction accumulator provides extra space +for the expanded form of the reproducible sum, which carries the extra +data required to make the operation reproducible in software. +.SH OPTIMIZATION +.SS Optimized MRs +.PP +The CXI provider has two separate MR implementations: standard and +optimized. +Standard MRs are designed to support applications which require a large +number of remote memory regions. +Optimized MRs are designed to support one-sided programming models that +allocate a small number of large remote memory windows. +The CXI provider can achieve higher RMA Write rates when targeting an +optimized MR. +.PP +Both types of MRs are allocated using fi_mr_reg. +MRs with client-provided key in the range [0-99] are optimized MRs. +MRs with key greater or equal to 100 are standard MRs. +An application may create a mix of standard and optimized MRs. +To disable the use of optimized MRs, set environment variable +\f[I]FI_CXI_OPTIMIZED_MRS=false\f[R]. +When disabled, all MR keys are available and all MRs are implemented as +standard MRs. +All communicating processes must agree on the use of optimized MRs. +.PP +When FI_MR_PROV_KEY mr_mode is specified caching of remote access MRs is +enabled, which can improve registration/de-registration performance in +RPC type applications, that wrap RMA operations within a message RPC +protocol. +Optimized MRs will be preferred, but will fallback to standard MRs if +insufficient hardware resources are available. +.SS Optimized RMA +.PP +Optimized MRs are one requirement for the use of low overhead packet +formats which enable higher RMA Write rates. +An RMA Write will use the low overhead format when all the following +requirements are met: +.IP \[bu] 2 +The Write targets an optimized MR +.IP \[bu] 2 +The target MR does not require remote completion notifications (no +\f[I]FI_RMA_EVENT\f[R]) +.IP \[bu] 2 +The Write does not have ordering requirements (no \f[I]FI_RMA_WAW\f[R]) +.PP +Theoretically, Cassini has resources to support 64k standard MRs or 2k +optimized MRs. +Practically, the limits are much lower and depend greatly on application +behavior. +.PP +Hardware counters can be used to validate the use of the low overhead +packets. +The counter C_CNTR_IXE_RX_PTL_RESTRICTED_PKT counts the number of low +overhead packets received at the target NIC. +Counter C_CNTR_IXE_RX_PTL_UNRESTRICTED_PKT counts the number of ordered +RDMA packets received at the target NIC. +.PP +Message rate performance may be further optimized by avoiding target +counting events. +To avoid counting events, do not bind a counter to the MR. +To validate optimal writes without target counting events, monitor the +counter: C_CNTR_LPE_PLEC_HITS. +.SS Unreliable AMOs +.PP +By default, all AMOs are resilient to intermittent packet loss in the +network. +Cassini implements a connection-based reliability model to support +reliable execution of AMOs. +.PP +The connection-based reliability model may be disabled for AMOs in order +to increase message rate. +With reliability disabled, a lost AMO packet will result in operation +failure. +A failed AMO will be reported to the client in a completion event as +usual. +Unreliable AMOs may be useful for applications that can tolerate +intermittent AMO failures or those where the benefit of increased +message rate outweighs by the cost of restarting after a failure. +.PP +Unreliable, non-fetching AMOs may be performed by specifying the +\f[I]FI_CXI_UNRELIABLE\f[R] flag. +Unreliable, fetching AMOs are not supported. +Unreliable AMOs must target an optimized MR and cannot use remote +completion notification. +Unreliable AMOs are not ordered. +.SS High Rate Put +.PP +High Rate Put (HRP) is a feature that increases message rate performance +of RMA and unreliable non-fetching AMO operations at the expense of +global ordering guarantees. +.PP +HRP responses are generated by the fabric egress port. +Responses are coalesced by the fabric to achieve higher message rates. +The completion event for an HRP operation guarantees delivery but does +not guarantee global ordering. +If global ordering is needed following an HRP operation, the source may +follow the operation with a normal, fenced Put. +.PP +HRP RMA and unreliable AMO operations may be performed by specifying the +\f[I]FI_CXI_HRP\f[R] flag. +HRP AMOs must also use the \f[I]FI_CXI_UNRELIABLE\f[R] flag. +Monitor the hardware counter C_CNTR_HNI_HRP_ACK at the initiator to +validate that HRP is in use. +.SS Counters +.PP +Cassini offloads light-weight counting events for certain types of +operations. +The rules for offloading are: +.IP \[bu] 2 +Counting events for RMA and AMO source events are always offloaded. +.IP \[bu] 2 +Counting events for RMA and AMO target events are always offloaded. +.IP \[bu] 2 +Counting events for Sends are offloaded when message size is less than +the rendezvous threshold. +.IP \[bu] 2 +Counting events for message Receives are never offloaded by default. +.PP +Software progress is required to update counters unless the criteria for +offloading are met. +.SH RUNTIME PARAMETERS +.PP +The CXI provider checks for the following environment variables: +.TP +\f[I]FI_CXI_ODP\f[R] +Enables on-demand paging. +If disabled, all DMA buffers are pinned. +If enabled and mr_mode bits in the hints exclude FI_MR_ALLOCATED, then +ODP mode will be used. +.TP +\f[I]FI_CXI_FORCE_ODP\f[R] +Experimental value that can be used to force the use of ODP mode even if +FI_MR_ALLOCATED is set in the mr_mode hint bits. +This is intended to be used primarily for testing. +.TP +\f[I]FI_CXI_ATS\f[R] +Enables PCIe ATS. +If disabled, the NTA mechanism is used. +.TP +\f[I]FI_CXI_ATS_MLOCK_MODE\f[R] +Sets ATS mlock mode. +The mlock() system call may be used in conjunction with ATS to help +avoid network page faults. +Valid values are \[lq]off\[rq] and \[lq]all\[rq]. +When mlock mode is \[lq]off\[rq], the provider does not use mlock(). +An application using ATS without mlock() may experience network page +faults, reducing network performance. +When ats_mlock_mode is set to \[lq]all\[rq], the provider uses +mlockall() during initialization with ATS. +mlockall() causes all mapped addresses to be locked in RAM at all times. +This helps to avoid most network page faults. +Using mlockall() may increase pressure on physical memory. +Ignored when ODP is disabled. +.TP +\f[I]FI_CXI_RDZV_THRESHOLD\f[R] +Message size threshold for rendezvous protocol. +.TP +\f[I]FI_CXI_RDZV_GET_MIN\f[R] +Minimum rendezvous Get payload size. +A Send with length less than or equal to \f[I]FI_CXI_RDZV_THRESHOLD\f[R] +plus \f[I]FI_CXI_RDZV_GET_MIN\f[R] will be performed using the eager +protocol. +Larger Sends will be performed using the rendezvous protocol with +\f[I]FI_CXI_RDZV_THRESHOLD\f[R] bytes of payload sent eagerly and the +remainder of the payload read from the source using a Get. +\f[I]FI_CXI_RDZV_THRESHOLD\f[R] plus \f[I]FI_CXI_RDZV_GET_MIN\f[R] must +be less than or equal to \f[I]FI_CXI_OFLOW_BUF_SIZE\f[R]. +.TP +\f[I]FI_CXI_RDZV_EAGER_SIZE\f[R] +Eager data size for rendezvous protocol. +.TP +\f[I]FI_CXI_RDZV_PROTO\f[R] +Direct the provider to use a preferred protocol to transfer non-eager +rendezvous data. +\f[I]FI_CXI_RDZV_PROTO\f[R]= default | alt_read +.RS +.PP +To use an alternate protocol, the CXI driver property rdzv_get_en should +be set to \[lq]0\[rq]. +The \[lq]alt_read\[rq] rendezvous protocol may help improve collective +operation performance. +Note that all rendezvous protocol use RDMA to transfer eager and +non-eager rendezvous data. +.RE +.TP +\f[I]FI_CXI_DISABLE_NON_INJECT_MSG_IDC\f[R] +Experimental option to disable favoring IDC for transmit of small +messages when FI_INJECT is not specified. +This can be useful with GPU source buffers to avoid the host copy in +cases a performant copy can not be used. +The default is to use IDC for all messages less than IDC size. +.TP +\f[I]FI_CXI_DISABLE_HOST_REGISTER\f[R] +Disable registration of host buffers (overflow and request) with GPU. +There are scenarios where using a large number of processes per GPU +results in page locking excessive amounts of memory degrading +performance and/or restricting process counts. +The default is to register buffers with the GPU. +.TP +\f[I]FI_CXI_OFLOW_BUF_SIZE\f[R] +Size of overflow buffers. +Increasing the overflow buffer size allows for more unexpected message +eager data to be held in single overflow buffer. +The default size is 2MB. +.TP +\f[I]FI_CXI_OFLOW_BUF_MIN_POSTED/FI_CXI_OFLOW_BUF_COUNT\f[R] +The minimum number of overflow buffers that should be posted. +The default minimum posted count is 3. +Buffers will grow unbounded to support outstanding unexpected messages. +Care should be taken to size appropriately based on job scale, size of +eager data, and the amount of unexpected message traffic to reduce the +need for flow control. +.TP +\f[I]FI_CXI_OFLOW_BUF_MAX_CACHED\f[R] +The maximum number of overflow buffers that will be cached. +The default maximum count is 3 * FI_CXI_OFLOW_BUF_MIN_POSTED. +A value of zero indicates that once a overflow buffer is allocated it +will be cached and used as needed. +A non-zero value can be used with bursty traffic to shrink the number of +allocated buffers to the maximum count when they are no longer needed. +.TP +*FI_CXI_SAFE_DEVMEM_COPY_THRESHOLD +Defines the maximum CPU memcpy size for HMEM device memory that is +accessible by the CPU with load/store operations. +.TP +\f[I]FI_CXI_OPTIMIZED_MRS\f[R] +Enables optimized memory regions. +See section \f[I]CXI Domain Control Extensions\f[R] on how to +enable/disable optimized MRs at the domain level instead of for the +global process/job. +.TP +\f[I]FI_CXI_MR_MATCH_EVENTS\f[R] +Enabling MR match events in a client/server environment can be used to +ensure that memory backing a memory region cannot be remotely accessed +after the MR has been closed, even if it that memory remains mapped in +the libfabric MR cache. +Manual progress must be made at the target to process the MR match event +accounting and avoid event queue overflow. +There is a slight additional cost in the creation and tear-down of MR. +This option is disabled by default. +.RS +.PP +See section \f[I]CXI Domain Control Extensions\f[R] on how to enable MR +match events at the domain level instead of for the global process/job. +.RE +.TP +\f[I]FI_CXI_PROV_KEY_CACHE\f[R] +Enabled by default, the caching of remote MR provider keys can be +disable by setting to 0. +.RS +.PP +See section \f[I]CXI Domain Control Extensions\f[R] on how to disable +the remote provider key cache at the domain level instead of for the +global process/job. +.RE +.TP +\f[I]FI_CXI_LLRING_MODE\f[R] +Set the policy for use of the low-latency command queue ring mechanism. +This mechanism improves the latency of command processing on an idle +command queue. +Valid values are idle, always, and never. +.TP +\f[I]FI_CXI_CQ_POLICY\f[R] +Experimental. +Set Command Queue write-back policy. +Valid values are always, high_empty, low_empty, and low. +\[lq]always\[rq], \[lq]high\[rq], and \[lq]low\[rq] refer to the +frequency of write-backs. +\[lq]empty\[rq] refers to whether a write-back is performed when the +queue becomes empty. +.TP +\f[I]FI_CXI_DEFAULT_VNI\f[R] +Default VNI value used only for service IDs where the VNI is not +restricted. +.TP +\f[I]FI_CXI_EQ_ACK_BATCH_SIZE\f[R] +Number of EQ events to process before writing an acknowledgement to HW. +Batching ACKs amortizes the cost of event acknowledgement over multiple +network operations. +.TP +\f[I]FI_CXI_RX_MATCH_MODE\f[R] +Specify the receive message matching mode to be utilized. +\f[I]FI_CXI_RX_MATCH_MODE=\f[R]hardware | software | hybrid +.RS +.PP +\f[I]hardware\f[R] - Message matching is fully offloaded, if resources +become exhausted flow control will be performed and existing unexpected +message headers will be onloaded to free resources. +.PP +\f[I]software\f[R] - Message matching is fully onloaded. +.PP +\f[I]hybrid\f[R] - Message matching begins fully offloaded, if resources +become exhuasted hardware will transition message matching to a hybrid +of hardware and software matching. +.PP +For both \f[I]\[lq]hybrid\[rq]\f[R] and \f[I]\[lq]software\[rq]\f[R] +modes and care should be taken to minimize the threshold for rendezvous +processing (i.e.\ \f[I]FI_CXI_RDZV_THRESHOLD\f[R] + +\f[I]FI_CXI_RDZV_GET_MIN\f[R]). +When running in software endpoint mode the environment variables +\f[I]FI_CXI_REQ_BUF_SIZE\f[R] and \f[I]FI_CXI_REQ_BUF_MIN_POSTED\f[R] +are used to control the size and number of the eager request buffers +posted to handle incoming unmatched messages. +.RE +.TP +\f[I]FI_CXI_HYBRID_PREEMPTIVE\f[R] +When in hybrid mode, this variable can be used to enable preemptive +transitions to software matching. +This is useful at scale for poorly written applications with a large +number of unexpected messages where reserved resources may be +insufficient to prevent to prevent starvation of software request list +match entries. +Default is 0, disabled. +.TP +\f[I]FI_CXI_HYBRID_RECV_PREEMPTIVE\f[R] +When in hybrid mode, this variable can be used to enable preemptive +transitions to software matching. +This is useful at scale for poorly written applications with a large +number of unmatched posted receives where reserved resources may be +insufficient to prevent starvation of software request list match +entries. +Default is 0, disabled. +.TP +\f[I]FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE\f[R] +When in hybrid mode, this variable can be used to enable preemptive +transitions to software matching when the number of posted receives +exceeds the user requested RX size attribute. +This is useful for applications where they may not know the exact number +of posted receives and they are expereincing application termination due +to event queue overflow. +Default is 0, disabled. +.TP +\f[I]FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE\f[R] +When in hybrid mode, this variable can be used to enable preemptive +transitions to software matching when the number of hardware unexpected +messages exceeds the user requested RX size attribute. +This is useful for applications where they may not know the exact number +of posted receives and they are expereincing application termination due +to event queue overflow. +Default is 0, disabled. +.TP +\f[I]FI_CXI_REQ_BUF_SIZE\f[R] +Size of request buffers. +Increasing the request buffer size allows for more unmatched messages to +be sent into a single request buffer. +The default size is 2MB. +.TP +\f[I]FI_CXI_REQ_BUF_MIN_POSTED\f[R] +The minimum number of request buffers that should be posted. +The default minimum posted count is 4. +The number of buffers will grow unbounded to support outstanding +unexpected messages. +Care should be taken to size appropriately based on job scale and the +size of eager data to reduce the need for flow control. +.TP +\f[I]FI_CXI_REQ_BUF_MAX_CACHED/FI_CXI_REQ_BUF_MAX_COUNT\f[R] +The maximum number of request buffers that will be cached. +The default maximum count is 0. +A value of zero indicates that once a request buffer is allocated it +will be cached and used as needed. +A non-zero value can be used with bursty traffic to shrink the number of +allocated buffers to a maximum count when they are no longer needed. +.TP +\f[I]FI_CXI_MSG_LOSSLESS\f[R] +Enable or disable lossless receive matching. +If hardware resources are exhausted, hardware will pause the associated +traffic class until a overflow buffer (hardware match mode) or request +buffer (software match mode or hybrid match mode) is posted. +This is considered experimental and defaults to disabled. +.TP +\f[I]FI_CXI_FC_RETRY_USEC_DELAY\f[R] +Number of micro-seconds to sleep before retrying a dropped side-band, +flow control message. +Setting to zero will disable any sleep. +.TP +\f[I]FI_UNIVERSE_SIZE\f[R] +Defines the maximum number of processes that will be used by distribute +OFI application. +Note that this value is used in setting the default control EQ size, see +FI_CXI_CTRL_RX_EQ_MAX_SIZE. +.TP +\f[I]FI_CXI_CTRL_RX_EQ_MAX_SIZE\f[R] +Max size of the receive event queue used for side-band/control messages. +Default receive event queue size is based on FI_UNIVERSE_SIZE. +Increasing the receive event queue size can help prevent +side-band/control messages from being dropped and retried but at the +cost of additional memory usage. +Size is always aligned up to a 4KiB boundary. +.TP +\f[I]FI_CXI_DEFAULT_CQ_SIZE\f[R] +Change the provider default completion queue size expressed in entries. +This may be useful for applications which rely on middleware, and +middleware defaults the completion queue size to the provider default. +.TP +\f[I]FI_CXI_DISABLE_EQ_HUGETLB/FI_CXI_DISABLE_CQ_HUGETLB\f[R] +By default, the provider will attempt to allocate 2 MiB hugetlb pages +for provider event queues. +Disabling hugetlb support will cause the provider to fallback to memory +allocators using host page sizes. +FI_CXI_DISABLE_EQ_HUGETLB replaces FI_CXI_DISABLE_CQ_HUGETLB, however +use of either is still supported. +.TP +\f[I]FI_CXI_DEFAULT_TX_SIZE\f[R] +Set the default tx_attr.size field to be used by the provider if the +size is not specified in the user provided fi_info hints. +.TP +\f[I]FI_CXI_DEFAULT_RX_SIZE\f[R] +Set the default rx_attr.size field to be used by the provider if the +size is not specified in the user provided fi_info hints. +.TP +\f[I]FI_CXI_SW_RX_TX_INIT_MAX\f[R] +Debug control to override the number of TX operations that can be +outstanding that are initiated by software RX processing. +It has no impact on hardware initiated RX rendezvous gets. +.TP +\f[I]FI_CXI_DEVICE_NAME\f[R] +Restrict CXI provider to specific CXI devices. +Format is a comma separated list of CXI devices (e.g.\ cxi0,cxi1). +.TP +\f[I]FI_CXI_TELEMETRY\f[R] +Perform a telemetry delta between fi_domain open and close. +Format is a comma separated list of telemetry files as defined in +/sys/class/cxi/cxi*/device/telemetry/. +The ALL-in-binary file in this directory is invalid. +Note that these are per CXI interface counters and not per CXI process +per interface counters. +.TP +\f[I]FI_CXI_TELEMETRY_RGID\f[R] +Resource group ID (RGID) to restrict the telemetry collection to. +Value less than 0 is no restrictions. +.TP +\f[I]FI_CXI_CQ_FILL_PERCENT\f[R] +Fill percent of underlying hardware event queue used to determine when +completion queue is saturated. +A saturated completion queue results in the provider returning +-FI_EAGAIN for data transfer and other related libfabric operations. +.TP +\f[I]FI_CXI_COMPAT\f[R] +Temporary compatibility to allow use of pre-upstream values for +FI_ADDR_CXI and FI_PROTO_CXI. +Compatibility can be disabled to verify operation with upstream constant +values and to enable access to conflicting provider values. +The default setting of 1 specifies both old and new constants are +supported. +A setting of 0 disables support for old constants and can be used to +test that an application is compatible with the upstream values. +A setting of 2 is a safety fallback that if used the provider will only +export fi_info with old constants and will be incompatible with +libfabric clients that been recompiled. +.TP +\f[I]FI_CXI_COLL_FABRIC_MGR_URL\f[R] +\f[B]accelerated collectives:\f[R] Specify the HTTPS address of the +fabric manager REST API used to create specialized multicast trees for +accelerated collectives. +This parameter is \f[B]REQUIRED\f[R] for accelerated collectives, and is +a fixed, system-dependent value. +.TP +\f[I]FI_CXI_COLL_TIMEOUT_USEC\f[R] +\f[B]accelerated collectives:\f[R] Specify the reduction engine timeout. +Upon expiration, reduction engines in hardware will deliver any partial +results and expire. Any remaining results will arrive individually, without +hardware reduction, unless the retry period (below) expires and re-arms +the reduction. +The relative performance benefit of acceleration declines with +increasing compute cycle time, dropping below one percent at 32 msec +(32000). +Using acceleration with compute cycles larger than 32 msec is not +recommended except for experimental purposes. +.TP +\f[I]FI_CXI_COLL_RETRY_USEC\f[R] +\f[B]accelerated collectives:\f[R] Specify the reduction engine retry +period. Upon expiration, incomplete reductions will be automatically +restarted, forcing partial results from leaf endpoints to be sent again. This +allows dropped packets to be recovered, and prevents potential incast at the +root if many nodes submit late results (unexpectedly long compute cycles). +The relative performance benefit of acceleration declines with +increasing compute cycle time, dropping below one percent at 32 msec +(32000). +Using acceleration with compute cycles larger than 32 msec is not +recommended except for experimental purposes. +.TP +\f[I]FI_CXI_COLL_USE_DMA_PUT\f[R] +\f[B]accelerated collectives:\f[R] Use DMA for collective packet put. +This uses DMA to inject reduction packets rather than IDC, and is +considered experimental. +Default is false. +.TP +\f[I]FI_CXI_DISABLE_HMEM_DEV_REGISTER\f[R] +Disable registering HMEM device buffer for load/store access. +Some HMEM devices (e.g.\ AMD, Nvidia, and Intel GPUs) support backing +the device memory by the PCIe BAR. +This enables software to perform load/stores to the device memory via +the BAR instead of using device DMA engines. +Direct load/store access may improve performance. +.TP +\f[I]FI_CXI_FORCE_ZE_HMEM_SUPPORT\f[R] +Force the enablement of ZE HMEM support. +By default, the CXI provider will only support ZE memory registration if +implicit scaling is disabled (i.e.\ the environment variables +EnableImplicitScaling=0 NEOReadDebugKeys=1 are set). +Set FI_CXI_FORCE_ZE_HMEM_SUPPORT to 1 will cause the CXI provider to +skip the implicit scaling checks. +GPU direct RDMA may or may not work in this case. +.TP +\f[I]FI_CXI_ENABLE_TRIG_OP_LIMIT\f[R] +Enable enforcement of triggered operation limit. +Doing this can prevent fi_control(FI_QUEUE_WORK) deadlocking at the cost +of performance. +.PP +Note: Use the fi_info utility to query provider environment variables: +fi_info -p cxi -e +.SH CXI EXTENSIONS +.PP +The CXI provider supports various fabric-specific extensions. +Extensions are accessed using the fi_open_ops function. +.SS CXI Domain Control Extensions +.PP +The \f[B]fi_control\f[R]() function is extended for domain FIDs to query +and override global environment settings for a specific domain. +This is useful for example where the application process also includes a +client API that has different optimizations and protections. +.PP +Command \f[I]FI_OPT_CXI_GET_OPTIMIZED\f[R] where the argument is a +pointer to a bool. +The call returns the setting for optimized MR usage for the domain. +The default is determined by the environment setting of +\f[I]FI_CXI_OPTIMIZED_MRS\f[R]. +.PP +Command \f[I]FI_OPT_CXI_SET_OPTIMIZED\f[R] where the argument is a +pointer to a bool initialized to true or false. +The call enables or disables the use of optimized MRs for the domain. +If the domain is not configured for FI_MR_PROV_KEY MR mode, the call +will fail with -FI_EINVAL, it is not supported for client generated +keys. +It must be called prior to MR being created. +.PP +Command \f[I]FI_OPT_CXI_GET_MR_MATCH_EVENTS\f[R] where the argument is a +pointer to a bool. +The call returns the setting for MR Match Event accounting for the +domain. +The default is determined by the environment setting of +\f[I]FI_CXI_MR_MATCH_EVENTS\f[R]. +.PP +Command \f[I]FI_OPT_CXI_SET_MR_MATCH_EVENTS\f[R] where the argument is a +pointer to a bool initialized to true or false. +This call enables or disables the use of MR Match Event counting. +This ensures that memory backing a MR cannot be accessed after invoking +fi_close() on the MR, even if that memory remains in the libfabric MR +cache. +Manual progress must be made to process events at the RMA destination. +It can only be changed prior to any EP or MR being created. +.PP +Command \f[I]FI_OPT_CXI_GET_PROV_KEY_CACHE\f[R] where the argument is a +pointer to a bool. +The call returns the setting for enabling use of the remote MR cache for +provider keys for the domain. +The default is determined by the environment setting of +\f[I]FI_CXI_PROV_KEY_CACHE\f[R] and is only valid if FI_MR_PROV_KEY MR +mode is used. +.PP +Command \f[I]FI_OPT_CXI_SET_PROV_KEY_CACHE\f[R] where the argument is a +pointer to a bool initialized to true or false. +This call enables or disables the use of the remote MR cache for +provider keys for the domain. +By default the cache is enabled and can be used for provider keys that +do not require events. +The command will fail with -FI_EINVAL if FI_MR_PROV_KEY MR mode is not +in use. +It can only be changed prior to any MR being created. +.SS CXI Domain Extensions +.PP +CXI domain extensions have been named \f[I]FI_CXI_DOM_OPS_6\f[R]. +The flags parameter is ignored. +The fi_open_ops function takes a \f[C]struct fi_cxi_dom_ops\f[R]. +See an example of usage below: +.IP +.nf +\f[C] +struct fi_cxi_dom_ops *dom_ops; + +ret = fi_open_ops(&domain->fid, FI_CXI_DOM_OPS_4, 0, (void **)&dom_ops, NULL); +\f[R] +.fi +.PP +The following domain extensions are defined: +.IP +.nf +\f[C] +struct fi_cxi_dom_ops { + int (*cntr_read)(struct fid *fid, unsigned int cntr, uint64_t *value, + struct timespec *ts); + int (*topology)(struct fid *fid, unsigned int *group_id, + unsigned int *switch_id, unsigned int *port_id); + int (*enable_hybrid_mr_desc)(struct fid *fid, bool enable); + size_t (*ep_get_unexp_msgs)(struct fid_ep *fid_ep, + struct fi_cq_tagged_entry *entry, + size_t count, fi_addr_t *src_addr, + size_t *ux_count); + int (*get_dwq_depth)(struct fid *fid, size_t *depth); +}; +\f[R] +.fi +.PP +\f[I]cntr_read\f[R] extension is used to read hardware counter values. +Valid values of the cntr argument are found in the Cassini-specific +header file cassini_cntr_defs.h. +Note that Counter accesses by applications may be rate-limited to 1HZ. +.PP +\f[I]topology\f[R] extension is used to return CXI NIC address topology +information for the domain. +Currently only a dragonfly fabric topology is reported. +.PP +The enablement of hybrid MR descriptor mode allows for libfabric users +to optionally pass in a valid MR desc for local communications +operations. +.PP +The get unexpected message function is used to obtain a list of +unexpected messages associated with an endpoint. +The list is returned as an array of CQ tagged entries set in the +following manner: +.IP +.nf +\f[C] +struct fi_cq_tagged_entry { + .op_context = NULL, + .flags = any of [FI_TAGGED | FI_MSG | FI_REMOTE_CQ_DATA], + .len = message length, + .buf = NULL, + .data = CQ data if FI_REMOTE_CQ_DATA set + .tag = tag if FI_TAGGED set +}; +\f[R] +.fi +.PP +If the src_addr or entry array is NULL, only the ux_count of available +unexpected list entries will be returned. +The parameter count specifies the size of the array provided, if it is 0 +then only the ux_count will be returned. +The function returns the number of entries written to the array or a +negative errno. +On successful return, ux_count will always be set to the total number of +unexpected messages available. +.PP +\f[I]enable_hybrid_mr_desc\f[R] is used to enable hybrid MR descriptor +mode. +Hybrid MR desc allows for libfabric users to optionally pass in a valid +MR desc for local communication operations. +This is currently only used for RMA and AMO transfers. +.PP +\f[I]get_dwq_depth\f[R] is used to get the depth of the deferred work +queue. +The depth is the number of triggered operation commands which can be +queued to hardware. +The depth is not per fi_domain but rather per service ID. +Since a single service ID is intended to be shared between all +processing using the same NIC in a job step, the triggered operations +are shared across processes. +.PP +\f[I]enable_mr_match_events\f[R] and \f[I]enable_optimized_mrs\f[R] have +been deprecated in favor of using the fi_control() API. +While the can be still be called via the domain ops, They will be +removed from the domain opts prior to software release 2.2. +.SS CXI Counter Extensions +.PP +CXI counter extensions have been named \f[I]FI_CXI_COUNTER_OPS\f[R]. +The flags parameter is ignored. +The fi_open_ops function takes a \f[C]struct fi_cxi_cntr_ops\f[R]. +See an example of usage below. +.IP +.nf +\f[C] +struct fi_cxi_cntr_ops *cntr_ops; + +ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0, (void **)&cntr_ops, NULL); +\f[R] +.fi +.PP +The following domain extensions are defined: +.IP +.nf +\f[C] +struct fi_cxi_cntr_ops { + /* Set the counter writeback address to a client provided address. */ + int (*set_wb_buffer)(struct fid *fid, const void *buf, size_t len); + + /* Get the counter MMIO region. */ + int (*get_mmio_addr)(struct fid *fid, void **addr, size_t *len); +}; +\f[R] +.fi +.SS CXI Counter Writeback Flag +.PP +If a client is using the CXI counter extensions to define a counter +writeback buffer, the CXI provider will not update the writeback buffer +success or failure values for each hardware counter success or failure +update. +This can especially create issues when clients expect the completion of +a deferred workqueue operation to generate a counter writeback. +To support this, the flag \f[I]FI_CXI_CNTR_WB\f[R] can be used in +conjunction with a deferred workqueue operation to force a writeback at +the completion of the deferred workqueue operation. +See an example of usage below. +.IP +.nf +\f[C] +struct fi_op_rma rma = { + /* Signal to the provider the completion of the RMA should trigger a + * writeback. + */ + .flags = FI_CXI_CNTR_WB, +}; + +struct fi_deferred_work rma_work = { + .op_type = FI_OP_READ, + .triggering_counter = cntr, + .completion_cntr = cntr, + .threshold = 1, + .op.rma = &rma, +}; + +ret = fi_control(&domain->fid, FI_QUEUE_WORK, &rma_work); +\f[R] +.fi +.PP +\f[B]Note:\f[R] Using \f[I]FI_CXI_CNTR_WB\f[R] will lead to additional +hardware usage. +To conserve hardware resources, it is recommended to only use the +\f[I]FI_CXI_CNTR_WB\f[R] when a counter writeback is absolutely +required. +.SS CXI Alias EP Overrides +.PP +A transmit alias endpoint can be created and configured to utilize a +different traffic class than the original endpoint. +This provides a lightweight mechanism to utilize multiple traffic +classes within a process. +Message order between the original endpoint and the alias endpoint is +not defined/guaranteed. +See example usage below for setting the traffic class of a transmit +alias endpoint. +.IP +.nf +\f[C] +#include +#include +#include // Ultimately fi_ext.h + +struct fid_ep *ep; +\&. . . + +struct fid_ep *alias_ep = NULL; +uint32_t tclass = FI_TC_LOW_LATENCY; +uint64_t op_flags = FI_TRANSMIT | desired data operation flags; + +ret = fi_ep_alias(ep, &alias_ep, op_flags); +if (ret) + error; + +ret = fi_set_val(&alias_ep->fid, FI_OPT_CXI_SET_TCLASS, (void *)&tlcass); +if (ret) + error; +\f[R] +.fi +.PP +In addition, the alias endpoint message order may be modified to +override the default endpoint message order. +Message order between the modified alias endpoint and the original +endpoint is not guaranteed. +See example usage below for setting the traffic class of a transmit +alias endpoint. +.IP +.nf +\f[C] +uint64_t msg_order = FI_ORDER_RMA_WAW; + +ret = fi_set_val(&alias_ep->fid, FI_OPT_CXI_SET_MSG_ORDER, + (void *)&msg_order); +if (ret) + error; +\f[R] +.fi +.PP +When an endpoint does not support FI_FENCE (e.g.\ optimized MR), a +provider specific transmit flag, FI_CXI_WEAK_FENCE, may be specified on +an alias EP to issue a FENCE operation to create a data ordering point +for the alias. +This is supported for one-sided operations only. +.PP +Alias EP must be closed prior to closing the original EP. +.SS PCIe Atomics +.PP +The CXI provider has the ability to issue a given libfabric atomic +memory operation as a PCIe operation as compared to a NIC operation. +The CXI provider extension flag FI_CXI_PCIE_AMO is used to signify this. +.PP +Since not all libfabric atomic memory operations can be executed as a +PCIe atomic memory operation, \f[C]fi_query_atomic()\f[R] could be used +to query if a given libfabric atomic memory operation could be executed +as PCIe atomic memory operation. +.PP +The following is a query to see if a given libfabric operation can be a +PCIe atomic operation. +.IP +.nf +\f[C] +int ret; +struct fi_atomic_attr out_attrs; + +/* Query if non-fetching PCIe atomic is supported. */ +ret = fi_query_atomic(domain, FI_UINT32, FI_SUM, &out_attrs, FI_CXI_PCIE_AMO); + +/* Query if fetching PCIe atomic is supported. */ +ret = fi_query_atomic(domain, FI_UINT32, FI_SUM, &out_attrs, + FI_FETCH_ATOMIC | FI_CXI_PCIE_AMO); +\f[R] +.fi +.PP +The following is how to issue a PCIe atomic operation. +.IP +.nf +\f[C] +ssize_t ret; +struct fi_msg_atomic msg; +struct fi_ioc resultv; +void *result_desc; +size_t result_count; + +ret = fi_fetch_atomicmsg(ep, &msg, &resultv, &result_desc, result_count, + FI_CXI_PCIE_AMO); +\f[R] +.fi +.PP +\f[B]Note:\f[R] The CXI provider only supports PCIe fetch add for +UINT32_T, INT32_t, UINT64_T, and INT64_t. +This support requires enablement of PCIe fetch add in the CXI driver, +and it comes at the cost of losing NIC atomic support for another +libfabric atomic operation. +.PP +\f[B]Note:\f[R] Ordering between PCIe atomic operations and NIC +atomic/RMA operations is undefined. +.PP +To enable PCIe fetch add for libfabric, the following CXI driver kernel +module parameter must be set to non-zero. +.IP +.nf +\f[C] +/sys/module/cxi_core/parameters/amo_remap_to_pcie_fadd +\f[R] +.fi +.PP +The following are the possible values for this kernel module and the +impact of each value: - -1: Disable PCIe fetch add support. +FI_CXI_PCIE_AMO is not supported. +- 0: Enable PCIe fetch add support. +FI_MIN is not supported. +- 1: Enable PCIe fetch add support. +FI_MAX is not supported. +- 2: Enable PCIe fetch add support. +FI_SUM is not supported. +- 4: Enable PCIe fetch add support. +FI_LOR is not supported. +- 5: Enable PCIe fetch add support. +FI_LAND is not supported. +- 6: Enable PCIe fetch add support. +FI_BOR is not supported. +- 7: Enable PCIe fetch add support. +FI_BAND is not supported. +- 8: Enable PCIe fetch add support. +FI_LXOR is not supported. +- 9: Enable PCIe fetch add support. +FI_BXOR is not supported. +- 10: Enable PCIe fetch add support. +No loss of default CXI provider AMO functionality. +.PP +Guidance is to default amo_remap_to_pcie_fadd to 10. +.SH FABTESTS +.PP +The CXI provider does not currently support fabtests which depend on IP +addressing. +.PP +fabtest RDM benchmarks are supported, like: +.IP +.nf +\f[C] +# Start server by specifying source PID and interface +\&./fabtests/benchmarks/fi_rdm_tagged_pingpong -B 10 -s cxi0 + +# Read server NIC address +CXI0_ADDR=$(cat /sys/class/cxi/cxi0/device/properties/nic_addr) + +# Start client by specifying server PID and NIC address +\&./fabtests/benchmarks/fi_rdm_tagged_pingpong -P 10 $CXI0_ADDR + +# The client may be bound to a specific interface, like: +\&./fabtests/benchmarks/fi_rdm_tagged_pingpong -B 10 -s cxi1 -P 10 $CXI0_ADDR +\f[R] +.fi +.PP +Some functional fabtests are supported (including fi_bw). +Others use IP sockets and are not yet supported. +.PP +multinode fabtests are not yet supported. +.PP +ubertest is supported for test configs matching the provider\[cq]s +current capabilities. +.PP +unit tests are supported where the test feature set matches the CXI +provider\[cq]s current capabilities. +.SH ERRATA +.IP \[bu] 2 +Fetch and compare type AMOs with FI_DELIVERY_COMPLETE or +FI_MATCH_COMPLETE completion semantics are not supported with +FI_RMA_EVENT. +.SH Libfabric CXI Provider User Programming and Troubleshooting Guide +.PP +The scope of the following subsection is to provide guidance and/or +troubleshooting tips for users of the libfabric CXI provider. +The scope of this section is not a full guide for user libfabric. +.SS Sizing Libfabric Objects Based on Expected Usage +.PP +The CXI provider uses various libfabric object attribute size and/or +libfabric enviroment variables to size hardware related resources +accordingly. +Failure to size resources properly can result in the CXI provider +frequently returning -FI_EAGAIN which may negatively impact performance. +The following subsection outline important sizing related attributes and +environment variables. +.SS Completion Queue Size Attribute +.PP +The CXI provider uses completion queue attribute size to size various +software and hardware event queues used to generate libfabric completion +events. +While the size of the software queues may grow, hardware event queue +sizes are static. +Failing to size hardware queues properly may result in CXI provider +returning -FI_EAGAIN frequently for data transfer operations. +When this error is returned, user should progress the corresponding +endpoint completion queues by calling fi_cq_read(). +.PP +Users are encouraged to set the completion queue size attribute based on +the expected number of inflight RDMA operations to and from a single +endpoint. +For users which are relying on the provider default value (e.g.\ MPI), +the FI_CXI_DEFAULT_CQ_SIZE environment variable can be used to override +the provider default value. +.SS Endpoint Recieve Size Attribute +.PP +The CXI provider uses the endpoint receive size attribute to size +internal command and hardware event queues. +Failing to size the either command queue correctly can result in the CXI +provider returning -FI_EAGAIN frequently for data transfer operations. +When this error is returned, user should progress the corresponding +endpoint completion queues by calling fi_cq_read(). +.PP +Users are encouraged to set the endpoint receive size attribute based on +the expected numbfer of inflight untagged and tagged RDMA operations. +For users which are relying on the provider default value (e.g.\ MPI), +the FI_CXI_DEFAULT_RX_SIZE environment variable can be used to override +the provider default value. +.SS Endpoint Transmit Size Attribute +.PP +The CXI provider uses the endpoint transmit size attribute to size +internal command and hardware event queues. +Failing to size the either command queue correctly can result in the CXI +provider returning -FI_EAGAIN frequently for data transfer operations. +When this error is returned, user should progress the corresponding +endpoint completion queues by calling fi_cq_read(). +.PP +At a minimum, users are encouraged to set the endpoint transmit size +attribute based on the expected numbfer of inflight, initiator RDMA +operations. +If users are going to be issuing message opeartions over the CXI +provider rendezvous limit (FI_CXI_RDZV_THRESHOLD), the transmit size +attribute must also include the number of outstanding, unexpected +rendezvous operations (i.e.\ inflight, initiator RDMA operations + +outstanding, unexpected rendezvous operations). +.PP +For users which are relying on the provider default value (e.g.\ MPI), +the FI_CXI_DEFAULT_TX_SIZE environment variable can be used to override +the provider default value. +.SS FI_UNIVERSE_SIZE Environment Variable +.PP +The libfabric FI_UNIVERSE_SIZE environment variable defines the number +of expected ranks/peers an application needs to communicate with. +The CXI provider may use this environment variable to size resources +tied to number of peers. +Users are encourage to set this environment variable accordingly. +.SS Selecting Proper Receive Match Mode +.PP +As mentioned in the \f[I]Runtime Parameters\f[R] section, the CXI +provider supports 3 different operational modes: hardware, hybrid, and +software. +.PP +Hardware match mode is approriate for users who can ensure the sum of +unexpected messages and posted receives does not exceed the configured +hardware receive resource limit for the application. +When resources are consumed, the endpoint will transition into a flow +control operational mode which requires side-band messaging to recover +from. +Recovery will involve the CXI provider trying to reclaim hardware +receive resources to help prevent future transition into flow control. +If the CXI provider is unable to reclaim hardware receive resoures, this +can lead to a cycle of entering and exiting flow control which may +present itself as a hang to the libfabric user. +Running with FI_LOG_LEVEL=warn and FI_LOG_PROV=cxi will report if this +flow control transition is happening. +.PP +Hybrid match mode is approriate for users who are unsure if the sum of +unexpected messages and posted receives will not exceed the configure +hardware receive resource limit for the application but want to ensure +they application still functions if hardware receive resources are +consumed. +Hybrid match mode extends hardware match by allowing for an automated +transition into software match mode if resources are consumed. +.PP +Sofftware match mode is approriate for user who know the sum of +unexpected messages and posted receives will exceed the configured +hardware receive resource limit for the application. +In software match mode, the CXI provider maintains the a software +unexpected and posted receive list rather than offloading to hardware. +This avoids having to allocated a hardware receive resource for each +unxpected messsage and posted receive. +.PP +\f[I]Note\f[R]: In practice, dependent processes (e.g.\ parallel job) +will most likely be sharing a recieve hardware resource pool. +.PP +\f[I]Note\f[R]: Each match mode may still enter flow control. +For example, if a user is not draining the libfabric completion queue at +a reasonable rate, corresponding hardware events may fill up which will +trigger flow control. +.SS Using Hybrid Match Mode Preemptive Options +.PP +The high-level objective of the hybrid match mode preemptive environment +variables (i.e. +FI_CXI_HYBRID_PREEMPTIVE, FI_CXI_HYBRID_RECV_PREEMPTIVE, +FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE, and +FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE) is to ensure a process +requiring more hardware receives resource does not force other process +requiring less hardware receive resource to be force into software match +mode due to no available hardware receive resources available. +.PP +For example, considered a parallel application which has multiple +processes (i.e.\ ranks) per NIC all sharing the same hardware receive +resource pool. +Suppose that the application communication pattern results in an +all-to-one communication to only a single rank (e.g. +rank 0) while other ranks may be doing communication amongst each other. +If the width of the all-to-one exceeds hardware resource consumptions, +all ranks on the target NIC will transition to software match mode. +The preemptive options may help ensure that only rank 0 would transition +to software match mode instead of all the ranks on the target NIC. +.PP +The FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE and +FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE environment variables will force +the transition to software match mode if the user requested endpoint +recieve size attribute is exceeded. +The benefit of running with these enabled is that software match mode +transition is 100% in control of the libfabric user through the receive +size attribute. +One approach users could take here is set receive size attribute to +expected usage, and if this expected usage is exceeded, only the +offending endpoints will transition to software match mode. +.PP +FI_CXI_HYBRID_PREEMPTIVE and FI_CXI_HYBRID_RECV_PREEMPTIVE environment +variables will force the transition to software match mode if hardware +receive resources in the pool are running low. +The CXI provider will do a multi-step process to transition the +libfabric endpoint to software match mode. +The benefit of running with these enabled is that the number of +endpoints transitioning to software match mode may be smaller when +compared to forced software match mode transition due to zero hardware +resources available. +.SS Preventing Messaging Flow Control Due to Hardware Event Queue Sizing +.PP +As much as possible, CXI provider message flow control should be +avoided. +Flow control results in expensive, side-band, CXI provider internal +messaging to recover from. +One cause for flow control is due to improper hardware event queue +sizing. +If the hardware event queue is undersized resulting it filling quicker +than expected, the next incoming message operation targeting a full +event queue will result in the message operation being dropped and flow +control triggered. +.PP +The default CXI provider behavior is to size hardware event queues based +on endpoint transmit and receive size attributes. +Thus, it is critical for users to set these attributes accordingly. +.PP +The CQ size can be used to override the CXI provider calcuatled hardware +event queue size based on endpoint transmit and receive size attributes. +If the CQ size is greater than the CXI proviuder calcuation, the value +from the CQ size will be used. +.PP +The CQ fill percent can be used to define a threshold for when no new +RDMA operations can be queued until the libfabric CQ a progressed thus +draining hardware event queues. +.SS Interrupting CXI Provider CQ Error Event Errno +.PP +The following are the libfabric errno value which may be returned in an +RDMA CQ error event. +.PP +FI_ETRUNC: Receive message truncation. +.PP +FI_EHOSTUNREACH: Target is unreachable. +This is due to connectivity issues, such as downed links, between the +two peers. +.PP +FI_ENOTCONN: Cannot communicate due to no libfabric endpoint configure. +In this case, the target NIC is reachable. +.PP +FI_EIO: Catch all errno. +.SH SEE ALSO +.PP +\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), +.SH AUTHORS +OpenFabrics. diff --git a/prov/cxi/.gitignore b/prov/cxi/.gitignore new file mode 100644 index 00000000000..689e74dcd1d --- /dev/null +++ b/prov/cxi/.gitignore @@ -0,0 +1,2 @@ +test/cxitest +test/curltest diff --git a/prov/cxi/Makefile.include b/prov/cxi/Makefile.include new file mode 100644 index 00000000000..b2619dd2ec6 --- /dev/null +++ b/prov/cxi/Makefile.include @@ -0,0 +1,211 @@ +# SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +# +# Copyright 2018,2020-2023 Hewlett Packard Enterprise Development LP + +if HAVE_CXI + +AM_CPPFLAGS += \ + -I$(top_srcdir)/prov/cxi/include + +_cxi_files = \ + prov/cxi/src/cxip_if.c \ + prov/cxi/src/cxip_fabric.c \ + prov/cxi/src/cxip_repsum.c \ + prov/cxi/src/cxip_coll.c \ + prov/cxi/src/cxip_zbcoll.c \ + prov/cxi/src/cxip_curl.c \ + prov/cxi/src/cxip_dom.c \ + prov/cxi/src/cxip_ep.c \ + prov/cxi/src/cxip_txc.c \ + prov/cxi/src/cxip_rxc.c \ + prov/cxi/src/cxip_av.c \ + prov/cxi/src/cxip_avset.c \ + prov/cxi/src/cxip_eq.c \ + prov/cxi/src/cxip_cq.c \ + prov/cxi/src/cxip_cntr.c \ + prov/cxi/src/cxip_rma.c \ + prov/cxi/src/cxip_mr.c \ + prov/cxi/src/cxip_msg.c \ + prov/cxi/src/cxip_atomic.c \ + prov/cxi/src/cxip_iomm.c \ + prov/cxi/src/cxip_faults.c \ + prov/cxi/src/cxip_info.c \ + prov/cxi/src/cxip_ctrl.c \ + prov/cxi/src/cxip_req_buf.c \ + prov/cxi/src/cxip_rdzv_pte.c \ + prov/cxi/src/cxip_trace.c \ + prov/cxi/src/cxip_telemetry.c \ + prov/cxi/src/cxip_ptelist_buf.c \ + prov/cxi/src/cxip_evtq.c \ + prov/cxi/src/cxip_nic.c \ + prov/cxi/src/cxip_portals_table.c \ + prov/cxi/src/cxip_pte.c \ + prov/cxi/src/cxip_cmdq.c + +_cxi_headers = \ + prov/cxi/include/cxip.h \ + prov/cxi/include/cxip_faults.h \ + prov/cxi/include/fi_cxi_ext.h + +rdmainclude_HEADERS += \ + prov/cxi/include/fi_cxi_ext.h + +# Stand-alone srun tests for hardware testing environment +noinst_PROGRAMS += prov/cxi/test/multinode/test_frmwk +prov_cxi_test_multinode_test_frmwk_SOURCES = \ + prov/cxi/test/multinode/multinode_frmwk.h \ + prov/cxi/test/multinode/multinode_frmwk.c \ + prov/cxi/test/multinode/test_frmwk.c +prov_cxi_test_multinode_test_frmwk_CPPFLAGS = \ + $(AM_CPPFLAGS) $(cxi_CPPFLAGS) $(PTHREAD_FLAGS) +prov_cxi_test_multinode_test_frmwk_LDFLAGS = -static +prov_cxi_test_multinode_test_frmwk_LDADD = $(linkback) $(PTHREAD_LIBS) + +noinst_PROGRAMS += prov/cxi/test/multinode/test_zbcoll +prov_cxi_test_multinode_test_zbcoll_SOURCES = \ + prov/cxi/test/multinode/multinode_frmwk.h \ + prov/cxi/test/multinode/multinode_frmwk.c \ + prov/cxi/test/multinode/test_zbcoll.c +prov_cxi_test_multinode_test_zbcoll_CPPFLAGS = \ + $(AM_CPPFLAGS) $(cxi_CPPFLAGS) $(PTHREAD_FLAGS) +prov_cxi_test_multinode_test_zbcoll_LDFLAGS = -static +prov_cxi_test_multinode_test_zbcoll_LDADD = $(linkback) $(PTHREAD_LIBS) + +noinst_PROGRAMS += prov/cxi/test/multinode/test_coll +prov_cxi_test_multinode_test_coll_SOURCES = \ + prov/cxi/test/multinode/multinode_frmwk.h \ + prov/cxi/test/multinode/multinode_frmwk.c \ + prov/cxi/test/multinode/test_coll.c +prov_cxi_test_multinode_test_coll_CPPFLAGS = \ + $(AM_CPPFLAGS) $(cxi_CPPFLAGS) $(PTHREAD_FLAGS) +prov_cxi_test_multinode_test_coll_LDFLAGS = -static +prov_cxi_test_multinode_test_coll_LDADD = $(linkback) $(PTHREAD_LIBS) + +noinst_PROGRAMS += prov/cxi/test/multinode/test_barrier +prov_cxi_test_multinode_test_barrier_SOURCES = \ + prov/cxi/test/multinode/multinode_frmwk.h \ + prov/cxi/test/multinode/multinode_frmwk.c \ + prov/cxi/test/multinode/test_barrier.c +prov_cxi_test_multinode_test_barrier_CPPFLAGS = \ + $(AM_CPPFLAGS) $(cxi_CPPFLAGS) $(PTHREAD_FLAGS) +prov_cxi_test_multinode_test_barrier_LDFLAGS = -static +prov_cxi_test_multinode_test_barrier_LDADD = $(linkback) $(PTHREAD_LIBS) + +if HAVE_CRITERION + +# curltest is not expected to exist outside devel env +noinst_PROGRAMS += prov/cxi/test/curltest +prov_cxi_test_curltest_SOURCES = \ + prov/cxi/test/curl.c +prov_cxi_test_curltest_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \ + $(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS) +prov_cxi_test_curltest_LDFLAGS = $(cxitest_LDFLAGS) -static +prov_cxi_test_curltest_LDADD = $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS) + +bin_PROGRAMS += prov/cxi/test/cxitest +nodist_prov_cxi_test_cxitest_SOURCES = \ + prov/cxi/test/cxip_test_common.c \ + prov/cxi/test/fabric.c \ + prov/cxi/test/domain.c \ + prov/cxi/test/ep.c \ + prov/cxi/test/eq.c \ + prov/cxi/test/cq.c \ + prov/cxi/test/av.c \ + prov/cxi/test/avset.c \ + prov/cxi/test/rma.c \ + prov/cxi/test/tagged.c \ + prov/cxi/test/msg.c \ + prov/cxi/test/atomic.c \ + prov/cxi/test/cntr.c \ + prov/cxi/test/tagged_stress.c \ + prov/cxi/test/mr.c \ + prov/cxi/test/deferred_work.c \ + prov/cxi/test/coll.c \ + prov/cxi/test/ctrl.c \ + prov/cxi/test/lat.c \ + prov/cxi/test/repsum.c \ + prov/cxi/test/auth_key.c \ + prov/cxi/test/fork.c \ + prov/cxi/test/mem_reg.c \ + prov/cxi/test/nic.c + +prov_cxi_test_cxitest_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \ + $(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS) +prov_cxi_test_cxitest_LDFLAGS = $(cxitest_LDFLAGS) -static +prov_cxi_test_cxitest_LDADD = $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS) + +TESTS += prov/cxi/test/cxitest + +# ZE test suite is its own binary +if HAVE_ZE + +bin_PROGRAMS += prov/cxi/test/cxitestze +nodist_prov_cxi_test_cxitestze_SOURCES = \ + prov/cxi/test/cxip_test_common.c \ + prov/cxi/test/ze.c + +prov_cxi_test_cxitestze_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \ + $(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS) +prov_cxi_test_cxitestze_LDFLAGS = $(cxitest_LDFLAGS) -static +prov_cxi_test_cxitestze_LDADD = $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS) -lze_loader + +TESTS += prov/cxi/test/cxitestze + +endif HAVE_ZE + +# CUDA test suite is its own binary +if HAVE_CUDA + +bin_PROGRAMS += prov/cxi/test/cxitestcuda +nodist_prov_cxi_test_cxitestcuda_SOURCES = \ + prov/cxi/test/cxip_test_common.c \ + prov/cxi/test/cuda.c + +prov_cxi_test_cxitestcuda_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \ + $(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS) +prov_cxi_test_cxitestcuda_LDFLAGS = $(cxitest_LDFLAGS) -static +prov_cxi_test_cxitestcuda_LDADD = $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS) -lcudart -lcuda + +TESTS += prov/cxi/test/cxitestcuda + +endif HAVE_CUDA + +# ROCR test suite is its own binary +if HAVE_ROCR + +bin_PROGRAMS += prov/cxi/test/cxitestrocr +nodist_prov_cxi_test_cxitestrocr_SOURCES = \ + prov/cxi/test/cxip_test_common.c \ + prov/cxi/test/rocr.c + +prov_cxi_test_cxitestrocr_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \ + $(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS) +prov_cxi_test_cxitestrocr_LDFLAGS = $(cxitest_LDFLAGS) -static +prov_cxi_test_cxitestrocr_LDADD = $(cxitest_LIBS) $(linkback) $(PTHREAD_LIBS) -lhsa-runtime64 + +TESTS += prov/cxi/test/cxitestrocr + +endif HAVE_ROCR + +endif HAVE_CRITERION + +if HAVE_CXI_DL +pkglib_LTLIBRARIES += libcxi-fi.la +libcxi_fi_la_SOURCES = $(_cxi_files) $(_cxi_headers) $(common_srcs) +libcxi_fi_la_CPPFLAGS = $(cxi_CPPFLAGS) +libcxi_fi_la_LIBADD = $(linkback) $(cxi_LIBS) +libcxi_fi_la_LDFLAGS = $(cxi_LDFLAGS) \ + -module -avoid-version -shared -export-dynamic +libcxi_fi_la_DEPENDENCIES = $(linkback) +else !HAVE_CXI_DL +src_libfabric_la_SOURCES += $(_cxi_files) $(_cxi_headers) +src_libfabric_la_CPPFLAGS += $(cxi_CPPFLAGS) +src_libfabric_la_LIBADD += $(cxi_LIBS) +src_libfabric_la_LDFLAGS += $(cxi_LDFLAGS) +endif !HAVE_CXI_DL + +prov_install_man_pages += man/man7/fi_cxi.7 + +endif HAVE_CXI + +prov_dist_man_pages += man/man7/fi_cxi.7 diff --git a/prov/cxi/configure.m4 b/prov/cxi/configure.m4 new file mode 100644 index 00000000000..ec50e18f33c --- /dev/null +++ b/prov/cxi/configure.m4 @@ -0,0 +1,153 @@ +dnl SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +dnl +dnl Copyright 2018 Hewlett Packard Enterprise Development LP + +dnl CXI provider specific configuration + +dnl Called to configure this provider +dnl +dnl Arguments: +dnl +dnl $1: action if configured successfully +dnl $2: action if not configured successfully +dnl + +AM_CONDITIONAL([HAVE_PMI], [test "x$have_pmi" = "xtrue"]) +AM_CONDITIONAL([HAVE_ZE], [test "$have_ze" = "1" && test "$with_ze" != ""]) +AM_CONDITIONAL([HAVE_CUDA], [test "$have_cuda" = "1" && test "$with_cuda" != ""]) +AM_CONDITIONAL([HAVE_ROCR], [test "$have_rocr" = "1" && test "$with_rocr" != ""]) + + +AC_DEFUN([FI_CXI_CONFIGURE],[ + + cxi_happy=1 + + # Support non-standard install path for cassini headers. This is needed + # by libcxi. + AC_ARG_WITH([cassini-headers], + [AS_HELP_STRING([--with-cassin-headers=DIR], [Install directory for Cassini headers])], + [CPPFLAGS="-I$with_cassini_headers/include $CPPFLAGS"]) + + # Support non-standard install path for cxi kernel UAPI headers. This is + # needed by libcxi. + AC_ARG_WITH([cxi-uapi-headers], + [AS_HELP_STRING([--with-cxi-uapi-headers=DIR], [Install directory for kernel CXI UAPI headers])], + [CPPFLAGS="-I$with_cxi_uapi_headers/include $CPPFLAGS"]) + + # Support non-standard install path for curl. This is needed by CXI provider. + AC_ARG_WITH([curl], + [AS_HELP_STRING([--with-curl=DIR], [Install directory for curl])]) + + # Support non-standard install path for json-c. This is needed by CXI provider. + AC_ARG_WITH([json-c], + [AS_HELP_STRING([--with-json-c=DIR], [Install directory for json-c])]) + + AS_IF([test x"$enable_cxi" != x"no"], + [ + AC_CHECK_HEADER(cxi_prov_hw.h, + [], + [cxi_happy=0]) + + AC_CHECK_HEADER(uapi/misc/cxi.h, + [], + [cxi_happy=0]) + + FI_CHECK_PACKAGE([libcxi], + [libcxi/libcxi.h], + [cxi], + [cxil_open_device], + [], + [$cxi_PREFIX], + [$cxi_LIBDIR], + [], + [cxi_happy=0]) + + cxi_CPPFLAGS=$libcxi_CPPFLAGS + cxi_LDFLAGS=$libcxi_LDFLAGS + cxi_LIBS=$libcxi_LIBS + + if test "$with_cassini_headers" != "" && test "$with_cassini_headers" != "no"; then + cxi_CPPFLAGS="$cxi_CPPFLAGS -I$with_cassini_headers/include" + fi + + if test "$with_cxi_uapi_headers" != "" && test "$with_cxi_uapi_headers" != "no"; then + cxi_CPPFLAGS="$cxi_CPPFLAGS -I$with_cxi_uapi_headers/include" + fi + + # Add on curl if installed in non-default location. + if test "$with_curl" != "" && test "$with_curl" != "no"; then + FI_CHECK_PREFIX_DIR([$with_curl], [curl]) + else + curl_PREFIX="" + curl_LIBDIR="" + fi + + FI_CHECK_PACKAGE([libcurl], + [curl/curl.h], + [curl], + [curl_global_init], + [], + [$curl_PREFIX], + [$curl_LIBDIR], + [], + [cxi_happy=0]) + + cxi_CPPFLAGS="$cxi_CPPFLAGS $libcurl_CPPFLAGS" + cxi_LDFLAGS="$cxi_LDFLAGS $libcurl_LDFLAGS" + cxi_LIBS="$cxi_LIBS $libcurl_LIBS" + + # Add on json if installed in non-default location. + if test "$with_json" != "" && test "$with_json" != "no"; then + FI_CHECK_PREFIX_DIR([$with_json], [json]) + else + json_PREFIX="" + json_LIBDIR="" + fi + + FI_CHECK_PACKAGE([libjson], + [json-c/json.h], + [json-c], + [json_object_get_type], + [], + [$json_PREFIX], + [$json_LIBDIR], + [], + [cxi_happy=0]) + + cxi_CPPFLAGS="$cxi_CPPFLAGS $libjson_CPPFLAGS" + cxi_LDFLAGS="$cxi_LDFLAGS $libjson_LDFLAGS" + cxi_LIBS="$cxi_LIBS $libjson_LIBS" + + # Need to explicitly link to libmath + cxi_LIBS="$cxi_LIBS -lm" + + AC_SUBST(cxi_CPPFLAGS) + AC_SUBST(cxi_LDFLAGS) + AC_SUBST(cxi_LIBS) + + # Checks to enable cxitest + AS_IF([test "$with_criterion" != ""], + [cxitest_CPPFLAGS="-I$with_criterion/include" + cxitest_LDFLAGS="-L$with_criterion/lib64 -Wl,-rpath=$(realpath $with_criterion/lib64)" + cxitest_LIBS="-lcriterion" + have_criterion=true]) + AM_CONDITIONAL([HAVE_CRITERION], [test "x$have_criterion" = "xtrue"]) + + AS_IF([test "$have_ze" = "1" && test "$with_ze" != "" && test x"$with_ze" != x"yes"], + [cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_ze/include" + cxitest_LDFLAGS="$cxitest_LDFLAGS -L$with_ze/lib64"]) + AS_IF([test "$have_cuda" = "1" && test "$with_cuda" != "" && test x"$with_cuda" != x"yes"], + [cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_cuda/include" + cxitest_LDFLAGS="$cxitest_LDFLAGS -L$with_cuda/lib64"]) + AS_IF([test "$have_rocr" = "1" && test "$with_rocr" != "" && test x"$with_rocr" != x"yes"], + [cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_rocr/include" + cxitest_LDFLAGS="$cxitest_LDFLAGS -L$with_rocr/lib"]) + + AC_SUBST(cxitest_CPPFLAGS) + AC_SUBST(cxitest_LDFLAGS) + AC_SUBST(cxitest_LIBS) + ], + [cxi_happy=0]) + + AS_IF([test $cxi_happy -eq 1], [$1], [$2]) +]) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h new file mode 100644 index 00000000000..0a441e3bc2c --- /dev/null +++ b/prov/cxi/include/cxip.h @@ -0,0 +1,3348 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved. + * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP + */ + +#ifndef _CXIP_PROV_H_ +#define _CXIP_PROV_H_ + +#include +#include "config.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libcxi/libcxi.h" +#include "cxip_faults.h" +#include "fi_cxi_ext.h" + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +#ifndef FLOOR +#define FLOOR(a, b) ((long long)(a) - (((long long)(a)) % (b))) +#endif + +#ifndef CEILING +#define CEILING(a, b) ((long long)(a) <= 0LL ? 0 : (FLOOR((a)-1, b) + (b))) +#endif + +#define CXIP_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define CXIP_ALIGN(x, a) CXIP_ALIGN_MASK(x, (typeof(x))(a) - 1) +#define CXIP_ALIGN_DOWN(x, a) CXIP_ALIGN((x) - ((a) - 1), (a)) + +#define CXIP_REQ_CLEANUP_TO 3000 + +#define CXIP_BUFFER_ID_MAX (1 << 16) + +/* Scalable EP not supported */ +#define CXIP_EP_MAX_CTX_BITS 0 +#define CXIP_EP_MAX_TX_CNT (1 << CXIP_EP_MAX_CTX_BITS) +#define CXIP_EP_MAX_RX_CNT (1 << CXIP_EP_MAX_CTX_BITS) +#define CXIP_EP_MAX_MSG_SZ ((1ULL << 32) - 1) +#define CXIP_EP_MIN_MULTI_RECV 64 +#define CXIP_EP_MAX_MULTI_RECV ((1 << 24) - 1) + +#define CXIP_TX_COMP_MODES (FI_INJECT_COMPLETE | \ + FI_TRANSMIT_COMPLETE | \ + FI_DELIVERY_COMPLETE | \ + FI_MATCH_COMPLETE) +#define CXIP_TX_OP_FLAGS (FI_INJECT | \ + FI_COMPLETION | \ + CXIP_TX_COMP_MODES | \ + FI_REMOTE_CQ_DATA | \ + FI_MORE | \ + FI_FENCE) +#define CXIP_RX_OP_FLAGS (FI_COMPLETION | \ + FI_MULTI_RECV | \ + FI_MORE) +/* Invalid OP flags for RX that can be silently ignored */ +#define CXIP_RX_IGNORE_OP_FLAGS (FI_REMOTE_CQ_DATA | \ + FI_INJECT) +#define CXIP_WRITEMSG_ALLOWED_FLAGS (FI_INJECT | \ + FI_COMPLETION | \ + FI_MORE | \ + FI_FENCE | \ + CXIP_TX_COMP_MODES) +#define CXIP_READMSG_ALLOWED_FLAGS (FI_COMPLETION | \ + FI_MORE | \ + FI_FENCE | \ + CXIP_TX_COMP_MODES) + +#define CXIP_AMO_MAX_IOV 1 +#define CXIP_EQ_DEF_SZ (1 << 8) +#define CXIP_CQ_DEF_SZ 1024U +#define CXIP_REMOTE_CQ_DATA_SZ 8 + +#define CXIP_PTE_IGNORE_DROPS ((1 << 24) - 1) +#define CXIP_RDZV_THRESHOLD 2048 +#define CXIP_OFLOW_BUF_SIZE (2*1024*1024) +#define CXIP_OFLOW_BUF_MIN_POSTED 3 +#define CXIP_OFLOW_BUF_MAX_CACHED (CXIP_OFLOW_BUF_MIN_POSTED * 3) +#define CXIP_REQ_BUF_SIZE (2*1024*1024) +#define CXIP_REQ_BUF_MIN_POSTED 4 +#define CXIP_REQ_BUF_MAX_CACHED 0 +#define CXIP_UX_BUFFER_SIZE (CXIP_OFLOW_BUF_MIN_POSTED * \ + CXIP_OFLOW_BUF_SIZE) + +/* When device memory is safe to access via load/store then the + * CPU will be used to move data below this threshold. + */ +#define CXIP_SAFE_DEVMEM_COPY_THRESH 4096 + +#define CXIP_EP_PRI_CAPS \ + (FI_RMA | FI_ATOMICS | FI_TAGGED | FI_RECV | FI_SEND | \ + FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE | \ + FI_DIRECTED_RECV | FI_MSG | FI_NAMED_RX_CTX | \ + FI_COLLECTIVE | FI_HMEM) +#define CXIP_EP_SEC_CAPS \ + (FI_SOURCE | FI_SOURCE_ERR | FI_LOCAL_COMM | \ + FI_REMOTE_COMM | FI_RMA_EVENT | FI_MULTI_RECV | FI_FENCE | FI_TRIGGER) +#define CXIP_EP_CAPS (CXIP_EP_PRI_CAPS | CXIP_EP_SEC_CAPS) +#define CXIP_DOM_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID) +#define CXIP_CAPS (CXIP_DOM_CAPS | CXIP_EP_CAPS) +#define CXIP_MSG_ORDER (FI_ORDER_SAS | \ + FI_ORDER_WAW | \ + FI_ORDER_RMA_WAW | \ + FI_ORDER_ATOMIC_WAW | \ + FI_ORDER_ATOMIC_WAR | \ + FI_ORDER_ATOMIC_RAW | \ + FI_ORDER_ATOMIC_RAR) + +#define CXIP_EP_CQ_FLAGS \ + (FI_SEND | FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION | \ + FI_COLLECTIVE) +#define CXIP_EP_CNTR_FLAGS \ + (FI_SEND | FI_RECV | FI_READ | FI_WRITE | FI_REMOTE_READ | \ + FI_REMOTE_WRITE) + +#define CXIP_INJECT_SIZE C_MAX_IDC_PAYLOAD_UNR + +/* Max TX size of 16,384 translate to a 4MiB command queue buffer. */ +#define CXIP_MAX_TX_SIZE 16384U +#define CXIP_DEFAULT_TX_SIZE 512U + +/* Some LEs need to be used for internally by CXI provider. The user facing + * RX size should be updated to reflect this. + * + * Note: This value is an estimation and may be too high. + */ +#define CXI_PROV_LE_PER_EP 1024U + +/* Maximum number of LEs per endpoint. */ +#define LES_PER_EP_MAX 16384U + +#define CXIP_MAX_RX_SIZE (LES_PER_EP_MAX - CXI_PROV_LE_PER_EP) +#define CXIP_DEFAULT_RX_SIZE 512U + +#define CXIP_MAJOR_VERSION 0 +#define CXIP_MINOR_VERSION 1 +#define CXIP_PROV_VERSION FI_VERSION(CXIP_MAJOR_VERSION, \ + CXIP_MINOR_VERSION) +#define CXIP_FI_VERSION FI_VERSION(1, 20) +#define CXIP_WIRE_PROTO_VERSION 1 + +#define CXIP_COLL_MAX_CONCUR 8 +#define CXIP_COLL_MIN_RX_BUFS 8 +#define CXIP_COLL_MIN_RX_SIZE 4096 +#define CXIP_COLL_MIN_MULTI_RECV 64 +#define CXIP_COLL_MAX_DATA_SIZE 32 +#define CXIP_COLL_MAX_SEQNO (1 << 10) +// TODO adjust based on performance testing +#define CXIP_COLL_MIN_RETRY_USEC 1 +#define CXIP_COLL_MAX_RETRY_USEC 32000 +#define CXIP_COLL_MIN_TIMEOUT_USEC 1 +#define CXIP_COLL_MAX_TIMEOUT_USEC 32000 + +#define CXIP_REQ_BUF_HEADER_MAX_SIZE (sizeof(struct c_port_fab_hdr) + \ + sizeof(struct c_port_unrestricted_hdr)) +#define CXIP_REQ_BUF_HEADER_MIN_SIZE (sizeof(struct c_port_fab_hdr) + \ + sizeof(struct c_port_small_msg_hdr)) + +extern int s_page_size; +extern char cxip_prov_name[]; +extern struct fi_provider cxip_prov; +extern struct util_prov cxip_util_prov; + +extern int cxip_cq_def_sz; +extern int cxip_eq_def_sz; + +extern struct slist cxip_if_list; + +extern struct fi_fabric_attr cxip_fabric_attr; +extern struct fi_domain_attr cxip_domain_attr; +extern struct fi_ep_attr cxip_ep_attr; +extern struct fi_tx_attr cxip_tx_attr; +extern struct fi_rx_attr cxip_rx_attr; + +enum cxip_ats_mlock_mode { + CXIP_ATS_MLOCK_OFF, + CXIP_ATS_MLOCK_CACHE, + CXIP_ATS_MLOCK_ALL, +}; + +enum cxip_llring_mode { + CXIP_LLRING_NEVER, + CXIP_LLRING_IDLE, + CXIP_LLRING_ALWAYS, +}; + +enum cxip_ep_ptle_mode { + CXIP_PTLTE_HARDWARE_MODE, + CXIP_PTLTE_DEFAULT_MODE = CXIP_PTLTE_HARDWARE_MODE, + CXIP_PTLTE_SOFTWARE_MODE, + CXIP_PTLTE_HYBRID_MODE, +}; + +enum cxip_rdzv_proto { + CXIP_RDZV_PROTO_DEFAULT, /* unrestricted gets */ + CXIP_RDZV_PROTO_ALT_READ, /* restricted gets */ + CXIP_RDZV_PROTO_ALT_WRITE, /* restricted puts */ +}; + +const char *cxip_rdzv_proto_to_str(enum cxip_rdzv_proto proto); + +struct cxip_environment { + /* Translation */ + int odp; + int force_odp; + int ats; + int iotlb; + enum cxip_ats_mlock_mode ats_mlock_mode; + + /* Messaging */ + int fork_safe_requested; + enum cxip_ep_ptle_mode rx_match_mode; + int msg_offload; + int hybrid_preemptive; + int hybrid_recv_preemptive; + size_t rdzv_threshold; + size_t rdzv_get_min; + size_t rdzv_eager_size; + int rdzv_aligned_sw_rget; + int disable_non_inject_msg_idc; + int disable_host_register; + size_t oflow_buf_size; + size_t oflow_buf_min_posted; + size_t oflow_buf_max_cached; + size_t safe_devmem_copy_threshold; + size_t req_buf_size; + size_t req_buf_min_posted; + size_t req_buf_max_cached; + int sw_rx_tx_init_max; + int msg_lossless; + size_t default_cq_size; + size_t default_tx_size; + size_t default_rx_size; + int optimized_mrs; + int prov_key_cache; + int mr_match_events; + int disable_eq_hugetlb; + int zbcoll_radix; + + enum cxip_llring_mode llring_mode; + + int cq_policy; + + size_t default_vni; + + size_t eq_ack_batch_size; + int fc_retry_usec_delay; + size_t ctrl_rx_eq_max_size; + char *device_name; + size_t cq_fill_percent; + int enable_unrestricted_end_ro; + int rget_tc; + int cacheline_size; + + char *coll_job_id; + char *coll_job_step_id; + size_t coll_retry_usec; + size_t coll_timeout_usec; + char *coll_fabric_mgr_url; + char *coll_mcast_token; + size_t hwcoll_addrs_per_job; + size_t hwcoll_min_nodes; + int coll_use_dma_put; + + char hostname[255]; + char *telemetry; + int telemetry_rgid; + int disable_hmem_dev_register; + int ze_hmem_supported; + enum cxip_rdzv_proto rdzv_proto; + int enable_trig_op_limit; + int hybrid_posted_recv_preemptive; + int hybrid_unexpected_msg_preemptive; +}; + +extern struct cxip_environment cxip_env; + +static inline bool cxip_software_pte_allowed(void) +{ + return cxip_env.rx_match_mode != CXIP_PTLTE_HARDWARE_MODE; +} + +/* + * The CXI Provider Address format. + * + * A Cassini NIC Address and PID identify a libfabric Endpoint. Cassini + * borrows the name 'PID' from Portals. In CXI, a process can allocate several + * PID values. + * + * The PID value C_PID_ANY is reserved. When used, the library auto-assigns + * a free PID value. A PID value is assigned when network resources are + * allocated. Libfabric clients can achieve this by not specifying a 'service' + * in a call to fi_getinfo() or by not setting src_addr in the fi_info + * structure used to allocate an Endpoint. + */ +struct cxip_addr { + uint32_t pid : C_DFA_PID_BITS_MAX; + uint32_t nic : C_DFA_NIC_BITS; + uint16_t vni; +}; + +#define CXIP_ADDR_EQUAL(a, b) ((a).nic == (b).nic && (a).pid == (b).pid) + +/* + * A PID contains "pid_granule" logical endpoints. The PID granule is set per + * device and can be found in libCXI devinfo. The default pid_granule is 256. + * These endpoints are partitioned by the provider for the following use: + * + * 0 RX Queue PtlTE + * 16 Collective PtlTE entry + * 17-116 Optimized write MR PtlTEs 0-99 + * For Client specified keys: + * 17-116 Non-cached optimized write MR PtlTEs 0-99 + * For Provider specified keys: + * 17-24 Cached optimized write MR PtlTEs 0-7 + * 25-116 Non-cached optimized write MR PtlTEs 8-99 + * 117 Standard client/provider cached/non-cached write MR + * PtlTE / Control messaging + * 127 Rendezvous destination write PtlTE + * 128-227 Optimized read MR PtlTEs 0-99 + * For Client specified keys: + * 128-227 Non-cached optimized read MR PtlTEs 0-99 + * For Provider specified keys: + * 128-135 Cached optimized read MR PtlTEs 0-7 + * 136-227 Non-cached optimized read MR PtlTEs 8-99 + * 228 Standard client or provider cached/non-cached read MR + * PtlTE + * 229-237 Rendezvous restricted read PtlTE (TODO consider merge with MR) + * 255 Rendezvous source PtlTE + * + * Note: Any logical endpoint within a PID granule that issues unrestricted Puts + * MUST be within the logical endpoint range 0 - 127 and unrestricted Gets MUST + * be within the logical endpoint range 128 - 255. + */ +#define CXIP_PTL_IDX_RXQ 0 +#define CXIP_PTL_IDX_WRITE_MR_OPT_BASE 17 +#define CXIP_PTL_IDX_READ_MR_OPT_BASE 128 +#define CXIP_PTL_IDX_MR_OPT_CNT 100 +#define CXIP_PTL_IDX_PROV_NUM_CACHE_IDX 8 +#define CXIP_PTL_IDX_PROV_MR_OPT_CNT \ + (CXIP_PTL_IDX_MR_OPT_CNT - CXIP_PTL_IDX_PROV_NUM_CACHE_IDX) + +/* Map non-cached optimized MR keys (client or FI_MR_PROV_KEY) + * to appropriate PTL index. + */ +#define CXIP_MR_PROV_KEY_MASK ((1ULL << 61) - 1) +#define CXIP_MR_PROV_KEY_ID_MASK ((1ULL << 16) - 1) +#define CXIP_MR_UNCACHED_KEY_TO_IDX(key) ((key) & CXIP_MR_PROV_KEY_ID_MASK) +#define CXIP_PTL_IDX_WRITE_MR_OPT(key) \ + (CXIP_PTL_IDX_WRITE_MR_OPT_BASE + \ + CXIP_MR_UNCACHED_KEY_TO_IDX(key)) +#define CXIP_PTL_IDX_READ_MR_OPT(key) \ + (CXIP_PTL_IDX_READ_MR_OPT_BASE + \ + CXIP_MR_UNCACHED_KEY_TO_IDX(key)) + +/* Map cached FI_MR_PROV_KEY optimized MR LAC to Index */ +#define CXIP_PTL_IDX_WRITE_PROV_CACHE_MR_OPT(lac) \ + (CXIP_PTL_IDX_WRITE_MR_OPT_BASE + (lac)) +#define CXIP_PTL_IDX_READ_PROV_CACHE_MR_OPT(lac) \ + (CXIP_PTL_IDX_READ_MR_OPT_BASE + (lac)) + +#define CXIP_PTL_IDX_WRITE_MR_STD 117 +#define CXIP_PTL_IDX_RDZV_DEST 127 +#define CXIP_PTL_IDX_COLL 6 +#define CXIP_PTL_IDX_CTRL CXIP_PTL_IDX_WRITE_MR_STD +#define CXIP_PTL_IDX_READ_MR_STD 228 +#define CXIP_PTL_IDX_RDZV_RESTRICTED_BASE 229 +#define CXIP_PTL_IDX_RDZV_RESTRICTED(lac) \ + (CXIP_PTL_IDX_RDZV_RESTRICTED_BASE + (lac)) + +#define CXIP_PTL_IDX_RDZV_SRC 255 + +/* The CXI provider supports both provider specified MR keys + * (FI_MR_PROV_KEY MR mode) and client specified keys on a per-domain + * basis. + * + * User specified keys: + * Hardware resources limit the number of active keys to 16 bits. + * Key size is 32-bit so there are only 64K unique keys. + * + * Provider specified keys: + * The key size is 64-bits and is separated from the MR hardware + * resources such that the associated MR can be cached if the + * following criteria are met: + * + * - The associated memory region is non-zero in length + * - The associated memory region mapping is cached + * - The MR is not bound to a counter + * + * Optimized caching is preferred by default. + * TODO: Fallback to standard optimized if PTE can not be allocated. + * + * FI_MR_PROV_KEY MR are associated with a unique domain wide + * 16-bit buffer ID, reducing the overhead of maintaining keys. + * Provider keys should always be preferred over client keys + * unless well known keys are not exchanged between peers. + */ +#define CXIP_MR_KEY_SIZE sizeof(uint32_t) +#define CXIP_MR_KEY_MASK ((1ULL << (8 * CXIP_MR_KEY_SIZE)) - 1) +#define CXIP_MR_VALID_OFFSET_MASK ((1ULL << 56) - 1) + +/* For provider defined keys we define a 64 bit MR key that maps + * to provider required information. + */ +struct cxip_mr_key { + union { + /* Provider generated standard cached */ + struct { + uint64_t lac : 3; + uint64_t lac_off: 58; + uint64_t opt : 1; + uint64_t cached : 1; + uint64_t unused1: 1; + /* shares CXIP_CTRL_LE_TYPE_MR */ + }; + /* Client or Provider non-cached */ + struct { + uint64_t key : 61; + uint64_t unused2: 3; + /* Provider shares opt */ + /* Provider shares cached == 0 */ + /* Provider shares CXIP_CTRL_LE_TYPE_MR */ + }; + /* Provider Key Only */ + struct { + /* Non-cached key consists of unique MR ID and sequence + * number. The same MR ID can be used with sequence + * number to create 2^44 unique keys. That is, a + * single standard MR repeatedly created and destroyed + * every micro-second, would take months before + * it repeated. + */ + uint64_t id : 16; /* Unique - 64K MR */ + uint64_t seqnum : 44; /* Sequence with random seed */ + uint64_t events : 1; /* Requires event generation */ + uint64_t unused3: 2; + uint64_t is_prov: 1; + /* Overloads CXIP_CTRL_LE_TYPE_MR and must be cleared + * before appending MR LE or TX using in match bits. + */ + }; + uint64_t raw; + }; +}; + +#define CXIP_MR_PROV_KEY_SIZE sizeof(struct cxip_mr_key) +#define CXIP_NUM_CACHED_KEY_LE 8 + +struct cxip_domain; +struct cxip_mr_domain; +struct cxip_mr; + +/* CXI provider MR operations that are specific for the MR + * based on MR key type and caching. + */ +struct cxip_mr_util_ops { + bool is_cached; + int (*init_key)(struct cxip_mr *mr, uint64_t req_key); + int (*enable_opt)(struct cxip_mr *mr); + int (*disable_opt)(struct cxip_mr *mr); + int (*enable_std)(struct cxip_mr *mr); + int (*disable_std)(struct cxip_mr *mr); +}; + +struct cxip_ep_obj; + +/* + * cxip_ctrl_mr_cache_flush() - Flush LE associated with remote MR cache. + */ +void cxip_ctrl_mr_cache_flush(struct cxip_ep_obj *ep_obj); + +/* + * cxip_adjust_remote_offset() - Update address with the appropriate offset + * for key. + */ +static inline +uint64_t cxip_adjust_remote_offset(uint64_t *addr, uint64_t key) +{ + struct cxip_mr_key cxip_key = { + .raw = key, + }; + + if (cxip_key.cached) { + *addr += cxip_key.lac_off; + if (*addr & ~CXIP_MR_VALID_OFFSET_MASK) + return -FI_EINVAL; + } + return FI_SUCCESS; +} + +int cxip_generic_mr_key_to_ptl_idx(struct cxip_domain *dom, + uint64_t key, bool write); +bool cxip_generic_is_mr_key_opt(uint64_t key); +bool cxip_generic_is_mr_key_events(uint64_t caps, uint64_t key); +bool cxip_generic_is_valid_mr_key(uint64_t key); + +/* Messaging Match Bit layout */ +#define CXIP_TX_ID_WIDTH 11 +#define CXIP_TAG_WIDTH 48 +#define CXIP_RDZV_ID_CMD_WIDTH 8 +#define CXIP_RDZV_ID_HIGH_WIDTH 7 +#define CXIP_TOTAL_RDZV_ID_WIDTH (CXIP_RDZV_ID_CMD_WIDTH + \ + CXIP_RDZV_ID_HIGH_WIDTH) +#define CXIP_TAG_MASK ((1UL << CXIP_TAG_WIDTH) - 1) + +/* Define several types of LEs */ +enum cxip_le_type { + CXIP_LE_TYPE_RX = 0, /* RX data LE */ + CXIP_LE_TYPE_ZBP, /* Zero-byte Put control message LE. Used to + * exchange data in the EQ header_data and + * match_bits fields. Unexpected headers are + * disabled. + */ +}; + +enum cxip_ctrl_le_type { + CXIP_CTRL_LE_TYPE_MR = 0, /* Memory Region LE */ + CXIP_CTRL_LE_TYPE_CTRL_MSG, /* Control Message LE */ +}; + +enum cxip_ctrl_msg_type { + CXIP_CTRL_MSG_FC_NOTIFY = 0, + CXIP_CTRL_MSG_FC_RESUME, + CXIP_CTRL_MSG_ZB_DATA, +}; + +union cxip_match_bits { + struct { + uint64_t tag : CXIP_TAG_WIDTH; /* User tag value */ + uint64_t tx_id : CXIP_TX_ID_WIDTH; /* Prov. tracked ID */ + uint64_t cq_data : 1; /* Header data is valid */ + uint64_t tagged : 1; /* Tagged API */ + uint64_t match_comp : 1; /* Notify initiator on match */ + uint64_t rdzv_done : 1; /* Notify initiator when rdzv done */ + uint64_t le_type : 1; + }; + /* Rendezvous protocol request, overloads match_comp and rdzv_done + * to specify requested protocol. + */ + struct { + uint64_t pad0 : 61; + uint64_t rdzv_proto : 2; + uint64_t pad1 : 1; + }; + /* Split TX ID for rendezvous operations. */ + struct { + uint64_t pad2 : CXIP_TAG_WIDTH; /* User tag value */ + uint64_t rdzv_id_hi : CXIP_RDZV_ID_HIGH_WIDTH; + uint64_t rdzv_lac : 4; /* Rendezvous Get LAC */ + }; + struct { + uint64_t rdzv_id_lo : CXIP_RDZV_ID_CMD_WIDTH; + }; + /* Control LE match bit format for notify/resume */ + struct { + uint64_t txc_id : 8; + uint64_t rxc_id : 8; + uint64_t drops : 16; + uint64_t pad3 : 29; + uint64_t ctrl_msg_type: 2; + uint64_t ctrl_le_type : 1; + }; + /* Control LE match bit format for zbcollectives */ + struct { + uint64_t zb_data :61; + uint64_t zb_pad : 3; + /* shares ctrl_le_type == CXIP_CTRL_LE_TYPE_CTRL_MSG + * shares ctrl_msg_type == CXIP_CTRL_MSG_ZB_BCAST + */ + }; + /* Control LE match bit format for cached MR */ + struct { + uint64_t mr_lac : 3; + uint64_t mr_lac_off : 58; + uint64_t mr_opt : 1; + uint64_t mr_cached : 1; + uint64_t mr_unused : 1; + /* shares ctrl_le_type == CXIP_CTRL_LE_TYPE_MR */ + }; + struct { + uint64_t mr_key : 61; + uint64_t mr_pad : 3; + /* shares mr_opt + * shares mr_cached == 0 + * shares ctrl_le_type == CXIP_CTRL_LE_TYPE_MR + */ + }; + struct { + uint64_t unused2 : 63; + uint64_t is_prov : 1; + /* Indicates provider generated key and shares ctrl_le_type == + * CXIP_CTRL_LE_TYPE_MR so it must be cleared before matching. + */ + }; + uint64_t raw; +}; +#define CXIP_IS_PROV_MR_KEY_BIT (1ULL << 63) +#define CXIP_KEY_MATCH_BITS(key) ((key) & ~CXIP_IS_PROV_MR_KEY_BIT) + +/* libcxi Wrapper Structures */ + +#define CXI_PLATFORM_ASIC 0 +#define CXI_PLATFORM_NETSIM 1 +#define CXI_PLATFORM_Z1 2 +#define CXI_PLATFORM_FPGA 3 + +/* + * CXI Device wrapper + * + * There will be one of these for every local Cassini device on the node. + */ +struct cxip_if { + struct slist_entry if_entry; + + /* Device description */ + struct cxil_devinfo *info; + int speed; + int link; + + struct cxil_dev *dev; + + /* PtlTEs (searched during state change events) */ + struct dlist_entry ptes; + + ofi_atomic32_t ref; + ofi_spin_t lock; +}; + +/* + * CXI communication profile wrapper. + * + * The wrapper is used to remap user requested traffic class to a communication + * profile which actually can be allocated. + */ +struct cxip_remap_cp { + struct dlist_entry remap_entry; + struct cxi_cp remap_cp; + struct cxi_cp *hw_cp; +}; + +/* + * CXI Logical Network Interface (LNI) wrapper + * + * An LNI is a container used allocate resources from a NIC. + */ +struct cxip_lni { + struct cxip_if *iface; + struct cxil_lni *lni; + + /* Hardware communication profiles */ + struct cxi_cp *hw_cps[16]; + int n_cps; + + /* Software remapped communication profiles. */ + struct dlist_entry remap_cps; + + ofi_spin_t lock; +}; + +/* A portals table define a network endpoint address. The endpoint address is + * a {NIC + PID} and this can be configured against multiple VNIs + */ +struct cxip_portals_table { + struct cxip_lni *lni; + uint32_t pid; + struct cxil_domain **doms; + size_t doms_count; +}; + +int cxip_portals_table_alloc(struct cxip_lni *lni, uint16_t *vni, + size_t vni_count, uint32_t pid, + struct cxip_portals_table **ptable); +void cxip_portals_table_free(struct cxip_portals_table *ptable); + +struct cxip_pte_map_entry { + struct dlist_entry entry; + struct cxil_pte_map *map; +}; + +/* + * CXI Portal Table Entry (PtlTE) wrapper + * + * Represents PtlTE mapped in a CXI domain. + */ +struct cxip_pte { + struct dlist_entry pte_entry; + struct cxip_portals_table *ptable; + struct cxil_pte *pte; + enum c_ptlte_state state; + struct dlist_entry map_list; + + void (*state_change_cb)(struct cxip_pte *pte, + const union c_event *event); + void *ctx; +}; + +/* + * CXI Command Queue wrapper + */ +struct cxip_cmdq { + struct cxi_cq *dev_cmdq; + struct c_cstate_cmd c_state; + enum cxip_llring_mode llring_mode; + + struct cxi_cp *cur_cp; + struct cxip_lni *lni; +}; + +int cxip_cmdq_emit_idc_put(struct cxip_cmdq *cmdq, + const struct c_cstate_cmd *c_state, + const struct c_idc_put_cmd *put, const void *buf, + size_t len, uint64_t flags); +int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma, + uint64_t flags); +int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, + const struct c_cstate_cmd *c_state, + const struct c_idc_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush); +int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, + uint64_t flags, bool fetching, bool flush); + +/* OFI Provider Structures */ + +/* + * CXI Provider Fabric object + */ +struct cxip_fabric { + struct util_fabric util_fabric; + ofi_atomic32_t ref; +}; + +/* + * CXI Provider Memory Descriptor + */ +struct cxip_md { + struct cxip_domain *dom; + struct cxi_md *md; + struct ofi_mr_info info; + uint64_t handle; + bool handle_valid; + bool cached; +}; + +#define CXIP_MR_DOMAIN_HT_BUCKETS 16 + +struct cxip_mr_domain { + struct dlist_entry buckets[CXIP_MR_DOMAIN_HT_BUCKETS]; + ofi_spin_t lock; +}; + +void cxip_mr_domain_init(struct cxip_mr_domain *mr_domain); +void cxip_mr_domain_fini(struct cxip_mr_domain *mr_domain); + +struct cxip_telemetry { + struct cxip_domain *dom; + + /* List of telemetry entries to being monitored. */ + struct dlist_entry telemetry_list; +}; + +void cxip_telemetry_dump_delta(struct cxip_telemetry *telemetry); +void cxip_telemetry_free(struct cxip_telemetry *telemetry); +int cxip_telemetry_alloc(struct cxip_domain *dom, + struct cxip_telemetry **telemetry); + +#define TELEMETRY_ENTRY_NAME_SIZE 64U + +struct cxip_telemetry_entry { + struct cxip_telemetry *telemetry; + struct dlist_entry telemetry_entry; + + /* Telemetry name. */ + char name[TELEMETRY_ENTRY_NAME_SIZE]; + + /* Telemetry value. */ + unsigned long value; +}; + +/* + * CXI Provider Domain object + */ +struct cxip_domain { + struct util_domain util_domain; + struct cxip_fabric *fab; + ofi_spin_t lock; + ofi_atomic32_t ref; + + uint32_t tclass; + + struct cxip_eq *eq; //unused + struct cxip_eq *mr_eq; //unused + + /* Assigned NIC address */ + uint32_t nic_addr; + + /* Device info */ + struct cxip_if *iface; + + /* Device partition */ + struct cxip_lni *lni; + + /* Trigger and CT support */ + struct cxip_cmdq *trig_cmdq; + struct ofi_genlock trig_cmdq_lock; + bool cntr_init; + + /* Provider generated RKEYs, else client */ + bool is_prov_key; + + /* Can disable caching of provider generated RKEYs */ + bool prov_key_cache; + + /* Provider generated RKEYs optimized MR disablement/enablement */ + bool optimized_mrs; + + /* Enable MR match event counting enables a more robust + * MR when using FI_MR_PROV_KEY. It disables hardware cached + * MR keys and ensures memory backing a MR cannot be + * remotely accessed even if that memory remains in the + * libfabric MR cache. + */ + bool mr_match_events; + + /* Domain wide MR resources. + * Req IDs are control buffer IDs to map MR or MR cache to an LE. + * MR IDs are used by non-cached provider key MR to decouple the + * MR and Req ID, and do not map directly to the MR LE. + */ + ofi_spin_t ctrl_id_lock; + struct indexer req_ids; + struct indexer mr_ids; + + /* If FI_MR_PROV_KEY is not cached, keys include a sequence number + * to reduce the likelyhood of a stale key being used to access + * a recycled MR key. + */ + uint32_t prov_key_seqnum; + + /* Translation cache */ + struct ofi_mr_cache iomm; + bool odp; + bool ats; + bool hmem; + + /* ATS translation support */ + struct cxip_md scalable_md; + bool scalable_iomm; + bool rocr_dev_mem_only; + + /* Domain state */ + bool enabled; + + /* List of allocated resources used for deferred work queue processing. + */ + struct dlist_entry txc_list; + struct dlist_entry cntr_list; + struct dlist_entry cq_list; + + struct fi_hmem_override_ops hmem_ops; + bool hybrid_mr_desc; + + /* Container of in-use MRs against this domain. */ + struct cxip_mr_domain mr_domain; + + /* Counters collected for the duration of the domain existence. */ + struct cxip_telemetry *telemetry; + + /* NIC AMO operation which is remapped to a PCIe operation. */ + int amo_remap_to_pcie_fadd; + + /* Maximum number of triggered operations configured for the service + * ID. + */ + int max_trig_op_in_use; + sem_t *trig_op_lock; + + /* Domain has been configured with FI_AV_AUTH_KEY. */ + bool av_auth_key; + + /* This is only valid if FI_AV_AUTH_KEY is false. */ + struct cxi_auth_key auth_key; + + /* Maximum number of auth keys requested by user. */ + size_t auth_key_entry_max; + + /* Domain has been configured with FI_AV_USER_ID. */ + bool av_user_id; +}; + +static inline bool cxip_domain_mr_cache_enabled(struct cxip_domain *dom) +{ + return dom->iomm.domain == &dom->util_domain; +} + +static inline bool cxip_domain_mr_cache_iface_enabled(struct cxip_domain *dom, + enum fi_hmem_iface iface) +{ + return cxip_domain_mr_cache_enabled(dom) && dom->iomm.monitors[iface]; +} + +int cxip_domain_valid_vni(struct cxip_domain *dom, unsigned int vni); + +/* This structure implies knowledge about the breakdown of the NIC address, + * which is taken from the AMA, that the provider does not know in a flexible + * way. However, the domain fi_open_ops() API includes a topology function + * that requires knowledge of the address breakdown into topology components. + * TODO: Research a less restricted way to get this information. + */ +#define CXIP_ADDR_PORT_BITS 6 +#define CXIP_ADDR_SWITCH_BITS 5 +#define CXIP_ADDR_GROUP_BITS 9 +#define CXIP_ADDR_FATTREE_PORT_BITS 6 +#define CXIP_ADDR_FATTREE_SWITCH_BITS 14 + +struct cxip_topo_addr { + union { + uint32_t addr; + struct { + uint32_t port_num:CXIP_ADDR_PORT_BITS; + uint32_t switch_num:CXIP_ADDR_SWITCH_BITS; + uint32_t group_num:CXIP_ADDR_GROUP_BITS; + } dragonfly; + struct { + uint32_t port_num:CXIP_ADDR_FATTREE_PORT_BITS; + uint32_t switch_num:CXIP_ADDR_FATTREE_SWITCH_BITS; + } fat_tree; + }; +}; + +static inline ssize_t +cxip_copy_to_hmem_iov(struct cxip_domain *domain, enum fi_hmem_iface hmem_iface, + uint64_t device, const struct iovec *hmem_iov, + size_t hmem_iov_count, uint64_t hmem_iov_offset, + const void *src, size_t size) +{ + return domain->hmem_ops.copy_to_hmem_iov(hmem_iface, device, hmem_iov, + hmem_iov_count, + hmem_iov_offset, src, size); +} + +/* + * Event Queue + * + * libfabric fi_eq implementation. + * + * Created in cxip_eq_open(). + */ +struct cxip_eq { + struct util_eq util_eq; + struct fi_eq_attr attr; + struct dlist_entry ep_list; + ofi_mutex_t list_lock; +}; + +#define CXIP_EQ_MAP_FLAGS \ + (CXI_MAP_WRITE | CXI_MAP_PIN | CXI_MAP_IOVA_ALLOC) + +/* + * RMA request + * + * Support structures, accumulated in a union. + */ +struct cxip_req_rma { + struct cxip_txc *txc; + struct cxip_md *local_md; // RMA target buffer + void *ibuf; +}; + +struct cxip_req_amo { + struct cxip_txc *txc; + struct cxip_md *result_md; + struct cxip_md *oper1_md; + char result[16]; + char oper1[16]; + bool tmp_result; + bool tmp_oper1; + void *ibuf; + bool fetching_amo_flush; + uint8_t fetching_amo_flush_event_count; + unsigned int fetching_amo_flush_event_rc; + struct cxip_cntr *fetching_amo_flush_cntr; +}; + +/* Used with receive request to maintain state associated + * with MQD support for dumping unexpected messages. + */ +struct cxip_ux_dump_state { + bool done; + + size_t max_count; /* Number entries/src_addr provided */ + size_t ret_count; /* Number of UX entries returned */ + size_t ux_count; /* Total UX entries available */ + + struct fi_cq_tagged_entry *entry; + fi_addr_t *src_addr; +}; + +struct cxip_req_recv { + /* Receive parameters */ + struct dlist_entry rxc_entry; + struct cxip_rxc *rxc; // receive context + struct cxip_cntr *cntr; + void *recv_buf; // local receive buffer + struct cxip_md *recv_md; // local receive MD + uint32_t ulen; // User buffer length + bool tagged; + uint64_t tag; + uint64_t ignore; + uint32_t match_id; + uint64_t flags; + + /* FI_CLAIM work around to hold UX remote offsets for duration of + * H/W UX entry matching and deletion. Array of 8-byte unexpected + * headers remote offsets, and current remote offset used when + * processing search results to match remote offsets. + */ + uint64_t *ule_offsets; + uint64_t ule_offset; + unsigned int num_ule_offsets; + unsigned int cur_ule_offsets; + bool offset_found; + + /* UX list dump state */ + struct cxip_ux_dump_state *ux_dump; + + /* Control info */ + int rc; // DMA return code + uint32_t rlen; // Send length + uint64_t oflow_start; // Overflow buffer address + uint16_t vni; // VNI operation came in on + uint32_t initiator; // DMA initiator address + uint32_t rdzv_id; // DMA initiator rendezvous ID + uint8_t rdzv_lac; // Rendezvous source LAC + bool done_notify; // Must send done notification + enum cxip_rdzv_proto rdzv_proto; + int rdzv_events; // Processed rdzv event count + enum c_event_type rdzv_event_types[4]; + uint32_t rdzv_initiator; // Rendezvous initiator used for mrecvs + uint32_t rget_nic; + uint32_t rget_pid; + bool software_list; // Appended to HW or SW + bool canceled; // Request canceled? + bool unlinked; + bool multi_recv; + bool tgt_event; + uint64_t start_offset; + uint64_t mrecv_bytes; + uint64_t mrecv_unlink_bytes; + bool auto_unlinked; + bool hw_offloaded; + struct cxip_req *parent; + struct dlist_entry children; + uint64_t src_offset; + uint16_t rdzv_mlen; +}; + +struct cxip_req_send { + /* Send parameters */ + struct cxip_txc *txc; + struct cxip_cntr *cntr; + const void *buf; // local send buffer + size_t len; // request length + struct cxip_md *send_md; // send buffer memory descriptor + struct cxip_addr caddr; + fi_addr_t dest_addr; + bool tagged; + uint32_t tclass; + uint64_t tag; + uint64_t data; + uint64_t flags; + void *ibuf; + + /* Control info */ + struct dlist_entry txc_entry; + struct cxip_fc_peer *fc_peer; + union { + int rdzv_id; // SW RDZV ID for long messages + int tx_id; + }; + int rc; // DMA return code + int rdzv_send_events; // Processed event count +}; + +struct cxip_req_rdzv_src { + struct dlist_entry list; + struct cxip_txc *txc; + uint32_t lac; + int rc; +}; + +struct cxip_req_search { + struct cxip_rxc *rxc; + bool complete; + int puts_pending; +}; + +struct cxip_req_coll { + struct cxip_coll_pte *coll_pte; + struct cxip_coll_buf *coll_buf; + uint32_t mrecv_space; + size_t hw_req_len; + bool isred; + enum c_return_code cxi_rc; +}; + +enum cxip_req_type { + CXIP_REQ_RMA, + CXIP_REQ_AMO, + CXIP_REQ_OFLOW, + CXIP_REQ_RECV, + CXIP_REQ_SEND, + CXIP_REQ_RDZV_SRC, + CXIP_REQ_SEARCH, + CXIP_REQ_COLL, + CXIP_REQ_RBUF, +}; + +/* + * Async Request + * + * Support structure. + * + * Created in cxip_cq_req_alloc(). + * + * This implements an async-request/callback mechanism. It uses the libfabric + * utility pool, which provides a pool of reusable memory objects that supports + * a fast lookup through the req_id index value, and can be bound to a CQ. + * + * The request is allocated and bound to the CQ, and then the command is + * issued. When the completion queue signals completion, this request is found, + * and the callback function is called. + */ +struct cxip_req { + /* Control info */ + struct dlist_entry evtq_entry; + void *req_ctx; + struct cxip_cq *cq; // request CQ + struct cxip_evtq *evtq; // request event queue + int req_id; // fast lookup in index table + int (*cb)(struct cxip_req *req, const union c_event *evt); + // completion event callback + bool discard; + + /* Triggered related fields. */ + bool triggered; + uint64_t trig_thresh; + struct cxip_cntr *trig_cntr; + + /* CQ event fields, set according to fi_cq.3 + * - set by provider + * - returned to user in completion event + */ + uint64_t context; + uint64_t flags; + uint64_t data_len; + uint64_t buf; + uint64_t data; + uint64_t tag; + fi_addr_t addr; + + /* Request parameters */ + enum cxip_req_type type; + union { + struct cxip_req_rma rma; + struct cxip_req_amo amo; + struct cxip_req_recv recv; + struct cxip_req_send send; + struct cxip_req_rdzv_src rdzv_src; + struct cxip_req_search search; + struct cxip_req_coll coll; + }; +}; + +static inline bool cxip_is_trig_req(struct cxip_req *req) +{ + return req->trig_cntr != NULL; +} + +struct cxip_ctrl_req_mr { + struct cxip_mr *mr; +}; + +struct cxip_ctrl_send { + uint32_t nic_addr; + uint32_t pid; + union cxip_match_bits mb; +}; + +struct cxip_ctrl_req { + struct dlist_entry ep_entry; + struct cxip_ep_obj *ep_obj; + int req_id; + int (*cb)(struct cxip_ctrl_req *req, const union c_event *evt); + + union { + struct cxip_ctrl_req_mr mr; + struct cxip_ctrl_send send; + }; +}; + +struct cxip_mr_lac_cache { + /* MR referencing the associated MR cache LE, can only + * be flushed if reference count is 0. + */ + ofi_atomic32_t ref; + union cxip_match_bits mb; + struct cxip_ctrl_req *ctrl_req; +}; + +struct cxip_fc_peer { + struct dlist_entry txc_entry; + struct cxip_txc *txc; + struct cxip_ctrl_req req; + struct cxip_addr caddr; + struct dlist_entry msg_queue; + uint16_t pending; + uint16_t dropped; + uint16_t pending_acks; + bool replayed; + unsigned int retry_count; +}; + +struct cxip_fc_drops { + struct dlist_entry rxc_entry; + struct cxip_rxc *rxc; + struct cxip_ctrl_req req; + uint32_t nic_addr; + uint32_t pid; + uint16_t drops; + unsigned int retry_count; +}; + +/* Completion queue specific wrapper around CXI event queue. */ +struct cxip_cq_eq { + struct cxi_eq *eq; + void *buf; + size_t len; + struct cxi_md *md; + bool mmap; + unsigned int unacked_events; + struct c_eq_status prev_eq_status; + bool eq_saturated; +}; + +struct cxip_evtq { + struct cxi_eq *eq; + void *buf; + size_t len; + struct cxi_md *md; + bool mmap; + unsigned int unacked_events; + unsigned int ack_batch_size; + struct c_eq_status prev_eq_status; + bool eq_saturated; + + /* Point back to CQ */ + struct cxip_cq *cq; + + /* Protected with ep_ob->lock */ + struct ofi_bufpool *req_pool; + struct indexer req_table; + struct dlist_entry req_list; +}; + +/* + * CXI Libfbric software completion queue + */ +struct cxip_cq { + struct util_cq util_cq; + struct fi_cq_attr attr; + + /* Implement our own CQ ep_list_lock since common code util_cq + * implementation is a mutex and can not be optimized. This lock + * is always taken walking the CQ EP, but can be optimized to no-op. + */ + struct ofi_genlock ep_list_lock; + + /* Internal CXI wait object allocated only if required. */ + struct cxil_wait_obj *priv_wait; + + /* CXI specific fields. */ + struct cxip_domain *domain; + unsigned int ack_batch_size; + struct dlist_entry dom_entry; +}; + +static inline uint16_t cxip_evtq_eqn(struct cxip_evtq *evtq) +{ + return evtq->eq->eqn; +} + +/* + * CXI libfabric completion counter + */ +struct cxip_cntr { + struct fid_cntr cntr_fid; + struct cxip_domain *domain; // parent domain + ofi_atomic32_t ref; + struct fi_cntr_attr attr; // copy of user or default attributes + struct fid_wait *wait; + /* Contexts to which counter is bound */ + struct dlist_entry ctx_list; + + ofi_mutex_t lock; + + struct cxi_ct *ct; + struct c_ct_writeback *wb; + uint64_t wb_device; + enum fi_hmem_iface wb_iface; + uint64_t wb_handle; + bool wb_handle_valid; + struct c_ct_writeback lwb; + + struct dlist_entry dom_entry; +}; + +struct cxip_ux_send { + struct dlist_entry rxc_entry; + struct cxip_req *req; + union c_event put_ev; + bool claimed; /* Reserved with FI_PEEK | FI_CLAIM */ +}; + +/* Key used to associate PUT and PUT_OVERFLOW events */ +union cxip_def_event_key { + struct { + uint64_t initiator : 32; + uint64_t rdzv_id : 15; + uint64_t pad0 : 16; + uint64_t rdzv : 1; + }; + struct { + uint64_t start_addr : 57; + uint64_t pad1 : 7; + }; + uint64_t raw; +}; + +struct cxip_deferred_event { + struct dlist_entry rxc_entry; + union cxip_def_event_key key; + struct cxip_req *req; + union c_event ev; + uint64_t mrecv_start; + uint32_t mrecv_len; + + struct cxip_ux_send *ux_send; +}; + +/* A very specific (non-generic) hash table is used to map + * deferred CXI events to associate PUT and PUT_OVERFLOW events. + * Hash entries are added and removed at a high rate and the + * overhead of generic implementations is insufficient. + */ +#define CXIP_DEF_EVENT_HT_BUCKETS 256 + +struct def_event_ht { + struct dlist_entry bh[CXIP_DEF_EVENT_HT_BUCKETS]; +}; + +/* + * Zero-buffer collectives. + */ +#define ZB_NOSIM -1 +#define ZB_ALLSIM -2 + +struct cxip_zbcoll_obj; +typedef void (*zbcomplete_t)(struct cxip_zbcoll_obj *zb, void *usrptr); + +struct cxip_zbcoll_cb_obj { + zbcomplete_t usrfunc; // callback function + void *usrptr; // callback data +}; + +/* Used to track state for one or more zbcoll endpoints */ +struct cxip_zbcoll_state { + struct cxip_zbcoll_obj *zb; // backpointer to zbcoll_obj + uint64_t *dataptr; // user-supplied target + uint64_t dataval; // collective data + int num_relatives; // number of nearest relatives + int *relatives; // nearest relative indices + int contribs; // contribution count + int grp_rank; // local rank within group +}; + +/* Used to track concurrent zbcoll operations */ +struct cxip_zbcoll_obj { + struct dlist_entry ready_link; // link to zb_coll ready_list + struct cxip_ep_obj *ep_obj; // backpointer to endpoint + struct cxip_zbcoll_state *state;// state array + struct cxip_addr *caddrs; // cxip addresses in collective + int num_caddrs; // number of cxip addresses + zbcomplete_t userfunc; // completion callback function + void *userptr; // completion callback data + uint64_t *grpmskp; // pointer to global group mask + uint32_t *shuffle; // TEST shuffle array + int simcount; // TEST count of states + int simrank; // TEST simulated rank + int simref; // TEST zb0 reference count + int busy; // serialize collectives in zb + int grpid; // zb collective grpid + int error; // error code + int reduce; // set to report reduction data +}; + +/* zbcoll extension to struct cxip_ep_obj */ +struct cxip_ep_zbcoll_obj { + struct dlist_entry ready_list; // zbcoll ops ready to advance + struct cxip_zbcoll_obj **grptbl;// group lookup table + uint64_t grpmsk; // mask of used grptbl entries + int refcnt; // grptbl reference count + bool disable; // low level tests + ofi_spin_t lock; // group ID negotiation lock + ofi_atomic32_t dsc_count; // cumulative RCV discard count + ofi_atomic32_t err_count; // cumulative ACK error count + ofi_atomic32_t ack_count; // cumulative ACK success count + ofi_atomic32_t rcv_count; // cumulative RCV success count +}; + +/* + * Collectives context. + * + * Extension to cxip_ep_obj for collectives. + * + * Initialized in cxip_coll_init() during EP creation. + */ +struct cxip_ep_coll_obj { + struct index_map mcast_map; // mc address -> object + struct dlist_entry mc_list; // list of mcast addresses + struct cxip_coll_pte *coll_pte; // PTE extensions + struct dlist_ts sched_list; // scheduled actions + struct cxip_cmdq *rx_cmdq; // shared with STD EP + struct cxip_cmdq *tx_cmdq; // shared with STD EP + struct cxip_cntr *rx_cntr; // shared with STD EP + struct cxip_cntr *tx_cntr; // shared with STD EP + struct cxip_evtq *rx_evtq; // shared with STD EP + struct cxip_evtq *tx_evtq; // shared with STD EP + struct cxip_eq *eq; // shared with STD EP + ofi_atomic32_t num_mc; // count of MC objects + ofi_atomic32_t join_cnt; // advanced on every join + size_t min_multi_recv; // trigger value to rotate bufs + size_t buffer_size; // size of receive buffers + size_t buffer_count; // count of receive buffers + bool join_busy; // serialize joins on a node + bool is_hwroot; // set if ep is hw_root + bool enabled; // enabled +}; + +/* Receive context state machine. + * TODO: Handle unexpected RMA. + */ +enum cxip_rxc_state { + /* Initial state of an RXC. All user posted receives are rejected until + * the RXC has been enabled. + * + * Note that an RXC can be transitioned from any state into + * RXC_DISABLED. + * + * Validate state changes: + * RXC_ENABLED: User has successfully enabled the RXC. + * RXC_ENABLED_SOFTWARE: User has successfully initialized the RXC + * in a software only RX matching mode. + */ + RXC_DISABLED = 0, + + /* User posted receives are matched against the software unexpected + * list before being offloaded to hardware. Hardware matches against + * the corresponding PtlTE priority and overflow list. + * + * Validate state changes: + * RXC_ONLOAD_FLOW_CONTROL: Several scenarios can initiate this state + * change. + * 1. Hardware fails to allocate an LE for an unexpected message + * or a priority list LE append fails, and hybrid mode is not + * enabled. Hardware transitions the PtlTE from enabled to disabled. + * 2. Hardware fails to allocate an LE during an overflow list + * append. The PtlTE remains in the enabled state but appends to + * the overflow list are disabled. Software manually disables + * the PtlTE. + * 3. Hardware fails to successfully match on the overflow list. + * Hardware automatically transitions the PtlTE from enabled to + * disabled. + * RXC_ONLOAD_FLOW_CONTROL_REENABLE: Several scenarios can initiate + * it this state change: + * 1. The hardware EQ is full, hardware transitions the PtlTE from + * enabled/software managed to disabled to recover drops, but it + * can re-enable if an LE resource is not recovered. + * 2. Running "hardware" RX match mode and matching failed because + * the overflow list buffers were full. Hardware transitions the + * PtlTE from enabled to disabled. The overflow list must be + * replenished and processing can continue if an LE resource is not + * recovered. + * 3. Running "hybrid" or "software" RX match mode and a message + * is received, but there is not a buffer available on the request + * list. Hardware transitions the PtlTE from software managed to + * disabled. The request list must be replenished and processing + * can continue if an LE resource is not recovered. + * RXC_PENDING_PTLTE_SOFTWARE_MANAGED: When the provider is configured + * to run in "hybrid" RX match mode and hardware fails to allocate an + * LE for an unexpected message match or an priority list append fails. + * Hardware will automatically transition the PtlTE from enabled to + * software managed and onload of UX messages will be initiated. + */ + RXC_ENABLED, + + /* The NIC has initiated a transition to software managed EP matching. + * + * Software must onload/reonload the hardware unexpected list while + * creating a pending unexpected list from entries received on the PtlTE + * request list. Any in flight appends will fail and be added to + * a receive replay list, further attempts to post receive operations + * will return -FI_EAGAIN. When onloading completes, the pending + * UX list is appended to the onloaded UX list and then failed appends + * are replayed prior to enabling the posting of receive operations. + * + * Validate state changes: + * RXC_ENABLED_SOFTWARE: The HW to SW transition onloading has + * completed and the onloaded and pending request UX list have been + * combined. + */ + RXC_PENDING_PTLTE_SOFTWARE_MANAGED, + + /* Executing as a software managed PtlTE either due to hybrid + * transition from hardware or initial startup in software + * RX matching mode. + * + * Validate state changes: + * RXC_PENDING_PTLTE_HARDWARE: TODO: When able, software may + * initiate a transition from software managed mode back to + * fully offloaded operation. + * RXC_ONLODAD_FLOW_CONTROL_REENABLE: Hardware was unable to match + * on the request list or the EQ is full. Hardware has disabled the + * PtlTE initiating flow control. Operation can continue if LE + * resources are not recovered as long as request buffers can be + * replenished. + */ + RXC_ENABLED_SOFTWARE, + + /* TODO: Hybrid RX match mode PtlTE is transitioning from software + * managed operation back to fully offloaded operation. + * + * Validate state changes: + * RXC_ENABLED: Hybrid software managed PtlTE successfully + * transitions back to fully offloaded operation. + * RXC_ENABLED_SOFTWARE: Hybrid software managed PtlTE was + * not able to transition to fully offloaded operation. + */ + RXC_PENDING_PTLTE_HARDWARE, + + /* Software has encountered a condition which requires manual transition + * of the PtlTE into disable. This state change occurs when a posted + * receive could not be appended due to LE exhaustion and software + * managed EP PtlTE operation has been disabled or is not possible. + * + * Validate state changes: + * RXC_ONLOAD_FLOW_CONTROL: PtlTE disabled event has successfully been + * received and onloading can begin. + */ + RXC_PENDING_PTLTE_DISABLE, + + /* Flow control has occurred and the PtlTE is disabled. Software is + * in the process of onloading the hardware unexpected headers to free + * up LEs. User posted receives are matched against the software + * unexpected list. If a match is not found on the software unexpected + * list, -FI_EAGAIN is returned to the user. Hardware matching is + * disabled. + * + * Validate state changes: + * RXC_ONLOAD_FLOW_CONTROL_REENABLE: An unexpected list entry matched + * a user posted receive, the search and delete command free a + * unexpected list entry, or a transition to software managed EP is + * occuring. + */ + RXC_ONLOAD_FLOW_CONTROL, + + /* PtlTE is in the same state as RXC_ONLOAD_FLOW_CONTROL, but the RXC + * should attempt to be re-enabled. + * + * Validate state changes: + * RXC_FLOW_CONTROL: Onloading of the unexpected headers has completed. + */ + RXC_ONLOAD_FLOW_CONTROL_REENABLE, + + /* Software is performing sideband communication to recover the dropped + * messages. User posted receives are matched against the software + * unexpected list. If a match is not found on the software unexpected + * list, -FI_EAGAIN is returned to the user. Hardware matching is + * disabled. + * + * If an append fails due to RC_NO_SPACE while in the RXC_FLOW_CONTROL + * state, hardware LEs are exhausted and no more LEs can be freed by + * onloading unexpected headers into software. This is a fatal event + * which requires software endpoint mode to workaround. + * + * Validate state changes: + * RXC_ENABLED: Sideband communication is complete and PtlTE is + * successfully re-enabled. + * RXC_SOFTWARE_MANAGED: When executing in "hybrid" or "software" + * RX match mode and processing has requested to re-enable as a + * software managed EP. + */ + RXC_FLOW_CONTROL, +}; + +#define CXIP_COUNTER_BUCKETS 31U +#define CXIP_BUCKET_MAX (CXIP_COUNTER_BUCKETS - 1) +#define CXIP_LIST_COUNTS 3U + +struct cxip_msg_counters { + /* Histogram counting the number of messages based on priority, buffer + * type (HMEM), and message size. + */ + ofi_atomic32_t msg_count[CXIP_LIST_COUNTS][OFI_HMEM_MAX][CXIP_COUNTER_BUCKETS]; +}; + +/* Returns the most significant bit set (indexed from 1 - the LSB) */ +static inline int fls64(uint64_t x) +{ + if (!x) + return 0; + + return (sizeof(x) * 8) - __builtin_clzl(x); +} + +static inline void cxip_msg_counters_init(struct cxip_msg_counters *cntrs) +{ + int i; + int j; + int k; + + for (i = 0; i < CXIP_LIST_COUNTS; i++) { + for (j = 0; j < OFI_HMEM_MAX; j++) { + for (k = 0; k < CXIP_COUNTER_BUCKETS; k++) + ofi_atomic_initialize32(&cntrs->msg_count[i][j][k], 0); + } + } +} + +static inline void +cxip_msg_counters_msg_record(struct cxip_msg_counters *cntrs, + enum c_ptl_list list, enum fi_hmem_iface buf_type, + size_t msg_size) +{ + unsigned int bucket; + + /* Buckets to bytes + * Bucket 0: 0 bytes + * Bucket 1: 1 byte + * Bucket 2: 2 bytes + * Bucket 3: 4 bytes + * ... + * Bucket CXIP_BUCKET_MAX: (1 << (CXIP_BUCKET_MAX - 1)) + */ + + /* Round size up to the nearest power of 2. */ + bucket = fls64(msg_size); + if ((1ULL << bucket) < msg_size) + bucket++; + + bucket = MIN(CXIP_BUCKET_MAX, bucket); + + ofi_atomic_add32(&cntrs->msg_count[list][buf_type][bucket], 1); +} + +/* + * The default for the number of SW initiated TX operation that may + * be initiated by RX processing and be outstanding. This has no + * impact on hardware initiated rendezvous gets. This value can be + * adjusted if necessary with FI_CXI_SW_RX_TX_INIT_MAX=#. + */ +#define CXIP_SW_RX_TX_INIT_MAX_DEFAULT 1024 +#define CXIP_SW_RX_TX_INIT_MIN 64 + +/* If a restricted rendezvous protocol notify done message + * cannot be delivered due to EQ full, delay before retrying. + */ +#define CXIP_DONE_NOTIFY_RETRY_DELAY_US 100 +/* + * Endpoint object receive context + */ +struct cxip_rxc { + void *context; + struct cxip_cq *recv_cq; + struct cxip_cntr *recv_cntr; + + struct cxip_ep_obj *ep_obj; // parent EP object + struct cxip_domain *domain; // parent domain + uint8_t pid_bits; + + struct dlist_entry ep_list; // contains EPs using shared context + + struct fi_rx_attr attr; + bool selective_completion; + bool sw_ep_only; + + struct cxip_evtq rx_evtq; + struct cxip_pte *rx_pte; // HW RX Queue + struct cxip_cmdq *rx_cmdq; // RX CMDQ for posting receive buffers + struct cxip_cmdq *tx_cmdq; // TX CMDQ for Message Gets + + /* Number of unexpected list entries in HW. */ + ofi_atomic32_t orx_hw_ule_cnt; + ofi_atomic32_t orx_reqs; // outstanding receive requests + ofi_atomic32_t orx_tx_reqs; // outstanding RX initiated TX requests + int32_t max_tx; + unsigned int recv_appends; + + /* Window when FI_CLAIM mutual exclusive access is required */ + bool hw_claim_in_progress; + + size_t min_multi_recv; + int max_eager_size; + + /* Flow control/software state change metrics */ + int num_fc_eq_full; + int num_fc_no_match; + int num_fc_unexp; + int num_fc_append_fail; + int num_fc_req_full; + int num_sc_nic_hw2sw_append_fail; + int num_sc_nic_hw2sw_unexp; + + /* Unexpected message handling */ + struct cxip_ptelist_bufpool *req_list_bufpool; + struct cxip_ptelist_bufpool *oflow_list_bufpool; + + /* Defer events to wait for both put and put overflow */ + struct def_event_ht deferred_events; + + struct dlist_entry fc_drops; + struct dlist_entry replay_queue; + struct dlist_entry sw_ux_list; + struct dlist_entry sw_pending_ux_list; + int sw_ux_list_len; + int sw_pending_ux_list_len; + + /* Array of 8-byte of unexpected headers remote offsets. */ + uint64_t *ule_offsets; + unsigned int num_ule_offsets; + + /* Current remote offset to be processed. Incremented after processing + * a search and delete put event. + */ + unsigned int cur_ule_offsets; + + /* Software receive queue. User posted requests are queued here instead + * of on hardware if the RXC is in software endpoint mode. + */ + struct dlist_entry sw_recv_queue; + + enum cxip_rxc_state state; + enum cxip_rxc_state prev_state; + enum cxip_rxc_state new_state; + enum c_sc_reason fc_reason; + + bool msg_offload; + uint64_t rget_align_mask; + + /* RXC drop count used for FC accounting. */ + int drop_count; + bool hmem; + + struct cxip_msg_counters cntrs; +}; + +static inline void cxip_copy_to_md(struct cxip_md *md, void *dest, + const void *src, size_t size) +{ + ssize_t ret __attribute__((unused)); + struct iovec iov; + + /* Favor CPU store access instead of relying on HMEM copy functions. */ + if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) { + ret = ofi_hmem_dev_reg_copy_to_hmem(md->info.iface, md->handle, + dest, src, size); + assert(ret == FI_SUCCESS); + } else { + iov.iov_base = dest; + iov.iov_len = size; + + ret = md->dom->hmem_ops.copy_to_hmem_iov(md->info.iface, + md->info.device, &iov, + 1, 0, src, size); + assert(ret == size); + } +} + +static inline void cxip_copy_from_md(struct cxip_md *md, void *dest, + const void *src, size_t size) +{ + ssize_t ret __attribute__((unused)); + struct iovec iov; + + /* Favor CPU store access instead of relying on HMEM copy functions. */ + if (md->handle_valid && size <= cxip_env.safe_devmem_copy_threshold) { + ret = ofi_hmem_dev_reg_copy_from_hmem(md->info.iface, + md->handle, + dest, src, size); + assert(ret == FI_SUCCESS); + } else { + iov.iov_base = (void *)src; + iov.iov_len = size; + + + ret = md->dom->hmem_ops.copy_from_hmem_iov(dest, size, + md->info.iface, + md->info.device, + &iov, 1, 0); + assert(ret == size); + } +} + +/* PtlTE buffer pool - Common PtlTE request/overflow list buffer + * management. + * + * Only C_PTL_LIST_REQUEST and C_PTL_LIST_OVERFLOW are supported. + */ +struct cxip_ptelist_bufpool_attr { + enum c_ptl_list list_type; + + /* Callback to handle PtlTE link error/unlink events */ + int (*ptelist_cb)(struct cxip_req *req, const union c_event *event); + size_t buf_size; + size_t min_space_avail; + size_t min_posted; + size_t max_posted; + size_t max_cached; +}; + +struct cxip_ptelist_bufpool { + struct cxip_ptelist_bufpool_attr attr; + struct cxip_rxc *rxc; + size_t buf_alignment; + + /* Ordered list of buffers emitted to hardware */ + struct dlist_entry active_bufs; + + /* List of consumed buffers which cannot be reposted yet + * since unexpected entries have not been matched. + */ + struct dlist_entry consumed_bufs; + + /* List of available buffers that may be appended to the list. + * These could be from a previous append failure or be cached + * from previous message processing to avoid map/unmap of + * list buffer. + */ + struct dlist_entry free_bufs; + + ofi_atomic32_t bufs_linked; + ofi_atomic32_t bufs_allocated; + ofi_atomic32_t bufs_free; +}; + +struct cxip_ptelist_req { + /* Pending list of unexpected header entries which could not be placed + * on the RX context unexpected header list due to put events being + * received out-of-order. + */ + struct dlist_entry pending_ux_list; +}; + +struct cxip_ptelist_buf { + struct cxip_ptelist_bufpool *pool; + + /* RX context the request buffer is posted on. */ + struct cxip_rxc *rxc; + enum cxip_le_type le_type; + struct dlist_entry buf_entry; + struct cxip_req *req; + + /* Memory mapping of req_buf field. */ + struct cxip_md *md; + + /* The number of bytes consume by hardware when the request buffer was + * unlinked. + */ + size_t unlink_length; + + /* Current offset into the buffer where packets/data are landing. When + * the cur_offset is equal to unlink_length, software has completed + * event processing for the buffer. + */ + size_t cur_offset; + + /* Request list specific control information */ + struct cxip_ptelist_req request; + + /* The number of unexpected headers posted placed on the RX context + * unexpected header list which have not been matched. + */ + ofi_atomic32_t refcount; + + /* Buffer used to land packets. */ + char *data; +}; + +int cxip_ptelist_bufpool_init(struct cxip_rxc *rxc, + struct cxip_ptelist_bufpool **pool, + struct cxip_ptelist_bufpool_attr *attr); +void cxip_ptelist_bufpool_fini(struct cxip_ptelist_bufpool *pool); +int cxip_ptelist_buf_replenish(struct cxip_ptelist_bufpool *pool, + bool seq_restart); +void cxip_ptelist_buf_link_err(struct cxip_ptelist_buf *buf, + int rc_link_error); +void cxip_ptelist_buf_unlink(struct cxip_ptelist_buf *buf); +void cxip_ptelist_buf_put(struct cxip_ptelist_buf *buf, bool repost); +void cxip_ptelist_buf_get(struct cxip_ptelist_buf *buf); +void cxip_ptelist_buf_consumed(struct cxip_ptelist_buf *buf); + +/* + * cxip_req_bufpool_init() - Initialize PtlTE request list buffer management + * object. + */ +int cxip_req_bufpool_init(struct cxip_rxc *rxc); +void cxip_req_bufpool_fini(struct cxip_rxc *rxc); + +/* + * cxip_oflow_bufpool_init() - Initialize PtlTE overflow list buffer management + * object. + */ +int cxip_oflow_bufpool_init(struct cxip_rxc *rxc); +void cxip_oflow_bufpool_fini(struct cxip_rxc *rxc); + +void _cxip_req_buf_ux_free(struct cxip_ux_send *ux, bool repost); +void cxip_req_buf_ux_free(struct cxip_ux_send *ux); + +#define CXIP_RDZV_IDS (1 << CXIP_TOTAL_RDZV_ID_WIDTH) +#define CXIP_RDZV_IDS_MULTI_RECV (1 << CXIP_RDZV_ID_CMD_WIDTH) +#define CXIP_TX_IDS (1 << CXIP_TX_ID_WIDTH) + +/* One per LAC */ +#define RDZV_SRC_LES 8U +#define RDZV_NO_MATCH_PTES 8U + +/* Base rendezvous PtlTE object */ +struct cxip_rdzv_pte { + struct cxip_txc *txc; + struct cxip_pte *pte; + + /* Count of the number of buffers successfully linked on this PtlTE. */ + ofi_atomic32_t le_linked_success_count; + + /* Count of the number of buffers failed to link on this PtlTE. */ + ofi_atomic32_t le_linked_failure_count; +}; + +/* Matching PtlTE for user generated unrestricted get DMA */ +struct cxip_rdzv_match_pte { + struct cxip_rdzv_pte base_pte; + + /* Request structure used to handle zero byte puts used for match + * complete. + */ + struct cxip_req *zbp_req; + + /* Request structures used to handle rendezvous source/data transfers. + * There is one request structure (and LE) for each LAC. + */ + struct cxip_req *src_reqs[RDZV_SRC_LES]; +}; + +/* Matching PtlTE for user generated restricted get DMA. One PtlTE + * per LAC used. + */ +struct cxip_rdzv_nomatch_pte { + struct cxip_rdzv_pte base_pte; + struct cxip_req *le_req; +}; + +#if ENABLE_DEBUG +/* Defines to force hard to test TXC error path failures; + * only valid for debug unit testing. See txc->force_err. + */ +#define CXIP_TXC_FORCE_ERR_ALT_READ_PROTO_ALLOC (1 << 0) +#endif + +/* + * Endpoint object transmit context + */ +struct cxip_txc { + void *context; + bool enabled; + bool hrp_war_req; // Non-fetching 32-bit HRP + + bool hmem; + + struct cxip_cq *send_cq; + struct cxip_cntr *send_cntr; + struct cxip_cntr *read_cntr; + struct cxip_cntr *write_cntr; + + struct cxip_ep_obj *ep_obj; // parent EP object + struct cxip_domain *domain; // parent domain + uint8_t pid_bits; + + struct dlist_entry ep_list; // contains EPs using shared context + + struct fi_tx_attr attr; // attributes + bool selective_completion; + uint32_t tclass; + + /* TX H/W Event Queue */ + struct cxip_evtq tx_evtq; + + /* Inject buffers for EP, protected by ep_obj->lock */ + struct ofi_bufpool *ibuf_pool; + + struct cxip_cmdq *tx_cmdq; // added during cxip_txc_enable() + ofi_atomic32_t otx_reqs; // outstanding transmit requests + + struct cxip_req *rma_write_selective_completion_req; + struct cxip_req *rma_read_selective_completion_req; + struct cxip_req *amo_selective_completion_req; + struct cxip_req *amo_fetch_selective_completion_req; + + /* Rendezvous related structures */ + struct cxip_rdzv_match_pte *rdzv_pte; + struct cxip_rdzv_nomatch_pte *rdzv_nomatch_pte[RDZV_NO_MATCH_PTES]; + struct indexer rdzv_ids; + struct indexer msg_rdzv_ids; + enum cxip_rdzv_proto rdzv_proto; + + /* Match complete IDs */ + struct indexer tx_ids; + + int max_eager_size; + int rdzv_eager_size; + struct cxip_cmdq *rx_cmdq; // Target cmdq for Rendezvous buffers + +#if ENABLE_DEBUG + uint64_t force_err; +#endif + /* Flow Control recovery */ + struct dlist_entry msg_queue; + struct dlist_entry fc_peers; + + struct dlist_entry dom_entry; +}; + +int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + const struct c_cstate_cmd *c_state, + const struct c_idc_put_cmd *put, const void *buf, + size_t len, uint64_t flags); +int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cntr *trig_cntr, size_t trig_thresh, + struct c_full_dma_cmd *dma, uint64_t flags); +int cxip_txc_emit_idc_amo(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + const struct c_cstate_cmd *c_state, + const struct c_idc_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush); +int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cntr *trig_cntr, size_t trig_thresh, + struct c_dma_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush); + +void cxip_txc_flush_msg_trig_reqs(struct cxip_txc *txc); + +/* + * Base Endpoint Object + * + * Support structure, libfabric fi_endpoint implementation. + * + * This is the meat of the endpoint object. It has been separated from cxip_ep + * to support aliasing. + */ +struct cxip_ep_obj { + /* Allow lock to be optimized out with FI_THREAD_DOMAIN */ + struct ofi_genlock lock; + struct cxip_domain *domain; + struct cxip_av *av; + + /* Domain has been configured with FI_AV_AUTH_KEY. */ + bool av_auth_key; + + /* This is only valid if FI_AV_AUTH_KEY is false. */ + struct cxi_auth_key auth_key; + + /* Array of VNIs if FI_AV_AUTH_KEY is true. */ + uint16_t *vnis; + size_t vni_count; + + bool enabled; + + struct cxil_wait_obj *ctrl_wait; + struct cxi_eq *ctrl_tgt_evtq; + struct cxi_eq *ctrl_tx_evtq; + + struct cxip_addr src_addr; + fi_addr_t fi_addr; + + /* ASIC version associated with EP/Domain */ + enum cassini_version asic_ver; + + struct cxip_txc txc; + struct cxip_rxc rxc; + + /* Command queues. Each EP has 1 transmit and 1 target + * command queue that can be shared. An optional 2nd transmit + * command queue may be created for RX initiated rgets. + */ + struct cxip_cmdq *txq; + ofi_atomic32_t txq_ref; + struct cxip_cmdq *tgq; + ofi_atomic32_t tgq_ref; + struct cxip_cmdq *rx_txq; + + /* Portals flow-control recovery messaging uses a credit + * scheme to avoid over-running the associated event queue. + */ + struct cxip_cmdq *ctrl_txq; + struct cxip_cmdq *ctrl_tgq; + unsigned int ctrl_tx_credits; + struct cxip_pte *ctrl_pte; + struct cxip_ctrl_req ctrl_msg_req; + + /* Libfabric software EQ resource */ + struct cxip_eq *eq; + struct dlist_entry eq_link; + + /* Values at base EP creation */ + uint64_t caps; + struct fi_ep_attr ep_attr; + struct fi_tx_attr tx_attr; + struct fi_rx_attr rx_attr; + + /* Collectives support */ + struct cxip_ep_coll_obj coll; + struct cxip_ep_zbcoll_obj zbcoll; + + /* Flow control recovery event queue buffers */ + void *ctrl_tgt_evtq_buf; + struct cxi_md *ctrl_tgt_evtq_buf_md; + void *ctrl_tx_evtq_buf; + struct cxi_md *ctrl_tx_evtq_buf_md; + + /* FI_MR_PROV_KEY caching, protected with ep_obj->lock */ + struct cxip_mr_lac_cache std_mr_cache[CXIP_NUM_CACHED_KEY_LE]; + struct cxip_mr_lac_cache opt_mr_cache[CXIP_NUM_CACHED_KEY_LE]; + struct dlist_entry mr_list; + + size_t txq_size; + size_t tgq_size; + ofi_atomic32_t ref; + struct cxip_portals_table *ptable; +}; + +/* + * CXI endpoint implementations to support FI_CLASS_EP. + */ +struct cxip_ep { + struct fid_ep ep; + struct fi_tx_attr tx_attr; + struct fi_rx_attr rx_attr; + struct cxip_ep_obj *ep_obj; + int is_alias; +}; + +size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep, + struct fi_cq_tagged_entry *entry, size_t count, + fi_addr_t *src_addr, size_t *ux_count); +int cxip_build_ux_entry_info(struct cxip_ep *ep, + struct fi_cq_tagged_entry *entry, size_t count, + fi_addr_t *src_addr, size_t *ux_count); + +enum cxip_mr_state { + CXIP_MR_DISABLED = 1, + CXIP_MR_ENABLED, + CXIP_MR_LINKED, + CXIP_MR_UNLINKED, + CXIP_MR_LINK_ERR, +}; + +/* + * Memory Region + * + * libfabric fi_mr implementation. + * + * Created in cxip_regattr(). + */ +struct cxip_mr { + struct fid_mr mr_fid; + struct cxip_domain *domain; // parent domain + struct cxip_ep *ep; // endpoint for remote memory + uint64_t key; // memory key + uint64_t flags; // special flags + struct fi_mr_attr attr; // attributes + struct cxip_cntr *cntr; // if bound to cntr + + /* Indicates if FI_RMA_EVENT was specified at creation and + * will be used to enable fi_writedata() and fi_inject_writedata() + * support for this MR (TODO). + */ + bool rma_events; + + /* If requested then count MR events to determine if RMA are in + * progress. At close if no RMA are in progress bypass the invalidate + * of the PTLTE LE. This improves non-cached key close performance, + * enabling their use so that after closing the MR the associated + * memory cannot be remotely accessed, even if it remains in the + * libfabric MR cache. + */ + bool count_events; + ofi_atomic32_t match_events; + ofi_atomic32_t access_events; + + ofi_spin_t lock; + + struct cxip_mr_util_ops *mr_util; + bool enabled; + struct cxip_pte *pte; + enum cxip_mr_state mr_state; + int64_t mr_id; // Non-cached provider key uniqueness + struct cxip_ctrl_req req; + bool optimized; + + void *buf; // memory buffer VA + uint64_t len; // memory length + struct cxip_md *md; // buffer IO descriptor + struct dlist_entry ep_entry; + + struct dlist_entry mr_domain_entry; +}; + +struct cxip_av_auth_key_entry { + ofi_atomic32_t use_cnt; + ofi_atomic32_t ref_cnt; + UT_hash_handle hh; + struct dlist_entry entry; + struct cxi_auth_key key; + fi_addr_t fi_addr; +}; + +struct cxip_av_entry { + ofi_atomic32_t use_cnt; + UT_hash_handle hh; + struct cxip_addr addr; + fi_addr_t fi_addr; + struct cxip_av_auth_key_entry *auth_key; +}; + +struct cxip_av { + struct fid_av av_fid; + struct cxip_domain *domain; + + /* List of endpoints bound to this AV. Each bind takes a reference + * as well. + */ + struct dlist_entry ep_list; + ofi_atomic32_t ref; + + /* Memory used to implement lookups. Two data structures are used. + * 1. ibuf pool for O(1) lookup on the data path + * 2. hash table for O(1) on the receive path + */ + struct cxip_av_entry *av_entry_hash; + struct ofi_bufpool *av_entry_pool; + ofi_atomic32_t av_entry_cnt; + + /* Memory used to support AV authorization key. Three data structures + * are needed. + * 1. ibuf pool for memory allocation and lookup O(1) access. + * 2. hash table for O(1) reverse lookup + * 3. List for iterating + */ + struct cxip_av_auth_key_entry *auth_key_entry_hash; + struct ofi_bufpool *auth_key_entry_pool; + struct dlist_entry auth_key_entry_list; + ofi_atomic32_t auth_key_entry_cnt; + size_t auth_key_entry_max; + + /* Single lock is used to protect entire AV. With domain level + * threading, this lock is not used. + */ + bool lockless; + pthread_rwlock_t lock; + + /* AV is configured as symmetric. This is an optimization which enables + * endpoints to use logical address. + */ + bool symmetric; + + /* Address vector type. */ + enum fi_av_type type; + + /* Whether or not the AV is operating in FI_AV_AUTH_KEY mode. */ + bool av_auth_key; + + /* Whether or not the AV was opened with FI_AV_USER_ID. */ + bool av_user_id; +}; + +int cxip_av_auth_key_get_vnis(struct cxip_av *av, uint16_t **vni, + size_t *vni_count); +void cxip_av_auth_key_put_vnis(struct cxip_av *av, uint16_t *vni, + size_t vni_count); +extern struct cxip_addr *(*cxip_av_addr_in)(const void *addr); +extern void (*cxip_av_addr_out)(struct cxip_addr *addr_out, + struct cxip_addr *addr); +int cxip_av_lookup_addr(struct cxip_av *av, fi_addr_t fi_addr, + struct cxip_addr *addr); +fi_addr_t cxip_av_lookup_fi_addr(struct cxip_av *av, + const struct cxip_addr *addr); +fi_addr_t cxip_av_lookup_auth_key_fi_addr(struct cxip_av *av, unsigned int vni); +int cxip_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context); +int cxip_av_bind_ep(struct cxip_av *av, struct cxip_ep *ep); +void cxip_av_unbind_ep(struct cxip_av *av, struct cxip_ep *ep); +static inline int cxip_av_entry_count(struct cxip_av *av) +{ + return ofi_atomic_get32(&av->av_entry_cnt); +} + +/* + * AV Set + * + * libfabric fi_av_set implementation. + * + * Created in cxip_av_set(). + */ +struct cxip_av_set { + struct fid_av_set av_set_fid; + struct cxip_av *cxi_av; // associated AV + struct cxip_coll_mc *mc_obj; // reference MC + fi_addr_t *fi_addr_ary; // addresses in set + size_t fi_addr_cnt; // count of addresses + struct cxip_comm_key comm_key; // communication key + uint64_t flags; +}; + +/* Needed for math functions */ +union cxip_dbl_bits { + struct { + uint64_t mantissa:52; + uint64_t exponent:11; + uint64_t sign:1; + } __attribute__((__packed__)); + double dval; + uint64_t ival; +}; + +static inline uint64_t _dbl2bits(double d) +{ +#if (BYTE_ORDER == LITTLE_ENDIAN) + union cxip_dbl_bits x = {.dval = d}; + return x.ival; +#else +#error "Unsupported processor byte ordering" +#endif +} + +static inline double _bits2dbl(uint64_t i) +{ +#if (BYTE_ORDER == LITTLE_ENDIAN) + union cxip_dbl_bits x = {.ival = i}; + return x.dval; +#else +#error "Unsupported processor byte ordering" +#endif +} + +static inline void _decompose_dbl(double d, int *sgn, int *exp, + unsigned long *man) +{ +#if (BYTE_ORDER == LITTLE_ENDIAN) + union cxip_dbl_bits x = {.dval = d}; + *sgn = (x.sign) ? -1 : 1; + *exp = x.exponent; + *man = x.mantissa; +#else +#error "Unsupported processor byte ordering" +#endif +} + +/* data structures for reduction support */ +enum cxip_coll_redtype { + REDTYPE_BYT, + REDTYPE_INT, + REDTYPE_FLT, + REDTYPE_IMINMAX, + REDTYPE_FMINMAX, + REDTYPE_REPSUM +}; + +/* int AND, OR, XOR, MIN, MAX, SUM */ +struct cxip_intval { + int64_t ival[4]; +}; + +/* flt MIN, MAX, SUM */ +struct cxip_fltval { + double fval[4]; +}; + +/* int MINMAXLOC */ +struct cxip_iminmax { + int64_t iminval; + uint64_t iminidx; + int64_t imaxval; + uint64_t imaxidx; +}; + +/* flt MINMAXLOC */ +struct cxip_fltminmax { + double fminval; + uint64_t fminidx; + double fmaxval; + uint64_t fmaxidx; +}; + +/* repsum SUM */ +struct cxip_repsum { + int64_t T[4]; + int32_t M; + int8_t overflow_id; + bool inexact; + bool overflow; + bool invalid; +}; + +/* Collective operation states */ +enum cxip_coll_state { + CXIP_COLL_STATE_NONE, + CXIP_COLL_STATE_READY, + CXIP_COLL_STATE_FAULT, +}; + +/* Similar to C_RC_* provider errors, but pure libfabric */ +/* These should be in priority order, from lowest to highest */ +enum cxip_coll_prov_errno { + CXIP_PROV_ERRNO_OK = -1, // good + CXIP_PROV_ERRNO_PTE = -2, // PTE setup failure + CXIP_PROV_ERRNO_MCAST_INUSE = -3, // multicast in-use + CXIP_PROV_ERRNO_HWROOT_INUSE = -4, // hwroot in-use + CXIP_PROV_ERRNO_MCAST_INVALID = -5, // multicast invalid + CXIP_PROV_ERRNO_HWROOT_INVALID = -6, // hwroot invalid + CXIP_PROV_ERRNO_CURL = -7, // CURL failure + CXIP_PROV_ERRNO_LAST = -8, // last error code (unused) +}; + +/* Rosetta reduction engine error codes */ +typedef enum cxip_coll_rc { + CXIP_COLL_RC_SUCCESS = 0, // good + CXIP_COLL_RC_FLT_INEXACT = 1, // result was rounded + CXIP_COLL_RC_FLT_OVERFLOW = 3, // result too large to represent + CXIP_COLL_RC_FLT_INVALID = 4, // operand was signalling NaN, + // or infinities subtracted + CXIP_COLL_RC_REP_INEXACT = 5, // reproducible sum was rounded + CXIP_COLL_RC_INT_OVERFLOW = 6, // reproducible sum overflow + CXIP_COLL_RC_CONTR_OVERFLOW = 7, // too many contributions seen + CXIP_COLL_RC_OP_MISMATCH = 8, // conflicting opcodes + CXIP_COLL_RC_TX_FAILURE = 9, // internal send error + CXIP_COLL_RC_MAX = 10 +} cxip_coll_rc_t; + +struct cxip_coll_buf { + struct dlist_entry buf_entry; // linked list of buffers + struct cxip_req *req; // associated LINK request + struct cxip_md *cxi_md; // buffer memory descriptor + size_t bufsiz; // buffer size in bytes + uint8_t buffer[]; // buffer space itself +}; + +struct cxip_coll_pte { + struct cxip_pte *pte; // Collectives PTE + struct cxip_ep_obj *ep_obj; // Associated endpoint + struct cxip_coll_mc *mc_obj; // Associated multicast object + struct dlist_entry buf_list; // PTE receive buffers + ofi_atomic32_t buf_cnt; // count of linked buffers + ofi_atomic32_t buf_swap_cnt; // for diagnostics + ofi_atomic32_t recv_cnt; // for diagnostics + int buf_low_water; // for diagnostics + bool enabled; // enabled +}; + +/* REQUIRED: + * sizeof(struct cxip_coll_accumulator) >= sizeof(struct cxip_coll_data) + * (opaque) struct cxip_coll_accumulator exported in fi_cxi_ext.h + */ +struct cxip_coll_data { + union { + uint8_t databuf[32]; // raw data buffer + struct cxip_intval intval; // 4 integer values + flags + struct cxip_fltval fltval; // 4 double values + flags + struct cxip_iminmax intminmax; // 1 intminmax structure + flags + struct cxip_fltminmax fltminmax;// 1 fltminmax structure + flags + struct cxip_repsum repsum; // 1 repsum structure + flags + }; + cxip_coll_op_t red_op; // reduction opcode + cxip_coll_rc_t red_rc; // reduction return code + int red_cnt; // reduction contrib count + bool initialized; +}; + +struct cxip_coll_reduction { + struct cxip_coll_mc *mc_obj; // parent mc_obj + uint32_t red_id; // reduction id + uint16_t seqno; // reduction sequence number + uint16_t resno; // reduction result number + struct cxip_req *op_inject_req; // active operation request + enum cxip_coll_state coll_state; // reduction state on node + struct cxip_coll_data accum; // reduction accumulator + void *op_rslt_data; // user recv buffer (or NULL) + int op_data_bytcnt; // bytes in send/recv buffers + void *op_context; // caller's context + bool in_use; // reduction is in-use + bool pktsent; // reduction packet sent + bool completed; // reduction is completed + bool drop_send; // drop the next send operation + bool drop_recv; // drop the next recv operation + enum cxip_coll_rc red_rc; // set by first error + struct timespec tv_expires; // reduction expiration time + uint8_t tx_msg[64]; // static packet memory +}; + +struct cxip_coll_mc { + struct fid_mc mc_fid; + struct dlist_entry entry; // Link to mc object list + struct cxip_ep_obj *ep_obj; // Associated endpoint + struct cxip_av_set *av_set_obj; // associated AV set + struct cxip_zbcoll_obj *zb; // zb object for zbcol + struct cxip_coll_pte *coll_pte; // collective PTE + struct timespec timeout; // state machine timeout + fi_addr_t mynode_fiaddr; // fi_addr of this node + int mynode_idx; // av_set index of this node + uint32_t hwroot_idx; // av_set index of hwroot node + uint32_t mcast_addr; // multicast target address + int tail_red_id; // tail active red_id + int next_red_id; // next available red_id + int max_red_id; // limit total concurrency + int seqno; // rolling seqno for packets + bool arm_disable; // arm-disable for testing + bool is_joined; // true if joined + bool rx_discard; // true to discard RX events + enum cxi_traffic_class tc; // traffic class + enum cxi_traffic_class_type tc_type; // traffic class type + ofi_atomic32_t send_cnt; // for diagnostics + ofi_atomic32_t recv_cnt; // for diagnostics + ofi_atomic32_t pkt_cnt; // for diagnostics + ofi_atomic32_t seq_err_cnt; // for diagnostics + ofi_atomic32_t tmout_cnt; // for diagnostics + ofi_spin_t lock; + + struct cxi_md *reduction_md; // memory descriptor for DMA + struct cxip_coll_reduction reduction[CXIP_COLL_MAX_CONCUR]; +}; + +struct cxip_curl_handle; + +typedef void (*curlcomplete_t)(struct cxip_curl_handle *); + +struct cxip_curl_handle { + long status; // HTTP status, 0 for no server, -1 busy + const char *endpoint; // HTTP server endpoint address + const char *request; // HTTP request data + const char *response; // HTTP response data, NULL until complete + curlcomplete_t usrfunc; // user completion function + void *usrptr; // user function argument + void *recv; // opaque + void *headers; // opaque +}; + +/* Low-level CURL POST/DELETE async wrappers */ +enum curl_ops { + CURL_GET, + CURL_PUT, + CURL_POST, + CURL_PATCH, + CURL_DELETE, + CURL_MAX +}; +int cxip_curl_init(void); +void cxip_curl_fini(void); +const char *cxip_curl_opname(enum curl_ops op); +int cxip_curl_perform(const char *endpoint, const char *request, + const char *sessionToken, size_t rsp_init_size, + enum curl_ops op, bool verbose, + curlcomplete_t usrfunc, void *usrptr); +int cxip_curl_progress(struct cxip_curl_handle **handleptr); +void cxip_curl_free(struct cxip_curl_handle *handle); + +static inline void single_to_double_quote(char *str) +{ + do {if (*str == '\'') *str = '"';} while (*(++str)); +} +enum json_type cxip_json_obj(const char *desc, struct json_object *jobj, + struct json_object **jval); +int cxip_json_bool(const char *desc, struct json_object *jobj, bool *val); +int cxip_json_int(const char *desc, struct json_object *jobj, int *val); +int cxip_json_int64(const char *desc, struct json_object *jobj, int64_t *val); +int cxip_json_double(const char *desc, struct json_object *jobj, double *val); +int cxip_json_string(const char *desc, struct json_object *jobj, + const char **val); + +/* Perform zero-buffer collectives */ +void cxip_tree_rowcol(int radix, int nodeidx, int *row, int *col, int *siz); +void cxip_tree_nodeidx(int radix, int row, int col, int *nodeidx); +int cxip_tree_relatives(int radix, int nodeidx, int maxnodes, int *rels); + +int cxip_zbcoll_recv_cb(struct cxip_ep_obj *ep_obj, uint32_t init_nic, + uint32_t init_pid, uint64_t mbv); +void cxip_zbcoll_send(struct cxip_zbcoll_obj *zb, int srcidx, int dstidx, + uint64_t payload); +void cxip_zbcoll_free(struct cxip_zbcoll_obj *zb); +int cxip_zbcoll_alloc(struct cxip_ep_obj *ep_obj, int num_addrs, + fi_addr_t *fiaddrs, int simrank, + struct cxip_zbcoll_obj **zbp); +int cxip_zbcoll_simlink(struct cxip_zbcoll_obj *zb0, + struct cxip_zbcoll_obj *zb); +void cxip_zbcoll_set_user_cb(struct cxip_zbcoll_obj *zb, + zbcomplete_t userfunc, void *userptr); + +int cxip_zbcoll_max_grps(bool sim); +int cxip_zbcoll_getgroup(struct cxip_zbcoll_obj *zb); +void cxip_zbcoll_rlsgroup(struct cxip_zbcoll_obj *zb); +int cxip_zbcoll_broadcast(struct cxip_zbcoll_obj *zb, uint64_t *dataptr); +int cxip_zbcoll_reduce(struct cxip_zbcoll_obj *zb, uint64_t *dataptr); +int cxip_zbcoll_barrier(struct cxip_zbcoll_obj *zb); +void cxip_ep_zbcoll_progress(struct cxip_ep_obj *ep_obj); + +void cxip_zbcoll_reset_counters(struct cxip_ep_obj *ep_obj); +void cxip_zbcoll_get_counters(struct cxip_ep_obj *ep_obj, uint32_t *dsc, + uint32_t *err, uint32_t *ack, uint32_t *rcv); +void cxip_zbcoll_fini(struct cxip_ep_obj *ep_obj); +int cxip_zbcoll_init(struct cxip_ep_obj *ep_obj); + +/* + * CNTR/CQ wait object file list element + * + * Support structure. + * + * Created in cxip_cntr_open(), cxip_cq_open(). + */ +struct cxip_fid_list { + struct dlist_entry entry; + struct fid *fid; +}; + +int cxip_rdzv_match_pte_alloc(struct cxip_txc *txc, + struct cxip_rdzv_match_pte **rdzv_pte); +int cxip_rdzv_nomatch_pte_alloc(struct cxip_txc *txc, int lac, + struct cxip_rdzv_nomatch_pte **rdzv_pte); +int cxip_rdzv_pte_src_req_alloc(struct cxip_rdzv_match_pte *pte, int lac); +void cxip_rdzv_match_pte_free(struct cxip_rdzv_match_pte *pte); +void cxip_rdzv_nomatch_pte_free(struct cxip_rdzv_nomatch_pte *pte); +int cxip_rdzv_pte_zbp_cb(struct cxip_req *req, const union c_event *event); +int cxip_rdzv_pte_src_cb(struct cxip_req *req, const union c_event *event); + +struct cxip_if *cxip_if_lookup_addr(uint32_t nic_addr); +struct cxip_if *cxip_if_lookup_name(const char *name); +int cxip_get_if(uint32_t nic_addr, struct cxip_if **dev_if); +void cxip_put_if(struct cxip_if *dev_if); +int cxip_if_valid_rgroup_vni(struct cxip_if *iface, unsigned int rgroup_id, + unsigned int vni); +int cxip_alloc_lni(struct cxip_if *iface, uint32_t svc_id, + struct cxip_lni **if_lni); +void cxip_free_lni(struct cxip_lni *lni); +const char *cxi_tc_str(enum cxi_traffic_class tc); +enum cxi_traffic_class cxip_ofi_to_cxi_tc(uint32_t ofi_tclass); +int cxip_txq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type); +void cxip_if_init(void); +void cxip_if_fini(void); + +int cxip_pte_set_state(struct cxip_pte *pte, struct cxip_cmdq *cmdq, + enum c_ptlte_state new_state, uint32_t drop_count); +int cxip_pte_set_state_wait(struct cxip_pte *pte, struct cxip_cmdq *cmdq, + struct cxip_evtq *evtq, + enum c_ptlte_state new_state, uint32_t drop_count); +int cxip_pte_append(struct cxip_pte *pte, uint64_t iova, size_t len, + unsigned int lac, enum c_ptl_list list, + uint32_t buffer_id, uint64_t match_bits, + uint64_t ignore_bits, uint32_t match_id, + uint64_t min_free, uint32_t flags, + struct cxip_cntr *cntr, struct cxip_cmdq *cmdq, + bool ring); +int cxip_pte_unlink(struct cxip_pte *pte, enum c_ptl_list list, + int buffer_id, struct cxip_cmdq *cmdq); +int cxip_pte_map(struct cxip_pte *pte, uint64_t pid_idx, bool is_multicast); +int cxip_pte_alloc_nomap(struct cxip_portals_table *ptable, struct cxi_eq *evtq, + struct cxi_pt_alloc_opts *opts, + void (*state_change_cb)(struct cxip_pte *pte, + const union c_event *event), + void *ctx, struct cxip_pte **pte); +int cxip_pte_alloc(struct cxip_portals_table *ptable, struct cxi_eq *evtq, + uint64_t pid_idx, bool is_multicast, + struct cxi_pt_alloc_opts *opts, + void (*state_change_cb)(struct cxip_pte *pte, + const union c_event *event), + void *ctx, struct cxip_pte **pte); +void cxip_pte_free(struct cxip_pte *pte); +int cxip_pte_state_change(struct cxip_if *dev_if, const union c_event *event); + +int cxip_cmdq_alloc(struct cxip_lni *lni, struct cxi_eq *evtq, + struct cxi_cq_alloc_opts *cq_opts, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cmdq **cmdq); +void cxip_cmdq_free(struct cxip_cmdq *cmdq); +int cxip_cmdq_emit_c_state(struct cxip_cmdq *cmdq, + const struct c_cstate_cmd *cmd); + +int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq, + size_t num_events, size_t num_fc_events); +void cxip_evtq_fini(struct cxip_evtq *eq); + +int cxip_domain(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **dom, void *context); + +int cxip_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context); + +int cxip_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); + +int cxip_tx_id_alloc(struct cxip_txc *txc, void *ctx); +int cxip_tx_id_free(struct cxip_txc *txc, int id); +void *cxip_tx_id_lookup(struct cxip_txc *txc, int id); +int cxip_rdzv_id_alloc(struct cxip_txc *txc, struct cxip_req *req); +int cxip_rdzv_id_free(struct cxip_txc *txc, int id); +void *cxip_rdzv_id_lookup(struct cxip_txc *txc, int id); +int cxip_ep_cmdq(struct cxip_ep_obj *ep_obj, bool transmit, uint32_t tclass, + struct cxi_eq *evtq, struct cxip_cmdq **cmdq); +void cxip_ep_cmdq_put(struct cxip_ep_obj *ep_obj, bool transmit); + +int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux); +int cxip_recv_req_sw_matcher(struct cxip_req *req); +int cxip_recv_cancel(struct cxip_req *req); +int cxip_fc_process_drops(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, + uint32_t pid, uint16_t drops); +void cxip_recv_pte_cb(struct cxip_pte *pte, const union c_event *event); +void cxip_rxc_req_fini(struct cxip_rxc *rxc); +int cxip_rxc_oflow_init(struct cxip_rxc *rxc); +void cxip_rxc_oflow_fini(struct cxip_rxc *rxc); +int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid); + +void cxip_txc_struct_init(struct cxip_txc *txc, const struct fi_tx_attr *attr, + void *context); +int cxip_txc_enable(struct cxip_txc *txc); +void cxip_txc_disable(struct cxip_txc *txc); +struct cxip_txc *cxip_stx_alloc(const struct fi_tx_attr *attr, void *context); +int cxip_rxc_msg_enable(struct cxip_rxc *rxc, uint32_t drop_count); +int cxip_rxc_enable(struct cxip_rxc *rxc); +void cxip_rxc_disable(struct cxip_rxc *rxc); +void cxip_rxc_struct_init(struct cxip_rxc *rxc, const struct fi_rx_attr *attr, + void *context); + +int cxip_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, + struct fid_eq **eq, void *context); + +bool cxip_evtq_saturated(struct cxip_evtq *evtq); +struct cxip_md *cxip_txc_ibuf_md(void *ibuf); +void *cxip_txc_ibuf_alloc(struct cxip_txc *txc); +void cxip_txc_ibuf_free(struct cxip_txc *txc, void *ibuf); +int cxip_ibuf_chunk_init(struct ofi_bufpool_region *region); +void cxip_ibuf_chunk_fini(struct ofi_bufpool_region *region); +int cxip_evtq_req_cancel(struct cxip_evtq *evtq, void *req_ctx, + void *op_ctx, bool match); +void cxip_evtq_req_discard(struct cxip_evtq *evtq, void *req_ctx); +void cxip_evtq_flush_trig_reqs(struct cxip_evtq *evtq); +int cxip_cq_req_complete(struct cxip_req *req); +int cxip_cq_req_complete_addr(struct cxip_req *req, fi_addr_t src); +int cxip_cq_req_error(struct cxip_req *req, size_t olen, + int err, int prov_errno, void *err_data, + size_t err_data_size, fi_addr_t src_addr); +int proverr2errno(int err); +struct cxip_req *cxip_evtq_req_alloc(struct cxip_evtq *evtq, + int remap, void *req_ctx); +void cxip_evtq_req_free(struct cxip_req *req); +void cxip_evtq_progress(struct cxip_evtq *evtq); + +void cxip_ep_progress(struct fid *fid); +int cxip_ep_peek(struct fid *fid); +void cxip_ep_flush_trig_reqs(struct cxip_ep_obj *ep_obj); + +void cxip_cq_progress(struct cxip_cq *cq); +void cxip_util_cq_progress(struct util_cq *util_cq); +int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq, void *context); +int cxip_evtq_adjust_reserved_fc_event_slots(struct cxip_evtq *evtq, int value); +void cxip_cq_flush_trig_reqs(struct cxip_cq *cq); + +void cxip_dom_cntr_disable(struct cxip_domain *dom); +int cxip_cntr_mod(struct cxip_cntr *cxi_cntr, uint64_t value, bool set, + bool err); +int cxip_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, + struct fid_cntr **cntr, void *context); + +int cxip_iomm_init(struct cxip_domain *dom); +void cxip_iomm_fini(struct cxip_domain *dom); +int cxip_map(struct cxip_domain *dom, const void *buf, unsigned long len, + uint64_t flags, struct cxip_md **md); +void cxip_unmap(struct cxip_md *md); + +int cxip_ctrl_msg_send(struct cxip_ctrl_req *req); +void cxip_ep_ctrl_progress(struct cxip_ep_obj *ep_obj); +void cxip_ep_ctrl_progress_locked(struct cxip_ep_obj *ep_obj); +void cxip_ep_tx_ctrl_progress(struct cxip_ep_obj *ep_obj); +void cxip_ep_tx_ctrl_progress_locked(struct cxip_ep_obj *ep_obj); +void cxip_ep_tgt_ctrl_progress(struct cxip_ep_obj *ep_obj); +void cxip_ep_tgt_ctrl_progress_locked(struct cxip_ep_obj *ep_obj); +int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj); +void cxip_ep_ctrl_fini(struct cxip_ep_obj *ep_obj); +void cxip_ep_ctrl_del_wait(struct cxip_ep_obj *ep_obj); +int cxip_ep_ctrl_trywait(void *arg); + +int cxip_av_set(struct fid_av *av, struct fi_av_set_attr *attr, + struct fid_av_set **av_set_fid, void * context); + +// TODO: naming convention for testing hooks +void cxip_coll_init(struct cxip_ep_obj *ep_obj); +int cxip_coll_enable(struct cxip_ep *ep); +int cxip_coll_disable(struct cxip_ep_obj *ep_obj); +void cxip_coll_close(struct cxip_ep_obj *ep_obj); +void cxip_coll_populate_opcodes(void); +int cxip_coll_send(struct cxip_coll_reduction *reduction, + int av_set_idx, const void *buffer, size_t buflen, + struct cxi_md *md); +int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, + const struct cxip_coll_data *coll_data, + bool arm, bool retry); + +void cxip_capture_red_id(int *red_id_buf); +ssize_t cxip_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context); +ssize_t cxip_broadcast(struct fid_ep *ep, void *buf, size_t count, + void *desc, fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, + void *context); +ssize_t cxip_reduce(struct fid_ep *ep, const void *buf, size_t count, + void *desc, void *result, void *result_desc, + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, enum fi_op op, uint64_t flags, + void *context); +ssize_t cxip_allreduce(struct fid_ep *ep, const void *buf, size_t count, + void *desc, void *result, void *result_desc, + fi_addr_t coll_addr, enum fi_datatype datatype, + enum fi_op op, uint64_t flags, void *context); +int cxip_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, + const struct fid_av_set *coll_av_set, + uint64_t flags, struct fid_mc **mc, void *context); +void cxip_coll_progress_join(struct cxip_ep_obj *ep_obj); + +int cxip_coll_arm_disable(struct fid_mc *mc, bool disable); +void cxip_coll_limit_red_id(struct fid_mc *mc, int max_red_id); +void cxip_coll_drop_send(struct cxip_coll_reduction *reduction); +void cxip_coll_drop_recv(struct cxip_coll_reduction *reduction); + +void cxip_coll_reset_mc_ctrs(struct fid_mc *mc); + +void cxip_dbl_to_rep(struct cxip_repsum *x, double d); +void cxip_rep_to_dbl(double *d, const struct cxip_repsum *x); +void cxip_rep_add(struct cxip_repsum *x, const struct cxip_repsum *y); +double cxip_rep_add_dbl(double d1, double d2); +double cxip_rep_sum(size_t count, double *values); + +int cxip_check_auth_key_info(struct fi_info *info); +int cxip_gen_auth_key(struct fi_info *info, struct cxi_auth_key *key); + +#define CXIP_FC_SOFTWARE_INITIATED -1 + +/* cxip_fc_reason() - Returns the event reason for portal state + * change (FC reason or SC reason). + */ +static inline int cxip_fc_reason(const union c_event *event) +{ + if (!event->tgt_long.initiator.state_change.sc_nic_auto) + return CXIP_FC_SOFTWARE_INITIATED; + + return event->tgt_long.initiator.state_change.sc_reason; +} + +static inline void cxip_txq_ring(struct cxip_cmdq *cmdq, bool more, + int otx_reqs) +{ + if (!more) { + switch (cmdq->llring_mode) { + case CXIP_LLRING_IDLE: + if (!otx_reqs) + cxi_cq_ll_ring(cmdq->dev_cmdq); + else + cxi_cq_ring(cmdq->dev_cmdq); + break; + case CXIP_LLRING_ALWAYS: + cxi_cq_ll_ring(cmdq->dev_cmdq); + break; + case CXIP_LLRING_NEVER: + default: + cxi_cq_ring(cmdq->dev_cmdq); + break; + } + } +} + +ssize_t cxip_send_common(struct cxip_txc *txc, uint32_t tclass, + const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t tag, void *context, uint64_t flags, + bool tagged, bool triggered, uint64_t trig_thresh, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr); + +ssize_t cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, + void *desc, fi_addr_t src_addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags, + bool tagged, struct cxip_cntr *comp_cntr); + +ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, + const void *buf, size_t len, void *desc, + fi_addr_t tgt_addr, uint64_t addr, + uint64_t key, uint64_t data, uint64_t flags, + uint32_t tclass, uint64_t msg_order, void *context, + bool triggered, uint64_t trig_thresh, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr); + +/* + * Request variants: + * CXIP_RQ_AMO + * Passes one argument (operand1), and applies that to a remote memory + * address content. + * + * CXIP_RQ_AMO_FETCH + * Passes two arguments (operand1, resultptr), applies operand1 to a + * remote memory address content, and returns the prior content of the + * remote memory in resultptr. + * + * CXIP_RQ_AMO_SWAP + * Passes three arguments (operand1, compare, resultptr). If remote memory + * address content satisfies the comparison operation with compare, + * replaces the remote memory content with operand1, and returns the prior + * content of the remote memory in resultptr. + * + * CXIP_RQ_AMO_PCIE_FETCH + * Passes two arguments (operand1, resultptr), applies operand1 to a + * remote memory address content, and returns the prior content of the + * remote memory in resultptr. + * + * The resulting operation should be a PCIe AMO instead of NIC AMO. + */ +enum cxip_amo_req_type { + CXIP_RQ_AMO, + CXIP_RQ_AMO_FETCH, + CXIP_RQ_AMO_SWAP, + CXIP_RQ_AMO_PCIE_FETCH, + CXIP_RQ_AMO_LAST, +}; + +int cxip_amo_common(enum cxip_amo_req_type req_type, struct cxip_txc *txc, + uint32_t tclass, const struct fi_msg_atomic *msg, + const struct fi_ioc *comparev, void **comparedesc, + size_t compare_count, const struct fi_ioc *resultv, + void **resultdesc, size_t result_count, uint64_t flags, + bool triggered, uint64_t trig_thresh, + struct cxip_cntr *trig_cntr, struct cxip_cntr *comp_cntr); +int _cxip_atomic_opcode(enum cxip_amo_req_type req_type, enum fi_datatype dt, + enum fi_op op, int amo_remap_to_pcie_fadd, + enum c_atomic_op *cop, enum c_atomic_type *cdt, + enum c_cswap_op *copswp, unsigned int *cdtlen); + +static inline void +cxip_domain_add_txc(struct cxip_domain *dom, struct cxip_txc *txc) +{ + ofi_spin_lock(&dom->lock); + dlist_insert_tail(&txc->dom_entry, &dom->txc_list); + ofi_spin_unlock(&dom->lock); +} + +static inline void +cxip_domain_remove_txc(struct cxip_domain *dom, struct cxip_txc *txc) +{ + ofi_spin_lock(&dom->lock); + dlist_remove(&txc->dom_entry); + ofi_spin_unlock(&dom->lock); +} + +static inline void +cxip_domain_add_cntr(struct cxip_domain *dom, struct cxip_cntr *cntr) +{ + ofi_spin_lock(&dom->lock); + dlist_insert_tail(&cntr->dom_entry, &dom->cntr_list); + ofi_atomic_inc32(&dom->ref); + ofi_spin_unlock(&dom->lock); +} + +static inline void +cxip_domain_remove_cntr(struct cxip_domain *dom, struct cxip_cntr *cntr) +{ + ofi_spin_lock(&dom->lock); + dlist_remove(&cntr->dom_entry); + ofi_atomic_dec32(&dom->ref); + ofi_spin_unlock(&dom->lock); +} + +static inline void +cxip_domain_add_cq(struct cxip_domain *dom, struct cxip_cq *cq) +{ + ofi_spin_lock(&dom->lock); + dlist_insert_tail(&cq->dom_entry, &dom->cq_list); + ofi_atomic_inc32(&dom->ref); + ofi_spin_unlock(&dom->lock); +} + +static inline void +cxip_domain_remove_cq(struct cxip_domain *dom, struct cxip_cq *cq) +{ + ofi_spin_lock(&dom->lock); + dlist_remove(&cq->dom_entry); + ofi_atomic_dec32(&dom->ref); + ofi_spin_unlock(&dom->lock); +} + +int cxip_domain_ctrl_id_alloc(struct cxip_domain *dom, + struct cxip_ctrl_req *req); +void cxip_domain_ctrl_id_free(struct cxip_domain *dom, + struct cxip_ctrl_req *req); +int cxip_domain_prov_mr_id_alloc(struct cxip_domain *dom, + struct cxip_mr *mr); +void cxip_domain_prov_mr_id_free(struct cxip_domain *dom, + struct cxip_mr *mr); + +static inline +struct cxip_ctrl_req *cxip_domain_ctrl_id_at(struct cxip_domain *dom, + int buffer_id) +{ + if (ofi_idx_is_valid(&dom->req_ids, buffer_id)) + return ofi_idx_at(&dom->req_ids, buffer_id); + return NULL; +} + +static inline uint32_t cxip_mac_to_nic(struct ether_addr *mac) +{ + return mac->ether_addr_octet[5] | + (mac->ether_addr_octet[4] << 8) | + ((mac->ether_addr_octet[3] & 0xF) << 16); +} + +static inline bool is_netsim(struct cxip_ep_obj *ep_obj) +{ + return (ep_obj->domain->iface->info->device_platform == + CXI_PLATFORM_NETSIM); +} + +/* debugging TRACE functions */ +#define cxip_trace_attr __attribute__((format(__printf__, 1, 2))) +typedef int (*cxip_trace_t)(const char *fmt, ...); +extern cxip_trace_t cxip_trace_attr cxip_trace_fn; + +typedef void (*cxip_trace_flush_t)(void); +extern cxip_trace_flush_t cxip_trace_flush_fn; + +typedef void (*cxip_trace_close_t)(void); +extern cxip_trace_close_t cxip_trace_close_fn; + +typedef bool (*cxip_trace_enable_t)(bool enable); +extern cxip_trace_enable_t cxip_trace_enable_fn; + +extern bool cxip_trace_enabled; // true if tracing is enabled +extern bool cxip_trace_append; // append open for trace file +extern bool cxip_trace_linebuf; // set line buffering for trace +extern int cxip_trace_rank; // tracing rank +extern int cxip_trace_numranks; // tracing number of ranks +extern FILE *cxip_trace_fid; // trace output file descriptor + +int cxip_trace_attr cxip_trace(const char *fmt, ...); +void cxip_trace_flush(void); +void cxip_trace_close(void); +bool cxip_trace_enable(bool enable); + +/* debugging TRACE filtering control */ +enum cxip_trace_module { + CXIP_TRC_CTRL, + CXIP_TRC_ZBCOLL, + CXIP_TRC_CURL, + CXIP_TRC_COLL_PKT, + CXIP_TRC_COLL_JOIN, + CXIP_TRC_COLL_DEBUG, + CXIP_TRC_TEST_CODE, + CXIP_TRC_MAX +}; +extern uint64_t cxip_trace_mask; + +static inline void cxip_trace_set(int mod) +{ + cxip_trace_mask |= (1L << mod); +} + +static inline void cxip_trace_clr(int mod) +{ + cxip_trace_mask &= ~(1L << mod); +} + +static inline bool cxip_trace_true(int mod) +{ + return cxip_trace_enabled && (cxip_trace_mask & (1L << mod)); +} + +#if ENABLE_DEBUG +#define CXIP_TRACE(mod, fmt, ...) \ + do {if (cxip_trace_true(mod)) cxip_trace_fn(fmt, ##__VA_ARGS__);} while (0) +#else +#define CXIP_TRACE(mod, fmt, ...) do {} while (0) +#endif + +/* fabric logging implementation functions */ +#define _CXIP_DBG(subsys, fmt, ...) \ + FI_DBG(&cxip_prov, subsys, "%s: " fmt "", cxip_env.hostname, \ + ##__VA_ARGS__) +#define _CXIP_INFO(subsys, fmt, ...) \ + FI_INFO(&cxip_prov, subsys, "%s: " fmt "", cxip_env.hostname, \ + ##__VA_ARGS__) +#define _CXIP_WARN(subsys, fmt, ...) \ + FI_WARN(&cxip_prov, subsys, "%s: " fmt "", cxip_env.hostname, \ + ##__VA_ARGS__) +#define _CXIP_WARN_ONCE(subsys, fmt, ...) \ + FI_WARN_ONCE(&cxip_prov, subsys, "%s: " fmt "", cxip_env.hostname, \ + ##__VA_ARGS__) +#define CXIP_LOG(fmt, ...) \ + fi_log(&cxip_prov, FI_LOG_WARN, FI_LOG_CORE, \ + __func__, __LINE__, "%s: " fmt "", cxip_env.hostname, \ + ##__VA_ARGS__) + +#define CXIP_FATAL(fmt, ...) \ + do { \ + CXIP_LOG(fmt, ##__VA_ARGS__); \ + abort(); \ + } while (0) + +#define TXC_DBG(txc, fmt, ...) \ + _CXIP_DBG(FI_LOG_EP_DATA, "TXC (%#x:%u): " fmt "", \ + (txc)->ep_obj->src_addr.nic, (txc)->ep_obj->src_addr.pid, \ + ##__VA_ARGS__) +#define TXC_WARN(txc, fmt, ...) \ + _CXIP_WARN(FI_LOG_EP_DATA, "TXC (%#x:%u): " fmt "", \ + (txc)->ep_obj->src_addr.nic, (txc)->ep_obj->src_addr.pid, \ + ##__VA_ARGS__) +#define TXC_WARN_RET(txc, ret, fmt, ...) \ + TXC_WARN(txc, "%d:%s: " fmt "", ret, fi_strerror(-ret), ##__VA_ARGS__) +#define TXC_FATAL(txc, fmt, ...) \ + CXIP_FATAL("TXC (%#x:%u):: " fmt "", (txc)->ep_obj->src_addr.nic, \ + (txc)->ep_obj->src_addr.pid, ##__VA_ARGS__) + +#define RXC_DBG(rxc, fmt, ...) \ + _CXIP_DBG(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \ + (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \ + (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) +#define RXC_INFO(rxc, fmt, ...) \ + _CXIP_INFO(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \ + (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \ + (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) +#define RXC_WARN(rxc, fmt, ...) \ + _CXIP_WARN(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \ + (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \ + (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) +#define RXC_WARN_ONCE(rxc, fmt, ...) \ + _CXIP_WARN_ONCE(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \ + (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \ + (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) +#define RXC_FATAL(rxc, fmt, ...) \ + CXIP_FATAL("RXC (%#x:%u) PtlTE %u:[Fatal] " fmt "", \ + (rxc)->ep_obj->src_addr.nic, \ + (rxc)->ep_obj->src_addr.pid, \ + (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) + +#define DOM_INFO(dom, fmt, ...) \ + _CXIP_INFO(FI_LOG_DOMAIN, "DOM (cxi%u:%u:%u:%u:%#x): " fmt "", \ + (dom)->iface->info->dev_id, (dom)->lni->lni->id, \ + (dom)->auth_key.svc_id, (dom)->auth_key.vni, \ + (dom)->nic_addr, ##__VA_ARGS__) +#define DOM_WARN(dom, fmt, ...) \ + _CXIP_WARN(FI_LOG_DOMAIN, "DOM (cxi%u:%u:%u:%u:%#x): " fmt "", \ + (dom)->iface->info->dev_id, (dom)->lni->lni->id, \ + (dom)->auth_key.svc_id, (dom)->auth_key.vni, \ + (dom)->nic_addr, ##__VA_ARGS__) + +#define CXIP_UNEXPECTED_EVENT_STS "Unexpected event status, %s rc = %s\n" +#define CXIP_UNEXPECTED_EVENT "Unexpected event %s, rc = %s\n" + +#define CXIP_DEFAULT_CACHE_LINE_SIZE 64 + +#define CXIP_SYSFS_CACHE_LINE_SIZE \ + "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size" + +/* cxip_cacheline_size() - Return the CPU cache-line size, if unable to + * read then return the assumed cache size. + */ +static inline int cxip_cacheline_size(void) +{ + FILE *f; + int cache_line_size; + int ret; + + f = fopen(CXIP_SYSFS_CACHE_LINE_SIZE, "r"); + if (!f) { + _CXIP_WARN(FI_LOG_CORE, + "Error %d determining cacheline size\n", + errno); + cache_line_size = CXIP_DEFAULT_CACHE_LINE_SIZE; + } else { + ret = fscanf(f, "%d", &cache_line_size); + if (ret != 1) { + _CXIP_WARN(FI_LOG_CORE, + "Error reading cacheline size\n"); + cache_line_size = CXIP_DEFAULT_CACHE_LINE_SIZE; + } + + fclose(f); + } + + return cache_line_size; +} + +static inline int +cxip_txc_copy_from_hmem(struct cxip_txc *txc, struct cxip_md *hmem_md, + void *dest, const void *hmem_src, size_t size) +{ + enum fi_hmem_iface iface; + uint64_t device; + struct iovec hmem_iov; + struct cxip_domain *domain = txc->domain; + uint64_t flags; + bool unmap_hmem_md = false; + int ret; + + /* Default to memcpy unless FI_HMEM is set. */ + if (!txc->hmem) { + memcpy(dest, hmem_src, size); + return FI_SUCCESS; + } + + /* With HMEM enabled, performing memory registration will also cause + * the device buffer to be registered for CPU load/store access. Being + * able to perform load/store instead of using the generic HMEM copy + * routines and/or HMEM override copy routines can significantly reduce + * latency. Thus, this path is favored. + * + * However, if FORK_SAFE variables are enabled, we avoid this mapping + * to keep from designating the entire page in which the buffer + * resides as don't copy, and take the performance hit. + * + * Memory registration can result in additional latency. Expectation is + * the MR cache can amortize the additional memory registration latency. + */ + if (!cxip_env.fork_safe_requested) { + if (!hmem_md) { + ret = cxip_map(domain, hmem_src, size, 0, &hmem_md); + if (ret) { + TXC_WARN(txc, "cxip_map failed: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + unmap_hmem_md = true; + } + + cxip_copy_from_md(hmem_md, dest, hmem_src, size); + if (unmap_hmem_md) + cxip_unmap(hmem_md); + + return FI_SUCCESS; + } + + /* Slow path HMEM copy path.*/ + iface = ofi_get_hmem_iface(hmem_src, &device, &flags); + hmem_iov.iov_base = (void *)hmem_src; + hmem_iov.iov_len = size; + + ret = domain->hmem_ops.copy_from_hmem_iov(dest, size, iface, device, + &hmem_iov, 1, 0); + if (ret != size) { + if (ret < 0) { + TXC_WARN(txc, "copy_from_hmem_iov failed: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + TXC_WARN(txc, + "copy_from_hmem_iov short copy: expect=%ld got=%d\n", + size, ret); + return -FI_EIO; + } + + return FI_SUCCESS; +} + +size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep, + struct fi_cq_tagged_entry *entry, size_t count, + fi_addr_t *src_addr, size_t *ux_count); + +int cxip_nic_alloc(struct cxip_if *nic_if, struct fid_nic **fid_nic); + +int cxip_domain_dwq_emit_dma(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cntr *trig_cntr, size_t trig_thresh, + struct c_full_dma_cmd *dma, uint64_t flags); +int cxip_domain_dwq_emit_amo(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cntr *trig_cntr, size_t trig_thresh, + struct c_dma_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush); + +#endif diff --git a/prov/cxi/include/cxip_faults.h b/prov/cxi/include/cxip_faults.h new file mode 100644 index 00000000000..e9b28f17fe9 --- /dev/null +++ b/prov/cxi/include/cxip_faults.h @@ -0,0 +1,148 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2019 Hewlett Packard Enterprise Development LP + */ + +/* Fault injection. */ + +struct cxip_fault { + char *env; /* Configuration env. var. name */ + int prop; /* Proportion of rand() values */ + size_t count; /* Count of injected faults */ +}; + +extern struct cxip_fault dma_fault; +extern struct cxip_fault malloc_fault; + +void cxip_fault_inject_fini(void); +void cxip_fault_inject_init(void); + +#if ENABLE_DEBUG +#define INJECT_FAULT(fault) \ + ((fault).prop && rand() < (fault).prop && (fault).count++) +#else +#define INJECT_FAULT(fault) 0 +#endif + +#define cxi_cq_emit_dma_f(...) \ + (INJECT_FAULT(dma_fault) ? -ENOSPC : \ + cxi_cq_emit_dma(__VA_ARGS__)) + +#define cxip_pte_unlink_f(...) \ + (INJECT_FAULT(dma_fault) ? -FI_EAGAIN : \ + cxip_pte_unlink(__VA_ARGS__)) + +#define malloc_f(...) \ + (INJECT_FAULT(malloc_fault) ? NULL : \ + malloc(__VA_ARGS__)) + +/** + * Collective traps, can be extended for other uses. + * + * This creates a dlist of "traps" that are keyed to an index, and a trap + * identifier. When the search results in a match of both index and trap, this + * sets the *err variable to the specified trap error, and returns true. + * Otherwise it returns false. + * + * The close, and set functions are generally called in the test code. The + * search function is generally embedded in the provider. + * + * If the trap logic branches on search returning true, search should be a no-op + * with no performance penalty when ENABLE_DEBUG is FALSE. + * + * This will slow operations if ENABLE_DEBUG is TRUE, and there is a large list + * of traps. Normally, the test case will set only one trap, since the objective + * is to force a controlled fault and observe the result. + */ +enum { + CXIP_TRAP_NONE = 0, + CXIP_TRAP_GETGRP, + CXIP_TRAP_BCAST, + CXIP_TRAP_REDUCE, + CXIP_TRAP_INITPTE, + CXIP_TRAP_CURLSND, + CXIP_TRAP_CURLRCV, +}; + +#if ENABLE_DEBUG +/* structure used to simulate failures */ +struct _cxip_trap { + struct dlist_entry link; + int index; + int trap; + int err; +}; + +struct dlist_entry _trap_list; +bool _trap_initialized; + +static void _cxip_trap_close(void) +{ + struct _cxip_trap *trap_obj; + + if (!_trap_initialized) + return; + while (!dlist_empty(&_trap_list)) { + dlist_pop_front(&_trap_list, struct _cxip_trap, trap_obj, link); + free(trap_obj); + } +} + +static void _cxip_trap_set(int index, int trap, int err) +{ + struct _cxip_trap *trap_obj; + + if (!_trap_initialized) { + dlist_init(&_trap_list); + _trap_initialized = true; + } + trap_obj = calloc(1, sizeof(*trap_obj)); + if (!trap_obj) + return; + dlist_init(&trap_obj->link); + trap_obj->index = index; + trap_obj->trap = trap; + trap_obj->err = err; + dlist_insert_tail(&_trap_list, &trap_obj->link); +} + +static bool _cxip_trap_search(int index, int trap, int *err) +{ + struct _cxip_trap *trap_obj; + struct dlist_entry *item; + + if (!_trap_initialized) + return false; + + dlist_foreach(&_trap_list, item) { + trap_obj = container_of(item, struct _cxip_trap, link); + if (trap_obj->index != index) + continue; + if (trap_obj->trap != trap) + continue; + dlist_remove(item); + *err = trap_obj->err; + free(trap_obj); + return true; + } + return false; +} + +static inline void cxip_trap_close(void) +{ + _cxip_trap_close(); +} +static inline void cxip_trap_set(int index, int trap, int err) +{ + _cxip_trap_set(index, trap, err); +} +static inline bool cxip_trap_search(int index, int trap, int *err) +{ + return _cxip_trap_search(index, trap, err); +} +#else +static inline void cxip_trap_close(void) {} +static inline void cxip_trap_set(int a, int b, int c) {} +static inline bool cxip_trap_search(int a, int b, int *c) {return false;} +#endif diff --git a/prov/cxi/include/fi_cxi_ext.h b/prov/cxi/include/fi_cxi_ext.h new file mode 100644 index 00000000000..bee868450e1 --- /dev/null +++ b/prov/cxi/include/fi_cxi_ext.h @@ -0,0 +1,455 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2020-2022 Hewlett Packard Enterprise Development LP + */ + +#ifndef _FI_CXI_EXT_H_ +#define _FI_CXI_EXT_H_ + +/* CXI provider specific NIC attributes. This information is returned in + * fi_info::nid::prov_attr. + + * Users can optionally modify some fields. Depending on the field adjusted, + * this can impact fi_domain() or other endpoint allocation behavior. + */ + +#define FI_CXI_NIC_ATTR_VER 1U + +struct cxip_nic_attr { + /* Version of NIC attr. Must remain at the top of this struct. */ + uint32_t version; + + /* NIC address. Should never be modified. */ + const unsigned int addr; + + /* On output from fi_getinfo(), rgroup_id will be set in the following + * order: + * 1. Resource group ID returned from SLINGSHOT_SVC_ID environment + * variable + * 2. First resource group ID with matching UID + * 3. First resource group ID with matching GID + * 4. First resource group ID with open permissions + */ + const unsigned int default_rgroup_id; + + /* Default VNI used with the rgroup ID. */ + const unsigned int default_vni; +}; + +/* + * TODO: The following should be integrated into the include/rdma/fi_ext.h + * and are use for provider specific fi_control() operations. + */ +#define FI_PROV_SPECIFIC_CXI (0xccc << 16) + +enum { + FI_OPT_CXI_SET_TCLASS = -FI_PROV_SPECIFIC_CXI, /* uint32_t */ + FI_OPT_CXI_SET_MSG_ORDER, /* uint64_t */ + + /* fid_nic control operation to refresh NIC attributes. */ + FI_OPT_CXI_NIC_REFRESH_ATTR, + + FI_OPT_CXI_SET_MR_MATCH_EVENTS, /* bool */ + FI_OPT_CXI_GET_MR_MATCH_EVENTS, /* bool */ + FI_OPT_CXI_SET_OPTIMIZED_MRS, /* bool */ + FI_OPT_CXI_GET_OPTIMIZED_MRS, /* bool */ + FI_OPT_CXI_SET_PROV_KEY_CACHE, /* bool */ + FI_OPT_CXI_GET_PROV_KEY_CACHE, /* bool */ +}; + +/* + * Execute a given libfabric atomic memory operation as a PCIe operation as + * compared to a NIC operation. + * + * Note: Ordering between PCIe atomic operations and NIC atomic/RMA operations + * is undefined. + * + * Note: This flag overloads the bit used for FI_SOURCE. But, since FI_SOURCE + * is invalid for AMO operations, overloading this bit is not an issue. + */ +#define FI_CXI_PCIE_AMO (1ULL << 57) + +/* + * Flag an accelerated collective as pre-reduced. + * + * This can be passed to the accelerated collectives operations to indicate + * that the supplied data is a pre-reduced cxip_coll_accumulator structure. + * + * Note: This flag overloads FI_CXI_PCIE_AMO. Accelerated collectives do not + * use FI_CXI_PCIE_AMO or FI_SOURCE. + */ +#define FI_CXI_PRE_REDUCED (1ULL << 57) + +/* + * Use CXI High Rate Puts (HRP). Increases message rate performance. Applies to + * RMA and unreliable, non-fetching AMO operations. + */ +#define FI_CXI_HRP (1ULL << 60) + +/* + * Disable AMO reliability. Increases message rate performance. Applies to + * non-fetching AMOs. Required for HRP AMOs. + */ +#define FI_CXI_UNRELIABLE (1ULL << 61) + +/* + * Request a provider specific weak FENCE operation to facilitate an + * EP alias ordering point, when the original EP utilizes PCIe RO=1. + */ +#define FI_CXI_WEAK_FENCE (1ULL << 63) + +/* + * Used in conjunction with the deferred work queue API. If a deferred work + * queue operation has this flag set, the CXI provider will ensure a counter + * writeback occurs once the deferred work queue operation completes. + * Note: Addition hardware resources will be used to ensure a counter writeback + * occurs at the completion of the deferred work queue operation. + */ +#define FI_CXI_CNTR_WB (1ULL << 62) +#define FI_CXI_COUNTER_OPS "cxi_counter_ops" + +struct fi_cxi_cntr_ops { + /* Set the counter writeback address to a client provided address. */ + int (*set_wb_buffer)(struct fid *fid, void *buf, size_t len); + + /* Get the counter MMIO region. */ + int (*get_mmio_addr)(struct fid *fid, void **addr, size_t *len); +}; + +/* Success values cannot exceed FI_CXI_CNTR_SUCCESS_MAX */ +#define FI_CXI_CNTR_SUCCESS_MAX ((1ULL << 48) - 1) + +/* Failure values cannot exceed FI_CXI_CNTR_FAILURE_MAX */ +#define FI_CXI_CNTR_FAILURE_MAX ((1ULL << 7) - 1) + +/* fi_cntr_read() equivalent but for the writeback buffer. */ +static inline uint64_t fi_cxi_cntr_wb_read(const void *wb_buf) +{ + return (*(uint64_t *)wb_buf) & FI_CXI_CNTR_SUCCESS_MAX; +}; + +/* fi_cntr_reader() equivalent but for the writeback buffer. */ +static inline uint64_t fi_cxi_cntr_wb_readerr(const void *wb_buf) +{ + return ((*(uint64_t *)wb_buf) >> 48) & FI_CXI_CNTR_FAILURE_MAX; +}; + +/* Generate a counter success value which can be polled on. */ +static inline int fi_cxi_gen_cntr_success(uint64_t value, uint64_t *cxi_value) +{ + if (value > FI_CXI_CNTR_SUCCESS_MAX) + return -FI_EINVAL; + + *cxi_value = (1ULL << 63) | value; + return FI_SUCCESS; +}; + +/* fi_cntr_add() equivalent but for the MMIO region. */ +static inline int fi_cxi_cntr_add(void *cntr_mmio, uint64_t value) +{ + /* Success counter is only 48 bits wide. */ + if (value > FI_CXI_CNTR_SUCCESS_MAX) + return -FI_EINVAL; + + *((uint64_t *)cntr_mmio) = value; + return FI_SUCCESS; +} + +/* fi_cntr_adderr() equivalent but for the MMIO region. */ +static inline int fi_cxi_cntr_adderr(void *cntr_mmio, uint64_t value) +{ + /* Error counter is only 7 bits wide. */ + if (value > FI_CXI_CNTR_FAILURE_MAX) + return -FI_EINVAL; + + *((uint64_t *)cntr_mmio + 8) = value; + return FI_SUCCESS; +} + +/* fi_cntr_set() equivalent but for the MMIO region. */ +static inline int fi_cxi_cntr_set(void *cntr_mmio, uint64_t value) +{ + /* Only set of zero is supported through MMIO region. */ + if (value > 0) + return -FI_EINVAL; + + *((uint64_t *)cntr_mmio + 16) = 0; + return FI_SUCCESS; +} + +/* fi_cntr_seterr() equivalent but for MMIO region. */ +static inline int fi_cxi_cntr_seterr(void *cntr_mmio, uint64_t value) +{ + /* Only set of zero is supported through MMIO region. */ + if (value > 0) + return -FI_EINVAL; + + *((uint64_t *)cntr_mmio + 24) = 0; + return FI_SUCCESS; +} + +/* fi_cntr_add() equivalent but for the MMIO region. */ +static inline void *fi_cxi_get_cntr_add_addr(void *cntr_mmio) +{ + return cntr_mmio; +} + +/* fi_cntr_adderr() equivalent but for the MMIO region. */ +static inline void *fi_cxi_get_cntr_adderr_addr(void *cntr_mmio) +{ + return (void *)((uint64_t *)cntr_mmio + 8); +} + +/* fi_cntr_set() equivalent but for the MMIO region reset. + * NOTE: CXI does not support set to counter MMIO region. Only reset. + */ +static inline void *fi_cxi_get_cntr_reset_addr(void *cntr_mmio) +{ + return (void *)((uint64_t *)cntr_mmio + 16); +} + +/* fi_cntr_seterr() equivalent but for MMIO region reset. + * NOTE: CXI does not support set to counter MMIO region. Only reset. + */ +static inline void *fi_cxi_get_cntr_reseterr_addr(void *cntr_mmio) +{ + return (void *)((uint64_t *)cntr_mmio + 24); +} + +#define FI_CXI_DOM_OPS_1 "dom_ops_v1" +#define FI_CXI_DOM_OPS_2 "dom_ops_v2" +#define FI_CXI_DOM_OPS_3 "dom_ops_v3" +#define FI_CXI_DOM_OPS_4 "dom_ops_v4" +#define FI_CXI_DOM_OPS_5 "dom_ops_v5" +#define FI_CXI_DOM_OPS_6 "dom_ops_v6" + +/* v1 to v6 can use the same struct since they only appended a routine */ +struct fi_cxi_dom_ops { + int (*cntr_read)(struct fid *fid, unsigned int cntr, uint64_t *value, + struct timespec *ts); + int (*topology)(struct fid *fid, unsigned int *group_id, + unsigned int *switch_id, unsigned int *port_id); + + /* Enable hybrid MR desc mode. Hybrid MR desc allows for libfabric users + * to optionally pass in a valid MR desc for local communication + * operations. + * + * When enabled, if the MR desc is NULL, the provider will + * perform internal memory registration. Else, the provider will assume + * the MR desc field is valid and skip internal memory registration. + * + * When disabled, the provider will ignore the MR desc field and always + * perform internal memory registration. This is the default behavior. + * + * All child endpoints will inherit the current domain status of hybrid + * MR desc only during endpoint creation. Dynamically changing the + * domain hybrid MR desc status with endpoint allocate may not propagate + * to child endpoints. Thus, it is recommended to set hybrid MR desc + * status prior to allocating endpoints. + */ + int (*enable_hybrid_mr_desc)(struct fid *fid, bool enable); + + /* Get unexpected message information. + * + * Obtain a list of unexpected messages associated with the endpoint. + * The list is returned as an array of CQ tagged entries. The following + * is how the fields in fi_cq_tagged_entry are used. + * + * op_context: NULL since this message has not matched a posted receive + * flags: A combination of FI_MSG, FI_TAGGED, FI_RECV, + * and/or FI_REMOTE_CQ_DATA + * len: Unexpected message request length + * data: Completion queue data (only valid if FI_REMOTE_CQ_DATA + * is set) + * tag: Unexpected message tag (only valid if FI_TAGGED is set) + * + * @ep: Endpoint FID to have unexpected messages returned to user. + * @entry: Tagged entry array to be filled in by the provider. If the + * entry is NULL, only ux_count will be set. + * @count: Number of entries in entry and src_addr array. If count is + * zero,then only the ux_count will be set on return. + * @src_addr: Source address array to be filled in by the provider. If + * the entry is NULL, only ux_count will be set. + * @ux_count: Output variable used to return the number of unexpected + * messages queued on the given endpoint. + * + * Return: On success, number of entries copied into the users entry + * and src_addr arrays. On error, -FI_ERRNO. + */ + size_t (*ep_get_unexp_msgs)(struct fid_ep *fid_ep, + struct fi_cq_tagged_entry *entry, + size_t count, fi_addr_t *src_addr, + size_t *ux_count); + + /* Get the depth of the deferred work queue. The depth is the number of + * of triggered operation commands which can be queued to hardware. The + * depth is not per fi_domain. The depth is across all processes using + * the same CXI service which usually maps to a job-step. + */ + int (*get_dwq_depth)(struct fid *fid, size_t *depth); + + /* The following two functions have been deprecated in favor of + * using the fi_control() standardized interface. They will be + * removed in a future software release, but are left here initially + * to allow early users to adjust their usage. + */ + int (*enable_mr_match_events)(struct fid *fid, bool enable); + int (*enable_optimized_mrs)(struct fid *fid, bool enable); +}; + +/* + * CXI Authorization Key + */ +struct cxi_auth_key { + /* The CXI service assigned to the Domain and Endpoints. A CXI service + * is associated with a set of local resource limits, VNIs, and Traffic + * Classes. + * + * The svc_id used by an OFI Domain must match all Endpoints belonging + * to the Domain. + */ + uint32_t svc_id; + + /* The Virtual Network ID (VNI) assigned to the Endpoint. Two Endpoints + * must use the same VNI in order to communicate. + * + * Note that while the CXI service may define one or more VNIs which a + * process can access, an Endpoint is assigned to only one. + */ + uint16_t vni; +}; + +/* + * CXI Collectives + */ + +/* + * AV Set communication key. + * + * For production: + * - Set cxip_comm_key.keytype = COMM_KEY_NONE. + * - Initialize cxip_comm_key structure to zeros. + * - Create one av_set on each node. + * - Initialize each av_set to contain the NIC addresses of all endpoints. + * - Call fi_join_collective() once on each endpoint. + * - dest_addr is a multicast address created by the join. + * - hwroot_nic is assigned by the join. + * - The PTE will receive at the multicast ID value, index extension of zero. + * - Sending to the multicast ID will cause delivery to nodes according to the + * tree topology. + * + * For testing with externally established multicast address: + * - NOT IMPLEMENTED. + * + * For testing on a multinode system without multicast: + * - Set cxip_comm_key.keytype = COMM_KEY_UNICAST. + * - Set cxip_comm_key.ucast.hwroot_idx to the desired hw_root index. + * - Create one av_set on each node. + * - Initialize each av_set to contain the NIC addresses of all endpoints. + * - Call fi_join_collective() once one each endpoint. + * - hwroot_nic is the NIC address of the node that serves as the emulated + * hardware root of the tree. + * - The PTE will use the EP source NIC address and process PID, with a + * PID_IDX of CXIP_PTL_IDX_COLL. + * - Sending to any (valid) node address with CXIP_PTL_IDX_COLL will target the + * collectives PTE on that node. + * - The root/leaf send routines will distribute one or more packets to all + * fi_addr_t in the av_set as appropriate. + * + * For testing under NETSIM on a single node: + * - Set cxip_comm_key.keytype = COMM_KEY_RANK. + * - Set cxip_comm_key.rank.hwroot_idx to the desired hw_root index. + * - Set cxip_comm_key.rank.rank to the simulated rank. + * - Create N av_set objects, one for each simulated rank. + * - Call fi_join_collective() once for each simulated endpoint. + * - dest_addr is the MC object index. + * - hwroot_nic is the MC object index for the MC object to serve as the + * simulated hardware root. + * - The PTE will use the EP source NIC address and process PID, with a PID_IDX + * of 16 + dest_addr (MC object index). + * - Sending to the node's own address with a PID_IDX of 16 + MC index will + * target the appropriate MC object. + * - Simulation is limited to 32 simulated endpoints. + */ +enum cxip_comm_key_type { + COMM_KEY_NONE = 0, + COMM_KEY_MULTICAST, + COMM_KEY_UNICAST, + COMM_KEY_RANK, + COMM_KEY_MAX +}; + +typedef unsigned int cxip_coll_op_t; // CXI collective opcode + +struct cxip_coll_mcast_key { + uint32_t hwroot_idx; // index of hwroot in av_set list + uint32_t mcast_addr; // 13-bit multicast address id +}; + +struct cxip_coll_unicast_key { + uint32_t hwroot_idx; // index of hwroot in av_set list + uint32_t mcast_addr; // 13-bit simulated multcast address +}; + +struct cxip_coll_rank_key { + uint32_t hwroot_idx; // index of hwroot in av_set list + uint32_t rank; // rank of this object + bool rx_discard; // clear to report RX events +}; + +struct cxip_comm_key { + enum cxip_comm_key_type keytype; + union { + struct cxip_coll_mcast_key mcast; + struct cxip_coll_unicast_key ucast; + struct cxip_coll_rank_key rank; + }; +}; + +/* Extended reduction opcodes. + * + * Only the following standard FI_ATOMIC operations are supported: + * - FI_MIN : INT or FLT + * - FI_MAX : INT or FLT + * - FI_SUM : INT or FLT + * - FI_BOR : INT + * - FI_BAND : INT + * - FI_BXOR : INT + * + * The codes below extend this standard FI_ATOMIC set to explicitly take + * advantage of extended hardware operations. These can be used as opcodes for + * any of the collective operations, just like FI_MIN or FI_SUM. + * + * Note that the current FI_ATOMIC set ends at opcode == 19. We start this one + * at 32, to accommodate possible expansion of the FI_ATOMIC set, and check for + * overlap during initialization. + */ +enum cxip_coll_op { + FI_CXI_MINMAXLOC = 32, // FLT or INT + FI_CXI_REPSUM, // FLT only + FI_CXI_OP_LAST +}; + +/* Extended accelerated reduction structures. + */ +struct cxip_coll_intminmax { + int64_t minval; + uint64_t minidx; + int64_t maxval; + uint64_t maxidx; +}; + +struct cxip_coll_fltminmax { + double minval; + uint64_t minidx; + double maxval; + uint64_t maxidx; +}; + +/* opaque export of struct cxip_coll_data */ +struct cxip_coll_accumulator { + uint8_t accum[64]; +}; + +#endif /* _FI_CXI_EXT_H_ */ diff --git a/prov/cxi/libfabric-cxi.spec.in b/prov/cxi/libfabric-cxi.spec.in new file mode 100644 index 00000000000..77a0b6613ff --- /dev/null +++ b/prov/cxi/libfabric-cxi.spec.in @@ -0,0 +1,52 @@ +%{!?configopts: %global configopts LDFLAGS=-Wl,--build-id} +%{!?provider: %define provider cxi} +%{!?provider_formal: %define provider_formal cxi} + +Name: libfabric-%{provider} +Version: @VERSION@ +Release: 1%{?dist} +Summary: Dynamic %{provider_formal} provider for user-space Open Fabric Interfaces +Group: System Environment/Libraries +License: GPLv2 or BSD +Url: http://www.github.com/ofiwg/libfabric +Source: http://www.github.org/ofiwg/%{name}/releases/download/v{%version}/libfabric-%{version}.tar.bz2 +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +Requires: libfabric +BuildRequires: libfabric + +%description +libfabric provides a user-space API to access high-performance fabric +services, such as RDMA. + +This RPM provides the %{provider_formal} provider as a "plugin" to an existing +libfabric installation. This plugin will override older %{provider_formal} +provider functionality in the existing libfabric installation. + +%prep +%setup -q -n libfabric-%{version} + +%build +%configure %{configopts} --enable-%{provider}=dl +make %{?_smp_mflags} + +%install +rm -rf %{buildroot} +%makeinstall installdirs + +%clean +rm -rf %{buildroot} + +%files +%defattr(-,root,root,-) +%{_libdir}/libfabric/*.so + +%exclude %{_libdir}/libfabric.* +%exclude %{_libdir}/libfabric/*.la +%exclude %{_libdir}/pkgconfig +%exclude %{_bindir} +%exclude %{_mandir} +%exclude %{_includedir} + +%changelog +* Wed May 24 2017 Open Fabrics Interfaces Working Group +- First release of specfile for packaging a single dl provider. diff --git a/prov/cxi/src/cxip_atomic.c b/prov/cxi/src/cxip_atomic.c new file mode 100644 index 00000000000..c71a762f02a --- /dev/null +++ b/prov/cxi/src/cxip_atomic.c @@ -0,0 +1,1744 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2018,2021-2023 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cxip.h" + +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) + +/* Cassini supports ONLY 1-element vectors, and this code presumes that the + * value is 1. + */ +_Static_assert(CXIP_AMO_MAX_IOV == 1, "Unexpected max IOV #"); + +/* Cassini supports ONLY 1-element packed IOVs. + */ +#define CXIP_AMO_MAX_PACKED_IOV (1) + +/** + * Data type codes for all of the supported fi_datatype values. + */ +static enum c_atomic_type _cxip_amo_type_code[FI_DATATYPE_LAST] = { + [FI_INT8] = C_AMO_TYPE_INT8_T, + [FI_UINT8] = C_AMO_TYPE_UINT8_T, + [FI_INT16] = C_AMO_TYPE_INT16_T, + [FI_UINT16] = C_AMO_TYPE_UINT16_T, + [FI_INT32] = C_AMO_TYPE_INT32_T, + [FI_UINT32] = C_AMO_TYPE_UINT32_T, + [FI_INT64] = C_AMO_TYPE_INT64_T, + [FI_UINT64] = C_AMO_TYPE_UINT64_T, + [FI_FLOAT] = C_AMO_TYPE_FLOAT_T, + [FI_DOUBLE] = C_AMO_TYPE_DOUBLE_T, + [FI_FLOAT_COMPLEX] = C_AMO_TYPE_FLOAT_COMPLEX_T, + [FI_DOUBLE_COMPLEX] = C_AMO_TYPE_DOUBLE_COMPLEX_T, +}; +//TODO: C_AMO_TYPE_UINT128_T + +/** + * AMO operation codes for all of the fi_op values. + */ +static enum c_atomic_op _cxip_amo_op_code[FI_ATOMIC_OP_LAST] = { + [FI_MIN] = C_AMO_OP_MIN, + [FI_MAX] = C_AMO_OP_MAX, + [FI_SUM] = C_AMO_OP_SUM, + [FI_LOR] = C_AMO_OP_LOR, + [FI_LAND] = C_AMO_OP_LAND, + [FI_BOR] = C_AMO_OP_BOR, + [FI_BAND] = C_AMO_OP_BAND, + [FI_LXOR] = C_AMO_OP_LXOR, + [FI_BXOR] = C_AMO_OP_BXOR, + [FI_ATOMIC_READ] = C_AMO_OP_SUM, + + /* ATOMIC_WRITE is implemented as a CSWAP NE instead of SWAP. This + * allows for SWAP to be remapped to PCIe fadd. + */ + [FI_ATOMIC_WRITE] = C_AMO_OP_CSWAP, + [FI_CSWAP] = C_AMO_OP_CSWAP, + [FI_CSWAP_NE] = C_AMO_OP_CSWAP, + [FI_CSWAP_LE] = C_AMO_OP_CSWAP, + [FI_CSWAP_LT] = C_AMO_OP_CSWAP, + [FI_CSWAP_GE] = C_AMO_OP_CSWAP, + [FI_CSWAP_GT] = C_AMO_OP_CSWAP, + [FI_MSWAP] = C_AMO_OP_AXOR, /* special handling */ +}; + +/** + * AMO swap operation codes for the CSWAP comparison conditions. + */ +static enum c_cswap_op _cxip_amo_swpcode[FI_ATOMIC_OP_LAST] = { + [FI_CSWAP] = C_AMO_OP_CSWAP_EQ, + [FI_CSWAP_NE] = C_AMO_OP_CSWAP_NE, + [FI_CSWAP_LE] = C_AMO_OP_CSWAP_LE, + [FI_CSWAP_LT] = C_AMO_OP_CSWAP_LT, + [FI_CSWAP_GE] = C_AMO_OP_CSWAP_GE, + [FI_CSWAP_GT] = C_AMO_OP_CSWAP_GT, +}; + +/** + * Multi-dimensional array defining supported/unsupported operations. Bits + * correspond to the 14 possible fi_datatype values. The OP_VALID() macro will + * return a 1 if the (request,op,dt) triple is supported by Cassini. + */ +static uint16_t _cxip_amo_valid[CXIP_RQ_AMO_LAST][FI_ATOMIC_OP_LAST] = { + + [CXIP_RQ_AMO] = { + [FI_MIN] = 0x03ff, + [FI_MAX] = 0x03ff, + [FI_SUM] = 0x0fff, + [FI_LOR] = 0x00ff, + [FI_LAND] = 0x00ff, + [FI_LXOR] = 0x00ff, + [FI_BOR] = 0x00ff, + [FI_BAND] = 0x00ff, + [FI_BXOR] = 0x00ff, + [FI_ATOMIC_WRITE] = 0x0fff, + }, + + [CXIP_RQ_AMO_FETCH] = { + [FI_MIN] = 0x03ff, + [FI_MAX] = 0x03ff, + [FI_SUM] = 0x0fff, + [FI_LOR] = 0x00ff, + [FI_LAND] = 0x00ff, + [FI_LXOR] = 0x00ff, + [FI_BOR] = 0x00ff, + [FI_BAND] = 0x00ff, + [FI_BXOR] = 0x00ff, + [FI_ATOMIC_WRITE] = 0x0fff, + [FI_ATOMIC_READ] = 0x0fff, + }, + + [CXIP_RQ_AMO_SWAP] = { + [FI_CSWAP] = 0x0fff, + [FI_CSWAP_NE] = 0x0fff, + [FI_CSWAP_LE] = 0x03ff, + [FI_CSWAP_LT] = 0x03ff, + [FI_CSWAP_GE] = 0x03ff, + [FI_CSWAP_GT] = 0x03ff, + [FI_MSWAP] = 0x00ff, + }, + + [CXIP_RQ_AMO_PCIE_FETCH] = { + [FI_MIN] = 0x0, + [FI_MAX] = 0x0, + [FI_SUM] = 0xf0, + [FI_LOR] = 0x0, + [FI_LAND] = 0x0, + [FI_LXOR] = 0x0, + [FI_BOR] = 0x0, + [FI_BAND] = 0x0, + [FI_BXOR] = 0x0, + [FI_ATOMIC_WRITE] = 0x0, + [FI_ATOMIC_READ] = 0x0, + }, + +}; +#define OP_VALID(rq, op, dt) (_cxip_amo_valid[rq][op] & (1 << dt)) + +/** + * Supply opcodes for a request, and determine if the operation is supported. + * + * @param req_type basic, fetch, or swap + * @param dt data type for operation + * @param op operation + * @param amo_remap_to_pcie_fadd NIC AMO operation which is remapped as PCIe + * fetch add + * @param cop Cassini code for operation + * @param cdt Cassini code for data type + * @param copswp Cassini code for cswap operation + * @param cdtlen Length of datatype in bytes + * + * @return int 0 on success, -FI_EOPNOTSUPP if operation is not supported + */ +int _cxip_atomic_opcode(enum cxip_amo_req_type req_type, enum fi_datatype dt, + enum fi_op op, int amo_remap_to_pcie_fadd, + enum c_atomic_op *cop, enum c_atomic_type *cdt, + enum c_cswap_op *copswp, unsigned int *cdtlen) +{ + int opcode; + int dtcode; + + if (dt < 0 || dt >= FI_DATATYPE_LAST || + op < 0 || op >= FI_ATOMIC_OP_LAST) + return -FI_EINVAL; + + if (!OP_VALID(req_type, op, dt)) + return -FI_EOPNOTSUPP; + + /* If the request is a PCIe fetching AMO, then the remap opcode is + * used. + * + * Note: Only fetching FI_SUM is supported as a PCIe AMO. + */ + if (req_type == CXIP_RQ_AMO_PCIE_FETCH) { + if (amo_remap_to_pcie_fadd >= 0) + opcode = amo_remap_to_pcie_fadd; + else + return -FI_EOPNOTSUPP; + } else { + opcode = _cxip_amo_op_code[op]; + if (opcode == amo_remap_to_pcie_fadd) + return -FI_EOPNOTSUPP; + } + + /* For fetching FI_SUMs done as a PCIe AMO, force signed data types to + * unsigned. This is required by the NIC to allow libfabric to support + * signed PCIe fetching FI_SUMs. + */ + dtcode = _cxip_amo_type_code[dt]; + if (req_type == CXIP_RQ_AMO_PCIE_FETCH) { + if (dtcode == C_AMO_TYPE_INT32_T) + dtcode = C_AMO_TYPE_UINT32_T; + else if (dtcode == C_AMO_TYPE_INT64_T) + dtcode = C_AMO_TYPE_UINT64_T; + } + + if (cop) + *cop = opcode; + if (cdt) + *cdt = dtcode; + if (cdtlen) + *cdtlen = ofi_datatype_size(dt); + if (copswp) { + if (op == FI_ATOMIC_WRITE) + *copswp = C_AMO_OP_CSWAP_NE; + else + *copswp = _cxip_amo_swpcode[op]; + } + + return 0; +} + +/** + * Implementation of the provider *_atomic_valid() functions. + * + * The returned count is the maximum number of atomic objects on which a single + * atomic call can operate. For Cassini, this is 1. + * + * @param ep endpoint + * @param req_type request type + * @param datatype datatype + * @param op operation + * @param count returns count of operations supported + * + * @return int 0 on success, -FI_EOPNOTSUPP if operation not supported + */ +static inline int _cxip_ep_valid(struct fid_ep *fid_ep, + enum cxip_amo_req_type req_type, + enum fi_datatype datatype, + enum fi_op op, + size_t *count) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + int ret; + + /* Check for a valid opcode */ + ret = _cxip_atomic_opcode(req_type, datatype, op, + ep->ep_obj->domain->amo_remap_to_pcie_fadd, + NULL, NULL, NULL, NULL); + if (ret < 0) + return ret; + + /* "Cassini implements single element atomics. There is no hardware + * support for packed atomics or IOVECs." -- CSDG + */ + if (count) + *count = CXIP_AMO_MAX_IOV; + + return 0; +} + +/* + * cxip_amo_inject_cb() - AMO inject event callback. + */ +static int cxip_amo_inject_cb(struct cxip_req *req, const union c_event *event) +{ + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + if (event->hdr.event_type == C_EVENT_SEND) { + CXIP_WARN(CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + } + + int event_rc; + + event_rc = cxi_init_event_rc(event); + int ret_err; + + ret_err = proverr2errno(event_rc); + return cxip_cq_req_error(req, 0, ret_err, + cxi_event_rc(event), NULL, 0, + FI_ADDR_UNSPEC); +} + +/* + * cxip_amo_selective_completion_req() - Return request state associated with + * all AMO inject transactions on the transmit context. + * + * The request is freed when the TXC send CQ is closed. + */ +static struct cxip_req *cxip_amo_selective_completion_req(struct cxip_txc *txc) +{ + if (!txc->amo_selective_completion_req) { + struct cxip_req *req; + bool free_request = false; + + req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); + if (!req) + return NULL; + + req->cb = cxip_amo_inject_cb; + req->context = (uint64_t)txc->context; + req->flags = FI_ATOMIC | FI_WRITE; + req->addr = FI_ADDR_UNSPEC; + + if (!txc->amo_selective_completion_req) + txc->amo_selective_completion_req = req; + else + free_request = true; + + if (free_request) + cxip_evtq_req_free(req); + } + + return txc->amo_selective_completion_req; +} + +/* + * cxip_amo_fetching_selective_completion_req() - Return request state + * associated with all fetching AMO inject transactions on the transmit context. + * + * The request is freed when the TXC send CQ is closed. + */ +static struct cxip_req * +cxip_amo_fetching_selective_completion_req(struct cxip_txc *txc) +{ + if (!txc->amo_fetch_selective_completion_req) { + struct cxip_req *req; + bool free_request = false; + + req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); + if (!req) + return NULL; + + req->cb = cxip_amo_inject_cb; + req->context = (uint64_t)txc->context; + req->flags = FI_ATOMIC | FI_READ; + req->addr = FI_ADDR_UNSPEC; + + if (!txc->amo_fetch_selective_completion_req) + txc->amo_fetch_selective_completion_req = req; + else + free_request = true; + + if (free_request) + cxip_evtq_req_free(req); + } + + return txc->amo_fetch_selective_completion_req; +} + +/** + * Callback for non-fetching AMO operations. + * + * @param req AMO request structure + * @param event resulting event + */ +static int _cxip_amo_cb(struct cxip_req *req, const union c_event *event) +{ + int ret; + int event_rc; + int ret_err; + int success_event = (req->flags & FI_COMPLETION); + struct cxip_txc *txc = req->amo.txc; + + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + if (event->hdr.event_type == C_EVENT_SEND) { + TXC_WARN(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + } + + /* Fetching AMO with flush requires two events. Only once two events are + * processed can the user-generated completion queue event be + * generated. In addition, since multiple initiator events are + * generated and zero assumptions can be made about the event order, + * counters cannot be incremented until both events are processed. + * This means that software must modify the counter (i/e it cannot be + * offloaded to hardware). + */ + if (req->amo.fetching_amo_flush) { + req->amo.fetching_amo_flush_event_count++; + + if (event->hdr.event_type == C_EVENT_REPLY) + req->amo.fetching_amo_flush_event_rc = + cxi_init_event_rc(event); + + if (req->amo.fetching_amo_flush_event_count != 2) + return FI_SUCCESS; + + event_rc = req->amo.fetching_amo_flush_event_rc; + + if (req->amo.fetching_amo_flush_cntr) { + if (event_rc == C_RC_OK) + ret = cxip_cntr_mod(req->amo.fetching_amo_flush_cntr, + 1, false, false); + else + ret = cxip_cntr_mod(req->amo.fetching_amo_flush_cntr, + 1, false, true); + + if (ret != FI_SUCCESS) { + req->amo.fetching_amo_flush_event_count--; + return ret; + } + } + } else { + event_rc = cxi_init_event_rc(event); + } + + if (req->amo.result_md) + cxip_unmap(req->amo.result_md); + + if (req->amo.oper1_md) + cxip_unmap(req->amo.oper1_md); + + if (req->amo.ibuf) + cxip_txc_ibuf_free(txc, req->amo.ibuf); + + req->flags &= (FI_ATOMIC | FI_READ | FI_WRITE); + + if (event_rc == C_RC_OK) { + if (success_event) { + ret = cxip_cq_req_complete(req); + if (ret != FI_SUCCESS) + TXC_WARN_RET(txc, ret, + "Failed to report completion\n"); + } + } else { + ret_err = proverr2errno(event_rc); + + ret = cxip_cq_req_error(req, 0, ret_err, + event_rc, NULL, 0, + FI_ADDR_UNSPEC); + + if (ret != FI_SUCCESS) + TXC_WARN_RET(txc, ret, "Failed to report error\n"); + } + + ofi_atomic_dec32(&req->amo.txc->otx_reqs); + cxip_evtq_req_free(req); + + return FI_SUCCESS; +} + +/** + * Return true if vector specification is valid. + * + * vn must be > 0 and <= 1 (CXIP_AMO_MAX_IOV). Formally, we could do this test, + * but formally we would have to loop (once) over the vectors, and test each + * count for being > 0 and <= 1 (CXIP_AMO_MAX_PACKED_IOV). Instead, we just test + * to ensure that each is 1. + * + * @param vn vector element count + * @param v vector pointer + * + * @return bool true if vector is valid, false otherwise + */ +static inline bool _vector_valid(size_t vn, const struct fi_ioc *v) +{ + return (vn == CXIP_AMO_MAX_IOV && v && + v[0].count == CXIP_AMO_MAX_PACKED_IOV && + v[0].addr); +} + +/** + * Return true if RMA vector specification is valid. Note that the address is + * treated as an offset into an RMA MR window, so a value of zero is valid. + * + * @param vn vector element count + * @param v vector pointer + * + * @return bool true if RMA vector is valid, false otherwise + */ +static inline bool _rma_vector_valid(size_t vn, const struct fi_rma_ioc *v) +{ + return (vn == CXIP_AMO_MAX_IOV && v && + v[0].count == CXIP_AMO_MAX_PACKED_IOV); +} + +static bool cxip_amo_emit_idc_req_needed(uint64_t flags, void *result, + struct cxip_mr *result_mr, + bool fetching_amo_flush) +{ + /* User completion events always require a tracking structure. */ + if (flags & FI_COMPLETION) + return true; + + /* If a fetching operation (i.e. result buffer is valid) and the user + * did not provide an MR for the result arg, internal memory + * registration needs to occur. This requires tracking. + */ + if (result && !result_mr) + return true; + + /* Fetching AMO with flush always requires a request struct since two + * operations are required to implement it. + */ + if (fetching_amo_flush) + return true; + + return false; +} + +/* TODO: Update HMEM buf type for 128-bit AMOs. */ +static int cxip_amo_emit_idc(struct cxip_txc *txc, + enum cxip_amo_req_type req_type, + const struct fi_msg_atomic *msg, void *buf, + void *compare, void *result, + struct cxip_mr *result_mr, uint64_t key, + uint64_t remote_offset, union c_fab_addr *dfa, + uint8_t *idx_ext, uint16_t vni, + enum c_atomic_op atomic_op, + enum c_cswap_op cswap_op, + enum c_atomic_type atomic_type, + unsigned int atomic_type_len, uint64_t flags, + uint32_t tclass) +{ + struct cxip_domain *dom = txc->domain; + struct cxip_md *result_md = NULL; + struct c_cstate_cmd cstate_cmd = {}; + struct c_idc_amo_cmd idc_amo_cmd = {}; + struct cxip_req *req = NULL; + bool flush = !!(flags & (FI_DELIVERY_COMPLETE | FI_MATCH_COMPLETE)); + bool fetching = result != NULL; + bool fetching_amo_flush = fetching && flush; + bool restricted = !!(flags & FI_CXI_UNRELIABLE); + int ret; + void *selective_completion_req; + enum cxi_traffic_class_type tc_type; + uint64_t hmem_buf; + uint64_t hmem_compare; + bool tgt_events = cxip_generic_is_mr_key_events(txc->ep_obj->caps, + key); + + /* MR desc cannot be value unless hybrid MR desc is enabled. */ + if (!dom->hybrid_mr_desc) + result_mr = NULL; + + /* Restricted AMOs must target optimized MRs without target events */ + if (restricted && tgt_events) { + TXC_WARN(txc, + "Restricted AMOs with FI_RMA_EVENT not supported\n"); + return -FI_EINVAL; + } + + /* Usage of the FI_CXI_HRP requires FI_CXI_UNRELIABLE. */ + if (flags & FI_CXI_HRP && !(flags & FI_CXI_UNRELIABLE)) { + TXC_WARN(txc, "FI_CXI_HRP requires FI_CXI_UNRELIABLE\n"); + return -FI_EINVAL; + } + + /* Since fetching AMO with flush results in two commands, if + * FI_RMA_EVENT is enabled, this would results in two remote MR counter + * increments. Thus, this functionality cannot be supported. + */ + if (fetching_amo_flush && tgt_events) { + TXC_WARN(txc, + "Fetching AMO with FI_DELIVERY_COMPLETE not supported with FI_RMA_EVENT\n"); + return -FI_EINVAL; + } + + /* Work around for silent drops at the target for non-fetching + * FI_UNIT32 atomic operations when using FI_CXI_HRP. Force + * switching out of HRP if necessary. + */ + if (txc->hrp_war_req && (flags & FI_CXI_HRP) && + req_type == CXIP_RQ_AMO && msg->datatype == FI_UINT32) + flags &= ~FI_CXI_HRP; + + ofi_genlock_lock(&txc->ep_obj->lock); + if (cxip_amo_emit_idc_req_needed(flags, result, result_mr, + fetching_amo_flush)) { + /* if (result && !result_mr) we end up in this branch */ + req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); + if (!req) { + TXC_WARN(txc, "Failed to allocate request\n"); + ret = -FI_EAGAIN; + goto err; + } + + /* Values set here are passed back to the user through the CQ */ + if (flags & FI_COMPLETION) + req->context = (uint64_t)msg->context; + else + req->context = (uint64_t)txc->context; + req->flags = FI_ATOMIC; + req->flags |= (req_type == CXIP_RQ_AMO ? FI_WRITE : FI_READ); + req->flags |= (flags & FI_COMPLETION); + req->cb = _cxip_amo_cb; + req->amo.txc = txc; + req->amo.fetching_amo_flush = fetching_amo_flush; + req->type = CXIP_REQ_AMO; + + /* For fetching AMOs, the result buffer (i.e. fetch buffer) must + * always be registered. + */ + if (result) { + if (result_mr) { + result_md = result_mr->md; + } else { + ret = cxip_map(dom, result, atomic_type_len, 0, + &req->amo.result_md); + if (ret) { + TXC_WARN_RET(txc, ret, + "Failed to map result buffer\n"); + goto err_free_req; + } + + result_md = req->amo.result_md; + } + } + } else if (result_mr) { + result_md = result_mr->md; + } + /* else {result == false} */ + + /* Identify the correct traffic class sub-type. */ + if (flags & FI_CXI_HRP) + tc_type = CXI_TC_TYPE_HRP; + else if (flags & FI_CXI_UNRELIABLE) + tc_type = CXI_TC_TYPE_RESTRICTED; + else + tc_type = CXI_TC_TYPE_DEFAULT; + + /* Prepare the c-state command for the AMO IDC operation. */ + if (result) + cstate_cmd.write_lac = result_md->md->lac; + + cstate_cmd.event_send_disable = 1; + cstate_cmd.index_ext = *idx_ext; + cstate_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + cstate_cmd.restricted = restricted; + + /* If a request structure is not allocated, success events will be + * disabled. But, if for some reason the operation completes with an + * error, an event will occur. For this case, a TXC inject request is + * allocated. This request enables the reporting of failed operation to + * the completion queue. This request is freed when the TXC is closed. + */ + if (req) { + cstate_cmd.user_ptr = (uint64_t)req; + } else { + if (req_type == CXIP_RQ_AMO) + selective_completion_req = + cxip_amo_selective_completion_req(txc); + else + selective_completion_req = + cxip_amo_fetching_selective_completion_req(txc); + + if (!selective_completion_req) { + ret = -FI_EAGAIN; + TXC_WARN(txc, + "Failed to allocate selective completion request\n"); + goto err_unmap_result_buf; + } + + cstate_cmd.user_ptr = (uint64_t)selective_completion_req; + cstate_cmd.event_success_disable = 1; + } + + /* Fetching AMO with flushes requires a trailing zero-byte put with + * flush. Normal AMOs can use the operation flush functionality. + */ + if (!fetching_amo_flush) { + if (flush) + cstate_cmd.flush = 1; + + if (req_type == CXIP_RQ_AMO) { + if (txc->write_cntr) { + cstate_cmd.event_ct_ack = 1; + cstate_cmd.ct = txc->write_cntr->ct->ctn; + } + } else { + if (txc->read_cntr) { + cstate_cmd.event_ct_reply = 1; + cstate_cmd.ct = txc->read_cntr->ct->ctn; + } + } + } + + /* Prepare the IDC AMO command. */ + idc_amo_cmd.idc_header.dfa = *dfa; + idc_amo_cmd.idc_header.remote_offset = remote_offset; + idc_amo_cmd.atomic_op = atomic_op; + idc_amo_cmd.atomic_type = atomic_type; + idc_amo_cmd.cswap_op = cswap_op; + + /* if (result) {result_md is set} */ + if (result) + idc_amo_cmd.local_addr = CXI_VA_TO_IOVA(result_md->md, result); + + switch (msg->op) { + case FI_MSWAP: + ret = cxip_txc_copy_from_hmem(txc, NULL, &hmem_buf, buf, + atomic_type_len); + if (ret) { + TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_result_buf; + } + + ret = cxip_txc_copy_from_hmem(txc, NULL, &hmem_compare, compare, + atomic_type_len); + if (ret) { + TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_result_buf; + } + + hmem_buf &= hmem_compare; + + /* Note: 16-byte value will overflow into op2_word2 */ + memcpy(&idc_amo_cmd.op1_word1, &hmem_buf, atomic_type_len); + break; + + /* FI_ATOMIC_READ is implemented as a sum of zero. Thus, only copy over + * the buffer contents for non-FI_ATOMIC_READ operations. + */ + case FI_ATOMIC_READ: + break; + + /* FI_ATOMIC_WRITE is implemented as a CSWAP NE operation. For this to + * work, the compare buffer (i.e. operand 2) needs to have the same + * contents as the write payload (i.e. operand 1). + */ + case FI_ATOMIC_WRITE: + assert(compare == NULL); + + ret = cxip_txc_copy_from_hmem(txc, NULL, &idc_amo_cmd.op2_word1, + buf, atomic_type_len); + if (ret) { + TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_result_buf; + } + + /* Fall through. */ + default: + ret = cxip_txc_copy_from_hmem(txc, NULL, &idc_amo_cmd.op1_word1, + buf, atomic_type_len); + if (ret) { + TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_result_buf; + } + } + + if (compare) { + /* Note: 16-byte value will overflow into op2_word2 */ + ret = cxip_txc_copy_from_hmem(txc, NULL, &idc_amo_cmd.op2_word1, + compare, atomic_type_len); + if (ret) { + TXC_WARN(txc, "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_result_buf; + } + } + + /* Optionally configure the flushing command used for fetching AMOs. */ + if (fetching_amo_flush) { + assert(req != NULL); + if (req_type == CXIP_RQ_AMO) + req->amo.fetching_amo_flush_cntr = txc->write_cntr; + else + req->amo.fetching_amo_flush_cntr = txc->read_cntr; + } + + ret = cxip_txc_emit_idc_amo(txc, vni, cxip_ofi_to_cxi_tc(tclass), + tc_type, &cstate_cmd, &idc_amo_cmd, flags, + fetching, flush); + if (ret) { + TXC_WARN_RET(txc, ret, "Failed to emit IDC amo\n"); + goto err_unmap_result_buf; + } + + ofi_genlock_unlock(&txc->ep_obj->lock); + + return FI_SUCCESS; + +err_unmap_result_buf: + if (req && req->amo.result_md) + cxip_unmap(req->amo.result_md); +err_free_req: + if (req) + cxip_evtq_req_free(req); +err: + ofi_genlock_unlock(&txc->ep_obj->lock); + + TXC_WARN_RET(txc, ret, + "%s IDC %s failed: atomic_op=%u cswap_op=%u atomic_type=%u buf=%p compare=%p result=%p len=%u roffset=%#lx nid=%#x ep=%u idx_ext=%u\n", + restricted ? "Restricted" : "Unrestricted", + fetching ? "FAMO" : "AMO", atomic_op, cswap_op, + atomic_type, buf, compare, result, atomic_type_len, + remote_offset, dfa->unicast.nid, + dfa->unicast.endpoint_defined, *idx_ext); + + return ret; +} + +static bool cxip_amo_emit_dma_req_needed(const struct fi_msg_atomic *msg, + uint64_t flags, void *result, + struct cxip_mr *buf_mr, + struct cxip_mr *result_mr, + bool fetching_amo_flush) +{ + /* To support FI_INJECt + DMA operations, an internal bounce buffer is + * needed. This buffer is tracked in the request structure. + */ + if (flags & FI_INJECT) + return true; + + /* User completion events always require a tracking structure. */ + if (flags & FI_COMPLETION) + return true; + + /* If the user did not provide an MR for the buffer arg, internal memory + * registration needs to occur. This requires tracking. + */ + if (!buf_mr) + return true; + + /* If a fetching operation (i.e. result buffer is valid) and the user + * did not provide an MR for the result arg, internal memory + * registration needs to occur. This requires tracking. + */ + if (result && !result_mr) + return true; + + /* FI_ATOMIC_READ and FI_MSWAP are require the use of an internal bounce + * buffer. This requires tracking. + */ + if (msg->op == FI_ATOMIC_READ || msg->op == FI_MSWAP) + return true; + + /* Fetching AMO with flush always requires a request struct since two + * operations are required to implement it. + */ + if (fetching_amo_flush) + return true; + + return false; +} + +/* TODO: Update HMEM buf type for 128-bit AMOs. */ +static int cxip_amo_emit_dma(struct cxip_txc *txc, + enum cxip_amo_req_type req_type, + const struct fi_msg_atomic *msg, void *buf, + void *compare, void *result, + struct cxip_mr *buf_mr, struct cxip_mr *result_mr, + uint64_t key, uint64_t remote_offset, + union c_fab_addr *dfa, uint8_t *idx_ext, + uint16_t vni, enum c_atomic_op atomic_op, + enum c_cswap_op cswap_op, + enum c_atomic_type atomic_type, + unsigned int atomic_type_len, uint64_t flags, + uint32_t tclass, bool triggered, + uint64_t trig_thresh, struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr) +{ + struct cxip_domain *dom = txc->domain; + struct c_dma_amo_cmd dma_amo_cmd = {}; + bool flush = !!(flags & (FI_DELIVERY_COMPLETE | FI_MATCH_COMPLETE)); + bool fetching = result != NULL; + bool fetching_amo_flush = fetching && flush; + struct cxip_req *req; + struct cxip_cntr *cntr; + int ret; + uint64_t hmem_buf; + uint64_t hmem_compare; + struct cxip_md *buf_md; + struct cxip_md *result_md = NULL; + void *selective_completion_req; + + /* MR desc cannot be value unless hybrid MR desc is enabled. */ + if (!dom->hybrid_mr_desc) { + buf_mr = NULL; + result_mr = NULL; + } + + /* Since fetching AMO with flush results in two commands, if the + * target MR needs events, this would results in two remote MR counter + * increments. Thus, this functionality cannot be supported. + */ + if (fetching_amo_flush && + cxip_generic_is_mr_key_events(txc->ep_obj->caps, key)) { + TXC_WARN(txc, + "Fetching AMO with FI_DELIVERY_COMPLETE not supported with FI_RMA_EVENT\n"); + return -FI_EINVAL; + } + + ofi_genlock_lock(&txc->ep_obj->lock); + if (cxip_amo_emit_dma_req_needed(msg, flags, result, buf_mr, result_mr, + fetching_amo_flush)) { + /* if (result && !result_mr) we end up in this branch */ + req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); + if (!req) { + ret = -FI_EAGAIN; + TXC_WARN_RET(txc, ret, "Failed to allocate request\n"); + goto err; + } + + /* Values set here are passed back to the user through the CQ */ + if (flags & FI_COMPLETION) + req->context = (uint64_t)msg->context; + else + req->context = (uint64_t)txc->context; + req->flags = FI_ATOMIC; + req->flags |= (req_type == CXIP_RQ_AMO ? FI_WRITE : FI_READ); + req->flags |= (flags & FI_COMPLETION); + req->cb = _cxip_amo_cb; + req->amo.txc = txc; + req->amo.fetching_amo_flush = fetching_amo_flush; + req->type = CXIP_REQ_AMO; + req->trig_cntr = trig_cntr; + + /* Optionally register result MR. */ + if (result) { + if (!result_mr) { + ret = cxip_map(dom, result, atomic_type_len, 0, + &req->amo.result_md); + if (ret) { + TXC_WARN(txc, + "Failed to map result buffer: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_req; + } + + result_md = req->amo.result_md; + } else { + result_md = result_mr->md; + } + } + + if ((flags & FI_INJECT) || msg->op == FI_ATOMIC_READ || + msg->op == FI_MSWAP) { + /* To support FI_INJECT ot FI_ATOMIC_READ with matching + * AMO commands, an internal buffer is needed to store + * the payload. + */ + req->amo.ibuf = cxip_txc_ibuf_alloc(txc); + if (!req->amo.ibuf) { + ret = -FI_EAGAIN; + TXC_WARN(txc, + "Failed to allocate ibuf: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_result_buf; + } + + switch (msg->op) { + /* FI_ATOMIC_READ is implemented as a sum of zero. Thus, + * zero internal buffer which is used for the sum + * operand. + */ + case FI_ATOMIC_READ: + memset(req->amo.ibuf, 0, atomic_type_len); + break; + + case FI_MSWAP: + ret = cxip_txc_copy_from_hmem(txc, NULL, + &hmem_buf, buf, + atomic_type_len); + if (ret) { + TXC_WARN(txc, + "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_operand_buf; + } + + ret = cxip_txc_copy_from_hmem(txc, NULL, + &hmem_compare, + compare, + atomic_type_len); + if (ret) { + TXC_WARN(txc, + "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_operand_buf; + } + + hmem_buf &= hmem_compare; + + memcpy(req->amo.ibuf, &hmem_buf, + atomic_type_len); + break; + + /* Copy over user payload for FI_INJECT operation. */ + default: + ret = cxip_txc_copy_from_hmem(txc, NULL, + req->amo.ibuf, + buf, + atomic_type_len); + if (ret) { + TXC_WARN(txc, + "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_operand_buf; + } + } + + buf = req->amo.ibuf; + buf_md = cxip_txc_ibuf_md(req->amo.ibuf); + } else if (buf_mr) { + buf_md = buf_mr->md; + } else { + /* Map user operand buffer for DMA command. */ + ret = cxip_map(dom, buf, atomic_type_len, 0, + &req->amo.oper1_md); + if (ret) { + TXC_WARN(txc, + "Failed to map operand buffer: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_unmap_result_buf; + } + + buf_md = req->amo.oper1_md; + } + } else { + req = NULL; + + if (result) + result_md = result_mr->md; + + buf_md = buf_mr->md; + } + + /* Build up the matching AMO command. */ + dma_amo_cmd.dfa = *dfa; + dma_amo_cmd.index_ext = *idx_ext; + dma_amo_cmd.event_send_disable = 1; + dma_amo_cmd.remote_offset = remote_offset; + dma_amo_cmd.request_len = atomic_type_len; + dma_amo_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + dma_amo_cmd.match_bits = CXIP_KEY_MATCH_BITS(key); + dma_amo_cmd.atomic_op = atomic_op; + dma_amo_cmd.atomic_type = atomic_type; + dma_amo_cmd.cswap_op = cswap_op; + dma_amo_cmd.local_read_addr = CXI_VA_TO_IOVA(buf_md->md, buf); + dma_amo_cmd.lac = buf_md->md->lac; + + if (req) { + dma_amo_cmd.user_ptr = (uint64_t)req; + } else { + if (req_type == CXIP_RQ_AMO) + selective_completion_req = + cxip_amo_selective_completion_req(txc); + else + selective_completion_req = + cxip_amo_fetching_selective_completion_req(txc); + + if (!selective_completion_req) { + ret = -FI_EAGAIN; + TXC_WARN(txc, + "Failed to allocate selective completion request\n"); + goto err_unmap_operand_buf; + } + + dma_amo_cmd.user_ptr = (uint64_t)selective_completion_req; + dma_amo_cmd.event_success_disable = 1; + } + + /* FI_ATOMIC_WRITE is implemented as a CSWAP NE operation. For this to + * work, the compare buffer (i.e. operand 2) needs to have the same + * contents as the write payload (i.e. operand 1). + */ + if (msg->op == FI_ATOMIC_WRITE) { + assert(compare == NULL); + + ret = cxip_txc_copy_from_hmem(txc, NULL, &dma_amo_cmd.op2_word1, + buf, atomic_type_len); + if (ret) { + TXC_WARN(txc, + "cxip_txc_copy_from_hmem failed: %d:%s\n", ret, + fi_strerror(-ret)); + goto err_unmap_operand_buf; + } + } else if (compare) { + /* Note: 16-byte value will overflow into op2_word2 */ + ret = cxip_txc_copy_from_hmem(txc, NULL, &dma_amo_cmd.op2_word1, + compare, atomic_type_len); + if (ret) { + TXC_WARN(txc, + "cxip_txc_copy_from_hmem failed: %d:%s\n", ret, + fi_strerror(-ret)); + goto err_unmap_operand_buf; + } + } + + /* if (result) {result_md is set} */ + if (result) { + dma_amo_cmd.local_write_addr = + CXI_VA_TO_IOVA(result_md->md, result); + dma_amo_cmd.write_lac = result_md->md->lac; + } + + /* Fetching AMO with flushes requires a trailing zero-byte put with + * Normal AMOs can use the operation flush functionality. + */ + if (!fetching_amo_flush) { + dma_amo_cmd.flush = flush; + + if (req_type == CXIP_RQ_AMO) { + cntr = triggered ? comp_cntr : txc->write_cntr; + + if (cntr) { + dma_amo_cmd.event_ct_ack = 1; + dma_amo_cmd.ct = cntr->ct->ctn; + } + } else { + cntr = triggered ? comp_cntr : txc->read_cntr; + + if (cntr) { + dma_amo_cmd.event_ct_reply = 1; + dma_amo_cmd.ct = cntr->ct->ctn; + } + } + } + + /* Optionally configure the flushing command used for fetching AMOs. */ + if (fetching_amo_flush) { + assert(req != NULL); + + if (req_type == CXIP_RQ_AMO) + req->amo.fetching_amo_flush_cntr = txc->write_cntr; + else + req->amo.fetching_amo_flush_cntr = txc->read_cntr; + } + + ret = cxip_txc_emit_dma_amo(txc, vni, cxip_ofi_to_cxi_tc(tclass), + CXI_TC_TYPE_DEFAULT, trig_cntr, trig_thresh, + &dma_amo_cmd, flags, fetching, flush); + if (ret) { + TXC_WARN_RET(txc, ret, "Failed to emit AMO\n"); + goto err_unmap_operand_buf; + } + + ofi_genlock_unlock(&txc->ep_obj->lock); + + return FI_SUCCESS; + +err_unmap_operand_buf: + if (req) { + if (req->amo.ibuf) + cxip_txc_ibuf_free(txc, req->amo.ibuf); + else + cxip_unmap(req->amo.oper1_md); + } +err_unmap_result_buf: + if (req && req->amo.result_md) + cxip_unmap(req->amo.result_md); +err_free_req: + if (req) + cxip_evtq_req_free(req); +err: + ofi_genlock_unlock(&txc->ep_obj->lock); + + TXC_WARN_RET(txc, ret, + "%s %s failed: atomic_op=%u cswap_op=%u atomic_type=%u buf=%p compare=%p result=%p len=%u rkey=%#lx roffset=%#lx nid=%#x ep=%u idx_ext=%u\n", + triggered ? "Triggered" : "DMA", fetching ? "FAMO" : "AMO", + atomic_op, cswap_op, atomic_type, buf, compare, result, + atomic_type_len, key, remote_offset, dfa->unicast.nid, + dfa->unicast.endpoint_defined, *idx_ext); + + return ret; +} + +static bool cxip_amo_is_idc(struct cxip_txc *txc, uint64_t key, bool triggered) +{ + /* Triggered AMOs can never be IDCs. */ + if (triggered) + return false; + + /* Only optimized MR can be used for IDCs. */ + return cxip_generic_is_mr_key_opt(key); +} + +int cxip_amo_common(enum cxip_amo_req_type req_type, struct cxip_txc *txc, + uint32_t tclass, const struct fi_msg_atomic *msg, + const struct fi_ioc *comparev, void **comparedesc, + size_t compare_count, const struct fi_ioc *resultv, + void **resultdesc, size_t result_count, uint64_t flags, + bool triggered, uint64_t trig_thresh, + struct cxip_cntr *trig_cntr, struct cxip_cntr *comp_cntr) +{ + void *buf; + void *compare = NULL; + void *result = NULL; + uint64_t remote_offset; + uint64_t key; + bool idc; + enum c_atomic_op atomic_op; + enum c_cswap_op cswap_op; + enum c_atomic_type atomic_type; + unsigned int atomic_type_len; + struct cxip_addr caddr; + int ret; + union c_fab_addr dfa; + uint8_t idx_ext; + uint32_t pid_idx; + struct cxip_mr *buf_mr = NULL; + struct cxip_mr *result_mr = NULL; + uint16_t vni; + + if (!msg) { + TXC_WARN(txc, "NULL fi_msg_atomic"); + return -FI_EINVAL; + } + + switch (req_type) { + case CXIP_RQ_AMO_SWAP: + /* Must have a valid compare address */ + if (!_vector_valid(compare_count, comparev)) { + TXC_WARN(txc, "compare IOV invalid\n"); + return -FI_EINVAL; + } + + compare = comparev[0].addr; + + /* FALLTHRU */ + case CXIP_RQ_AMO_FETCH: + case CXIP_RQ_AMO_PCIE_FETCH: + /* Must have a valid result address */ + if (!_vector_valid(result_count, resultv)) { + TXC_WARN(txc, "result IOV invalid\n"); + return -FI_EINVAL; + } + + result = resultv[0].addr; + if (resultdesc && resultdesc[0]) + result_mr = resultdesc[0]; + + /* FALLTHRU */ + case CXIP_RQ_AMO: + if (msg->op != FI_ATOMIC_READ) { + if (!_vector_valid(msg->iov_count, msg->msg_iov)) { + TXC_WARN(txc, "msg IOV invalid\n"); + return -FI_EINVAL; + } + buf = msg->msg_iov[0].addr; + if (msg->desc && msg->desc[0]) + buf_mr = msg->desc[0]; + } else { + buf = NULL; + buf_mr = NULL; + } + + /* The supplied RMA address is actually an offset into a + * registered MR. A value of 0 is valid. + */ + if (!_rma_vector_valid(msg->rma_iov_count, msg->rma_iov)) { + TXC_WARN(txc, "RMA IOV invalid\n"); + return -FI_EINVAL; + } + + remote_offset = msg->rma_iov[0].addr; + key = msg->rma_iov[0].key; + + ret = cxip_adjust_remote_offset(&remote_offset, key); + if (ret) { + TXC_WARN(txc, "RMA IOV address overflow\n"); + return -FI_EINVAL; + } + break; + + default: + TXC_WARN(txc, "Invalid AMO request type: %d\n", req_type); + return -FI_EINVAL; + } + + if (!cxip_generic_is_valid_mr_key(key)) { + TXC_WARN(txc, "Invalid remote key: 0x%lx\n", key); + return -FI_EKEYREJECTED; + } + + idc = cxip_amo_is_idc(txc, key, triggered); + + /* Convert FI to CXI codes, fail if operation not supported */ + ret = _cxip_atomic_opcode(req_type, msg->datatype, msg->op, + txc->domain->amo_remap_to_pcie_fadd, + &atomic_op, &atomic_type, &cswap_op, + &atomic_type_len); + if (ret < 0) { + TXC_WARN_RET(txc, ret, "Failed to generate CXI AMO opcodes\n"); + return ret; + } + + /* Look up target CXI address */ + ret = cxip_av_lookup_addr(txc->ep_obj->av, msg->addr, &caddr); + if (ret != FI_SUCCESS) { + TXC_WARN_RET(txc, ret, "Failed to look up dst FI addr\n"); + return ret; + } + + if (txc->ep_obj->av_auth_key) + vni = caddr.vni; + else + vni = txc->ep_obj->auth_key.vni; + + pid_idx = cxip_generic_mr_key_to_ptl_idx(txc->domain, key, !result); + cxi_build_dfa(caddr.nic, caddr.pid, txc->pid_bits, pid_idx, &dfa, + &idx_ext); + if (idc) + ret = cxip_amo_emit_idc(txc, req_type, msg, buf, compare, + result, result_mr, key, remote_offset, + &dfa, &idx_ext, vni, atomic_op, + cswap_op, atomic_type, atomic_type_len, + flags, tclass); + else + ret = cxip_amo_emit_dma(txc, req_type, msg, buf, compare, + result, buf_mr, result_mr, key, + remote_offset, &dfa, &idx_ext, vni, + atomic_op, cswap_op, atomic_type, + atomic_type_len, flags, tclass, + triggered, trig_thresh, trig_cntr, + comp_cntr); + if (ret) + TXC_WARN_RET(txc, ret, + "%s AMO failed: op=%u buf=%p compare=%p result=%p len=%u rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u triggered=%u", + idc ? "IDC" : "DMA", msg->op, buf, compare, result, + atomic_type_len, key, remote_offset, caddr.nic, + caddr.pid, pid_idx, triggered); + else + TXC_DBG(txc, + "%s AMO emitted: op=%u buf=%p compare=%p result=%p len=%u rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u triggered=%u", + idc ? "IDC" : "DMA", msg->op, buf, compare, result, + atomic_type_len, key, remote_offset, caddr.nic, + caddr.pid, pid_idx, triggered); + + return ret; +} + +/* + * Libfabric APIs + */ +static ssize_t cxip_ep_atomic_write(struct fid_ep *fid_ep, const void *buf, + size_t count, void *desc, + fi_addr_t dest_addr, uint64_t addr, + uint64_t key, enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct fi_ioc oper1 = { + .addr = (void *)buf, + .count = count + }; + struct fi_rma_ioc rma = { + .addr = addr, + .count = 1, + .key = key + }; + struct fi_msg_atomic msg = { + .msg_iov = &oper1, + .desc = &desc, + .iov_count = 1, + .addr = dest_addr, + .rma_iov = &rma, + .rma_iov_count = 1, + .datatype = datatype, + .op = op, + .context = context + }; + + return cxip_amo_common(CXIP_RQ_AMO, &ep->ep_obj->txc, + ep->tx_attr.tclass, &msg, NULL, NULL, 0, + NULL, NULL, 0, ep->tx_attr.op_flags, false, + 0, NULL, NULL); +} + +static ssize_t cxip_ep_atomic_writev(struct fid_ep *fid_ep, + const struct fi_ioc *iov, void **desc, + size_t count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, + void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct fi_rma_ioc rma = { + .addr = addr, + .count = 1, + .key = key + }; + struct fi_msg_atomic msg = { + .msg_iov = iov, + .desc = desc, + .iov_count = count, + .addr = dest_addr, + .rma_iov = &rma, + .rma_iov_count = 1, + .datatype = datatype, + .op = op, + .context = context + }; + + return cxip_amo_common(CXIP_RQ_AMO, &ep->ep_obj->txc, + ep->tx_attr.tclass, &msg, NULL, NULL, 0, NULL, + NULL, 0, ep->tx_attr.op_flags, false, 0, + NULL, NULL); +} + +static ssize_t cxip_ep_atomic_writemsg(struct fid_ep *fid_ep, + const struct fi_msg_atomic *msg, + uint64_t flags) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + + if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS | + FI_CXI_UNRELIABLE | + FI_CXI_HRP | FI_CXI_WEAK_FENCE)) + return -FI_EBADFLAGS; + + if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE)) + return -FI_EINVAL; + + /* If selective completion is not requested, always generate + * completions. + */ + if (!txc->selective_completion) + flags |= FI_COMPLETION; + + return cxip_amo_common(CXIP_RQ_AMO, txc, ep->tx_attr.tclass, msg, + NULL, NULL, 0, NULL, NULL, 0, flags, false, 0, + NULL, NULL); +} + +static ssize_t cxip_ep_atomic_inject(struct fid_ep *fid_ep, const void *buf, + size_t count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct fi_ioc oper1 = { + .addr = (void *)buf, + .count = count + }; + struct fi_rma_ioc rma = { + .addr = addr, + .count = 1, + .key = key + }; + struct fi_msg_atomic msg = { + .msg_iov = &oper1, + .desc = NULL, + .iov_count = 1, + .addr = dest_addr, + .rma_iov = &rma, + .rma_iov_count = 1, + .datatype = datatype, + .op = op, + .context = NULL + }; + + return cxip_amo_common(CXIP_RQ_AMO, &ep->ep_obj->txc, + ep->tx_attr.tclass, &msg, NULL, NULL, 0, NULL, + NULL, 0, FI_INJECT, false, 0, NULL, NULL); +} + +static ssize_t cxip_ep_atomic_readwrite(struct fid_ep *fid_ep, const void *buf, + size_t count, void *desc, void *result, + void *result_desc, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct fi_ioc oper1 = { + .addr = (void *)buf, + .count = count + }; + struct fi_ioc resultv = { + .addr = result, + .count = count + }; + struct fi_rma_ioc rma = { + .addr = addr, + .count = 1, + .key = key + }; + struct fi_msg_atomic msg = { + .msg_iov = &oper1, + .desc = &desc, + .iov_count = 1, + .addr = dest_addr, + .rma_iov = &rma, + .rma_iov_count = 1, + .datatype = datatype, + .op = op, + .context = context + }; + + return cxip_amo_common(CXIP_RQ_AMO_FETCH, &ep->ep_obj->txc, + ep->tx_attr.tclass, &msg, NULL, NULL, 0, + &resultv, &result_desc, 1, ep->tx_attr.op_flags, + false, 0, NULL, NULL); +} + +static ssize_t cxip_ep_atomic_readwritev(struct fid_ep *fid_ep, + const struct fi_ioc *iov, + void **desc, size_t count, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, + fi_addr_t dest_addr, uint64_t addr, + uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct fi_rma_ioc rma = { + .addr = addr, + .count = 1, + .key = key + }; + struct fi_msg_atomic msg = { + .msg_iov = iov, + .desc = desc, + .iov_count = count, + .addr = dest_addr, + .rma_iov = &rma, + .rma_iov_count = 1, + .datatype = datatype, + .op = op, + .context = context + }; + + return cxip_amo_common(CXIP_RQ_AMO_FETCH, &ep->ep_obj->txc, + ep->tx_attr.tclass, &msg, NULL, NULL, 0, resultv, + result_desc, result_count, ep->tx_attr.op_flags, + false, 0, NULL, NULL); +} + +static ssize_t cxip_ep_atomic_readwritemsg(struct fid_ep *fid_ep, + const struct fi_msg_atomic *msg, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, uint64_t flags) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + enum cxip_amo_req_type req_type; + + if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS | + FI_CXI_UNRELIABLE | FI_CXI_WEAK_FENCE | + FI_CXI_PCIE_AMO)) + return -FI_EBADFLAGS; + + if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE)) + return -FI_EINVAL; + + /* If selective completion is not requested, always generate + * completions. + */ + if (!txc->selective_completion) + flags |= FI_COMPLETION; + + if (flags & FI_CXI_PCIE_AMO) + req_type = CXIP_RQ_AMO_PCIE_FETCH; + else + req_type = CXIP_RQ_AMO_FETCH; + + return cxip_amo_common(req_type, txc, ep->tx_attr.tclass, msg, NULL, + NULL, 0, resultv, result_desc, result_count, + flags, false, 0, NULL, NULL); +} + +static ssize_t cxip_ep_atomic_compwrite(struct fid_ep *fid_ep, const void *buf, + size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, uint64_t addr, + uint64_t key, enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct fi_ioc oper1 = { + .addr = (void *)buf, + .count = count + }; + struct fi_ioc comparev = { + .addr = (void *)compare, + .count = count + }; + struct fi_ioc resultv = { + .addr = result, + .count = count + }; + struct fi_rma_ioc rma = { + .addr = addr, + .count = 1, + .key = key + }; + struct fi_msg_atomic msg = { + .msg_iov = &oper1, + .desc = &desc, + .iov_count = 1, + .addr = dest_addr, + .rma_iov = &rma, + .rma_iov_count = 1, + .datatype = datatype, + .op = op, + .context = context + }; + + return cxip_amo_common(CXIP_RQ_AMO_SWAP, &ep->ep_obj->txc, + ep->tx_attr.tclass, &msg, &comparev, + &result_desc, 1, &resultv, &result_desc, 1, + ep->tx_attr.op_flags, false, 0, NULL, NULL); +} + +static ssize_t cxip_ep_atomic_compwritev(struct fid_ep *fid_ep, + const struct fi_ioc *iov, void **desc, + size_t count, + const struct fi_ioc *comparev, + void **compare_desc, + size_t compare_count, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, + fi_addr_t dest_addr, uint64_t addr, + uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct fi_rma_ioc rma = { + .addr = addr, + .count = 1, + .key = key + }; + struct fi_msg_atomic msg = { + .msg_iov = iov, + .desc = desc, + .iov_count = count, + .addr = dest_addr, + .rma_iov = &rma, + .rma_iov_count = 1, + .datatype = datatype, + .op = op, + .context = context + }; + + return cxip_amo_common(CXIP_RQ_AMO_SWAP, &ep->ep_obj->txc, + ep->tx_attr.tclass, &msg, comparev, compare_desc, + compare_count, resultv, result_desc, + result_count, ep->tx_attr.op_flags, false, 0, + NULL, NULL); +} + +static ssize_t +cxip_ep_atomic_compwritemsg(struct fid_ep *fid_ep, + const struct fi_msg_atomic *msg, + const struct fi_ioc *comparev, void **compare_desc, + size_t compare_count, struct fi_ioc *resultv, + void **result_desc, size_t result_count, + uint64_t flags) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + + if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS | + FI_CXI_UNRELIABLE | FI_CXI_WEAK_FENCE)) + return -FI_EBADFLAGS; + + if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE)) + return -FI_EINVAL; + + /* If selective completion is not requested, always generate + * completions. + */ + if (!txc->selective_completion) + flags |= FI_COMPLETION; + + return cxip_amo_common(CXIP_RQ_AMO_SWAP, txc, ep->tx_attr.tclass, msg, + comparev, compare_desc, compare_count, resultv, + result_desc, result_count, flags, false, 0, + NULL, NULL); +} + +static int cxip_ep_atomic_valid(struct fid_ep *ep, + enum fi_datatype datatype, + enum fi_op op, + size_t *count) +{ + return _cxip_ep_valid(ep, CXIP_RQ_AMO, datatype, op, count); +} + +static int cxip_ep_fetch_atomic_valid(struct fid_ep *ep, + enum fi_datatype datatype, enum fi_op op, + size_t *count) +{ + return _cxip_ep_valid(ep, CXIP_RQ_AMO_FETCH, datatype, op, count); +} + +static int cxip_ep_comp_atomic_valid(struct fid_ep *ep, + enum fi_datatype datatype, + enum fi_op op, size_t *count) +{ + return _cxip_ep_valid(ep, CXIP_RQ_AMO_SWAP, datatype, op, count); +} + +struct fi_ops_atomic cxip_ep_atomic_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = cxip_ep_atomic_write, + .writev = cxip_ep_atomic_writev, + .writemsg = cxip_ep_atomic_writemsg, + .inject = cxip_ep_atomic_inject, + .readwrite = cxip_ep_atomic_readwrite, + .readwritev = cxip_ep_atomic_readwritev, + .readwritemsg = cxip_ep_atomic_readwritemsg, + .compwrite = cxip_ep_atomic_compwrite, + .compwritev = cxip_ep_atomic_compwritev, + .compwritemsg = cxip_ep_atomic_compwritemsg, + .writevalid = cxip_ep_atomic_valid, + .readwritevalid = cxip_ep_fetch_atomic_valid, + .compwritevalid = cxip_ep_comp_atomic_valid, +}; + +struct fi_ops_atomic cxip_ep_atomic_no_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = fi_no_atomic_write, + .writev = fi_no_atomic_writev, + .writemsg = fi_no_atomic_writemsg, + .inject = fi_no_atomic_inject, + .readwrite = fi_no_atomic_readwrite, + .readwritev = fi_no_atomic_readwritev, + .readwritemsg = fi_no_atomic_readwritemsg, + .compwrite = fi_no_atomic_compwrite, + .compwritev = fi_no_atomic_compwritev, + .compwritemsg = fi_no_atomic_compwritemsg, + .writevalid = fi_no_atomic_writevalid, + .readwritevalid = fi_no_atomic_readwritevalid, + .compwritevalid = fi_no_atomic_compwritevalid, +}; diff --git a/prov/cxi/src/cxip_av.c b/prov/cxi/src/cxip_av.c new file mode 100644 index 00000000000..6dd4aa4e415 --- /dev/null +++ b/prov/cxi/src/cxip_av.c @@ -0,0 +1,947 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2018,2020 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxip.h" + +#include "ofi_osd.h" +#include "ofi_util.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_AV, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_AV, __VA_ARGS__) + +/* + * cxip_parse_cxi_addr() - Parse node and service arguments representing a CXI + * address. + */ +static int cxip_parse_cxi_addr(const char *node, const char *service, + struct cxip_addr *addr) +{ + struct ether_addr *mac; + uint32_t scan_nic; + uint32_t scan_pid; + + if (!node) + return -FI_ENODATA; + + mac = ether_aton(node); + if (mac) { + addr->nic = cxip_mac_to_nic(mac); + } else if (sscanf(node, "%i", &scan_nic) == 1) { + addr->nic = scan_nic; + } else { + return -FI_ENODATA; + } + + if (!service) + addr->pid = C_PID_ANY; + else if (sscanf(service, "%i", &scan_pid) == 1) + addr->pid = scan_pid; + else + return -FI_ENODATA; + + return FI_SUCCESS; +} + +static inline void cxip_av_read_lock(struct cxip_av *av) +{ + if (!av->lockless) + pthread_rwlock_rdlock(&av->lock); +} + +static inline void cxip_av_write_lock(struct cxip_av *av) +{ + if (!av->lockless) + pthread_rwlock_wrlock(&av->lock); +} + +static inline void cxip_av_unlock(struct cxip_av *av) +{ + if (!av->lockless) + pthread_rwlock_unlock(&av->lock); +} + +static int cxip_av_insert_addr(struct cxip_av *av, struct cxip_addr *addr, + fi_addr_t *fi_addr, uint64_t flags) +{ + struct cxip_av_entry *entry; + struct cxip_av_auth_key_entry *auth_key_entry = NULL; + struct cxip_addr auth_key_addr = { + .nic = addr->nic, + .pid = addr->pid + }; + + if (flags & FI_AUTH_KEY) { + auth_key_entry = + ofi_bufpool_get_ibuf(av->auth_key_entry_pool, *fi_addr); + if (!auth_key_entry) { + CXIP_WARN("Failed to find auth_key entry\n"); + return -FI_EINVAL; + } + + auth_key_addr.vni = auth_key_entry->key.vni; + } + + CXIP_DBG("Inserting nid=%#x pid=%d vni=%d\n", auth_key_addr.nic, + auth_key_addr.pid, auth_key_addr.vni); + + HASH_FIND(hh, av->av_entry_hash, &auth_key_addr, sizeof(auth_key_addr), + entry); + if (entry) { + if (fi_addr) + *fi_addr = ofi_buf_index(entry); + if (ofi_atomic_inc32(&entry->use_cnt) > 1) + CXIP_WARN("nid=%#x pid=%d inserted multiple times\n", + addr->nic, addr->pid); + + return FI_SUCCESS; + } + + entry = ofi_ibuf_alloc(av->av_entry_pool); + if (!entry) { + CXIP_WARN("Failed to allocated AV entry memory\n"); + if (fi_addr) + *fi_addr = FI_ADDR_NOTAVAIL; + return -FI_ENOMEM; + } + + memcpy(&entry->addr, &auth_key_addr, sizeof(auth_key_addr)); + ofi_atomic_initialize32(&entry->use_cnt, 1); + HASH_ADD(hh, av->av_entry_hash, addr, sizeof(entry->addr), entry); + + if (flags & FI_AV_USER_ID) + entry->fi_addr = *fi_addr; + else if (av->av_user_id) + entry->fi_addr = FI_ADDR_UNSPEC; + else + entry->fi_addr = ofi_buf_index(entry); + + if (fi_addr) + *fi_addr = ofi_buf_index(entry); + + if (auth_key_entry) { + entry->auth_key = auth_key_entry; + ofi_atomic_inc32(&auth_key_entry->ref_cnt); + } + + ofi_atomic_inc32(&av->av_entry_cnt); + + return FI_SUCCESS; +} + +#define AV_INSERT_VALID_FLAGS (FI_MORE | FI_AV_USER_ID | FI_AUTH_KEY) + +static int cxip_av_insert_validate_args(struct fid_av *fid, const void *addr_in, + size_t count, fi_addr_t *fi_addr, + uint64_t flags, void *context) +{ + uint64_t unsupported_flags = flags & ~AV_INSERT_VALID_FLAGS; + struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid); + + if (!addr_in && count) { + CXIP_WARN("NULL addr buffer\n"); + return -FI_EINVAL; + } + + if (unsupported_flags) { + CXIP_WARN("Unsupported AV insert flags: %#lx\n", + unsupported_flags); + return -FI_EINVAL; + } + + /* FI_SYMMETRIC is an optimization using logical matching. This avoids + * doing a reverse lookup for support FI_SOURCE. Since no lookup + * occurs, FI_AV_USER_ID cannot be support. + */ + if (av->symmetric && (flags & FI_AV_USER_ID)) { + CXIP_WARN("FI_SYMMETRIC not supported with FI_AV_USER_ID\n"); + return -FI_EINVAL; + } + + if (av->av_user_id && (flags & FI_AV_USER_ID)) { + CXIP_WARN("FI_AV_USER_ID insert flags not supported with AV opened with FI_AV_USER_ID\n"); + return -FI_EINVAL; + } + + if (!fi_addr && (flags & FI_AV_USER_ID)) { + CXIP_WARN("NULL fi_addr with FI_AV_USER_ID\n"); + return -FI_EINVAL; + } + + if (!av->av_auth_key && (flags & FI_AUTH_KEY)) { + CXIP_WARN("FI_AUTH_KEY requested without FI_AV_AUTH_KEY domain config\n"); + return -FI_EINVAL; + } + + if (av->av_auth_key && !(flags & FI_AUTH_KEY)) { + CXIP_WARN("FI_AUTH_KEY must be used for AVs configured with FI_AV_AUTH_KEY\n"); + return -FI_EINVAL; + } + + if ((flags & FI_AUTH_KEY) && (flags & FI_AV_USER_ID)) { + CXIP_WARN("FI_AUTH_KEY and FI_AV_USER_ID are not supported together\n"); + return -FI_EINVAL; + } + + if ((flags & FI_AUTH_KEY) && !fi_addr) { + CXIP_WARN("NULL fi_addr array used with FI_AUTH_KEY\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +/* NETSIM collectives simulation reqires many-to-one fi_addr to cxip_addr, + * i.e., multiple fi_addr values that resolve to the same target address. The + * new reverse-lookup model requires unique one-to-one, i.e. every cxip_addr + * must be unique. These filter functions allow insert/lookup modifications + * of the values by replacing these functions in the test code. + */ +static struct cxip_addr *insert_in(const void *addr_in) +{ + return (struct cxip_addr *)addr_in; +} +static void insert_out(struct cxip_addr *addr_out, + struct cxip_addr *addr_in) +{ + *addr_out = *addr_in; +} +struct cxip_addr *(*cxip_av_addr_in)(const void *addr) = insert_in; +void (*cxip_av_addr_out)(struct cxip_addr *addr_out, + struct cxip_addr *addr) = insert_out; + +static int cxip_av_insert(struct fid_av *fid, const void *addr_in, size_t count, + fi_addr_t *fi_addr, uint64_t flags, void *context) +{ + struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid); + size_t i; + size_t success_cnt = 0; + int ret; + + ret = cxip_av_insert_validate_args(fid, addr_in, count, fi_addr, flags, + context); + if (ret != FI_SUCCESS) + return ret; + + cxip_av_write_lock(av); + + for (i = 0; i < count; i++) { + ret = cxip_av_insert_addr(av, cxip_av_addr_in(addr_in) + i, + fi_addr ? &fi_addr[i] : NULL, flags); + if (ret == FI_SUCCESS) + success_cnt++; + } + + cxip_av_unlock(av); + + return success_cnt; +} + +static int cxip_av_insertsvc_validate_args(struct fid_av *fid, const char *node, + const char *service, + fi_addr_t *fi_addr, uint64_t flags, + void *context) +{ + if (!node) { + CXIP_WARN("NULL node\n"); + return -FI_EINVAL; + } + + if (!service) { + CXIP_WARN("NULL service\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static int cxip_av_insertsvc(struct fid_av *fid, const char *node, + const char *service, fi_addr_t *fi_addr, + uint64_t flags, void *context) +{ + struct cxip_addr addr = {}; + int ret; + + ret = cxip_av_insertsvc_validate_args(fid, node, service, fi_addr, + flags, context); + if (ret != FI_SUCCESS) + return ret; + + ret = cxip_parse_cxi_addr(node, service, &addr); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to parse node %s and service %s\n", node, + service); + return ret; + } + + return cxip_av_insert(fid, &addr, 1, fi_addr, flags, context); +} + +int cxip_av_lookup_addr(struct cxip_av *av, fi_addr_t fi_addr, + struct cxip_addr *addr) +{ + struct cxip_av_entry *entry; + + cxip_av_read_lock(av); + + entry = ofi_bufpool_get_ibuf(av->av_entry_pool, fi_addr); + if (entry && addr) + cxip_av_addr_out(addr, &entry->addr); + + cxip_av_unlock(av); + + if (entry) + return FI_SUCCESS; + + CXIP_WARN("Invalid fi_addr %#lx\n", fi_addr); + + return -FI_EINVAL; +} + +fi_addr_t cxip_av_lookup_fi_addr(struct cxip_av *av, + const struct cxip_addr *addr) +{ + struct cxip_av_entry *entry; + struct cxip_addr lookup_addr = *addr; + fi_addr_t fi_addr; + + /* Non-zero VNIs being inserted into the auth_key is ONLY supported with + * FI_AV_AUTH_KEY. + */ + if (!av->av_auth_key) + lookup_addr.vni = 0; + + cxip_av_read_lock(av); + + HASH_FIND(hh, av->av_entry_hash, &lookup_addr, sizeof(lookup_addr), + entry); + fi_addr = entry ? entry->fi_addr : FI_ADDR_NOTAVAIL; + + cxip_av_unlock(av); + + return fi_addr; +} + +int cxip_av_bind_ep(struct cxip_av *av, struct cxip_ep *ep) +{ + int ret; + + if (av->domain != ep->ep_obj->domain) { + CXIP_WARN("EP belongs to different domain\n"); + return -FI_EINVAL; + } + + cxip_av_write_lock(av); + ret = fid_list_insert(&av->ep_list, NULL, &ep->ep.fid); + cxip_av_unlock(av); + + if (ret != FI_SUCCESS) { + CXIP_WARN("EP bind failed: %d\n", ret); + return ret; + } + + ofi_atomic_inc32(&av->ref); + return FI_SUCCESS; +} + +void cxip_av_unbind_ep(struct cxip_av *av, struct cxip_ep *ep) +{ + cxip_av_write_lock(av); + fid_list_remove(&av->ep_list, NULL, &ep->ep.fid); + cxip_av_unlock(av); + + ofi_atomic_dec32(&av->ref); +} + +static int cxip_av_lookup(struct fid_av *fid, fi_addr_t fi_addr, void *addr_out, + size_t *addrlen) +{ + struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid); + struct cxip_addr addr; + int ret; + + ret = cxip_av_lookup_addr(av, fi_addr, &addr); + if (ret != FI_SUCCESS) + return ret; + + memcpy(addr_out, &addr, MIN(*addrlen, sizeof(addr))); + *addrlen = sizeof(addr); + + return FI_SUCCESS; +} + +static void cxip_av_remove_addr(struct cxip_av *av, fi_addr_t fi_addr) +{ + struct cxip_av_entry *entry; + int use_cnt; + + entry = ofi_bufpool_get_ibuf(av->av_entry_pool, fi_addr); + if (!entry) { + CXIP_WARN("Invalid fi_addr: %#lx\n", fi_addr); + return; + } + + use_cnt = ofi_atomic_dec32(&entry->use_cnt); + if (use_cnt) + return; + + CXIP_DBG("Removing nid=%#x pid=%d\n", entry->addr.nic, + entry->addr.pid); + + if (entry->auth_key) + ofi_atomic_dec32(&entry->auth_key->ref_cnt); + + ofi_atomic_dec32(&av->av_entry_cnt); + HASH_DELETE(hh, av->av_entry_hash, entry); + ofi_ibuf_free(entry); +} + +static int cxip_av_remove_auth_key(struct cxip_av *av, fi_addr_t fi_addr) +{ + struct cxip_av_auth_key_entry *entry; + int use_cnt; + + entry = ofi_bufpool_get_ibuf(av->auth_key_entry_pool, fi_addr); + if (!entry) { + CXIP_WARN("Invalid fi_addr: %#lx\n", fi_addr); + return -FI_EINVAL; + } + + if (ofi_atomic_get32(&entry->ref_cnt)) { + CXIP_WARN("AV auth key still in use\n"); + return -FI_EBUSY; + } + + use_cnt = ofi_atomic_dec32(&entry->use_cnt); + if (use_cnt) + return FI_SUCCESS; + + CXIP_DBG("vni=%d\n", entry->key.vni); + + ofi_atomic_dec32(&av->auth_key_entry_cnt); + dlist_remove(&entry->entry); + HASH_DELETE(hh, av->auth_key_entry_hash, entry); + ofi_ibuf_free(entry); + + return FI_SUCCESS; +} + +#define AV_REMOVE_VALID_FLAGS FI_AUTH_KEY + +static int cxip_av_remove(struct fid_av *fid, fi_addr_t *fi_addr, + size_t count, uint64_t flags) +{ + struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid); + uint64_t unsupported_flags = flags & ~AV_REMOVE_VALID_FLAGS; + size_t i; + int ret; + + if (unsupported_flags) { + CXIP_WARN("Unsupported flags: %#lx\n", unsupported_flags); + return -FI_EINVAL; + } + + cxip_av_write_lock(av); + + for (i = 0; i < count; i++) { + if (flags & FI_AUTH_KEY) { + ret = cxip_av_remove_auth_key(av, fi_addr[i]); + if (ret != FI_SUCCESS) + return ret; + } else { + cxip_av_remove_addr(av, fi_addr[i]); + } + } + + cxip_av_unlock(av); + + return FI_SUCCESS; +} + +static const char *cxip_av_straddr(struct fid_av *fid, const void *addr, + char *buf, size_t *len) +{ + return ofi_straddr(buf, len, FI_ADDR_CXI, addr); +} + +static int cxip_av_close(struct fid *fid) +{ + struct cxip_av *av = container_of(fid, struct cxip_av, av_fid.fid); + struct cxip_domain *dom = av->domain; + + if (ofi_atomic_get32(&av->ref)) + return -FI_EBUSY; + + HASH_CLEAR(hh, av->auth_key_entry_hash); + ofi_bufpool_destroy(av->auth_key_entry_pool); + HASH_CLEAR(hh, av->av_entry_hash); + ofi_bufpool_destroy(av->av_entry_pool); + free(av); + + ofi_atomic_dec32(&dom->ref); + + return FI_SUCCESS; +} + +static int cxip_av_insert_auth_key_validate_args(struct cxip_av *cxi_av, + const void *auth_key, + size_t auth_key_size, + fi_addr_t *fi_addr, + uint64_t flags) +{ + struct cxi_auth_key *key = (struct cxi_auth_key *)auth_key; + + if (!cxi_av->av_auth_key) { + CXIP_WARN("Domain not configured with FI_AV_AUTH_KEY\n"); + return -FI_EINVAL; + } + + if (!auth_key) { + CXIP_WARN("NULL auth_key\n"); + return -FI_EINVAL; + } + + if (auth_key_size != sizeof(struct cxi_auth_key)) { + CXIP_WARN("Bad auth_key_size\n"); + return -FI_EINVAL; + } + + if (!fi_addr) { + CXIP_WARN("NULL fi_addr\n"); + return -FI_EINVAL; + } + + if (flags) { + CXIP_WARN("Invalid flags\n"); + return -FI_EINVAL; + } + + if (ofi_atomic_get32(&cxi_av->auth_key_entry_cnt) >= + cxi_av->auth_key_entry_max) { + CXIP_WARN("AV EP max auth key count limit reached\n"); + return -FI_ENOSPC; + } + + return cxip_domain_valid_vni(cxi_av->domain, key->vni); +} + +static int cxip_av_insert_auth_key(struct fid_av *av, const void *auth_key, + size_t auth_key_size, fi_addr_t *fi_addr, + uint64_t flags) +{ + struct cxip_av *cxi_av = container_of(av, struct cxip_av, av_fid); + struct cxip_av_auth_key_entry *entry; + struct cxi_auth_key key; + int ret; + + ret = cxip_av_insert_auth_key_validate_args(cxi_av, auth_key, + auth_key_size, fi_addr, + flags); + if (ret != FI_SUCCESS) + return ret; + + /* Use a bounce buffer for authorization key to clear the service field. + * The service field is not needed for this AV auth key. + */ + memcpy(&key, auth_key, sizeof(key)); + key.svc_id = 0; + + CXIP_DBG("Inserting VNI=%d\n", key.vni); + + cxip_av_write_lock(cxi_av); + + HASH_FIND(hh, cxi_av->auth_key_entry_hash, &key, sizeof(key), entry); + if (entry) { + *fi_addr = ofi_buf_index(entry); + if (ofi_atomic_inc32(&entry->use_cnt) > 1) + CXIP_WARN("vni=%d inserted multiple times\n", key.vni); + + return FI_SUCCESS; + } + + entry = ofi_ibuf_alloc(cxi_av->auth_key_entry_pool); + if (!entry) { + CXIP_WARN("Failed to allocated AV auth key entry memory\n"); + return -FI_ENOMEM; + } + + memcpy(&entry->key, &key, sizeof(key)); + ofi_atomic_initialize32(&entry->use_cnt, 1); + ofi_atomic_initialize32(&entry->ref_cnt, 0); + HASH_ADD(hh, cxi_av->auth_key_entry_hash, key, sizeof(key), entry); + dlist_insert_tail(&entry->entry, &cxi_av->auth_key_entry_list); + ofi_atomic_inc32(&cxi_av->auth_key_entry_cnt); + + if (cxi_av->av_user_id) + entry->fi_addr = FI_ADDR_UNSPEC; + else + entry->fi_addr = ofi_buf_index(entry); + + *fi_addr = ofi_buf_index(entry); + + cxip_av_unlock(cxi_av); + + return FI_SUCCESS; +} + +static int cxip_av_lookup_auth_key_validate_args(struct cxip_av *cxi_av, + fi_addr_t addr, void *auth_key, + size_t *auth_key_size) +{ + if (!cxi_av->av_auth_key) { + CXIP_WARN("Domain not configured with FI_AV_AUTH_KEY\n"); + return -FI_EINVAL; + } + + if (!auth_key) { + CXIP_WARN("NULL auth_key\n"); + return -FI_EINVAL; + } + + if (!auth_key_size) { + CXIP_WARN("NULL auth_key_size\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static int cxip_av_lookup_auth_key(struct fid_av *av, fi_addr_t addr, + void *auth_key, size_t *auth_key_size) +{ + struct cxip_av *cxi_av = container_of(av, struct cxip_av, av_fid); + struct cxip_av_auth_key_entry *entry; + int ret; + + ret = cxip_av_lookup_auth_key_validate_args(cxi_av, addr, auth_key, + auth_key_size); + if (ret != FI_SUCCESS) + return ret; + + cxip_av_read_lock(cxi_av); + + entry = ofi_bufpool_get_ibuf(cxi_av->auth_key_entry_pool, addr); + if (entry) { + *auth_key_size = MIN(sizeof(entry->key), *auth_key_size); + memcpy(auth_key, &entry->key, *auth_key_size); + *auth_key_size = sizeof(entry->key); + } + + cxip_av_unlock(cxi_av); + + if (entry) + return FI_SUCCESS; + + CXIP_WARN("Invalid fi_addr %#lx\n", addr); + + return -FI_EINVAL; +} + +fi_addr_t cxip_av_lookup_auth_key_fi_addr(struct cxip_av *av, unsigned int vni) +{ + struct cxip_av_auth_key_entry *entry; + struct cxi_auth_key lookup = { + .vni = vni, + }; + fi_addr_t addr; + + if (!av->av_auth_key) + return FI_ADDR_NOTAVAIL; + + cxip_av_read_lock(av); + + HASH_FIND(hh, av->auth_key_entry_hash, &lookup, sizeof(lookup), entry); + addr = entry ? entry->fi_addr : FI_ADDR_NOTAVAIL; + + cxip_av_unlock(av); + + return addr; +} + +int cxip_av_auth_key_get_vnis(struct cxip_av *av, uint16_t **vni, + size_t *vni_count) +{ + uint16_t *vnis; + size_t count; + struct cxip_av_auth_key_entry *entry; + int i; + int ret = FI_SUCCESS; + + cxip_av_read_lock(av); + + count = ofi_atomic_get32(&av->auth_key_entry_cnt); + if (count == 0) { + CXIP_WARN("AV auth key empty\n"); + ret = -FI_EINVAL; + goto unlock_out; + } + + vnis = calloc(count, sizeof(*vnis)); + if (!vnis) { + CXIP_WARN("Failed to allocate auth key VNI memory\n"); + ret = -FI_ENOMEM;; + goto unlock_out; + } + + i = 0; + dlist_foreach_container(&av->auth_key_entry_list, + struct cxip_av_auth_key_entry, entry, entry) { + ofi_atomic_inc32(&entry->ref_cnt); + vnis[i] = entry->key.vni; + i++; + } + + assert(count == i); + + *vni_count = count; + *vni = vnis; + +unlock_out: + cxip_av_unlock(av); + + return ret; +} + +void cxip_av_auth_key_put_vnis(struct cxip_av *av, uint16_t *vni, + size_t vni_count) +{ + size_t i; + struct cxip_av_auth_key_entry *entry; + + cxip_av_read_lock(av); + + for (i = 0; i < vni_count; i++) { + dlist_foreach_container(&av->auth_key_entry_list, + struct cxip_av_auth_key_entry, entry, entry) { + if (entry->key.vni == vni[i]) { + ofi_atomic_dec32(&entry->ref_cnt); + break; + } + } + } + + cxip_av_unlock(av); + + free(vni); +} + +#define AV_SET_USER_ID_VALID_FLAGS FI_AUTH_KEY + +static int cxip_av_set_user_id(struct fid_av *av, fi_addr_t fi_addr, + fi_addr_t user_id, uint64_t flags) +{ + struct cxip_av *cxi_av = container_of(av, struct cxip_av, av_fid); + struct cxip_av_entry *av_entry = NULL; + struct cxip_av_auth_key_entry *auth_key_entry = NULL; + uint64_t unsupported_flags = flags & ~AV_INSERT_VALID_FLAGS; + + if (!cxi_av->av_user_id) { + CXIP_WARN("AV not opened with FI_AV_AUTH_KEY\n"); + return -FI_EINVAL; + } + + if (unsupported_flags) { + CXIP_WARN("Unsupported AV set user id flags: %#lx\n", + unsupported_flags); + return -FI_EINVAL; + } + + cxip_av_write_lock(cxi_av); + + if (flags & FI_AUTH_KEY) { + auth_key_entry = + ofi_bufpool_get_ibuf(cxi_av->auth_key_entry_pool, + fi_addr); + if (auth_key_entry) + auth_key_entry->fi_addr = user_id; + } else { + av_entry = ofi_bufpool_get_ibuf(cxi_av->av_entry_pool, fi_addr); + if (av_entry) + av_entry->fi_addr = user_id; + } + + cxip_av_unlock(cxi_av); + + if (av_entry || auth_key_entry) + return FI_SUCCESS; + + CXIP_WARN("Invalid fi_addr %#lx\n", fi_addr); + + return -FI_EINVAL; +} + +static struct fi_ops_av cxip_av_fid_ops = { + .size = sizeof(struct fi_ops_av), + .insert = cxip_av_insert, + .insertsvc = cxip_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .remove = cxip_av_remove, + .lookup = cxip_av_lookup, + .straddr = cxip_av_straddr, + .av_set = cxip_av_set, + .insert_auth_key = cxip_av_insert_auth_key, + .lookup_auth_key = cxip_av_lookup_auth_key, + .set_user_id = cxip_av_set_user_id, +}; + +static struct fi_ops cxip_av_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_av_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static int cxip_av_open_validate_args(struct cxip_domain *dom, + struct fi_av_attr *attr, + struct fid_av **avp, void *context) +{ + if (!attr) { + CXIP_WARN("NULL AV attributes\n"); + return -FI_EINVAL; + } + + if (!avp) { + CXIP_WARN("NULL AV\n"); + return -FI_EINVAL; + } + + if (!dom->av_user_id && (attr->flags & FI_AV_USER_ID)) { + CXIP_WARN("Domain not configured with FI_AV_USER_ID\n"); + return -FI_EINVAL; + } + + if (attr->rx_ctx_bits) { + CXIP_WARN("rx_ctx_bits non-zero. SEPs not supported.\n"); + return -FI_EINVAL; + } + + if (attr->name) { + CXIP_WARN("Shared AVs not supported\n"); + return -FI_EINVAL; + } + + if (attr->flags & FI_READ) { + CXIP_WARN("FI_READ and shared AVs not supported\n"); + return -FI_EINVAL; + } + + if (attr->flags & FI_EVENT) { + CXIP_WARN("FI_EVENT not supported\n"); + return -FI_EINVAL; + } + + switch (attr->type) { + case FI_AV_UNSPEC: + case FI_AV_MAP: + case FI_AV_TABLE: + break; + default: + CXIP_WARN("Invalid AV type: %d\n", attr->type); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +int cxip_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **avp, void *context) +{ + int ret; + struct cxip_av *av; + struct cxip_domain *dom; + struct ofi_bufpool_attr pool_attr = { + .size = sizeof(struct cxip_av_entry), + .flags = OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_INDEXED, + }; + size_t orig_size; + + dom = container_of(domain, struct cxip_domain, util_domain.domain_fid); + + ret = cxip_av_open_validate_args(dom, attr, avp, context); + if (ret != FI_SUCCESS) + goto err; + + av = calloc(1, sizeof(*av)); + if (!av) { + ret = -FI_ENOMEM; + goto err; + } + + /* Initialize embedded AV fields. */ + av->av_fid.fid.context = context; + av->av_fid.fid.fclass = FI_CLASS_AV; + av->av_fid.fid.ops = &cxip_av_fi_ops; + av->av_fid.ops = &cxip_av_fid_ops; + av->domain = dom; + dlist_init(&av->ep_list); + ofi_atomic_initialize32(&av->ref, 0); + av->lockless = dom->util_domain.threading == FI_THREAD_DOMAIN; + pthread_rwlock_init(&av->lock, NULL); + av->av_entry_hash = NULL; + av->symmetric = !!(attr->flags & FI_SYMMETRIC); + ofi_atomic_initialize32(&av->av_entry_cnt, 0); + av->av_auth_key = dom->av_auth_key; + av->auth_key_entry_hash = NULL; + dlist_init(&av->auth_key_entry_list); + ofi_atomic_initialize32(&av->auth_key_entry_cnt, 0); + av->auth_key_entry_max = dom->auth_key_entry_max; + av->av_user_id = !!(attr->flags & FI_AV_USER_ID); + + /* Cannot support symmetric with AV auth key. */ + if (av->av_auth_key) + av->symmetric = 0; + + /* Only FI_AV_TABLE is implemented. */ + av->type = attr->type == FI_AV_UNSPEC ? FI_AV_TABLE : attr->type; + + /* Allocate buffer pool and size it based on user input. */ + orig_size = attr->count ? attr->count : ofi_universe_size; + orig_size = roundup_power_of_two(orig_size); + pool_attr.chunk_cnt = orig_size; + ret = ofi_bufpool_create_attr(&pool_attr, &av->av_entry_pool); + if (ret) { + CXIP_WARN("Failed to allocate buffer pool: %d\n", ret); + goto err_free_av; + } + + pool_attr.size = sizeof(struct cxip_av_auth_key_entry); + pool_attr.chunk_cnt = av->auth_key_entry_max; + ret = ofi_bufpool_create_attr(&pool_attr, &av->auth_key_entry_pool); + if (ret) { + CXIP_WARN("Failed to allocate buffer pool: %d\n", ret); + goto err_free_av_buf_pool; + } + + ofi_atomic_inc32(&dom->ref); + + *avp = &av->av_fid; + + return FI_SUCCESS; + +err_free_av_buf_pool: + ofi_bufpool_destroy(av->av_entry_pool); +err_free_av: + free(av); +err: + return ret; +} diff --git a/prov/cxi/src/cxip_avset.c b/prov/cxi/src/cxip_avset.c new file mode 100644 index 00000000000..4bf7a445f25 --- /dev/null +++ b/prov/cxi/src/cxip_avset.c @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2020 Intel Corporation. All rights reserved. + * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +/* + * Notes: + * + * To implement this as an extension of util_av_set requires that AV be an + * extension of util_av, which it currently is not. + * + * The bulk of the util code is involved with a point-to-point implementaion of + * collectives, and the util_av_set code is relatively trivial, and also has a + * bad bug in util_av_set_diff(). + * + * Our current plan is to implement only accelerated multicast operations in + * libfabric, and leave all point-to-point implementations to the regular MPI + * algorithms, which will (in general) be better optimized and tunable. + * + * At some future point, we can rework cxip_av to be an extension of util_av, + * eliminate this code in favor of the util_coll code, with custom + * implementations of the accelerated multicast operations. + * + */ +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cxip.h" + +int cxip_av_set_union(struct fid_av_set *dst, const struct fid_av_set *src) +{ + /* Must append to end */ + struct cxip_av_set *src_av_set; + struct cxip_av_set *dst_av_set; + size_t temp; + int i,j; + + src_av_set = container_of(src, struct cxip_av_set, av_set_fid); + dst_av_set = container_of(dst, struct cxip_av_set, av_set_fid); + + if (src_av_set->cxi_av != dst_av_set->cxi_av) + return -FI_EINVAL; + + if (dst_av_set->mc_obj) + return -FI_EPERM; + + /* New elements placed at end of dst */ + temp = dst_av_set->fi_addr_cnt; + for (i = 0; i < src_av_set->fi_addr_cnt; i++) { + for (j = 0; j < dst_av_set->fi_addr_cnt; j++) { + if (dst_av_set->fi_addr_ary[j] == + src_av_set->fi_addr_ary[i]) { + /* src[i] already in dst */ + break; + } + } + if (j == dst_av_set->fi_addr_cnt) { + /* src[i] gets added to end of dst */ + dst_av_set->fi_addr_ary[temp++] = + src_av_set->fi_addr_ary[i]; + } + } + /* temp >= dst_av_set->fi_addr_cnt */ + dst_av_set->fi_addr_cnt = temp; + return FI_SUCCESS; +} + +int cxip_av_set_intersect(struct fid_av_set *dst, const struct fid_av_set *src) +{ + /* Must preserve order */ + struct cxip_av_set *src_av_set; + struct cxip_av_set *dst_av_set; + int i,j, temp; + + src_av_set = container_of(src, struct cxip_av_set, av_set_fid); + dst_av_set = container_of(dst, struct cxip_av_set, av_set_fid); + + if (src_av_set->cxi_av != dst_av_set->cxi_av) + return -FI_EINVAL; + + if (dst_av_set->mc_obj) + return -FI_EPERM; + + /* Old elements removed from dst */ + temp = 0; + for (i = 0; i < dst_av_set->fi_addr_cnt; i++) { + for (j = 0; j < src_av_set->fi_addr_cnt; j++) { + if (dst_av_set->fi_addr_ary[i] == + src_av_set->fi_addr_ary[j]) { + /* dst[i] is in src, temp <= i */ + if (temp < i) { + dst_av_set->fi_addr_ary[temp] = + dst_av_set->fi_addr_ary[i]; + } + temp++; + break; + } + } + } + /* temp <= dst_av_set->fi_addr_cnt */ + dst_av_set->fi_addr_cnt = temp; + return FI_SUCCESS; +} + +int cxip_av_set_diff(struct fid_av_set *dst, const struct fid_av_set *src) +{ + /* Must preserve order */ + struct cxip_av_set *src_av_set; + struct cxip_av_set *dst_av_set; + int i,j, temp; + + src_av_set = container_of(src, struct cxip_av_set, av_set_fid); + dst_av_set = container_of(dst, struct cxip_av_set, av_set_fid); + + if (src_av_set->cxi_av != dst_av_set->cxi_av) + return -FI_EINVAL; + + if (dst_av_set->mc_obj) + return -FI_EPERM; + + /* Old elements removed from dst */ + temp = 0; + for (i = 0; i < dst_av_set->fi_addr_cnt; i++) { + for (j = 0; j < src_av_set->fi_addr_cnt; j++) { + if (dst_av_set->fi_addr_ary[i] == + src_av_set->fi_addr_ary[j]) + break; + } + if (j == src_av_set->fi_addr_cnt) { + /* temp <= i */ + if (temp < dst_av_set->fi_addr_cnt) { + dst_av_set->fi_addr_ary[temp] = + dst_av_set->fi_addr_ary[i]; + } + temp++; + } + } + /* temp <= dst_av_set->fi_addr_cnt */ + dst_av_set->fi_addr_cnt = temp; + return FI_SUCCESS; +} + +int cxip_av_set_insert(struct fid_av_set *set, fi_addr_t addr) +{ + /* Must append to end */ + struct cxip_av_set *av_set_obj; + int i; + + av_set_obj = container_of(set, struct cxip_av_set, av_set_fid); + + if (av_set_obj->mc_obj) + return -FI_EPERM; + + /* Do not insert duplicates */ + for (i = 0; i < av_set_obj->fi_addr_cnt; i++) { + if (av_set_obj->fi_addr_ary[i] == addr) + return -FI_EINVAL; + } + /* Append new value */ + av_set_obj->fi_addr_ary[av_set_obj->fi_addr_cnt++] = addr; + return FI_SUCCESS; +} + +int cxip_av_set_remove(struct fid_av_set *set, fi_addr_t addr) +{ + /* Must preserve ordering */ + struct cxip_av_set *av_set_obj; + int i; + + av_set_obj = container_of(set, struct cxip_av_set, av_set_fid); + + if (av_set_obj->mc_obj) + return -FI_EPERM; + + for (i = 0; i < av_set_obj->fi_addr_cnt; i++) { + if (av_set_obj->fi_addr_ary[i] == addr) + break; + } + if (i == av_set_obj->fi_addr_cnt) + return -FI_EINVAL; + + for (i++; i < av_set_obj->fi_addr_cnt; i++) + av_set_obj->fi_addr_ary[i-1] = av_set_obj->fi_addr_ary[i]; + av_set_obj->fi_addr_cnt--; + return FI_SUCCESS; +} + +int cxip_av_set_addr(struct fid_av_set *set, fi_addr_t *coll_addr) +{ + *coll_addr = FI_ADDR_NOTAVAIL; + return FI_SUCCESS; +} + +int cxip_close_av_set(struct fid *fid) +{ + struct cxip_av_set *cxi_av_set; + + cxi_av_set = container_of(fid, struct cxip_av_set, av_set_fid.fid); + if (cxi_av_set->mc_obj) + return -FI_EBUSY; + + ofi_atomic_dec32(&cxi_av_set->cxi_av->ref); + + free(cxi_av_set->fi_addr_ary); + free(cxi_av_set); + return FI_SUCCESS; +} + +static struct fi_ops_av_set cxip_av_set_ops= { + .set_union = cxip_av_set_union, + .intersect = cxip_av_set_intersect, + .diff = cxip_av_set_diff, + .insert = cxip_av_set_insert, + .remove = cxip_av_set_remove, + .addr = cxip_av_set_addr +}; + +static struct fi_ops cxip_av_set_fid_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_close_av_set, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static inline int fi_addr_is_valid(struct cxip_av *av, fi_addr_t fi_addr) +{ + return cxip_av_lookup_addr(av, fi_addr, NULL) == FI_SUCCESS; +} + +int cxip_av_set(struct fid_av *av, struct fi_av_set_attr *attr, + struct fid_av_set **av_set_fid, void *context) +{ + struct cxip_av *cxi_av; + struct cxip_av_set *cxi_set; + bool abeg, aend; + fi_addr_t start, end; + size_t count, stride; + fi_addr_t i, j; + int ret; + size_t max_size; + size_t av_entry_count; + + cxi_av = container_of(av, struct cxip_av, av_fid); + av_entry_count = cxip_av_entry_count(cxi_av); + + if (!attr) + return -FI_EINVAL; + + /* We need the AV to stick around now */ + ofi_atomic_inc32(&cxi_av->ref); + + /* May change values below, don't alter struct */ + start = attr->start_addr; + end = attr->end_addr; + count = attr->count; + stride = attr->stride; + abeg = (start != FI_ADDR_NOTAVAIL); + aend = (end != FI_ADDR_NOTAVAIL); + + /* Override everything for UNIVERSE flag */ + if (attr->flags & FI_UNIVERSE) { + start = FI_ADDR_NOTAVAIL; + end = FI_ADDR_NOTAVAIL; + count = FI_ADDR_NOTAVAIL; + stride = 1; + abeg = false; + aend = false; + } + + /* Common error for these syntax tests */ + ret = -FI_EINVAL; + + /* Must specify both, or neither */ + if (abeg != aend) + goto err0; + + /* Cannot specify a range for FI_AV_MAP */ + if (abeg && cxi_av->type == FI_AV_MAP) + goto err0; + + /* Cannot specify a range for empty AV set */ + if (abeg && count == 0) + goto err0; + + /* Comm_key data must match in our structure */ + if (attr->comm_key && attr->comm_key_size && + attr->comm_key_size != sizeof(struct cxip_comm_key)) + goto err0; + + /* Must specify a range if non-sequential stride */ + if (!abeg && stride > 1) + goto err0; + + /* Stride unspecified means sequential */ + if (stride == 0) + stride = 1; + + /* Resolve undefined range and count */ + if (start == FI_ADDR_NOTAVAIL) + start = 0; + if (end == FI_ADDR_NOTAVAIL) + end = 0; + if (count > end - start + 1) + count = end - start + 1; + + cxi_set = calloc(1,sizeof(*cxi_set)); + if (!cxi_set) { + ret = -FI_ENOMEM; + goto err0; + } + + /* Allocate enough space to add all addresses */ + max_size = attr->count ? + attr->count : MAX(ofi_universe_size, av_entry_count); + cxi_set->fi_addr_ary = calloc(max_size, + sizeof(*cxi_set->fi_addr_ary)); + if (!cxi_set->fi_addr_ary) { + ret = -FI_ENOMEM; + goto err1; + } + + /* Add address indices */ + for (i=0, j=start; + i < count && j <= end && j < av_entry_count; + i++, j+=stride) { + /* Skip over invalid addresses as if not there */ + while (!fi_addr_is_valid(cxi_av, i)) { + if (++j >= av_entry_count) + break; + } + if (j >= av_entry_count) + break; + cxi_set->fi_addr_ary[i] = (fi_addr_t)j; + cxi_set->fi_addr_cnt++; + } + + /* copy comm_key from attributes, if present */ + if (attr->comm_key && attr->comm_key_size) { + memcpy(&cxi_set->comm_key, attr->comm_key, + attr->comm_key_size); + } + + cxi_set->av_set_fid.fid.fclass = FI_CLASS_AV_SET; + cxi_set->av_set_fid.fid.context = context; + cxi_set->av_set_fid.fid.ops = &cxip_av_set_fid_ops; + cxi_set->av_set_fid.ops = &cxip_av_set_ops; + cxi_set->cxi_av = cxi_av; + + *av_set_fid = &cxi_set->av_set_fid; + + return FI_SUCCESS; +err1: + free(cxi_set); +err0: + ofi_atomic_dec32(&cxi_av->ref); + return ret; +} diff --git a/prov/cxi/src/cxip_cmdq.c b/prov/cxi/src/cxip_cmdq.c new file mode 100644 index 00000000000..6d4b28efddf --- /dev/null +++ b/prov/cxi/src/cxip_cmdq.c @@ -0,0 +1,425 @@ +/* + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_DOMAIN, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__) + +enum cxi_traffic_class cxip_ofi_to_cxi_tc(uint32_t ofi_tclass) +{ + switch (ofi_tclass) { + case FI_TC_BULK_DATA: + return CXI_TC_BULK_DATA; + case FI_TC_DEDICATED_ACCESS: + return CXI_TC_DEDICATED_ACCESS; + case FI_TC_LOW_LATENCY: + return CXI_TC_LOW_LATENCY; + case FI_TC_BEST_EFFORT: + case FI_TC_NETWORK_CTRL: + case FI_TC_SCAVENGER: + default: + return CXI_TC_BEST_EFFORT; + } +} + +static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxi_cp **cp) +{ + int ret; + int i; + struct cxip_remap_cp *sw_cp; + static const enum cxi_traffic_class remap_tc = CXI_TC_BEST_EFFORT; + + ofi_spin_lock(&lni->lock); + + /* Always prefer SW remapped CPs over allocating HW CP. */ + dlist_foreach_container(&lni->remap_cps, struct cxip_remap_cp, sw_cp, + remap_entry) { + if (sw_cp->remap_cp.vni == vni && sw_cp->remap_cp.tc == tc && + sw_cp->remap_cp.tc_type == tc_type) { + CXIP_DBG("Reusing SW CP: %u VNI: %u TC: %s TYPE: %s\n", + sw_cp->remap_cp.lcid, sw_cp->remap_cp.vni, + cxi_tc_to_str(sw_cp->remap_cp.tc), + cxi_tc_type_to_str(sw_cp->remap_cp.tc_type)); + *cp = &sw_cp->remap_cp; + goto success_unlock; + } + } + + /* Allocate a new SW remapped CP entry and attempt to allocate the + * user requested HW CP. + */ + sw_cp = calloc(1, sizeof(*sw_cp)); + if (!sw_cp) { + ret = -FI_ENOMEM; + goto err_unlock; + } + + ret = cxil_alloc_cp(lni->lni, vni, tc, tc_type, + &lni->hw_cps[lni->n_cps]); + if (ret) { + /* Attempt to fall back to remap traffic class with the same + * traffic class type and allocate HW CP if necessary. + */ + CXIP_WARN("Failed to allocate CP, ret: %d VNI: %u TC: %s TYPE: %s\n", + ret, vni, cxi_tc_to_str(tc), + cxi_tc_type_to_str(tc_type)); + CXIP_WARN("Remapping original TC from %s to %s\n", + cxi_tc_to_str(tc), cxi_tc_to_str(remap_tc)); + + /* Check to see if a matching HW CP has already been allocated. + * If so, reuse the entry. + */ + for (i = 0; i < lni->n_cps; i++) { + if (lni->hw_cps[i]->vni == vni && + lni->hw_cps[i]->tc == remap_tc && + lni->hw_cps[i]->tc_type == tc_type) { + sw_cp->hw_cp = lni->hw_cps[i]; + goto found_hw_cp; + } + } + + /* Attempt to allocated a remapped HW CP. */ + ret = cxil_alloc_cp(lni->lni, vni, remap_tc, tc_type, + &lni->hw_cps[lni->n_cps]); + if (ret) { + CXIP_WARN("Failed to allocate CP, ret: %d VNI: %u TC: %s TYPE: %s\n", + ret, vni, cxi_tc_to_str(remap_tc), + cxi_tc_type_to_str(tc_type)); + ret = -FI_EINVAL; + goto err_free_sw_cp; + } + } + + CXIP_DBG("Allocated CP: %u VNI: %u TC: %s TYPE: %s\n", + lni->hw_cps[lni->n_cps]->lcid, vni, + cxi_tc_to_str(lni->hw_cps[lni->n_cps]->tc), + cxi_tc_type_to_str(lni->hw_cps[lni->n_cps]->tc_type)); + + sw_cp->hw_cp = lni->hw_cps[lni->n_cps++]; + +found_hw_cp: + sw_cp->remap_cp.vni = vni; + sw_cp->remap_cp.tc = tc; + sw_cp->remap_cp.tc_type = tc_type; + sw_cp->remap_cp.lcid = sw_cp->hw_cp->lcid; + dlist_insert_tail(&sw_cp->remap_entry, &lni->remap_cps); + + *cp = &sw_cp->remap_cp; + +success_unlock: + ofi_spin_unlock(&lni->lock); + + return FI_SUCCESS; + +err_free_sw_cp: + free(sw_cp); +err_unlock: + ofi_spin_unlock(&lni->lock); + + return ret; +} + +int cxip_txq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type) +{ + struct cxi_cp *cp; + int ret; + + if (cmdq->cur_cp->vni == vni && cmdq->cur_cp->tc == tc && + cmdq->cur_cp->tc_type == tc_type) + return FI_SUCCESS; + + ret = cxip_cp_get(cmdq->lni, vni, tc, tc_type, &cp); + if (ret != FI_SUCCESS) { + CXIP_DBG("Failed to get CP: %d\n", ret); + return -FI_EOTHER; + } + + ret = cxi_cq_emit_cq_lcid(cmdq->dev_cmdq, cp->lcid); + if (ret) { + CXIP_DBG("Failed to update CMDQ(%p) CP: %d\n", cmdq, ret); + ret = -FI_EAGAIN; + } else { + ret = FI_SUCCESS; + cmdq->cur_cp = cp; + + CXIP_DBG("Updated CMDQ(%p) CP: %d VNI: %u TC: %s TYPE: %s\n", + cmdq, cp->lcid, cp->vni, cxi_tc_to_str(cp->tc), + cxi_tc_type_to_str(cp->tc_type)); + } + + return ret; +} + +/* + * cxip_cmdq_alloc() - Allocate a command queue. + */ +int cxip_cmdq_alloc(struct cxip_lni *lni, struct cxi_eq *evtq, + struct cxi_cq_alloc_opts *cq_opts, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cmdq **cmdq) +{ + int ret; + struct cxi_cq *dev_cmdq; + struct cxip_cmdq *new_cmdq; + struct cxi_cp *cp = NULL; + + new_cmdq = calloc(1, sizeof(*new_cmdq)); + if (!new_cmdq) { + CXIP_WARN("Unable to allocate CMDQ structure\n"); + return -FI_ENOMEM; + } + + if (cq_opts->flags & CXI_CQ_IS_TX) { + ret = cxip_cp_get(lni, vni, tc, tc_type, &cp); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate CP: %d\n", ret); + return ret; + } + cq_opts->lcid = cp->lcid; + + new_cmdq->cur_cp = cp; + + /* Trig command queue can never use LL ring. */ + if (cq_opts->flags & CXI_CQ_TX_WITH_TRIG_CMDS || + lni->iface->info->device_platform == CXI_PLATFORM_NETSIM) + new_cmdq->llring_mode = CXIP_LLRING_NEVER; + else + new_cmdq->llring_mode = cxip_env.llring_mode; + } else { + new_cmdq->llring_mode = CXIP_LLRING_NEVER; + } + + ret = cxil_alloc_cmdq(lni->lni, evtq, cq_opts, &dev_cmdq); + if (ret) { + CXIP_WARN("Failed to allocate %s, ret: %d\n", + cq_opts->flags & CXI_CQ_IS_TX ? "TXQ" : "TGQ", ret); + ret = -FI_ENOSPC; + goto free_cmdq; + } + + new_cmdq->dev_cmdq = dev_cmdq; + new_cmdq->lni = lni; + *cmdq = new_cmdq; + + return FI_SUCCESS; + +free_cmdq: + free(new_cmdq); + + return ret; +} + +/* + * cxip_cmdq_free() - Free a command queue. + */ +void cxip_cmdq_free(struct cxip_cmdq *cmdq) +{ + int ret; + + ret = cxil_destroy_cmdq(cmdq->dev_cmdq); + if (ret) + CXIP_WARN("cxil_destroy_cmdq failed, ret: %d\n", ret); + + free(cmdq); +} + +/* Must hold cmdq->lock. */ +int cxip_cmdq_emit_c_state(struct cxip_cmdq *cmdq, + const struct c_cstate_cmd *c_state) +{ + int ret; + + if (memcmp(&cmdq->c_state, c_state, sizeof(*c_state))) { + ret = cxi_cq_emit_c_state(cmdq->dev_cmdq, c_state); + if (ret) { + CXIP_DBG("Failed to issue C_STATE command: %d\n", ret); + return -FI_EAGAIN; + } + + cmdq->c_state = *c_state; + } + + return FI_SUCCESS; +} + +int cxip_cmdq_emit_idc_put(struct cxip_cmdq *cmdq, + const struct c_cstate_cmd *c_state, + const struct c_idc_put_cmd *put, const void *buf, + size_t len, uint64_t flags) +{ + int ret; + + if (flags & (FI_FENCE | FI_CXI_WEAK_FENCE)) { + ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE); + if (ret) { + CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, + fi_strerror(-ret)); + return -FI_EAGAIN; + } + } + + ret = cxip_cmdq_emit_c_state(cmdq, c_state); + if (ret) { + CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + ret = cxi_cq_emit_idc_put(cmdq->dev_cmdq, put, buf, len); + if (ret) { + CXIP_WARN("Failed to emit idc_put command: %d:%s\n", ret, + fi_strerror(-ret)); + return -FI_EAGAIN; + } + + return FI_SUCCESS; +} + +int cxip_cmdq_emit_dma(struct cxip_cmdq *cmdq, struct c_full_dma_cmd *dma, + uint64_t flags) +{ + int ret; + + if (flags & (FI_FENCE | FI_CXI_WEAK_FENCE)) { + ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE); + if (ret) { + CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, + fi_strerror(-ret)); + return -FI_EAGAIN; + } + } + + ret = cxi_cq_emit_dma(cmdq->dev_cmdq, dma); + if (ret) { + CXIP_WARN("Failed to emit dma command: %d:%s\n", ret, + fi_strerror(-ret)); + return -FI_EAGAIN; + } + + return FI_SUCCESS; +} + +int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, + const struct c_cstate_cmd *c_state, + const struct c_idc_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush) +{ + struct c_full_dma_cmd flush_cmd; + bool fetching_flush = fetching && flush; + int ret; + + if (fetching_flush) { + memset(&flush_cmd, 0, sizeof(flush_cmd)); + flush_cmd.command.opcode = C_CMD_PUT; + flush_cmd.index_ext = c_state->index_ext; + flush_cmd.event_send_disable = 1; + flush_cmd.dfa = amo->idc_header.dfa; + flush_cmd.remote_offset = amo->idc_header.remote_offset; + flush_cmd.eq = c_state->eq; + flush_cmd.user_ptr = c_state->user_ptr; + flush_cmd.flush = 1; + } + + if (flags & (FI_FENCE | FI_CXI_WEAK_FENCE)) { + ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE); + if (ret) { + CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, + fi_strerror(-ret)); + return -FI_EAGAIN; + } + } + + ret = cxip_cmdq_emit_c_state(cmdq, c_state); + if (ret) { + CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + /* Fetching AMO with flush requires two commands. Ensure there is enough + * space. At worse at least 16x 32-byte slots are needed. + */ + if (fetching_flush && __cxi_cq_free_slots(cmdq->dev_cmdq) < 16) { + CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n"); + return -FI_EAGAIN; + } + + ret = cxi_cq_emit_idc_amo(cmdq->dev_cmdq, amo, fetching); + if (ret) { + CXIP_WARN("Failed to emit IDC amo\n"); + return -FI_EAGAIN; + } + + if (fetching_flush) { + /* CQ space check already occurred. Thus, return code can be + * ignored. + */ + ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &flush_cmd); + assert(ret == 0); + } + + return FI_SUCCESS; +} + +int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, + uint64_t flags, bool fetching, bool flush) +{ + struct c_full_dma_cmd flush_cmd; + bool fetching_flush = fetching && flush; + int ret; + + if (fetching_flush) { + memset(&flush_cmd, 0, sizeof(flush_cmd)); + flush_cmd.command.opcode = C_CMD_PUT; + flush_cmd.index_ext = amo->index_ext; + flush_cmd.event_send_disable = 1; + flush_cmd.dfa = amo->dfa; + flush_cmd.remote_offset = amo->remote_offset; + flush_cmd.eq = amo->eq; + flush_cmd.user_ptr = amo->user_ptr; + flush_cmd.flush = 1; + flush_cmd.match_bits = amo->match_bits; + } + + if (flags & (FI_FENCE | FI_CXI_WEAK_FENCE)) { + ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE); + if (ret) { + CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, + fi_strerror(-ret)); + return -FI_EAGAIN; + } + } + + /* Fetching AMO with flush requires two commands. Ensure there is enough + * space. At worse at least 16x 32-byte slots are needed. + */ + if (fetching_flush && __cxi_cq_free_slots(cmdq->dev_cmdq) < 16) { + CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n"); + return -FI_EAGAIN; + } + + ret = cxi_cq_emit_dma_amo(cmdq->dev_cmdq, amo, fetching); + if (ret) { + CXIP_WARN("Failed to emit DMA amo\n"); + return -FI_EAGAIN; + } + + if (fetching_flush) { + /* CQ space check already occurred. Thus, return code can be + * ignored. + */ + ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &flush_cmd); + assert(ret == 0); + } + + return FI_SUCCESS; +} diff --git a/prov/cxi/src/cxip_cntr.c b/prov/cxi/src/cxip_cntr.c new file mode 100644 index 00000000000..2f7354330ac --- /dev/null +++ b/prov/cxi/src/cxip_cntr.c @@ -0,0 +1,865 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include + +#include "cxip.h" + +#include + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_DATA, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_WARN(FI_LOG_EP_DATA, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_DATA, __VA_ARGS__) + +static int cxip_cntr_copy_ct_writeback(struct cxip_cntr *cntr, + struct c_ct_writeback *wb_copy) +{ + struct cxip_domain *dom = cntr->domain; + ssize_t ret __attribute__((unused)); + struct iovec hmem_iov; + + if (cntr->wb_iface == FI_HMEM_SYSTEM) { + memcpy(wb_copy, cntr->wb, sizeof(*cntr->wb)); + return FI_SUCCESS; + } + + if (cntr->wb_handle_valid) { + ret = ofi_hmem_dev_reg_copy_from_hmem(cntr->wb_iface, + cntr->wb_handle, wb_copy, + cntr->wb, + sizeof(*cntr->wb)); + assert(ret == FI_SUCCESS); + return FI_SUCCESS; + } + + hmem_iov.iov_base = cntr->wb; + hmem_iov.iov_len = sizeof(*cntr->wb); + + ret = dom->hmem_ops.copy_from_hmem_iov(wb_copy, sizeof(*cntr->wb), + cntr->wb_iface, cntr->wb_device, + &hmem_iov, 1, 0); + assert(ret == sizeof(*wb_copy)); + return FI_SUCCESS; +} + +static int cxip_cntr_get_ct_error(struct cxip_cntr *cntr, uint64_t *error) +{ + struct c_ct_writeback wb_copy; + int ret; + + /* Only can reference the ct_failure field directly if dealing with + * system memory. Device memory requires a memcpy of the contents into + * system memory. + */ + if (cntr->wb_iface == FI_HMEM_SYSTEM) { + *error = cntr->wb->ct_failure; + return FI_SUCCESS; + } + + ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); + if (ret) + return ret; + + *error = wb_copy.ct_failure; + return FI_SUCCESS; +} + +static int cxip_cntr_get_ct_success(struct cxip_cntr *cntr, uint64_t *success) +{ + struct c_ct_writeback wb_copy; + int ret; + + /* Only can reference the ct_success field directly if dealing with + * system memory. Device memory requires a memcpy of the contents into + * system memory. + */ + if (cntr->wb_iface == FI_HMEM_SYSTEM) { + *success = cntr->wb->ct_success; + return FI_SUCCESS; + } + + ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); + if (ret) + return ret; + + *success = wb_copy.ct_success; + return FI_SUCCESS; +} + +#define CT_WRITEBACK_OFFSET 7U + +static int cxip_cntr_clear_ct_writeback(struct cxip_cntr *cntr) +{ + struct iovec hmem_iov; + ssize_t ret __attribute__((unused)); + uint8_t ct_writeback; + + /* Only can reference the ct_success field directly if dealing with + * system memory. Device memory requires a memcpy of the contents into + * device memory. + */ + if (cntr->wb_iface == FI_HMEM_SYSTEM) { + cntr->wb->ct_writeback = 0; + return FI_SUCCESS; + } + + /* Only write to ct_writeback byte. */ + ct_writeback = 0; + hmem_iov.iov_base = (char *)cntr->wb + CT_WRITEBACK_OFFSET; + hmem_iov.iov_len = 1; + + ret = cntr->domain->hmem_ops.copy_to_hmem_iov(cntr->wb_iface, 0, + &hmem_iov, 1, 0, + &ct_writeback, 1); + assert(ret == 1); + + return FI_SUCCESS; +} + +static int cxip_cntr_get_ct_writeback(struct cxip_cntr *cntr) +{ + struct c_ct_writeback wb_copy; + int ret; + + /* Only can reference the ct_writeback field directly if dealing with + * system memory. Device memory requires a memcpy of the contents into + * system memory. + */ + if (cntr->wb_iface == FI_HMEM_SYSTEM) + return cntr->wb->ct_writeback; + + ret = cxip_cntr_copy_ct_writeback(cntr, &wb_copy); + if (ret) + return ret; + + return wb_copy.ct_writeback; +} + +#define TRIG_OP_LOCK_NAME_FMT "/.uuid%d_cxi%d_vni%d_svcid%d" +#define TRIG_OP_LOCK_NAME_SIZE 256U + +static int cxip_dom_cntr_enable(struct cxip_domain *dom) +{ + char trig_op_lock_name[TRIG_OP_LOCK_NAME_SIZE]; + struct cxi_cq_alloc_opts cq_opts = { + .policy = CXI_CQ_UPDATE_ALWAYS, + }; + int ret; + + ofi_spin_lock(&dom->lock); + + if (dom->cntr_init) { + ofi_spin_unlock(&dom->lock); + return FI_SUCCESS; + } + + assert(dom->enabled); + + ret = snprintf(trig_op_lock_name, TRIG_OP_LOCK_NAME_SIZE, + TRIG_OP_LOCK_NAME_FMT, getuid(), + dom->iface->dev->info.dev_id, dom->auth_key.vni, + dom->auth_key.svc_id); + if (ret >= TRIG_OP_LOCK_NAME_SIZE) { + CXIP_WARN("snprintf buffer too small\n"); + ret = -FI_ENOSPC; + goto err_unlock; + } else if (ret < 0) { + CXIP_WARN("snprintf failed: %d\n", ret); + goto err_unlock; + } + + dom->trig_op_lock = sem_open(trig_op_lock_name, O_CREAT, + S_IRUSR | S_IWUSR, 1); + if (dom->trig_op_lock == SEM_FAILED) { + ret = -errno; + CXIP_WARN("sem_open failed: %d\n", ret); + goto err_unlock; + } + + cq_opts.count = MAX(dom->max_trig_op_in_use, 64); + cq_opts.flags = CXI_CQ_IS_TX | CXI_CQ_TX_WITH_TRIG_CMDS; + cq_opts.policy = CXI_CQ_UPDATE_ALWAYS; + + ret = cxip_cmdq_alloc(dom->lni, NULL, &cq_opts, + dom->auth_key.vni, + cxip_ofi_to_cxi_tc(dom->tclass), + CXI_TC_TYPE_DEFAULT, + &dom->trig_cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate trig_cmdq: %d\n", ret); + goto err_close_sem; + } + + if (dom->util_domain.threading == FI_THREAD_DOMAIN) + ofi_genlock_init(&dom->trig_cmdq_lock, OFI_LOCK_NONE); + else + ofi_genlock_init(&dom->trig_cmdq_lock, OFI_LOCK_SPINLOCK); + + dom->cntr_init = true; + + CXIP_DBG("Domain counters enabled: %p\n", dom); + + ofi_spin_unlock(&dom->lock); + + return FI_SUCCESS; + +err_close_sem: + sem_close(dom->trig_op_lock); +err_unlock: + ofi_spin_unlock(&dom->lock); + + return ret; +} + +void cxip_dom_cntr_disable(struct cxip_domain *dom) +{ + char trig_op_lock_name[TRIG_OP_LOCK_NAME_SIZE]; + int ret; + + if (dom->cntr_init) { + ofi_genlock_destroy(&dom->trig_cmdq_lock); + + sem_close(dom->trig_op_lock); + + ret = snprintf(trig_op_lock_name, TRIG_OP_LOCK_NAME_SIZE, + TRIG_OP_LOCK_NAME_FMT, getuid(), + dom->iface->dev->info.dev_id, dom->auth_key.vni, + dom->auth_key.svc_id); + if (ret >= TRIG_OP_LOCK_NAME_SIZE) + CXIP_WARN("snprintf buffer too small\n"); + else if (ret < 0) + CXIP_WARN("snprintf failed: %d\n", ret); + else + sem_unlink(trig_op_lock_name); + + cxip_cmdq_free(dom->trig_cmdq); + } +} + +const struct fi_cntr_attr cxip_cntr_attr = { + .events = FI_CNTR_EVENTS_COMP, + .wait_obj = FI_WAIT_YIELD, + .wait_set = NULL, + .flags = 0, +}; + +/* + * cxip_cntr_mod() - Modify counter value. + * + * Set or increment the success or failure value of a counter by 'value'. + */ +int cxip_cntr_mod(struct cxip_cntr *cxi_cntr, uint64_t value, bool set, + bool err) +{ + struct c_ct_cmd cmd; + struct cxip_cmdq *cmdq; + int ret; + + if (!set) { + /* Doorbell supports counter increment */ + if (err) + cxi_ct_inc_failure(cxi_cntr->ct, value); + else + cxi_ct_inc_success(cxi_cntr->ct, value); + } else { + /* Doorbell supports counter reset */ + if (!value) { + if (err) + cxi_ct_reset_failure(cxi_cntr->ct); + else + cxi_ct_reset_success(cxi_cntr->ct); + } else { + memset(&cmd, 0, sizeof(cmd)); + cmdq = cxi_cntr->domain->trig_cmdq; + + /* Use CQ to set a specific counter value */ + cmd.ct = cxi_cntr->ct->ctn; + if (err) { + cmd.set_ct_failure = 1; + cmd.ct_failure = value; + } else { + cmd.set_ct_success = 1; + cmd.ct_success = value; + } + ofi_genlock_lock(&cxi_cntr->domain->trig_cmdq_lock); + + ret = cxi_cq_emit_ct(cmdq->dev_cmdq, C_CMD_CT_SET, + &cmd); + if (ret) { + ofi_genlock_unlock(&cxi_cntr->domain->trig_cmdq_lock); + return -FI_EAGAIN; + } + cxi_cq_ring(cmdq->dev_cmdq); + ofi_genlock_unlock(&cxi_cntr->domain->trig_cmdq_lock); + } + } + + return FI_SUCCESS; +} + +static int cxip_cntr_issue_ct_get(struct cxip_cntr *cntr, bool *issue_ct_get) +{ + int ret; + + /* The calling thread which changes CT writeback bit from 1 to 0 must + * issue a CT get command. + */ + ofi_mutex_lock(&cntr->lock); + + ret = cxip_cntr_get_ct_writeback(cntr); + if (ret < 0) { + CXIP_WARN("Failed to read counter writeback: rc=%d\n", ret); + goto err_unlock; + } + + if (ret) { + ret = cxip_cntr_clear_ct_writeback(cntr); + if (ret) { + CXIP_WARN("Failed to clear counter writeback bit: rc=%d\n", + ret); + goto err_unlock; + } + + *issue_ct_get = true; + } else { + *issue_ct_get = false; + } + + ofi_mutex_unlock(&cntr->lock); + + return FI_SUCCESS; + +err_unlock: + ofi_mutex_unlock(&cntr->lock); + + *issue_ct_get = false; + return ret; +} + +/* + * cxip_cntr_get() - Schedule a counter write-back. + * + * Schedule hardware to write the value of a counter to memory. Avoid + * scheduling multiple write-backs at once. The counter value will appear in + * memory a small amount of time later. + */ +static int cxip_cntr_get(struct cxip_cntr *cxi_cntr, bool force) +{ + struct c_ct_cmd cmd; + struct cxip_cmdq *cmdq; + int ret; + bool issue_ct_get; + + if (!force) { + ret = cxip_cntr_issue_ct_get(cxi_cntr, &issue_ct_get); + if (ret) { + CXIP_WARN("cxip_cntr_issue_ct_get() error: rc=%d\n", + ret); + return ret; + } + + if (!issue_ct_get) + return FI_SUCCESS; + } + + memset(&cmd, 0, sizeof(cmd)); + cmdq = cxi_cntr->domain->trig_cmdq; + + /* Request a write-back */ + cmd.ct = cxi_cntr->ct->ctn; + + ofi_genlock_lock(&cxi_cntr->domain->trig_cmdq_lock); + ret = cxi_cq_emit_ct(cmdq->dev_cmdq, C_CMD_CT_GET, &cmd); + if (ret) { + ofi_genlock_unlock(&cxi_cntr->domain->trig_cmdq_lock); + return -FI_EAGAIN; + } + cxi_cq_ring(cmdq->dev_cmdq); + ofi_genlock_unlock(&cxi_cntr->domain->trig_cmdq_lock); + + return FI_SUCCESS; +} + +/* + * cxip_cntr_progress() - Make CQ progress on bound endpoint. + */ +static void cxip_cntr_progress(struct cxip_cntr *cntr) +{ + struct fid_list_entry *fid_entry; + struct dlist_entry *item; + + /* Lock is used to protect bound context list. Note that + * CQ processing updates counters via doorbells, use of + * cntr->lock is not required by CQ processing. + */ + ofi_mutex_lock(&cntr->lock); + + dlist_foreach(&cntr->ctx_list, item) { + fid_entry = container_of(item, struct fid_list_entry, entry); + cxip_ep_progress(fid_entry->fid); + } + ofi_mutex_unlock(&cntr->lock); +} + +/* + * cxip_cntr_read() - fi_cntr_read() implementation. + */ +static uint64_t cxip_cntr_read(struct fid_cntr *fid_cntr) +{ + struct cxip_cntr *cxi_cntr; + uint64_t success = 0; + int ret; + + cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); + + cxip_cntr_progress(cxi_cntr); + cxip_cntr_get(cxi_cntr, false); + + /* TODO: Fall back to reading register on error? */ + ret = cxip_cntr_get_ct_success(cxi_cntr, &success); + if (ret != FI_SUCCESS) + CXIP_WARN("Failed to read counter success: rc=%d\n", ret); + + return success; +} + +/* + * cxip_cntr_readerr() - fi_cntr_readerr() implementation. + */ +static uint64_t cxip_cntr_readerr(struct fid_cntr *fid_cntr) +{ + struct cxip_cntr *cxi_cntr; + uint64_t error = 0; + int ret; + + cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); + + cxip_cntr_progress(cxi_cntr); + cxip_cntr_get(cxi_cntr, false); + + /* TODO: Fall back to reading register on error? */ + ret = cxip_cntr_get_ct_error(cxi_cntr, &error); + if (ret != FI_SUCCESS) + CXIP_WARN("Failed to read counter error: rc=%d\n", ret); + + return error; +} + +/* + * cxip_cntr_add() - fi_cntr_add() implementation. + */ +static int cxip_cntr_add(struct fid_cntr *fid_cntr, uint64_t value) +{ + struct cxip_cntr *cxi_cntr; + + if (value > FI_CXI_CNTR_SUCCESS_MAX) + return -FI_EINVAL; + + cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); + + return cxip_cntr_mod(cxi_cntr, value, false, false); +} + +/* + * cxip_cntr_set() - fi_cntr_set() implementation. + */ +static int cxip_cntr_set(struct fid_cntr *fid_cntr, uint64_t value) +{ + struct cxip_cntr *cxi_cntr; + + if (value > FI_CXI_CNTR_SUCCESS_MAX) + return -FI_EINVAL; + + cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); + + return cxip_cntr_mod(cxi_cntr, value, true, false); +} + +/* + * cxip_cntr_adderr() - fi_cntr_adderr() implementation. + */ +static int cxip_cntr_adderr(struct fid_cntr *fid_cntr, uint64_t value) +{ + struct cxip_cntr *cxi_cntr; + + if (value > FI_CXI_CNTR_FAILURE_MAX) + return -FI_EINVAL; + + cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); + + return cxip_cntr_mod(cxi_cntr, value, false, true); +} + +/* + * cxip_cntr_seterr() - fi_cntr_seterr() implementation. + */ +static int cxip_cntr_seterr(struct fid_cntr *fid_cntr, uint64_t value) +{ + struct cxip_cntr *cxi_cntr; + + if (value > FI_CXI_CNTR_FAILURE_MAX) + return -FI_EINVAL; + + cxi_cntr = container_of(fid_cntr, struct cxip_cntr, cntr_fid); + + return cxip_cntr_mod(cxi_cntr, value, true, true); +} + +static int cxip_cntr_emit_trig_event_cmd(struct cxip_cntr *cntr, + uint64_t threshold) +{ + struct c_ct_cmd cmd = { + .trig_ct = cntr->ct->ctn, + .threshold = threshold, + .eq = C_EQ_NONE, + }; + struct cxip_cmdq *cmdq = cntr->domain->trig_cmdq; + int ret; + + /* TODO: Need to handle TLE exhaustion. */ + ofi_genlock_lock(&cntr->domain->trig_cmdq_lock); + ret = cxi_cq_emit_ct(cmdq->dev_cmdq, C_CMD_CT_TRIG_EVENT, &cmd); + if (!ret) + cxi_cq_ring(cmdq->dev_cmdq); + ofi_genlock_unlock(&cntr->domain->trig_cmdq_lock); + + if (ret) + return -FI_EAGAIN; + return FI_SUCCESS; +} + +/* + * cxip_cntr_wait() - fi_cntr_wait() implementation. + */ +static int cxip_cntr_wait(struct fid_cntr *fid_cntr, uint64_t threshold, + int timeout) +{ + struct cxip_cntr *cntr = + container_of(fid_cntr, struct cxip_cntr, cntr_fid); + uint64_t success = 0; + int ret; + uint64_t endtime; + + + if (cntr->attr.wait_obj == FI_WAIT_NONE || + threshold > FI_CXI_CNTR_SUCCESS_MAX) + return -FI_EINVAL; + + endtime = ofi_timeout_time(timeout); + + /* Use a triggered list entry setup to fire at the user's threshold. + * This will cause a success/error writeback to occur at the desired + * threshold. + */ + ret = cxip_cntr_emit_trig_event_cmd(cntr, threshold); + if (ret) { + CXIP_INFO("Failed to emit trig cmd: %d\n", ret); + return ret; + } + + /* Spin until the trigger list entry fires which updates the CT success + * field. + */ + do { + ret = cxip_cntr_get_ct_success(cntr, &success); + if (ret) { + CXIP_WARN("Failed to read counter success: %d\n", ret); + return ret; + } + + if (success >= threshold) + return FI_SUCCESS; + + if (ofi_adjust_timeout(endtime, &timeout)) + return -FI_ETIMEDOUT; + + /* Only FI_WAIT_YIELD is supported. */ + sched_yield(); + + cxip_cntr_progress(cntr); + + } while (1); + + /* TODO: Triggered operation may get leaked on timeout and threshold + * never met. + */ +} + +/* + * cxip_cntr_control() - fi_control() implementation for counter objects. + */ +static int cxip_cntr_control(struct fid *fid, int command, void *arg) +{ + int ret = FI_SUCCESS; + struct cxip_cntr *cntr; + + cntr = container_of(fid, struct cxip_cntr, cntr_fid); + + switch (command) { + case FI_GETWAIT: + if (cntr->wait) + ret = fi_control(&cntr->wait->fid, + FI_GETWAIT, arg); + else + ret = -FI_EINVAL; + break; + + case FI_GETOPSFLAG: + memcpy(arg, &cntr->attr.flags, sizeof(uint64_t)); + break; + + case FI_SETOPSFLAG: + memcpy(&cntr->attr.flags, arg, sizeof(uint64_t)); + break; + + default: + ret = -FI_EINVAL; + break; + } + + return ret; +} + +/* + * cxip_cntr_enable() - Assign hardware resources to the Counter. + */ +static int cxip_cntr_enable(struct cxip_cntr *cxi_cntr) +{ + int ret; + + ret = cxip_dom_cntr_enable(cxi_cntr->domain); + if (ret != FI_SUCCESS) + return ret; + + cxi_cntr->wb = &cxi_cntr->lwb; + cxi_cntr->wb_iface = FI_HMEM_SYSTEM; + cxi_cntr->wb_handle_valid = false; + + ret = cxil_alloc_ct(cxi_cntr->domain->lni->lni, + cxi_cntr->wb, &cxi_cntr->ct); + if (ret) { + CXIP_WARN("Failed to allocate CT, ret: %d\n", ret); + return -FI_EDOMAIN; + } + + /* Zero the success and failure values. In addition, this will force a + * writeback into the writeback buffer. + */ + cxi_ct_reset_failure(cxi_cntr->ct); + cxi_ct_reset_success(cxi_cntr->ct); + + CXIP_DBG("Counter enabled: %p (CT: %d)\n", cxi_cntr, cxi_cntr->ct->ctn); + + return FI_SUCCESS; +} + +/* + * cxip_cntr_close() - fi_close() implementation for counter objects. + */ +static int cxip_cntr_close(struct fid *fid) +{ + struct cxip_cntr *cntr; + int ret; + + cntr = container_of(fid, struct cxip_cntr, cntr_fid.fid); + if (ofi_atomic_get32(&cntr->ref)) + return -FI_EBUSY; + + assert(dlist_empty(&cntr->ctx_list)); + + if (cntr->wb_iface != FI_HMEM_SYSTEM && + cntr->wb_handle_valid) + ofi_hmem_dev_unregister(cntr->wb_iface, cntr->wb_handle); + + ret = cxil_destroy_ct(cntr->ct); + if (ret) + CXIP_WARN("Failed to free CT, ret: %d\n", ret); + else + CXIP_DBG("Counter disabled: %p\n", cntr); + + ofi_mutex_destroy(&cntr->lock); + + cxip_domain_remove_cntr(cntr->domain, cntr); + + free(cntr); + return 0; +} + +/* Set the counter writeback address to a client provided address. */ +int cxip_set_wb_buffer(struct fid *fid, void *buf, size_t len) +{ + int ret; + struct cxip_cntr *cntr; + uint64_t flags; + + if (!buf) + return -FI_EINVAL; + + if (len < sizeof(struct c_ct_writeback)) + return -FI_EINVAL; + + cntr = container_of(fid, struct cxip_cntr, cntr_fid.fid); + + ret = cxil_ct_wb_update(cntr->ct, buf); + if (ret) + return ret; + + if (cntr->wb_iface != FI_HMEM_SYSTEM && + cntr->wb_handle_valid) + ofi_hmem_dev_unregister(cntr->wb_iface, cntr->wb_handle); + + cntr->wb = buf; + cntr->wb_iface = ofi_get_hmem_iface(buf, &cntr->wb_device, &flags); + + if (cntr->wb_iface != FI_HMEM_SYSTEM) { + ret = ofi_hmem_dev_register(cntr->wb_iface, cntr->wb, + sizeof(*cntr->wb), + &cntr->wb_handle); + cntr->wb_handle_valid = (ret == FI_SUCCESS); + } + + /* Force a counter writeback into the user's provider buffer. */ + do { + ret = cxip_cntr_get(cntr, true); + } while (ret == -FI_EAGAIN); + + return ret; +} + +/* Get the counter MMIO region. */ +int cxip_get_mmio_addr(struct fid *fid, void **addr, size_t *len) +{ + struct cxip_cntr *cntr; + + cntr = container_of(fid, struct cxip_cntr, cntr_fid.fid); + + if (!cntr || !cntr->ct) + return -FI_EINVAL; + + *addr = cntr->ct->doorbell; + *len = sizeof(cntr->ct->doorbell); + + return FI_SUCCESS; +} + +static struct fi_cxi_cntr_ops cxip_cntr_ext_ops = { + .set_wb_buffer = cxip_set_wb_buffer, + .get_mmio_addr = cxip_get_mmio_addr, +}; + +static int cxip_cntr_ops_open(struct fid *fid, const char *ops_name, + uint64_t flags, void **ops, void *context) +{ + if (!strcmp(ops_name, FI_CXI_COUNTER_OPS)) { + *ops = &cxip_cntr_ext_ops; + return FI_SUCCESS; + } + + return -FI_EINVAL; +} + +static struct fi_ops_cntr cxip_cntr_ops = { + .size = sizeof(struct fi_ops_cntr), + .readerr = cxip_cntr_readerr, + .read = cxip_cntr_read, + .add = cxip_cntr_add, + .set = cxip_cntr_set, + .wait = cxip_cntr_wait, + .adderr = cxip_cntr_adderr, + .seterr = cxip_cntr_seterr, +}; + +static struct fi_ops cxip_cntr_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_cntr_close, + .bind = fi_no_bind, + .control = cxip_cntr_control, + .ops_open = cxip_cntr_ops_open, +}; + +/* + * cxip_cntr_verify_attr() - Verify counter creation attributes. + */ +static int cxip_cntr_verify_attr(struct fi_cntr_attr *attr) +{ + if (!attr) + return FI_SUCCESS; + + if (attr->events != FI_CNTR_EVENTS_COMP) + return -FI_ENOSYS; + + switch (attr->wait_obj) { + case FI_WAIT_NONE: + case FI_WAIT_UNSPEC: + case FI_WAIT_YIELD: + break; + default: + return -FI_ENOSYS; + } + + if (attr->flags) + return -FI_ENOSYS; + + return FI_SUCCESS; +} + +/* + * cxip_cntr_open() - fi_cntr_open() implementation. + */ +int cxip_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, + struct fid_cntr **cntr, void *context) +{ + int ret; + struct cxip_domain *dom; + struct cxip_cntr *_cntr; + + dom = container_of(domain, struct cxip_domain, util_domain.domain_fid); + + ret = cxip_cntr_verify_attr(attr); + if (ret != FI_SUCCESS) + return ret; + + _cntr = calloc(1, sizeof(*_cntr)); + if (!_cntr) + return -FI_ENOMEM; + + if (!attr) + memcpy(&_cntr->attr, &cxip_cntr_attr, sizeof(cxip_cntr_attr)); + else + memcpy(&_cntr->attr, attr, sizeof(cxip_cntr_attr)); + + ofi_atomic_initialize32(&_cntr->ref, 0); + dlist_init(&_cntr->ctx_list); + + ofi_mutex_init(&_cntr->lock); + + _cntr->cntr_fid.fid.fclass = FI_CLASS_CNTR; + _cntr->cntr_fid.fid.context = context; + _cntr->cntr_fid.fid.ops = &cxip_cntr_fi_ops; + _cntr->cntr_fid.ops = &cxip_cntr_ops; + _cntr->domain = dom; + + ret = cxip_cntr_enable(_cntr); + if (ret) + goto err_free_cntr; + + cxip_domain_add_cntr(dom, _cntr); + + *cntr = &_cntr->cntr_fid; + + return FI_SUCCESS; + +err_free_cntr: + free(_cntr); + + return ret; +} diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c new file mode 100644 index 00000000000..b58983bf62c --- /dev/null +++ b/prov/cxi/src/cxip_coll.c @@ -0,0 +1,3814 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP + * Support for accelerated collective reductions. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cxip.h" + +#ifndef _MM_GET_FLUSH_ZERO_MODE +#define _MM_GET_FLUSH_ZERO_MODE() ({0;}) +#endif + +#define TRACE_PKT(fmt, ...) CXIP_TRACE(CXIP_TRC_COLL_PKT, fmt, \ + ##__VA_ARGS__) +#define TRACE_JOIN(fmt, ...) CXIP_TRACE(CXIP_TRC_COLL_JOIN, fmt, \ + ##__VA_ARGS__) +#define TRACE_DEBUG(fmt, ...) CXIP_TRACE(CXIP_TRC_COLL_DEBUG, fmt, \ + ##__VA_ARGS__) + +// TODO regularize usage of these +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) + +/* must all be 0 in production code */ +#define __chk_pkts 1 +#define __trc_pkts 1 +#define __trc_data 1 + +#define MAGIC 0x677d + +/**************************************************************************** + * Reduction packet for hardware accelerated collectives: + * + * +----------------------------------------------------------+ + * | BYTES | Mnemonic | Definition | + * +----------------------------------------------------------+ + * | 48:17 | RED_PAYLOAD | Reduction payload, always 32 bytes | + * | 16:5 | RED_HDR | Reduction Header (below) | + * | 4:0 | RED_PADDING | Padding | + * +----------------------------------------------------------+ + * + * Reduction header format: + * -------------------------------------------------------- + * | Field | Description | Bit | Size (bits) + * -------------------------------------------------------- + * | rt_seqno | Sequence number | 0 | 10 | + * | rt_arm | Multicast arm command | 10 | 1 | + * | rt_op | Reduction operation | 11 | 6 | + * | rt_count | Number of contributions | 17 | 20 | + * | rt_resno | Result number | 37 | 10 | + * | rt_rc | result code | 47 | 4 | + * | rt_repsum_m | Reproducible sum M value | 51 | 8 | + * | rt_repsum_ovfl | Reproducible sum M ovfl | 59 | 2 | + * | rt_pad | Pad to 64 bits | 61 | 3 | + * | rt_cookie | Cookie value | 64 | 32 | + * -------------------------------------------------------- + * + * Note that this header is a 12-byte object, and "network-defined order" means + * big-endian for the entire 12-byte object. Thus, bytes must be swapped so + * that the MSByte of rt_cookie appears at byte 0, and the LS 8 bits of + * rt_seqno appear in byte 11. + * + * The cookie is ignored by reduction hardware, and is used as follows: + * + * mcast_id is the 13-bit multicast address used to disambiguate multiple + * multicast trees, since all incoming collective traffic is received by a + * single PTE bound to the endpoint. + * + * red_id is used to disambiguate packets delivered for different concurrent + * reductions. + * + * magic is a magic number used to positively identify this packet as a + * reduction packet. The basic send/receive code could be used for other kinds + * of restricted IDC packets. At present, all such packets are discarded. + * + * retry is a control bit that can be invoked by the hw root node to initiate a + * retransmission of the data from the leaves, if packets are lost. + */ +struct cxip_coll_cookie { + uint32_t mcast_id:13; + uint32_t red_id:3; + uint32_t magic: 15; + uint32_t retry: 1; +} __attribute__((__packed__)); /* size 4b */ + +/* Packed header bits and cookie from above */ +struct cxip_coll_hdr { + uint64_t seqno:10; + uint64_t arm:1; + uint64_t op:6; + uint64_t redcnt:20; + uint64_t resno:10; + uint64_t red_rc:4; + uint64_t repsum_m:8; + uint64_t repsum_ovflid:2; + uint64_t pad:3; + struct cxip_coll_cookie cookie; +} __attribute__((__packed__)); /* size 12b */ + +/* The following structure is 49 bytes in size, and all of the fields align + * properly for network transmission. + */ +struct red_pkt { + uint8_t pad[5]; /* size 5b offset 0b */ + struct cxip_coll_hdr hdr; /* size 12b offset 5b */ + uint8_t data[32]; /* size 32b offset 17b */ +} __attribute__((__packed__)); /* size 49b */ + +/* Swap byte order in an object of any size. Works for even or odd counts */ +static inline +void _swapbyteorder(void *ptr, int count) +{ + uint8_t *p1 = (uint8_t *)ptr; + uint8_t *p2 = p1 + count - 1; + uint8_t swp; + while (p1 < p2) { + swp = *p1; + *p1 = *p2; + *p2 = swp; + p1++; + p2--; + } +} + +/** + * Reformat the packet to accommodate network-ordering (big-endian) Rosetta + * expectations, versus little-endian Intel processing. + * + * Note in particular that the header bytes are treated as a single 12-byte + * object, rather than an 8-byte followed by a 4-byte, i.e. the last byte of the + * cookie is the first byte of the data processed by Rosetta. Note also that + * there is a 5-byte pad at the beginning of the packet, not included in the + * byte-swapping. + * + * This is done in-place for convenience. For reductions, it is copied to a + * properly-aligned data structure for mathematical operations. + */ +static inline +void _swappkt(struct red_pkt *pkt) +{ +#if (BYTE_ORDER == LITTLE_ENDIAN) + uint64_t *data = (uint64_t *)pkt->data; + int i; + + _swapbyteorder(&pkt->hdr, sizeof(pkt->hdr)); + for (i = 0; i < 4; i++) + _swapbyteorder(&data[i], 8); +#else +#error "Unsupported processor byte ordering" +#endif +} + +/** + * Verificaton of the packet structure, normally disabled. Sizes and offsets + * cannot be checked at compile time. If the structure is wrong, this will + * call abort(). + */ +#define FLDOFFSET(base, fld) ((uint8_t *)&base.fld - (uint8_t *)&base) +__attribute__((unused)) static inline +void check_red_pkt(void) +{ +#if __chk_pkts + static int checked = 0; + struct red_pkt pkt; + uint64_t len, exp; + uint8_t *ptr, offset; + int i, err = 0; + + if (checked) + return; + checked = 1; + + len = sizeof(pkt); + exp = 49; + if (len != exp) { + TRACE_PKT("sizeof(pkt) = %ld, exp %ld\n", len, exp); + err++; + } + len = sizeof(pkt.pad); + exp = 5; + if (len != exp) { + TRACE_PKT("sizeof(pkt.pad) = %ld, exp %ld\n", len, exp); + err++; + } + len = sizeof(pkt.hdr); + exp = 12; + if (len != exp) { + TRACE_PKT("sizeof(pkt.hdr) = %ld, exp %ld\n", len, exp); + err++; + } + len = sizeof(pkt.data); + exp = 32; + if (len != exp) { + TRACE_PKT("sizeof(pkt.data) = %ld, exp %ld\n", len, exp); + err++; + } + len = FLDOFFSET(pkt, hdr); + exp = 5; + if (len != exp) { + TRACE_PKT("offset(pkt.hdr) = %ld, exp %ld\n", len, exp); + err++; + } + len = FLDOFFSET(pkt, data); + exp = 17; + if (len != exp) { + TRACE_PKT("offset(pkt.data) = %ld, exp %ld\n", len, exp); + err++; + } + + /* Arbitrary value between 1,15 inclusive, ensure non-zero fill */ + offset = 13; + + /* Fill, swap, and confirm integrity of all 49 bytes */ + ptr = (uint8_t *)&pkt; + for (i = 0; i < sizeof(pkt); i++) + ptr[i] = i + offset; + _swappkt(&pkt); + _swappkt(&pkt); + for (i = 0; i < sizeof(pkt); i++) + if (ptr[i] != i + offset) { + TRACE_PKT("pkt[%d] = %d, exp %d\n", i, ptr[i], i + offset); + err++; + } + + if (err) { + TRACE_PKT("*** INVALID STRUCTURE see above ***\n"); + abort(); + } +#endif +} + +__attribute__((unused)) static inline +void _dump_red_pkt(struct red_pkt *pkt, char *dir) +{ +#if __trc_pkts + __attribute__((__unused__)) const uint64_t *data + = (const uint64_t *)pkt->data; + __attribute__((__unused__)) int i; + + TRACE_PKT("---------------\n"); + TRACE_PKT("Reduction packet (%s):\n", dir); + TRACE_PKT(" seqno = %d\n", pkt->hdr.seqno); + TRACE_PKT(" arm = %d\n", pkt->hdr.arm); + TRACE_PKT(" op = %d\n", pkt->hdr.op); + TRACE_PKT(" redcnt = %d\n", pkt->hdr.redcnt); + TRACE_PKT(" resno = %d\n", pkt->hdr.resno); + TRACE_PKT(" red_rc = %d\n", pkt->hdr.red_rc); + TRACE_PKT(" repsum_m = %d\n", pkt->hdr.repsum_m); + TRACE_PKT(" repsum_ovflid= %d\n", pkt->hdr.repsum_ovflid); + TRACE_PKT(" cookie --\n"); + TRACE_PKT(" .mcast_id = %08x\n", pkt->hdr.cookie.mcast_id); + TRACE_PKT(" .red_id = %08x\n", pkt->hdr.cookie.red_id); + TRACE_PKT(" .magic = %08x\n", pkt->hdr.cookie.magic); + TRACE_PKT(" .retry = %08x\n", pkt->hdr.cookie.retry); + for (i = 0; i < 4; i++) + TRACE_PKT(" ival[%d] = %016lx\n", i, data[i]); + TRACE_PKT("---------------\n"); +#endif +} + +/**************************************************************************** + * Reduction operators for accelerated collectives. + * + * The array lookup is faster than a switch. Non-static initialization makes + * this adaptive to changes in header files (e.g. new opcodes in FI). + */ +#define COLL_OPCODE_BARRIER 0x00 +#define COLL_OPCODE_BIT_AND 0x01 +#define COLL_OPCODE_BIT_OR 0x02 +#define COLL_OPCODE_BIT_XOR 0x03 +#define COLL_OPCODE_INT_MIN 0x10 +#define COLL_OPCODE_INT_MAX 0x11 +#define COLL_OPCODE_INT_MINMAXLOC 0x12 +#define COLL_OPCODE_INT_SUM 0x14 +#define COLL_OPCODE_FLT_MINNUM 0x24 +#define COLL_OPCODE_FLT_MAXNUM 0x25 +#define COLL_OPCODE_FLT_MINMAXNUMLOC 0x26 +#define COLL_OPCODE_FLT_SUM_NOFTZ_RND0 0x28 +#define COLL_OPCODE_FLT_SUM_NOFTZ_RND1 0x29 +#define COLL_OPCODE_FLT_SUM_NOFTZ_RND2 0x2a +#define COLL_OPCODE_FLT_SUM_NOFTZ_RND3 0x2b +#define COLL_OPCODE_FLT_SUM_FTZ_RND0 0x2c +#define COLL_OPCODE_FLT_SUM_FTZ_RND1 0x2d +#define COLL_OPCODE_FLT_SUM_FTZ_RND2 0x2e +#define COLL_OPCODE_FLT_SUM_FTZ_RND3 0x2f +#define COLL_OPCODE_FLT_REPSUM 0x30 +#define COLL_OPCODE_MAX 0x31 + +/* Convert exported op values to Rosetta opcodes */ +static cxip_coll_op_t _int8_16_32_op_to_opcode[FI_CXI_OP_LAST]; +static cxip_coll_op_t _uint8_16_32_op_to_opcode[FI_CXI_OP_LAST]; +static cxip_coll_op_t _int64_op_to_opcode[FI_CXI_OP_LAST]; +static cxip_coll_op_t _uint64_op_to_opcode[FI_CXI_OP_LAST]; +static cxip_coll_op_t _flt_op_to_opcode[FI_CXI_OP_LAST]; +static enum c_return_code _cxip_rc_to_cxi_rc[16]; +static enum cxip_coll_redtype _cxi_op_to_redtype[COLL_OPCODE_MAX]; + +/* One-time dynamic initialization of FI to CXI opcode. + */ +void cxip_coll_populate_opcodes(void) +{ + int i; + + if ((int)FI_CXI_MINMAXLOC < (int)FI_ATOMIC_OP_LAST) { + CXIP_FATAL("Invalid CXI_FMINMAXLOC value\n"); + } + for (i = 0; i < FI_CXI_OP_LAST; i++) { + _int8_16_32_op_to_opcode[i] = -FI_EOPNOTSUPP; + _uint8_16_32_op_to_opcode[i] = -FI_EOPNOTSUPP; + _int64_op_to_opcode[i] = -FI_EOPNOTSUPP; + _uint64_op_to_opcode[i] = -FI_EOPNOTSUPP; + _flt_op_to_opcode[i] = -FI_EOPNOTSUPP; + _cxi_op_to_redtype[i] = REDTYPE_BYT; + } + /* operations supported by 32, 16, and 8 bit signed int operands */ + /* NOTE: executed as packed 64-bit quantities */ + _int8_16_32_op_to_opcode[FI_BOR] = COLL_OPCODE_BIT_OR; + _int8_16_32_op_to_opcode[FI_BAND] = COLL_OPCODE_BIT_AND; + _int8_16_32_op_to_opcode[FI_BXOR] = COLL_OPCODE_BIT_XOR; + + /* operations supported by 32, 16, and 8 bit unsigned int operands */ + _uint8_16_32_op_to_opcode[FI_BOR] = COLL_OPCODE_BIT_OR; + _uint8_16_32_op_to_opcode[FI_BAND] = COLL_OPCODE_BIT_AND; + _uint8_16_32_op_to_opcode[FI_BXOR] = COLL_OPCODE_BIT_XOR; + + /* operations supported by 64 bit signed int operands */ + _int64_op_to_opcode[FI_MIN] = COLL_OPCODE_INT_MIN; + _int64_op_to_opcode[FI_MAX] = COLL_OPCODE_INT_MAX; + _int64_op_to_opcode[FI_SUM] = COLL_OPCODE_INT_SUM; + _int64_op_to_opcode[FI_CXI_MINMAXLOC] = COLL_OPCODE_INT_MINMAXLOC; + + /* operations supported by 64 bit unsigned int operands */ + _uint64_op_to_opcode[FI_BOR] = COLL_OPCODE_BIT_OR; + _uint64_op_to_opcode[FI_BAND] = COLL_OPCODE_BIT_AND; + _uint64_op_to_opcode[FI_BXOR] = COLL_OPCODE_BIT_XOR; + + /* operations supported by 64 bit double operands */ + _flt_op_to_opcode[FI_MIN] = COLL_OPCODE_FLT_MINNUM; + _flt_op_to_opcode[FI_MAX] = COLL_OPCODE_FLT_MAXNUM; + _flt_op_to_opcode[FI_CXI_MINMAXLOC] = COLL_OPCODE_FLT_MINMAXNUMLOC; + _flt_op_to_opcode[FI_CXI_REPSUM] = COLL_OPCODE_FLT_REPSUM; + /* NOTE: FI_SUM handled in flt_op_to_opcode() function */ + + /* cxi_opcode to redtype translation */ + _cxi_op_to_redtype[COLL_OPCODE_BIT_OR] = REDTYPE_INT; + _cxi_op_to_redtype[COLL_OPCODE_BIT_AND] = REDTYPE_INT; + _cxi_op_to_redtype[COLL_OPCODE_BIT_XOR] = REDTYPE_INT; + _cxi_op_to_redtype[COLL_OPCODE_INT_MIN] = REDTYPE_INT; + _cxi_op_to_redtype[COLL_OPCODE_INT_MAX] = REDTYPE_INT; + _cxi_op_to_redtype[COLL_OPCODE_INT_SUM] = REDTYPE_INT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_MINNUM] = REDTYPE_FLT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_MAXNUM] = REDTYPE_FLT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_NOFTZ_RND0] = REDTYPE_FLT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_NOFTZ_RND1] = REDTYPE_FLT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_NOFTZ_RND2] = REDTYPE_FLT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_NOFTZ_RND3] = REDTYPE_FLT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_FTZ_RND0] = REDTYPE_FLT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_FTZ_RND1] = REDTYPE_FLT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_FTZ_RND2] = REDTYPE_FLT; + _cxi_op_to_redtype[COLL_OPCODE_FLT_SUM_FTZ_RND3] = REDTYPE_FLT; + + _cxi_op_to_redtype[COLL_OPCODE_INT_MINMAXLOC] = REDTYPE_IMINMAX; + _cxi_op_to_redtype[COLL_OPCODE_FLT_MINMAXNUMLOC] = REDTYPE_FMINMAX; + _cxi_op_to_redtype[COLL_OPCODE_FLT_REPSUM] = REDTYPE_REPSUM; + + for (i = 0; i < 16; i++) + _cxip_rc_to_cxi_rc[i] = C_RC_AMO_ALIGN_ERROR; + _cxip_rc_to_cxi_rc[CXIP_COLL_RC_SUCCESS] = C_RC_OK; + _cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_INEXACT] = C_RC_AMO_FP_INEXACT; + _cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_OVERFLOW] = C_RC_AMO_FP_OVERFLOW; + _cxip_rc_to_cxi_rc[CXIP_COLL_RC_FLT_INVALID] = C_RC_AMO_FP_INVALID; + _cxip_rc_to_cxi_rc[CXIP_COLL_RC_REP_INEXACT] = C_RC_AMO_FP_INEXACT; + _cxip_rc_to_cxi_rc[CXIP_COLL_RC_INT_OVERFLOW] = C_RC_AMO_FP_OVERFLOW; + _cxip_rc_to_cxi_rc[CXIP_COLL_RC_CONTR_OVERFLOW] = C_RC_AMO_LENGTH_ERROR; + _cxip_rc_to_cxi_rc[CXIP_COLL_RC_OP_MISMATCH] = C_RC_AMO_INVAL_OP_ERROR; +} + +static inline int int8_16_32_op_to_opcode(int op) +{ + return _int8_16_32_op_to_opcode[op]; +} + +static inline int uint8_16_32_op_to_opcode(int op) +{ + return _uint8_16_32_op_to_opcode[op]; +} + +static inline int int64_op_to_opcode(int op) +{ + return _int64_op_to_opcode[op]; +} + +static inline int uint64_op_to_opcode(int op) +{ + return _uint64_op_to_opcode[op]; +} + +static inline int flt_op_to_opcode(int op) +{ + if (op != FI_SUM) + return _flt_op_to_opcode[op]; + + switch (fegetround()) { + case FE_TONEAREST: + return (_MM_GET_FLUSH_ZERO_MODE()) ? + COLL_OPCODE_FLT_SUM_FTZ_RND0 : + COLL_OPCODE_FLT_SUM_NOFTZ_RND0; + case FE_UPWARD: + return (_MM_GET_FLUSH_ZERO_MODE()) ? + COLL_OPCODE_FLT_SUM_FTZ_RND1 : + COLL_OPCODE_FLT_SUM_NOFTZ_RND1; + case FE_DOWNWARD: + return (_MM_GET_FLUSH_ZERO_MODE()) ? + COLL_OPCODE_FLT_SUM_FTZ_RND2 : + COLL_OPCODE_FLT_SUM_NOFTZ_RND2; + case FE_TOWARDZERO: + return (_MM_GET_FLUSH_ZERO_MODE()) ? + COLL_OPCODE_FLT_SUM_FTZ_RND3 : + COLL_OPCODE_FLT_SUM_NOFTZ_RND3; + } + return -FI_EOPNOTSUPP; +} + +/* Convert CXI opcode to reduction data type */ +static inline +enum cxip_coll_redtype _opcode_to_redtype(cxip_coll_op_t cxi_opcode) +{ + return _cxi_op_to_redtype[cxi_opcode]; +} + +/* Convert FI opcode to CXI opcode, depending on FI data type */ +static inline +int cxip_fi2cxi_opcode(enum fi_op op, enum fi_datatype datatype) +{ + switch ((int)datatype) { + case FI_INT8: + case FI_INT16: + case FI_INT32: + return int8_16_32_op_to_opcode(op); + case FI_UINT8: + case FI_UINT16: + case FI_UINT32: + return uint8_16_32_op_to_opcode(op); + case FI_INT64: + return int64_op_to_opcode(op); + case FI_UINT64: + return uint64_op_to_opcode(op); + case FI_DOUBLE: + return flt_op_to_opcode(op); + } + return -FI_EOPNOTSUPP; +} + +/* Determine FI datatype size */ +static inline +int _get_cxi_data_bytcnt(cxip_coll_op_t cxi_opcode, + enum fi_datatype datatype, size_t count) +{ + int size; + + switch (datatype) { + case FI_INT8: + case FI_UINT8: + size = sizeof(uint8_t); + break; + case FI_INT16: + case FI_UINT16: + size = sizeof(uint16_t); + break; + case FI_INT32: + case FI_UINT32: + size = sizeof(uint32_t); + break; + case FI_INT64: + case FI_UINT64: + size = sizeof(uint64_t); + break; + case FI_FLOAT: + size = sizeof(float); + break; + case FI_DOUBLE: + size = sizeof(double); + break; + default: + return -FI_EOPNOTSUPP; + } + switch (cxi_opcode) { + case COLL_OPCODE_INT_MINMAXLOC: + case COLL_OPCODE_FLT_MINMAXNUMLOC: + case COLL_OPCODE_FLT_REPSUM: + size *= 4; + break; + default: + // do nothing, size is correct + break; + } + size *= count; + if (size > CXIP_COLL_MAX_DATA_SIZE) + return -FI_EINVAL; + return size; +} + +/**************************************************************************** + * SEND operation (restricted Put to a remote PTE) + */ + +/* Forward references */ +static void _progress_coll(struct cxip_coll_reduction *reduction, + struct red_pkt *pkt); +static ssize_t _coll_append_buffer(struct cxip_coll_pte *coll_pte, + struct cxip_coll_buf *buf); + +/* Generate a dfa and index extension for a reduction */ +static int _gen_tx_dfa(struct cxip_coll_reduction *reduction, + int av_set_idx, union c_fab_addr *dfa, + uint8_t *index_ext, bool *is_mcast) +{ + struct cxip_ep_obj *ep_obj; + struct cxip_av_set *av_set_obj; + struct cxip_addr dest_caddr; + fi_addr_t dest_addr; + int pid_bits; + int idx_ext; + int ret; + + ep_obj = reduction->mc_obj->ep_obj; + av_set_obj = reduction->mc_obj->av_set_obj; + + /* Send address */ + switch (av_set_obj->comm_key.keytype) { + case COMM_KEY_MULTICAST: + /* - destination == multicast ID + * - idx_ext == 0 + * - dfa == multicast destination + * - index_ext == 0 + */ + if (is_netsim(ep_obj)) { + CXIP_WARN("NETSIM does not support mcast\n"); + return -FI_EINVAL; + } + idx_ext = 0; + cxi_build_mcast_dfa(av_set_obj->comm_key.mcast.mcast_addr, + reduction->red_id, idx_ext, + dfa, index_ext); + *is_mcast = true; + break; + case COMM_KEY_UNICAST: + /* - destination == remote node in av_set_obj + * - idx_ext == CXIP_PTL_IDX_COLL + * - dfa = remote nic + * - index_ext == CXIP_PTL_IDX_COLL + */ + if (av_set_idx >= av_set_obj->fi_addr_cnt) { + CXIP_WARN("av_set_idx out-of-range\n"); + return -FI_EINVAL; + } + dest_addr = av_set_obj->fi_addr_ary[av_set_idx]; + ret = cxip_av_lookup_addr(ep_obj->av, dest_addr, &dest_caddr); + if (ret != FI_SUCCESS) + return ret; + pid_bits = ep_obj->domain->iface->dev->info.pid_bits; + cxi_build_dfa(dest_caddr.nic, dest_caddr.pid, pid_bits, + CXIP_PTL_IDX_COLL, dfa, index_ext); + *is_mcast = false; + break; + case COMM_KEY_RANK: + /* - destination == source NIC + * - idx_ext == extended PID + * - dfa == source NIC + * - index_ext == idx_ext offset beyond RXCs (5-bit range) + */ + if (av_set_idx >= av_set_obj->fi_addr_cnt) { + CXIP_WARN("av_set_idx out-of-range\n"); + return -FI_EINVAL; + } + dest_caddr = ep_obj->src_addr; + pid_bits = ep_obj->domain->iface->dev->info.pid_bits; + idx_ext = CXIP_PTL_IDX_COLL + av_set_idx; + cxi_build_dfa(dest_caddr.nic, dest_caddr.pid, pid_bits, + idx_ext, dfa, index_ext); + *is_mcast = false; + break; + default: + CXIP_WARN("unexpected comm_key type: %d\n", + av_set_obj->comm_key.keytype); + return -FI_EINVAL; + } + return FI_SUCCESS; +} + +/** + * Issue a restricted Put to the destination address. + * If md is NULL, this performs an IDC Put, otherwise it issues a DMA Put. + * + * Exported for unit testing. + * + * This will return -FI_EAGAIN on transient errors. + */ +int cxip_coll_send(struct cxip_coll_reduction *reduction, + int av_set_idx, const void *buffer, size_t buflen, + struct cxi_md *md) +{ + union c_cmdu cmd = {}; + struct cxip_coll_mc *mc_obj; + struct cxip_ep_obj *ep_obj; + struct cxip_cmdq *cmdq; + union c_fab_addr dfa; + uint8_t index_ext; + bool is_mcast; + int ret; + + if (!buffer) { + CXIP_INFO("no buffer\n"); + return -FI_EINVAL; + } + + mc_obj = reduction->mc_obj; + ep_obj = mc_obj->ep_obj; + cmdq = ep_obj->coll.tx_cmdq; + + ret = _gen_tx_dfa(reduction, av_set_idx, &dfa, &index_ext, &is_mcast); + if (ret) + return ret; + + if (cxip_evtq_saturated(ep_obj->coll.tx_evtq)) { + CXIP_DBG("TX HW EQ saturated\n"); + return -FI_EAGAIN; + } + +#if ENABLE_DEBUG + if (reduction->drop_send) { + reduction->drop_send = false; + goto drop_pkt; + } +#endif + + if (md) { + cmd.full_dma.command.opcode = C_CMD_PUT; + cmd.full_dma.event_send_disable = 1; + cmd.full_dma.event_success_disable = 1; + cmd.full_dma.restricted = 1; + cmd.full_dma.reduction = is_mcast; + cmd.full_dma.index_ext = index_ext; + cmd.full_dma.eq = cxip_evtq_eqn(ep_obj->coll.tx_evtq); + cmd.full_dma.dfa = dfa; + cmd.full_dma.lac = md->lac; + cmd.full_dma.local_addr = CXI_VA_TO_IOVA(md, buffer); + cmd.full_dma.request_len = buflen; + + /* this uses cached values, returns -FI_EAGAIN if queue full */ + ret = cxip_txq_cp_set(cmdq, ep_obj->auth_key.vni, + mc_obj->tc, mc_obj->tc_type); + if (ret) + goto err; + + ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &cmd.full_dma); + } else { + cmd.c_state.event_send_disable = 1; + cmd.c_state.event_success_disable = 1; + cmd.c_state.restricted = 1; + cmd.c_state.reduction = is_mcast; + cmd.c_state.index_ext = index_ext; + cmd.c_state.eq = cxip_evtq_eqn(ep_obj->coll.tx_evtq); + cmd.c_state.initiator = CXI_MATCH_ID( + ep_obj->domain->iface->dev->info.pid_bits, + ep_obj->src_addr.pid, ep_obj->src_addr.nic); + + /* this uses cached values, returns -FI_EAGAIN if queue full */ + ret = cxip_txq_cp_set(cmdq, ep_obj->auth_key.vni, + mc_obj->tc, mc_obj->tc_type); + if (ret) + goto err; + + /* returns -FI_EAGAIN on failure */ + ret = cxip_cmdq_emit_c_state(cmdq, &cmd.c_state); + if (ret) { + ret = -FI_EAGAIN; + goto err; + } + + memset(&cmd.idc_put, 0, sizeof(cmd.idc_put)); + cmd.idc_put.idc_header.dfa = dfa; + ret = cxi_cq_emit_idc_put(cmdq->dev_cmdq, &cmd.idc_put, + buffer, buflen); + if (ret) { + ret = -FI_EAGAIN; + goto err; + } + } + + if (ret) { + /* Return error according to Domain Resource Management */ + ret = -FI_EAGAIN; + goto err; + } + + cxi_cq_ring(cmdq->dev_cmdq); + +#if ENABLE_DEBUG +drop_pkt: +#endif + ret = FI_SUCCESS; + ofi_atomic_inc32(&reduction->mc_obj->send_cnt); + +err: + return ret; +} + +/**************************************************************************** + * RECV operation (of restricted Put to a local PTE) + * + * Collectives use a dedicated EP and PTE for each MC object. + * + * Packet space is allocated and linked to the PTE with a request. When a + * packet is received, CXI hardware puts the request pointer and incoming + * packet offset into a hardware-managed CXI event queue. When the CXI evtq + * is progressed, completed hardware events are harvested, and the request + * pointer (along with completion data) is inserted into an OFI CQ for the + * endpont. Reading any OFI CQ bound to that endpoint will harvest all CXI + * (hardware) evtqs bound to that endpoint, but will return only events + * associated with the specified CQ, IF there are multiple CQs. + * + * Collectives services two CXI (hardware) evtqs for each MC object. + * + * The tx_evtq is only used to detect hardware buffer overflow, which + * reflects -FI_EAGAIN back to the client. + * + * The rx_evtq manages PTE events for the collective endpoint. Buffer link + * and unlink events are consumed silently: buffer exhaustion is checked on + * every packet receipt, and will automatically recycle exhausted buffers. + * PUT events are filtered for correct format and passed into the collective + * state machine for processing. All other received packets are discarded. + * + * cxip_cq_req_complete() is used internally for PTE events, and externally + * to report collective operation completions. The internal events are useful + * for certain bench test models, where we need to count the packets received + * as well as the collective completion. In production, we want to disable + * the internal events. This is done independently for each MC object with + * the mc->rx_discard flag. + */ + +/* Report success/error results of an RX event through CQ/counters, and roll + * over the buffers if appropriate. + * + * NOTE: req may be invalid after this call. + * + * Caller must hold ep_obj->lock. + */ +static void _coll_rx_req_report(struct cxip_req *req) +{ + size_t overflow; + int err, ret; + + req->flags &= (FI_RECV | FI_COMPLETION); + + /* Interpret results */ + overflow = req->coll.hw_req_len - req->data_len; + if (req->coll.cxi_rc == C_RC_OK && req->coll.isred && !overflow) { + /* receive success */ + if (req->flags & FI_COMPLETION) { + /* failure means progression is hung */ + ret = cxip_cq_req_complete(req); + if (ret) + CXIP_FATAL( + "cxip_cq_req_complete failed: %d\n", ret); + } + + if (req->coll.coll_pte->ep_obj->coll.rx_cntr) { + /* failure means counts cannot be trusted */ + ret = cxip_cntr_mod( + req->coll.coll_pte->ep_obj->coll.rx_cntr, 1, + false, false); + if (ret) + CXIP_WARN( + "Failed success cxip_cntr_mod: %d\n", + ret); + } + } else { + /* failure */ + if (req->coll.cxi_rc != C_RC_OK) { + /* real network error of some sort */ + err = proverr2errno(req->coll.cxi_rc); + CXIP_WARN("Request error: %p (err: %d, %s)\n", + req, err, cxi_rc_to_str(err)); + } else if (overflow) { + /* can only happen on very large packet (> 64 bytes) */ + err = FI_EMSGSIZE; + CXIP_WARN("Request truncated: %p (err: %d, %s)\n", + req, err, cxi_rc_to_str(err)); + } else { + /* non-reduction packet */ + err = FI_ENOMSG; + CXIP_INFO("Not reduction pkt: %p (err: %d, %s)\n", + req, err, cxi_rc_to_str(err)); + } + + /* failure means progression is hung */ + ret = cxip_cq_req_error(req, overflow, err, + req->coll.cxi_rc, + NULL, 0, FI_ADDR_UNSPEC); + if (ret) + CXIP_FATAL("cxip_cq_req_error: %d\n", ret); + + if (req->coll.coll_pte->ep_obj->coll.rx_cntr) { + /* failure means counts cannot be trusted */ + ret = cxip_cntr_mod( + req->coll.coll_pte->ep_obj->coll.rx_cntr, 1, + false, true); + if (ret) + CXIP_WARN("cxip_cntr_mod: %d\n", ret); + } + } + + /* manage buffer rollover */ + if (req->coll.mrecv_space < + req->coll.coll_pte->ep_obj->coll.min_multi_recv) { + struct cxip_coll_pte *coll_pte = req->coll.coll_pte; + struct cxip_coll_buf *buf = req->coll.coll_buf; + int cnt; + + /* Will be re-incremented when LINK is received */ + cnt = ofi_atomic_dec32(&coll_pte->buf_cnt); + if (req->coll.coll_pte->buf_low_water > cnt) + req->coll.coll_pte->buf_low_water = cnt; + if (cnt <= 0) { + CXIP_WARN("COLL buffers exhausted\n"); + // TODO set flag to shut this down + } + ofi_atomic_inc32(&coll_pte->buf_swap_cnt); + + /* Re-use this buffer in the hardware */ + ret = _coll_append_buffer(coll_pte, buf); + if (ret != FI_SUCCESS) + CXIP_WARN("Re-link buffer failed: %d\n", ret); + + /* Hardware has silently unlinked this */ + cxip_evtq_req_free(req); + } +} + +/* Evaluate PUT receive request to see if this is a reduction packet */ +static void _coll_rx_progress(struct cxip_req *req, + const union c_event *event) +{ + struct cxip_coll_mc *mc_obj; + struct cxip_coll_reduction *reduction; + struct red_pkt *pkt; + + /* Raw packet of some sort received */ + ofi_atomic_inc32(&req->coll.coll_pte->recv_cnt); + + /* If not the right size, don't swap bytes */ + if (req->data_len != sizeof(struct red_pkt)) { + CXIP_INFO("Bad coll packet size: %ld\n", req->data_len); + return; + } + + /* If swap doesn't look like reduction packet, swap back and discard */ + pkt = (struct red_pkt *)req->buf; + _swappkt(pkt); + if (pkt->hdr.cookie.magic != MAGIC) + { + CXIP_INFO("Bad coll MAGIC: %x\n", pkt->hdr.cookie.magic); + _swappkt(pkt); + return; + } + /* This is a reduction packet */ + + /* The coll.coll_pte->mc_obj is defined only for COMM_KEY_RANK */ + mc_obj = req->coll.coll_pte->mc_obj; + if (!mc_obj) + mc_obj = ofi_idm_lookup( + &req->coll.coll_pte->ep_obj->coll.mcast_map, + pkt->hdr.cookie.mcast_id); + if (!mc_obj) { + TRACE_PKT("Bad coll lookup: %x\n", pkt->hdr.cookie.mcast_id); + return; + } + /* This is a valid reduction packet */ + ofi_atomic_inc32(&mc_obj->recv_cnt); + req->coll.isred = true; + req->discard = mc_obj->rx_discard; + reduction = &mc_obj->reduction[pkt->hdr.cookie.red_id]; + TRACE_PKT("Valid reduction packet\n"); + +#if ENABLE_DEBUG + /* Test case, simulate packet dropped in-flight */ + if (reduction->drop_recv) { + reduction->drop_recv = false; + return; + } +#endif + + /* Progress the reduction */ + _dump_red_pkt(pkt, "recv"); + ofi_atomic_inc32(&mc_obj->pkt_cnt); + _progress_coll(reduction, pkt); +} + +/* Event-handling callback for posted receive buffers */ +static int _coll_recv_cb(struct cxip_req *req, const union c_event *event) +{ + req->coll.cxi_rc = cxi_tgt_event_rc(event); + switch (event->hdr.event_type) { + case C_EVENT_LINK: + /* Enabled */ + if (req->coll.cxi_rc != C_RC_OK) { + CXIP_WARN("LINK error rc: %d\n", req->coll.cxi_rc); + break; + } + CXIP_DBG("LINK event seen\n"); + ofi_atomic_inc32(&req->coll.coll_pte->buf_cnt); + break; + case C_EVENT_UNLINK: + /* Normally disabled, errors only */ + req->coll.cxi_rc = cxi_tgt_event_rc(event); + if (req->coll.cxi_rc != C_RC_OK) { + CXIP_WARN("UNLINK error rc: %d\n", req->coll.cxi_rc); + break; + } + CXIP_DBG("UNLINK event seen\n"); + break; + case C_EVENT_PUT: + req->coll.isred = false; + req->coll.cxi_rc = cxi_tgt_event_rc(event); + if (req->coll.cxi_rc != C_RC_OK) { + CXIP_WARN("PUT error rc: %d\n", req->coll.cxi_rc); + break; + } + CXIP_DBG("PUT event seen\n"); + req->buf = (uint64_t)(CXI_IOVA_TO_VA( + req->coll.coll_buf->cxi_md->md, + event->tgt_long.start)); + req->coll.mrecv_space -= event->tgt_long.mlength; + req->coll.hw_req_len = event->tgt_long.rlength; + req->data_len = event->tgt_long.mlength; + _coll_rx_progress(req, event); + _coll_rx_req_report(req); + break; + default: + req->coll.cxi_rc = cxi_tgt_event_rc(event); + CXIP_WARN(CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(req->coll.cxi_rc)); + break; + } + + return FI_SUCCESS; +} + +/* Inject a hardware LE append. Does not generate HW LINK event unless error. */ +static int _hw_coll_recv(struct cxip_coll_pte *coll_pte, struct cxip_req *req) +{ + uint32_t le_flags; + uint64_t recv_iova; + int ret; + + /* C_LE_MANAGE_LOCAL makes Cassini ignore initiator remote_offset in all + * Puts, and causes automatic UNLINK when buffer capacity drops below + * CXIP_COLL_MIN_MULTI_RECV. + * + * C_LE_EVENT_UNLINK_DISABLE prevents generation of UNLINK events. We + * detect UNLINK by counting packets, and presume automatic UNLINK drops + * below CXIP_COLL_MIN_MULTI_RECV. + * + * C_LE_EVENT_UNLINK_DISABLE prevents UNLINK events from being + * generated. Hardware performs UNLINK automatically when buffer + * capacity is below CXIP_COLL_MIN_MULTI_RECV. + * + * C_LE_OP_PUT indicates this is an input buffer that responses to PUT. + * + * C_LE_NO_TRUNCATE is not used, because all packets are a fixed size, + * and CXIP_COLL_MIN_MULTI_RECV is sufficient to guarantee space for one new + * reduction packet. + */ + le_flags = C_LE_EVENT_UNLINK_DISABLE | C_LE_OP_PUT | C_LE_MANAGE_LOCAL; + + recv_iova = CXI_VA_TO_IOVA(req->coll.coll_buf->cxi_md->md, + (uint64_t)req->coll.coll_buf->buffer); + + ret = cxip_pte_append(coll_pte->pte, + recv_iova, + req->coll.coll_buf->bufsiz, + req->coll.coll_buf->cxi_md->md->lac, + C_PTL_LIST_PRIORITY, + req->req_id, + 0, 0, 0, + req->coll.coll_pte->ep_obj->coll.min_multi_recv, + le_flags, coll_pte->ep_obj->coll.rx_cntr, + coll_pte->ep_obj->coll.rx_cmdq, + true); + if (ret != FI_SUCCESS) { + CXIP_WARN("PTE append inject failed: %d\n", ret); + return ret; + } + + return FI_SUCCESS; +} + +/* Append a receive buffer to the PTE, with callback to handle receives. + * + * Caller must hold ep_obj->lock. + */ +static ssize_t _coll_append_buffer(struct cxip_coll_pte *coll_pte, + struct cxip_coll_buf *buf) +{ + struct cxip_req *req; + int ret; + + if (buf->bufsiz && !buf->buffer) { + CXIP_INFO("no buffer\n"); + return -FI_EINVAL; + } + + /* Allocate and populate a new request + * Sets: + * - req->cq + * - req->req_id to request index + * - req->req_ctx to passed context (buf) + * - req->discard to false + * - Inserts into the cq->req_list + */ + req = cxip_evtq_req_alloc(coll_pte->ep_obj->coll.rx_evtq, 1, buf); + if (!req) { + ret = -FI_ENOMEM; + goto recv_unmap; + } + + /* CQ event fields, set according to fi_cq.3 + * - set by provider + * - returned to user in completion event + * uint64_t context; // operation context + * uint64_t flags; // operation flags + * uint64_t data_len; // received data length + * uint64_t buf; // receive buf offset + * uint64_t data; // receive REMOTE_CQ_DATA + * uint64_t tag; // receive tag value on matching interface + * fi_addr_t addr; // sender address (if known) ??? + */ + + /* Request parameters */ + req->type = CXIP_REQ_COLL; + req->flags = (FI_RECV | FI_COMPLETION); + req->cb = _coll_recv_cb; + req->triggered = false; + req->trig_thresh = 0; + req->trig_cntr = NULL; + req->context = (uint64_t)buf; + req->data_len = 0; + req->buf = (uint64_t)buf->buffer; + req->data = 0; + req->tag = 0; + req->coll.coll_pte = coll_pte; + req->coll.coll_buf = buf; + req->coll.mrecv_space = req->coll.coll_buf->bufsiz; + + /* Returns FI_SUCCESS or FI_EAGAIN */ + ret = _hw_coll_recv(coll_pte, req); + if (ret != FI_SUCCESS) + goto recv_dequeue; + + return FI_SUCCESS; + +recv_dequeue: + cxip_evtq_req_free(req); + +recv_unmap: + cxip_unmap(buf->cxi_md); + return ret; +} + +/**************************************************************************** + * PTE management functions. + */ + +/* PTE state-change callback */ + __attribute__((__unused__)) +static void _coll_pte_cb(struct cxip_pte *pte, const union c_event *event) +{ + switch (pte->state) { + case C_PTLTE_ENABLED: + case C_PTLTE_DISABLED: + break; + default: + CXIP_FATAL("Unexpected state received: %u\n", pte->state); + } +} + +/* Enable a collective PTE. Wait for completion. */ +static inline +int _coll_pte_enable(struct cxip_coll_pte *coll_pte, uint32_t drop_count) +{ + return cxip_pte_set_state_wait(coll_pte->pte, + coll_pte->ep_obj->coll.rx_cmdq, + coll_pte->ep_obj->coll.rx_evtq, + C_PTLTE_ENABLED, drop_count); +} + +/* Disable a collective PTE. Wait for completion */ +static inline +int _coll_pte_disable(struct cxip_coll_pte *coll_pte) +{ + return cxip_pte_set_state_wait(coll_pte->pte, + coll_pte->ep_obj->coll.rx_cmdq, + coll_pte->ep_obj->coll.rx_evtq, + C_PTLTE_DISABLED, 0); +} + +/* Destroy and unmap all buffers used by the collectives PTE. + * + * Caller must hold ep_obj->lock. + */ +static void _coll_destroy_buffers(struct cxip_coll_pte *coll_pte) +{ + struct dlist_entry *list = &coll_pte->buf_list; + struct cxip_coll_buf *buf; + + while (!dlist_empty(list)) { + dlist_pop_front(list, struct cxip_coll_buf, buf, buf_entry); + cxip_unmap(buf->cxi_md); + free(buf); + } +} + +/* Adds 'count' buffers of 'size' bytes to the collecives PTE. This succeeds + * fully, or it fails and removes all added buffers. + */ +static int _coll_add_buffers(struct cxip_coll_pte *coll_pte, size_t size, + size_t count) +{ + struct cxip_coll_buf *buf; + int ret, i; + + if (count < CXIP_COLL_MIN_RX_BUFS) { + CXIP_INFO("Buffer count %ld < minimum (%d)\n", + count, CXIP_COLL_MIN_RX_BUFS); + return -FI_EINVAL; + } + + if (size < CXIP_COLL_MIN_RX_SIZE) { + CXIP_INFO("Buffer size %ld < minimum (%d)\n", + size, CXIP_COLL_MIN_RX_SIZE); + return -FI_EINVAL; + } + + CXIP_DBG("Adding %ld buffers of size %ld\n", count, size); + for (i = 0; i < count; i++) { + buf = calloc(1, sizeof(*buf) + size); + if (!buf) { + ret = -FI_ENOMEM; + goto out; + } + ret = cxip_map(coll_pte->ep_obj->domain, (void *)buf->buffer, + size, 0, &buf->cxi_md); + if (ret) + goto del_msg; + buf->bufsiz = size; + dlist_insert_tail(&buf->buf_entry, &coll_pte->buf_list); + + ret = _coll_append_buffer(coll_pte, buf); + if (ret) { + CXIP_WARN("Add buffer %d of %ld: %d\n", + i, count, ret); + goto out; + } + } + /* Block until PTE completes buffer appends */ + do { + sched_yield(); + cxip_evtq_progress(coll_pte->ep_obj->coll.rx_evtq); + } while (ofi_atomic_get32(&coll_pte->buf_cnt) < count); + coll_pte->buf_low_water = (int)count; + + return FI_SUCCESS; +del_msg: + free(buf); +out: + _coll_destroy_buffers(coll_pte); + return ret; +} + +/**************************************************************************** + * Mathematical routines used for collective reductions. + */ + +/* Set RC only if new is higher priority than old */ +// TODO avoid branch: +// http://geeksforgeeks.org/ +// compute-the-minimum-or-maximum-of-two-integers-without-branching +#define SET_RED_RC(redrc, rc) do {if ((redrc)<(rc)) (redrc)=(rc);} while(0) + +static inline +bool cxip_is_snan64(double d) +{ + /* This detection is universal IEEE */ + return isnan(d) && !(_dbl2bits(d) & 0x0008000000000000); +} + +/* convert signalling NaN to quiet NaN */ +static inline +bool _quiesce_nan(double *d) +{ + if (!cxip_is_snan64(*d)) + return false; + *d = NAN; + return true; +} + +/** + * Implement NaN comparison in RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM + * + * Only associative mode is supported. The old IEEE mode is incorrect, and has + * been deprecated. + * + * Compares two doubles, replaces *d1 as appropriate, and indicates swap. + * + * If the values are normal doubles, less=true indicates we are looking for the + * lesser of the two values, while less=false indicates we are looking for the + * greater of the two values. The appropriate value will be swapped into *d1 if + * necessary. + * + * In general, this will give preference to real values over NaN, which is the + * opposite of swpnan1() above. This will only return NaN if BOTH values in + * the comparison are NaN. + * + * If either NaN is sNaN, this will set the CXIP_COLL_RC_FLT_INVALID error. + * + * The return value can be used when associating an index with the value. + * + * Note that since this quiets any signalling NaNs, we need set the + * CXIP_COLL_RC_FLT_INVALID error. + * + * - return 0 indicates the values are equivalent, so use the smallest index. + * - return +1 indicates the values were swapped, so use the second index. + * - return -1 indicates no swap, so use the first index. + */ +static int swpnan2(double *d1, double d2, bool less, cxip_coll_rc_t *rc) +{ + bool nan1, nan2, snan1, snan2; + + // isnan() does not distinguish sNaN from qNaN + nan1 = isnan(*d1); + nan2 = isnan(d2); + // Neither is NaN, so simple comparison + if (!nan1 && !nan2) { + if (*d1 == d2) + return 0; + if (less && (*d1 > d2)) { + *d1 = d2; + return 1; + } + if (!less && (*d1 < d2)) { + *d1 = d2; + return 1; + } + return -1; + } + + // ----- FLT_MINNUM and FLT_MAXNUM rules + // At least one is NaN, check for sNaN + snan1 = _quiesce_nan(d1); + snan2 = _quiesce_nan(&d2); + if (snan1 || snan2) + SET_RED_RC(*rc, CXIP_COLL_RC_FLT_INVALID); + + // return qNaN only if both are NaN + if (nan1 && nan2) + return 0; + + // Prefers number + if (nan1) { + *d1 = d2; + return 1; + } + // Prefers number + return -1; +} + +/* Companion to swpnan1() and swpnan2() to swap associated indices */ +static inline +void swpidx(uint64_t *i1, uint64_t i2, int swp) +{ + if (swp >= 0 && (swp > 0 || *i1 > i2)) + *i1 = i2; +} + +/* Determine if double precision sum is exact. This shifts the value with the + * lower exponent toward the MSBit by the amount of the bitwise overlap between + * the final sum and the value that resulted in that sum. If any non-zero bits + * remain in that smaller value, they were discarded during the summation, and + * the result is inexact. + */ +static inline +bool exact(double rslt, double d) +{ + // TODO verify sign and shift + unsigned long m1, m2; + int s1, e1, s2, e2; + int shft, dlte; + bool ret; + + _decompose_dbl(rslt, &s1, &e1, &m1); + _decompose_dbl(d, &s2, &e2, &m2); + dlte = e1 - e2; + + if (dlte < 0) { + shft = MIN(52 + dlte, 0); + ret = !(m1 << shft); + } else { + shft= MIN(52 - dlte, 0); + ret = !(m2 << shft); + } + return ret; +} + +static inline +void _dump_coll_data(const char *tag, const struct cxip_coll_data *coll_data) +{ +#if __trc_data + int i; + + TRACE_PKT("=== Coll data: %s\n", tag); + TRACE_PKT(" init = %d\n", coll_data->initialized); + TRACE_PKT(" red_op = %d\n", coll_data->red_op); + TRACE_PKT(" rec_rc = %d\n", coll_data->red_rc); + TRACE_PKT(" red_cnt = %d\n", coll_data->red_cnt); + TRACE_PKT(" data:\n"); + for (i = 0; i < 4; i++) + TRACE_PKT(" %016lx\n", coll_data->intval.ival[i]); + TRACE_PKT("\n"); + TRACE_PKT("===================\n"); +#endif +} + +/* initialize coll_data structure from raw user data */ +static void _init_coll_data(struct cxip_coll_data *coll_data, int opcode, + const void *user_data, int bytcnt) +{ + double d; + int i; + + /* NOTE: snan can be directly injected here */ + memset(coll_data, 0, sizeof(*coll_data)); + if (user_data) + memcpy(coll_data->databuf, user_data, bytcnt); + coll_data->red_rc = 0; + coll_data->red_cnt = 1; + coll_data->red_op = opcode; + switch (coll_data->red_op) { + case COLL_OPCODE_FLT_MINNUM: + case COLL_OPCODE_FLT_MAXNUM: + case COLL_OPCODE_FLT_SUM_NOFTZ_RND0: + case COLL_OPCODE_FLT_SUM_NOFTZ_RND1: + case COLL_OPCODE_FLT_SUM_NOFTZ_RND2: + case COLL_OPCODE_FLT_SUM_NOFTZ_RND3: + case COLL_OPCODE_FLT_SUM_FTZ_RND0: + case COLL_OPCODE_FLT_SUM_FTZ_RND1: + case COLL_OPCODE_FLT_SUM_FTZ_RND2: + case COLL_OPCODE_FLT_SUM_FTZ_RND3: + /* evaluate all four doubles */ + for (i = 0; i < 4; i++) { + if (cxip_is_snan64(coll_data->fltval.fval[i])) + SET_RED_RC(coll_data->red_rc, + CXIP_COLL_RC_FLT_INVALID); + if (isnan(coll_data->fltval.fval[i]) || + isinf(coll_data->fltval.fval[i])) + SET_RED_RC(coll_data->red_rc, + CXIP_COLL_RC_FLT_OVERFLOW); + } + break; + case COLL_OPCODE_FLT_MINMAXNUMLOC: + /* evaluate the two doubles */ + for (i = 0; i < 4; i += 2) { + if (cxip_is_snan64(coll_data->fltval.fval[i])) + SET_RED_RC(coll_data->red_rc, + CXIP_COLL_RC_FLT_INVALID); + if (isinf(coll_data->fltval.fval[i])) + SET_RED_RC(coll_data->red_rc, + CXIP_COLL_RC_FLT_OVERFLOW); + } + break; + case COLL_OPCODE_FLT_REPSUM: + /* perform the conversion */ + d = coll_data->fltval.fval[0]; + cxip_dbl_to_rep(&coll_data->repsum, d); + break; + } + coll_data->initialized = true; +} + +/* reduce data into accumulator - can be used on uninitialized accumulator */ +static void _reduce(struct cxip_coll_data *accum, + const struct cxip_coll_data *coll_data, + bool pre_reduce) +{ + int i, swp; + + TRACE_DEBUG("%s entry\n", __func__); + /* Initialize with new data */ + if (!accum->initialized) { + memcpy(accum, coll_data, sizeof(*accum)); + return; + } + + /* copy new error (if any) to accumulator */ + SET_RED_RC(accum->red_rc, coll_data->red_rc); + + /* Real reduction (send or receive) must count contributions. + */ + if (!pre_reduce) + accum->red_cnt += coll_data->red_cnt; + + /* ops must always match, else don't apply data */ + if (accum->red_op != coll_data->red_op) { + SET_RED_RC(accum->red_rc, CXIP_COLL_RC_OP_MISMATCH); + return; + } + + /* Perform the reduction in software */ + switch (accum->red_op) { + case COLL_OPCODE_BARRIER: + break; + case COLL_OPCODE_BIT_AND: + for (i = 0; i < 4; i++) + accum->intval.ival[i] &= coll_data->intval.ival[i]; + /* overflow not possible */ + break; + case COLL_OPCODE_BIT_OR: + for (i = 0; i < 4; i++) + accum->intval.ival[i] |= coll_data->intval.ival[i]; + /* overflow not possible */ + break; + case COLL_OPCODE_BIT_XOR: + for (i = 0; i < 4; i++) + accum->intval.ival[i] ^= coll_data->intval.ival[i]; + /* overflow not possible */ + break; + case COLL_OPCODE_INT_MIN: + for (i = 0; i < 4; i++) + if (accum->intval.ival[i] > coll_data->intval.ival[i]) + accum->intval.ival[i] = coll_data->intval.ival[i]; + /* overflow not possible */ + break; + case COLL_OPCODE_INT_MAX: + for (i = 0; i < 4; i++) + if (accum->intval.ival[i] < coll_data->intval.ival[i]) + accum->intval.ival[i] = coll_data->intval.ival[i]; + /* overflow not possible */ + break; + case COLL_OPCODE_INT_MINMAXLOC: + /* RSDG 4.5.9.2.2 MINMAXLOC */ + /* return smallest value and its index */ + if (accum->intminmax.iminval > coll_data->intminmax.iminval) { + accum->intminmax.iminval = coll_data->intminmax.iminval; + accum->intminmax.iminidx = coll_data->intminmax.iminidx; + } else + /* return smallest index if values equal */ + if (accum->intminmax.iminval == coll_data->intminmax.iminval && + accum->intminmax.iminidx > coll_data->intminmax.iminidx) { + accum->intminmax.iminidx = coll_data->intminmax.iminidx; + } + + /* return largest value and its index */ + if (accum->intminmax.imaxval < coll_data->intminmax.imaxval) { + accum->intminmax.imaxval = coll_data->intminmax.imaxval; + accum->intminmax.imaxidx = coll_data->intminmax.imaxidx; + } else + /* return smallest (yes) index if values equal */ + if (accum->intminmax.imaxval == coll_data->intminmax.imaxval && + accum->intminmax.imaxidx > coll_data->intminmax.imaxidx) { + accum->intminmax.imaxidx = coll_data->intminmax.imaxidx; + } + /* overflow not possible */ + break; + case COLL_OPCODE_INT_SUM: + for (i = 0; i < 4; i++) { + bool newneg = (coll_data->intval.ival[i] < 0); + bool oldneg = (accum->intval.ival[i] < 0); + bool sumneg; + accum->intval.ival[i] += coll_data->intval.ival[i]; + sumneg = (accum->intval.ival[i] < 0); + /* if sum changed sign, and doesn't match new sign */ + if (sumneg != oldneg && sumneg != newneg) + SET_RED_RC(accum->red_rc, + CXIP_COLL_RC_INT_OVERFLOW); + } + break; + case COLL_OPCODE_FLT_MINNUM: + /* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */ + for (i = 0; i < 4; i++) { + swpnan2(&accum->fltval.fval[i], coll_data->fltval.fval[i], 1, + &accum->red_rc); + } + break; + case COLL_OPCODE_FLT_MAXNUM: + /* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */ + for (i = 0; i < 4; i++) { + swpnan2(&accum->fltval.fval[i], coll_data->fltval.fval[i], 0, + &accum->red_rc); + } + break; + case COLL_OPCODE_FLT_MINMAXNUMLOC: + /* RSDG 4.5.9.2.4 FLT_MINNUM and FLT_MAXNUM */ + swp = swpnan2(&accum->fltminmax.fminval, + coll_data->fltminmax.fminval, 1, &accum->red_rc); + swpidx(&accum->fltminmax.fminidx, coll_data->fltminmax.fminidx, swp); + swp = swpnan2(&accum->fltminmax.fmaxval, + coll_data->fltminmax.fmaxval, 0, &accum->red_rc); + swpidx(&accum->fltminmax.fmaxidx, coll_data->fltminmax.fmaxidx, swp); + break; + case COLL_OPCODE_FLT_SUM_NOFTZ_RND0: + case COLL_OPCODE_FLT_SUM_NOFTZ_RND1: + case COLL_OPCODE_FLT_SUM_NOFTZ_RND2: + case COLL_OPCODE_FLT_SUM_NOFTZ_RND3: + /* Rosetta opcode has been chosen according to the current + * rounding mode for this application, so all we need to do is + * add the numbers. + */ + for (i = 0; i < 4; i++) { + /* NOTE: arithmetic operations will quiesce snan */ + accum->fltval.fval[i] += coll_data->fltval.fval[i]; + + if (!exact(accum->fltval.fval[i], + coll_data->fltval.fval[i])) + SET_RED_RC(accum->red_rc, + CXIP_COLL_RC_FLT_INEXACT); + if (isinf(accum->fltval.fval[i])) + SET_RED_RC(accum->red_rc, + CXIP_COLL_RC_FLT_OVERFLOW); + } + break; + case COLL_OPCODE_FLT_SUM_FTZ_RND0: + case COLL_OPCODE_FLT_SUM_FTZ_RND1: + case COLL_OPCODE_FLT_SUM_FTZ_RND2: + case COLL_OPCODE_FLT_SUM_FTZ_RND3: + /* Rosetta opcode has been chosen according to the current + * rounding mode for this application, so all we need to do is + * add the numbers. + */ + for (i = 0; i < 4; i++) { + /* NOTE: arithmetic operations will quiesce snan */ + accum->fltval.fval[i] += coll_data->fltval.fval[i]; + + if (!exact(accum->fltval.fval[i], + coll_data->fltval.fval[i])) + SET_RED_RC(accum->red_rc, + CXIP_COLL_RC_FLT_INEXACT); + if (isinf(accum->fltval.fval[i])) + SET_RED_RC(accum->red_rc, + CXIP_COLL_RC_FLT_OVERFLOW); + } + break; + case COLL_OPCODE_FLT_REPSUM: + cxip_rep_add(&accum->repsum, &coll_data->repsum); + break; + } +} + +/**************************************************************************** + * Reduction packet management. + */ + +/** + * Prevent setting the ARM bit on a root packet. + * + * This is used in testing to suppress Rosetta collective operations, forcing + * all leaf packets to arrive at the root, creating an incast. + */ +int cxip_coll_arm_disable(struct fid_mc *mc, bool disable) +{ + struct cxip_coll_mc *mc_obj = (struct cxip_coll_mc *)mc; + int old = mc_obj->arm_disable; + + mc_obj->arm_disable = disable; + + return old; +} + +/** + * Limit the reduction ID values. + * + * Reduction ID values do round-robin over an adjustable range of values. This + * is useful in testing to force all reductions to use reduction id zero (set + * max_red_id to 1), but could be used in production to use only a subset of + * reduction IDs to limit fabric resource exhaustion when concurrent reductions + * are used. + */ +void cxip_coll_limit_red_id(struct fid_mc *mc, int max_red_id) +{ + struct cxip_coll_mc *mc_obj = (struct cxip_coll_mc *)mc; + + if (max_red_id < 1) + max_red_id = 1; + if (max_red_id > CXIP_COLL_MAX_CONCUR) + max_red_id = CXIP_COLL_MAX_CONCUR; + mc_obj->max_red_id = max_red_id; +} + +/* drop the next packet sent */ +void cxip_coll_drop_send(struct cxip_coll_reduction *reduction) +{ + reduction->drop_send = true; +} + +/* drop the next packet received */ +void cxip_coll_drop_recv(struct cxip_coll_reduction *reduction) +{ + reduction->drop_recv = true; +} + +/* Return true if this node is the hwroot node */ +static inline +bool is_hw_root(struct cxip_coll_mc *mc_obj) +{ + return (mc_obj->hwroot_idx == mc_obj->mynode_idx); +} + +/* Simulated unicast send of multiple packets as root node to leaf nodes */ +static inline +ssize_t _send_pkt_as_root(struct cxip_coll_reduction *reduction, bool retry) +{ + int i, ret, err; + + err = 0; + for (i = 0; i < reduction->mc_obj->av_set_obj->fi_addr_cnt; i++) { + if (i == reduction->mc_obj->mynode_idx && + reduction->mc_obj->av_set_obj->fi_addr_cnt > 1) { + TRACE_DEBUG("root: skip=%d\n", i); + continue; + } + ret = cxip_coll_send(reduction, i, + reduction->tx_msg, + sizeof(struct red_pkt), + reduction->mc_obj->reduction_md); + TRACE_DEBUG("root: send=%d ret=%d\n", i, ret); + if (!err) + err = ret; + } + return err; +} + +/* Simulated unicast send of single packet as leaf node to root node */ +static inline +ssize_t _send_pkt_as_leaf(struct cxip_coll_reduction *reduction, bool retry) +{ + int ret; + + ret = cxip_coll_send(reduction, reduction->mc_obj->hwroot_idx, + reduction->tx_msg, sizeof(struct red_pkt), + reduction->mc_obj->reduction_md); + TRACE_DEBUG("leaf: send=%d ret=%d\n", 1, ret); + return ret; +} + +/* Multicast send of single packet from root or leaf node */ +static inline +ssize_t _send_pkt_mc(struct cxip_coll_reduction *reduction, bool retry) +{ + return cxip_coll_send(reduction, 0, + reduction->tx_msg, + sizeof(struct red_pkt), + reduction->mc_obj->reduction_md); +} + +/* Send packet from root or leaf node as appropriate */ +static inline +ssize_t _send_pkt(struct cxip_coll_reduction *reduction, bool retry) +{ + int ret; + + if (reduction->mc_obj->av_set_obj->comm_key.keytype == + COMM_KEY_MULTICAST) { + ret = _send_pkt_mc(reduction, retry); + } else if (is_hw_root(reduction->mc_obj)) { + ret = _send_pkt_as_root(reduction, retry); + } else { + ret = _send_pkt_as_leaf(reduction, retry); + } + return ret; +} + +/* prepare and issue the reduction packet */ +int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, + const struct cxip_coll_data *coll_data, + bool arm, bool retry) +{ + struct red_pkt *pkt; + int ret; + + pkt = (struct red_pkt *)reduction->tx_msg; + + memset(&pkt->hdr, 0, sizeof(pkt->hdr)); + pkt->hdr.arm = arm; + pkt->hdr.seqno = reduction->seqno; + pkt->hdr.resno = reduction->resno; + pkt->hdr.cookie.mcast_id = reduction->mc_obj->mcast_addr; + pkt->hdr.cookie.red_id = reduction->red_id; + pkt->hdr.cookie.retry = retry; + pkt->hdr.cookie.magic = MAGIC; + + if (coll_data) { + pkt->hdr.redcnt = coll_data->red_cnt; + pkt->hdr.op = coll_data->red_op; + pkt->hdr.red_rc = coll_data->red_rc; + /* repsum has some additional information that must be set */ + if (_opcode_to_redtype(coll_data->red_op) == REDTYPE_REPSUM) { + pkt->hdr.repsum_m = coll_data->repsum.M; + pkt->hdr.repsum_ovflid = coll_data->repsum.overflow_id; + } + memcpy(pkt->data, &coll_data->databuf, CXIP_COLL_MAX_DATA_SIZE); + } else { + pkt->hdr.redcnt = 0; + pkt->hdr.op = 0; + pkt->hdr.red_rc = 0; + pkt->hdr.repsum_m = 0; + pkt->hdr.repsum_ovflid = 0; + memset(pkt->data, 0, CXIP_COLL_MAX_DATA_SIZE); + } + _dump_red_pkt(pkt, "send"); + _swappkt(pkt); + + /* -FI_EAGAIN means HW queue is full, should self-clear */ + do { + ret = _send_pkt(reduction, retry); + } while (ret == -FI_EAGAIN); + /* any other error is a serious config/hardware issue */ + if (ret) + CXIP_WARN("Fatal send error = %d\n", ret); + + return ret; +} + +/* Post a reduction completion request to the collective completion queue */ +static void _post_coll_complete(struct cxip_coll_reduction *reduction) +{ + struct cxip_req *req; + int ret; + + /* Indicates collective completion by writing to the endpoint TX CQ */ + req = reduction->op_inject_req; + if (!req) + return; + + if (reduction->accum.red_rc == CXIP_COLL_RC_SUCCESS) { + ret = cxip_cq_req_complete(req); + } else { + ret = cxip_cq_req_error(req, 0, + _cxip_rc_to_cxi_rc[reduction->accum.red_rc], + reduction->accum.red_rc, NULL, 0, FI_ADDR_UNSPEC); + } + if (ret) { + /* Is this possible? The only error is -FI_ENOMEM. It looks like + * send is blocked with -FI_EAGAIN until we are guaranteed EQ + * space in the queue. Display and ignore. + */ + CXIP_WARN("Attempt to post completion failed %s\n", + fi_strerror(-ret)); + } + + /* req structure no longer needed */ + cxip_evtq_req_free(req); + + /* restore reduction object to usable state */ + reduction->accum.initialized = false; + reduction->in_use = false; + reduction->completed = false; + reduction->pktsent = false; + reduction->accum.initialized = false; + reduction->accum.red_rc = CXIP_COLL_RC_SUCCESS; + reduction->op_inject_req = NULL; +} + +/* unpack reduction data from a reduction packet */ +static void _unpack_red_data(struct cxip_coll_data *coll_data, + const struct red_pkt *pkt) +{ + memcpy(coll_data->databuf, pkt->data, 32); + coll_data->repsum.M = pkt->hdr.repsum_m; + coll_data->repsum.overflow_id = pkt->hdr.repsum_ovflid; + coll_data->red_op = pkt->hdr.op; + coll_data->red_cnt = pkt->hdr.redcnt; + coll_data->red_rc = pkt->hdr.red_rc; + coll_data->initialized = true; +} + +/**************************************************************************** + * Collective State Machine + * + * The basic flow is: + * - all nodes reach a common reduction call (at different times) + * - leaf nodes send their data, to be reduced, and block, polling CQ + * - root node prepares for the reduction, and blocks, polling CQ + * - root node receives leaf packets and reduces them, until all received + * - root node sends Arm Packet with final result, and unblocks + * - leaf nodes receive Arm Packet with final result, and unblock + * + * The Rosetta acceleration comes from the Arm Packet, which speculatively arms + * the Rosetta tree for the NEXT operation. This persists until a timeout + * expires. The timeout is specified when the multicast tree is created by the + * Rosetta configuration service, and cannot be modified after join is complete. + * + * If the next collective operation occurs within the timeout, the leaf results + * will be reduced in reduction engines by Rosetta as they move up the tree, + * reducing the number of packets received by the root. + * + * If the reduction engine times out with partial results, it forwards the + * partial results, and all subsequent results are passed directly to the next + * Rosetta. + * + * The first leaf contribution to reach a reduction engine establishes the + * reduction operation. All subsequent contributions must use the same + * operation, or Rosetta returns an error. + * + * There are eight reduction_id values, which can be used to acquire and use up + * to eight independent reduction engines (REs) at each upstream port of each + * Rosetta switch in the collective tree. + * + * We use a round-robin selection of reduction id values. There is a small race + * condition among the leaf nodes as the result is distributed from the root. If + * another reduction were to be initiated during this race, the leaf nodes would + * be in disagreement as to which reduction IDs were free for the new reduction. + * To avoid this, we use a deterministic algorithm (round-robin) so that the + * "next" reduction id is always predetermined for each reduction. + * + * Ordering of requests and responses will the same on all nodes. + * + * Ordering of requests is required of the application. If requests are ordered + * differently on different nodes, results are undefined, and it is considered + * an application error. + * + * Ordering of responses is guaranteed by the mc_obj->tail_red_id value, which + * is advanced after the reduction completes. This ordering is required to + * ensure that the round-robin is observed. + */ + +/* modular increment/decrement */ +#define INCMOD(val, mod) do {(val)=((val)+1)%(mod);} while (0) +#define DECMOD(val, mod) do {(val)=((val)+(mod)-1)%(mod);} while (0) + +/* MONOTONIC timestamp operations for timeouts/retries */ +static inline +void _tsget(struct timespec *ts) +{ + clock_gettime(CLOCK_MONOTONIC, ts); +} + +static inline +void _tsadd(struct timespec *ts, const struct timespec *dt) +{ + ts->tv_sec += dt->tv_sec; + ts->tv_nsec += dt->tv_nsec; + if (ts->tv_nsec >= 1000000000L) { + ts->tv_sec += 1; + ts->tv_nsec -= 1000000000L; + } +} + +/* Set a timespec at expiration time (future) */ +static inline +void _tsset(struct cxip_coll_reduction *reduction) +{ + _tsget(&reduction->tv_expires); + _tsadd(&reduction->tv_expires, &reduction->mc_obj->timeout); +} + +/* Used to prevent first-use incast */ +static inline +bool _is_red_first_time(struct cxip_coll_reduction *reduction) +{ + return (reduction->tv_expires.tv_sec == 0L && + reduction->tv_expires.tv_nsec == 0L); +} + +/* Used to reduce incast congestion during run */ +static inline +bool _is_red_timed_out(struct cxip_coll_reduction *reduction) +{ + struct timespec tsnow; + + if (_is_red_first_time(reduction)) { + TRACE_DEBUG("=== root first time, retry\n"); + return true; + } + _tsget(&tsnow); + if (tsnow.tv_sec < reduction->tv_expires.tv_sec) + return false; + if (tsnow.tv_sec == reduction->tv_expires.tv_sec && + tsnow.tv_nsec < reduction->tv_expires.tv_nsec) + return false; + TRACE_DEBUG("=== root timeout, retry\n"); + return true; +} + +/* Root node state machine progress. + * !pkt means this is progressing from injection call (e.g. fi_reduce()) + * pkt means this is progressing from event callback (leaf packet) + */ +static void _progress_root(struct cxip_coll_reduction *reduction, + struct red_pkt *pkt) +{ + struct cxip_coll_mc *mc_obj = reduction->mc_obj; + struct cxip_coll_data coll_data; + ssize_t ret; + + /* State machine disabled for testing */ + if (reduction->coll_state != CXIP_COLL_STATE_READY) + return; + + /* Injection or packet arrival after root timeout initiates a retry */ + if (_is_red_timed_out(reduction)) { + /* reset reduction for retry send */ + reduction->seqno = mc_obj->seqno; + INCMOD(mc_obj->seqno, CXIP_COLL_MAX_SEQNO); + ofi_atomic_inc32(&mc_obj->tmout_cnt); + + ret = cxip_coll_send_red_pkt(reduction, NULL, + !mc_obj->arm_disable, true); + _tsset(reduction); + if (ret) { + SET_RED_RC(reduction->accum.red_rc, + CXIP_COLL_RC_TX_FAILURE); + reduction->completed = true; + goto post_complete; + } + return; + } + + /* Process received packet */ + if (pkt) { + /* Root has received a leaf packet */ + _dump_red_pkt(pkt, "Rrcv"); + + /* Drop out-of-date packets */ + if (pkt->hdr.resno != reduction->seqno) { + TRACE_DEBUG("bad seqno, exp=%d saw=%d\n", + reduction->seqno, pkt->hdr.resno); + ofi_atomic_inc32(&mc_obj->seq_err_cnt); + return; + } + + /* capture and reduce packet information */ + _unpack_red_data(&coll_data, pkt); + _reduce(&reduction->accum, &coll_data, false); + _dump_coll_data("after leaf contrib to root", &reduction->accum); + } + + /* check for reduction complete */ + if (reduction->accum.red_cnt == mc_obj->av_set_obj->fi_addr_cnt) { + /* copy reduction result to user result buffer */ + if (reduction->op_rslt_data && reduction->op_data_bytcnt) { + memcpy(reduction->op_rslt_data, + reduction->accum.databuf, + reduction->op_data_bytcnt); + } + + /* send reduction result to leaves, arm new seqno */ + reduction->seqno = mc_obj->seqno; + INCMOD(mc_obj->seqno, CXIP_COLL_MAX_SEQNO); + reduction->completed = true; + + ret = cxip_coll_send_red_pkt(reduction, &reduction->accum, + !mc_obj->arm_disable, false); + _tsset(reduction); + if (ret) + SET_RED_RC(reduction->accum.red_rc, + CXIP_COLL_RC_TX_FAILURE); + } + +post_complete: + /* Post completions in injection order */ + reduction = &mc_obj->reduction[mc_obj->tail_red_id]; + while (reduction->in_use && reduction->completed) { + /* Reduction completed on root */ + _post_coll_complete(reduction); + + /* Advance to the next reduction */ + INCMOD(mc_obj->tail_red_id, mc_obj->max_red_id); + reduction = &mc_obj->reduction[mc_obj->tail_red_id]; + } +} + +/* Leaf node state machine progress. + * !pkt means this is progressing from injection call (e.g. fi_reduce()) + * pkt means this is progressing from event callback (receipt of packet) + */ +static void _progress_leaf(struct cxip_coll_reduction *reduction, + struct red_pkt *pkt) +{ + struct cxip_coll_mc *mc_obj = reduction->mc_obj; + struct cxip_coll_data coll_data; + int ret; + + /* state machine disabled for testing */ + if (reduction->coll_state != CXIP_COLL_STATE_READY) + return; + + /* if reduction packet, reset timer, seqno, honor retry */ + if (pkt) { + _dump_red_pkt(pkt, "Lrcv"); + _tsset(reduction); + reduction->seqno = pkt->hdr.seqno; + reduction->resno = pkt->hdr.seqno; + if (pkt->hdr.cookie.retry) + reduction->pktsent = false; + } + + /* leaves lead with sending a packet */ + if (!reduction->pktsent) { + /* Avoid first-use incast, retry guaranteed */ + if (_is_red_first_time(reduction)) { + TRACE_DEBUG("=== leaf first time, wait\n"); + return; + } + + /* Don't send if nothing to send yet */ + if (!reduction->accum.initialized) + return; + + /* Send leaf data */ + ret = cxip_coll_send_red_pkt(reduction, &reduction->accum, + false, false); + if (ret) { + SET_RED_RC(reduction->accum.red_rc, + CXIP_COLL_RC_TX_FAILURE); + reduction->completed = true; + goto post_complete; + } + reduction->pktsent = true; + } + + /* If no incoming reduction packet, we are done */ + if (!pkt) + return; + + /* If packet has no reduction count (retry), done */ + if (!pkt->hdr.redcnt) + return; + + /* Capture final reduction data in user-pointer */ + SET_RED_RC(reduction->accum.red_rc, pkt->hdr.red_rc); + if (reduction->op_rslt_data) { + _unpack_red_data(&coll_data, pkt); + memcpy(reduction->op_rslt_data, + &coll_data.databuf, + reduction->op_data_bytcnt); + } + /* Reduction completed on leaf */ + reduction->completed = true; + +post_complete: + /* Post completions in injection order */ + reduction = &mc_obj->reduction[mc_obj->tail_red_id]; + while (reduction->in_use && reduction->completed) { + _post_coll_complete(reduction); + INCMOD(mc_obj->tail_red_id, mc_obj->max_red_id); + reduction = &mc_obj->reduction[mc_obj->tail_red_id]; + } +} + +/* Root or leaf progress state machine. + */ +static void _progress_coll(struct cxip_coll_reduction *reduction, + struct red_pkt *pkt) +{ + if (is_hw_root(reduction->mc_obj)) + _progress_root(reduction, pkt); + else + _progress_leaf(reduction, pkt); +} + +/* Debugging only */ +static int *_injected_red_id_buf; +void cxip_capture_red_id(int *red_id_buf) +{ + _injected_red_id_buf = red_id_buf; +} + +/* Generic collective pre-reduction into cxip_coll_data structure */ +static void +_cxip_coll_prereduce(int cxi_opcode, const void *op_send_data, + void *accum, size_t sendcnt, uint64_t flags) +{ + const struct cxip_coll_data *coll_data_ptr; + struct cxip_coll_data coll_data; + + /* Convert user data to local coll_data structure */ + if (flags & FI_CXI_PRE_REDUCED) { + coll_data_ptr = op_send_data; + } else { + _init_coll_data(&coll_data, cxi_opcode, op_send_data, + sendcnt); + coll_data_ptr = &coll_data; + } + _dump_coll_data("coll_data initialized pre", coll_data_ptr); + + /* pre-reduce data into accumulator */ + _reduce(accum, coll_data_ptr, true); +} + +/* Generic collective injection into fabric. + * + * Reduction ID is normally hidden. Can be exposed by calling _capture_red_id() + * just before calling a reduction operation. + * + * - Acquires next available reduction structure in MC, or returns -FI_EAGAIN. + * - Acquires evtq request, or return -FI_EAGAIN. + * - Marks reduction structure in-use. + * - Advances next available reduction pointer. + * - Initializes: + * - result data pointer + * - source data (pre-reduced or raw) + * - data byte count + * - Reduces user data into reduction accumulator (may already contain data) + * - Progresses reduction (no packet supplied) + */ +static ssize_t +_cxip_coll_inject(struct cxip_coll_mc *mc_obj, int cxi_opcode, + const void *op_send_data, void *op_rslt_data, + size_t bytcnt, uint64_t flags, void *context) +{ + struct cxip_coll_reduction *reduction; + struct cxip_coll_data coll_data; + struct cxip_req *req; + int ret; + + TRACE_DEBUG("%s entry\n", __func__); + TRACE_DEBUG("%s bytecnt=%ld\n", __func__, bytcnt); + ofi_genlock_lock(&mc_obj->ep_obj->lock); + + /* must observe strict round-robin across all nodes */ + reduction = &mc_obj->reduction[mc_obj->next_red_id]; + if (reduction->in_use) { + ret = -FI_EAGAIN; + goto quit; + } + + /* acquire a request structure */ + req = cxip_evtq_req_alloc(mc_obj->ep_obj->coll.tx_evtq, 1, NULL); + if (!req) { + ret = -FI_EAGAIN; + goto quit; + } + + /* Used for debugging */ + if (_injected_red_id_buf) { + *_injected_red_id_buf = reduction->red_id; + _injected_red_id_buf = NULL; + } + + /* advance next_red_id, reserving this one for us */ + INCMOD(mc_obj->next_red_id, mc_obj->max_red_id); + reduction->in_use = true; + + /* Set up the reduction structure */ + reduction->op_rslt_data = op_rslt_data; + reduction->op_data_bytcnt = bytcnt; + reduction->op_context = context; + reduction->op_inject_req = req; + reduction->op_inject_req->context = (uint64_t)context; + + /* Convert user data to local coll_data structure */ + if (flags & FI_CXI_PRE_REDUCED) + memcpy(&coll_data, op_send_data, sizeof(coll_data)); + else + _init_coll_data(&coll_data, cxi_opcode, op_send_data, bytcnt); + + /* reduce data into accumulator */ + _reduce(&reduction->accum, &coll_data, false); + _dump_coll_data("coll_data initialized inj", &coll_data); + + /* Progress the collective */ + _progress_coll(reduction, NULL); + ret = FI_SUCCESS; + +quit: + ofi_genlock_unlock(&mc_obj->ep_obj->lock); + TRACE_DEBUG("%s return %d\n", __func__, ret); + return ret; +} + +/* Get the mc_obj from ep/coll_addr and check for consistency */ +static inline +ssize_t _get_mc_obj(struct fid_ep *ep, fi_addr_t coll_addr, + struct cxip_coll_mc **mc_obj) +{ + struct cxip_ep *cxi_ep; + + if (!ep) { + CXIP_WARN("Collective requires ep\n"); + return -FI_EINVAL; + } + + if (!coll_addr) { + CXIP_WARN("Collective requires coll_addr\n"); + return -FI_EINVAL; + } + + cxi_ep = container_of(ep, struct cxip_ep, ep.fid); + *mc_obj = (struct cxip_coll_mc *)((uintptr_t)coll_addr); + + if ((*mc_obj)->ep_obj != cxi_ep->ep_obj) { + CXIP_WARN("Multicast does not belong to ep\n"); + return -FI_EINVAL; + } + + if (!(*mc_obj)->is_joined) { + CXIP_WARN("Multicast collective not joined\n"); + return -FI_EOPBADSTATE; + } + + return FI_SUCCESS; +} + +/* get payload byte count and check for consistency */ +static inline +ssize_t _get_bytcnt(int cxi_opcode, enum fi_datatype datatype, + const void *buf, size_t count) +{ + ssize_t bytcnt; + + if (cxi_opcode < 0) { + CXIP_WARN("opcode not supported\n"); + return -FI_EINVAL; + } + + if (!buf || count <= 0L) { + CXIP_WARN("buffer required\n"); + return -FI_EINVAL; + } + + bytcnt = _get_cxi_data_bytcnt(cxi_opcode, datatype, count); + if (bytcnt < 0) + CXIP_WARN("opcode does not support datatype\n"); + + return bytcnt; +} + +ssize_t cxip_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context) +{ + struct cxip_coll_mc *mc_obj; + int cxi_opcode; + ssize_t ret; + + /* barrier requires mc_obj */ + ret = _get_mc_obj(ep, coll_addr, &mc_obj); + if (ret) + return ret; + + cxi_opcode = COLL_OPCODE_BARRIER; + + return _cxip_coll_inject(mc_obj, cxi_opcode, NULL, NULL, 0, 0, context); +} + +ssize_t cxip_broadcast(struct fid_ep *ep, void *buf, size_t count, + void *desc, fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, + void *context) +{ + struct cxip_coll_mc *mc_obj; + int cxi_opcode, bytcnt; + ssize_t ret; + + if (flags & (FI_MORE|FI_CXI_PRE_REDUCED)) { + CXIP_WARN("Illegal flags for broadcast\n"); + return -FI_EINVAL; + } + + cxi_opcode = COLL_OPCODE_BIT_OR; + bytcnt = _get_bytcnt(cxi_opcode, datatype, buf, count); + if (bytcnt < 0) + return -FI_EINVAL; + + /* broadcast requires mc_obj */ + ret = _get_mc_obj(ep, coll_addr, &mc_obj); + if (ret) + return ret; + + /* only root node contributes data, others contribute 0 */ + if (root_addr != mc_obj->mynode_fiaddr) + memset(buf, 0, bytcnt); + + /* buf serves as source and result */ + return _cxip_coll_inject(mc_obj, cxi_opcode, buf, buf, bytcnt, + flags, context); +} + +ssize_t cxip_reduce(struct fid_ep *ep, const void *buf, size_t count, + void *desc, void *result, void *result_desc, + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, enum fi_op op, uint64_t flags, + void *context) +{ + struct cxip_coll_mc *mc_obj; + int cxi_opcode; + ssize_t bytcnt, ret; + + TRACE_DEBUG("%s entry\n", __func__); + cxi_opcode = cxip_fi2cxi_opcode(op, datatype); + bytcnt = _get_bytcnt(cxi_opcode, datatype, buf, count); + if (bytcnt < 0) + return (ssize_t)bytcnt; + + /* FI_MORE requires result buffer, succeeds immediately */ + if (flags & FI_MORE) { + if (!result) { + CXIP_WARN("result required with FI_MORE\n"); + return -FI_EINVAL; + } + _cxip_coll_prereduce(cxi_opcode, buf, result, bytcnt, flags); + return FI_SUCCESS; + } + + /* otherwise reduce requires mc_obj */ + ret = _get_mc_obj(ep, coll_addr, &mc_obj); + if (ret) + return ret; + + /* root requires a result buffer */ + if (!result && (mc_obj->mynode_fiaddr == root_addr)) { + CXIP_WARN("reduce root result required\n"); + return -FI_EINVAL; + } + + return _cxip_coll_inject(mc_obj, cxi_opcode, buf, result, bytcnt, + flags, context); +} + +ssize_t cxip_allreduce(struct fid_ep *ep, const void *buf, size_t count, + void *desc, void *result, void *result_desc, + fi_addr_t coll_addr, enum fi_datatype datatype, + enum fi_op op, uint64_t flags, void *context) +{ + struct cxip_coll_mc *mc_obj; + int cxi_opcode, bytcnt; + ssize_t ret; + + TRACE_DEBUG("%s entry\n", __func__); + cxi_opcode = cxip_fi2cxi_opcode(op, datatype); + TRACE_DEBUG("%s cxi_opcode = %d\n", __func__, cxi_opcode); + bytcnt = _get_bytcnt(cxi_opcode, datatype, buf, count); + TRACE_DEBUG("%s bytcnt = %d\n", __func__, bytcnt); + if (bytcnt < 0) + return bytcnt; + + /* result required in all cases */ + if (!result) { + CXIP_WARN("result required with FI_MORE\n"); + return -FI_EINVAL; + } + + /* FI_MORE succeeds immediately */ + if (flags & FI_MORE) { + _cxip_coll_prereduce(cxi_opcode, buf, result, bytcnt, flags); + return FI_SUCCESS; + } + + /* otherwise reduce requires mc_obj */ + ret = _get_mc_obj(ep, coll_addr, &mc_obj); + if (ret) + return ret; + + return _cxip_coll_inject(mc_obj, cxi_opcode, buf, result, bytcnt, + flags, context); +} + +/**************************************************************************** + * JOIN COLLECTIVE STATE MACHINE + */ + +/* Packed structure to fit information into zbcoll broadcast payload */ +union pack_mcast { + uint64_t uint64; + struct { + uint64_t mcast_addr: 16;// maximum anticipated multicast + uint64_t hwroot_idx: 27;// 128M endpoints in tree + uint64_t valid: 1; // success flag + uint64_t pad: 20; // needed by zbcoll + } __attribute__((__packed__)); + struct { + uint64_t error_bits: 43;// up to 43 independent errors + uint64_t valid1: 1; // unused/reserved + uint64_t pad1: 20; // unused/reserved + + } __attribute__((__packed__)); +}; + +/* State structure for carrying data through the join sequence */ +struct cxip_join_state { + struct cxip_ep_obj *ep_obj; // ep object + struct cxip_av_set *av_set_obj; // av set for this collective + struct cxip_coll_mc *mc_obj; // mc object for this collective + struct cxip_zbcoll_obj *zb; // zb object associated with state + struct fid_mc **mc; // user pointer to return mc_obj + void *context; // user context for concurrent joins + uint64_t join_flags; // user-supplied libfabric join flags + union pack_mcast bcast_data; // packed multicast data + bool rx_discard; // set if RX events should be discarded + bool is_rank; // set if using COLL_RANK simulation model + bool is_mcast; // set if using Rosetta multicast tree + bool create_mcast; // set to create Rosetta multicast tree + bool creating_mcast; // set once CURL has been initiated + bool finished_mcast; // set once CURL has been completed + bool created_ptlte; // set once PtlTE is initialized + int mynode_idx; // index within the fi_addr[] list + int mynode_fiaddr; // fi_addr of this node + int simrank; // simulated rank of NIC + int pid_idx; // pid_idx used by ptl_te + int prov_errno; // collective provider error + int sched_state; // scheduled operation + int join_idx; // unique join index for diagnostics + struct dlist_entry sched_link; // link to scheduled actions +}; + +/* State structure for recovering data from CURL response */ +struct cxip_curl_mcast_usrptr { + struct cxip_join_state *jstate; // join state + int mcast_id; // multicast address + int hwroot_rank; // hardware root index +}; + +/* pack provider errors into AND bitmask - address data */ +void _proverr_to_bits(struct cxip_join_state *jstate) +{ + int bitno; + + /* record error as a bit for this endpoint */ + jstate->bcast_data.error_bits = 0L; + if (!jstate->bcast_data.valid) { + bitno = -jstate->prov_errno; + jstate->bcast_data.error_bits |= (1L << bitno); + } + /* invert bits, zbcoll reduce does AND */ + jstate->bcast_data.error_bits ^= -1L; +} + +/* unpack AND bitmask into dominant provider error */ +void _bits_to_proverr(struct cxip_join_state *jstate) +{ + int bitno; + + /* zbcoll reduce does AND, invert bits */ + jstate->bcast_data.error_bits ^= -1L; + + /* if data is valid, bits do not represent errors */ + if (jstate->bcast_data.valid) { + jstate->prov_errno = CXIP_PROV_ERRNO_OK; + return; + } + + /* bits set represent multiple errors from endpoints */ + for (bitno = -CXIP_PROV_ERRNO_OK; bitno < -CXIP_PROV_ERRNO_LAST; bitno++) { + if (jstate->bcast_data.error_bits & (1 << bitno)) { + jstate->prov_errno = -bitno; + CXIP_WARN("join error %d seen\n", jstate->prov_errno); + } + } + /* returns most significant of multiple errors as jstate->prov_errno */ +} + +/* Close collective pte object - ep_obj->lock must be held */ +static void _close_pte(struct cxip_coll_pte *coll_pte) +{ + int ret; + + if (!coll_pte) + return; + do { + ret = _coll_pte_disable(coll_pte); + } while (ret == -FI_EAGAIN); + _coll_destroy_buffers(coll_pte); + cxip_pte_free(coll_pte->pte); + free(coll_pte); +} + +/* pid_idx == CXIP_PTL_IDX_COLL+rank for NETSIM + * pid_idx == CXIP_PTL_IDX_COLL for all other cases + */ +static int _acquire_pte(struct cxip_ep_obj *ep_obj, int pid_idx, + bool is_mcast, struct cxip_coll_pte **coll_pte_ret) +{ + struct cxi_pt_alloc_opts pt_opts = { + .use_long_event = 1, + .do_space_check = 1, + .en_restricted_unicast_lm = 1, + }; + struct cxip_coll_pte *coll_pte; + int ret; + + *coll_pte_ret = NULL; + coll_pte = calloc(1, sizeof(*coll_pte)); + if (!coll_pte) + return -FI_ENOMEM; + + /* initialize coll_pte */ + coll_pte->ep_obj = ep_obj; + dlist_init(&coll_pte->buf_list); + ofi_atomic_initialize32(&coll_pte->buf_cnt, 0); + ofi_atomic_initialize32(&coll_pte->buf_swap_cnt, 0); + ofi_atomic_initialize32(&coll_pte->recv_cnt, 0); + + /* bind PTE to domain */ + ret = cxip_pte_alloc(ep_obj->ptable, ep_obj->coll.rx_evtq->eq, + pid_idx, is_mcast, &pt_opts, _coll_pte_cb, + coll_pte, &coll_pte->pte); + if (ret) + goto fail; + + /* enable the PTE */ + ret = _coll_pte_enable(coll_pte, CXIP_PTE_IGNORE_DROPS); + if (ret) + goto fail; + + /* add buffers to the PTE */ + ret = _coll_add_buffers(coll_pte, + ep_obj->coll.buffer_size, + ep_obj->coll.buffer_count); + if (ret) + goto fail; + + *coll_pte_ret = coll_pte; + return FI_SUCCESS; + +fail: + _close_pte(coll_pte); + return ret; +} + +/* Close multicast collective object */ +static void _close_mc(struct cxip_coll_mc *mc_obj) +{ + int count; + + if (!mc_obj) + return; + /* clear the mcast_addr -> mc_obj reference*/ + ofi_idm_clear(&mc_obj->ep_obj->coll.mcast_map, mc_obj->mcast_addr); + mc_obj->ep_obj->coll.is_hwroot = false; + + /* clear the avset alteration lockout */ + mc_obj->av_set_obj->mc_obj = NULL; + + /* unmap the reduction mem descriptor for DMA */ + if (mc_obj->reduction_md) + cxil_unmap(mc_obj->reduction_md); + + /* close any PTE associated with mc_obj (NETSIM) */ + if (mc_obj->coll_pte != mc_obj->ep_obj->coll.coll_pte) + _close_pte(mc_obj->coll_pte); + + /* decrement multicast count (real), close PTE if unused */ + count = ofi_atomic_dec32(&mc_obj->ep_obj->coll.num_mc); + count = ofi_atomic_get32(&mc_obj->ep_obj->coll.num_mc); + if (!count && mc_obj->ep_obj->coll.coll_pte) { + _close_pte(mc_obj->ep_obj->coll.coll_pte); + mc_obj->ep_obj->coll.coll_pte = NULL; + } + free(mc_obj); +} + +static int _fi_close_mc(struct fid *fid) +{ + struct cxip_coll_mc *mc_obj; + + mc_obj = container_of(fid, struct cxip_coll_mc, mc_fid.fid); + _close_mc(mc_obj); + return FI_SUCCESS; +} + +/* multicast object operational functions */ +static struct fi_ops mc_ops = { + .size = sizeof(struct fi_ops), + .close = _fi_close_mc, +}; + +/** + * Utility routine to set up the collective framework in response to calls to + * fi_join_collective(). + * + * If jstate->is_rank is true, this is a NETSIM model, which opens a PTE for + * each call to fi_join_collective() that is bound to the multicast object + * created by that call. This allows simulated multicast traffic through the + * NETSIM loopback port by using different pte_idx values for each PTE to + * disambiguate traffic intended for different simulated hardware endpoints. + * This model does not support multiple MC objects at an endpoint: there is + * exactly one MC address. Progressing the single endpoint will progress all + * of the simulated MC objects. Extending this model to support multiple MC + * objects is not a priority at this time. + * + * If jstate->is_rank is false, this is a multinode model. The first call to + * fi_join_collective() creates a single PTE which is bound to the EP, and + * creates the first multicast object for that endpoint. Every subsequent + * join will create an additional multicast object that shares the PTE for + * that endpoint. Multiple NICs on the node are represented by separate EP + * objects, which are functionally distinct: all endpoints must be progressed + * independently, and if any endpoint is not progressed, it will stall the + * collective. + * + * Caller must hold ep_obj->lock. + */ +static int _initialize_mc(void *ptr) +{ + struct cxip_join_state *jstate = ptr; + struct cxip_ep_obj *ep_obj = jstate->ep_obj; + struct cxip_av_set *av_set_obj = jstate->av_set_obj; + struct cxip_coll_mc *mc_obj; + struct cxip_coll_pte *coll_pte; + struct cxip_cmdq *cmdq; + int red_id; + int ret; + + TRACE_JOIN("%s entry\n", __func__); + + mc_obj = calloc(1, sizeof(*mc_obj)); + if (!mc_obj) + return -FI_ENOMEM; + + /* COMM_KEY_RANK model needs a distinct PTE for every MC object. + * All other models share a single PTE for all MCs using an EP. + */ + coll_pte = ep_obj->coll.coll_pte; + if (!coll_pte) { + TRACE_DEBUG("acqiring PTE\n"); + ret = _acquire_pte(ep_obj, jstate->pid_idx, jstate->is_mcast, + &coll_pte); + if (ret) { + TRACE_DEBUG("acquiring PTE failed %d\n", ret); + free(mc_obj); + return ret; + } + if (!jstate->is_rank) { + TRACE_DEBUG("assigned PTE to ep_obj\n"); + ep_obj->coll.coll_pte = coll_pte; + } + /* else leave ep_obj->coll.coll_pte == NULL */ + } + /* copy coll_pte to mc_obj */ + mc_obj->coll_pte = coll_pte; + + /* if COMM_KEY_RANK model, PTE must know the mc_obj */ + coll_pte->mc_obj = (jstate->is_rank) ? mc_obj : NULL; + + /* link ep_obj to mc_obj (1 to many) */ + mc_obj->ep_obj = ep_obj; + ofi_atomic_inc32(&ep_obj->coll.num_mc); + + /* link av_set_obj to mc_obj (one to one) */ + av_set_obj->mc_obj = mc_obj; + mc_obj->av_set_obj = av_set_obj; + + /* initialize remainder of mc_obj */ + mc_obj->mc_fid.fid.fclass = FI_CLASS_MC; + mc_obj->mc_fid.fid.context = mc_obj; + mc_obj->mc_fid.fid.ops = &mc_ops; + mc_obj->mc_fid.fi_addr = (fi_addr_t)(uintptr_t)mc_obj; + mc_obj->hwroot_idx = jstate->bcast_data.hwroot_idx; + mc_obj->mcast_addr = jstate->bcast_data.mcast_addr; + mc_obj->mynode_idx = jstate->mynode_idx; + mc_obj->mynode_fiaddr = jstate->mynode_fiaddr; + mc_obj->max_red_id = CXIP_COLL_MAX_CONCUR; + mc_obj->arm_disable = false; + mc_obj->rx_discard = jstate->rx_discard; + mc_obj->timeout.tv_sec = + cxip_env.coll_retry_usec/1000000L; + mc_obj->timeout.tv_nsec = + (cxip_env.coll_retry_usec%1000000L)*1000L; + for (red_id = 0; red_id < CXIP_COLL_MAX_CONCUR; red_id++) { + struct cxip_coll_reduction *reduction; + + reduction = &mc_obj->reduction[red_id]; + reduction->coll_state = CXIP_COLL_STATE_READY; + reduction->mc_obj = mc_obj; + reduction->red_id = red_id; + reduction->in_use = false; + reduction->completed = false; + } + TRACE_DEBUG("Initializing mc_obj=%p counters\n", mc_obj); + ofi_spin_init(&mc_obj->lock); + ofi_atomic_initialize32(&mc_obj->send_cnt, 0); + ofi_atomic_initialize32(&mc_obj->recv_cnt, 0); + ofi_atomic_initialize32(&mc_obj->pkt_cnt, 0); + ofi_atomic_initialize32(&mc_obj->seq_err_cnt, 0); + ofi_atomic_initialize32(&mc_obj->tmout_cnt, 0); + + /* map entire reduction block if using DMA */ + if (cxip_env.coll_use_dma_put) { + /* EXPERIMENTAL */ + ret = cxil_map(ep_obj->domain->lni->lni, + mc_obj->reduction, + sizeof(mc_obj->reduction), + CXI_MAP_PIN | CXI_MAP_READ | CXI_MAP_WRITE, + NULL, &mc_obj->reduction_md); + if (ret) + goto fail; + } + + /* define the traffic class */ + // TODO revisit for LOW_LATENCY + if (is_netsim(ep_obj)) { + /* NETSIM RANK model */ + mc_obj->tc = CXI_TC_BEST_EFFORT; + mc_obj->tc_type = CXI_TC_TYPE_DEFAULT; + } else if (!jstate->is_mcast) { + /* UNICAST model */ + mc_obj->tc = CXI_TC_BEST_EFFORT; + mc_obj->tc_type = CXI_TC_TYPE_DEFAULT; + } else if (is_hw_root(mc_obj)) { + /* MULTICAST model, hw_root */ + mc_obj->tc = CXI_TC_BEST_EFFORT; + mc_obj->tc_type = CXI_TC_TYPE_DEFAULT; + } else { + /* MULTICAST model, leaves */ + mc_obj->tc = CXI_TC_LOW_LATENCY; + mc_obj->tc_type = CXI_TC_TYPE_COLL_LEAF; + } + /* Set this now to instantiate cmdq CP */ + cmdq = ep_obj->coll.tx_cmdq; + ret = cxip_txq_cp_set(cmdq, ep_obj->auth_key.vni, + mc_obj->tc, mc_obj->tc_type); + if (ret) { + TRACE_JOIN("%s: cxip_txq_cp_set() = %d\n", __func__, ret); + goto fail; + } + + /* index mc_obj by mcast_addr for fast lookup */ + TRACE_JOIN("%s: mc addr=%d obj=%p\n", __func__, mc_obj->mcast_addr, mc_obj); + ret = ofi_idm_set(&ep_obj->coll.mcast_map, + mc_obj->mcast_addr, mc_obj); + if (ret < 0) { + TRACE_JOIN("%s: idm set failed %d\n", __func__, ret); + goto fail; + } + /* lock out reuse of this endpoint as hw_root for any multicast addr */ + if (mc_obj->hwroot_idx == mc_obj->mynode_idx) { + TRACE_JOIN("%s: set is_hwroot\n", __func__); + ep_obj->coll.is_hwroot = true; + } +#if ENABLE_DEBUG + struct cxip_coll_mc *mc_obj_chk; + + mc_obj_chk = ofi_idm_lookup(&ep_obj->coll.mcast_map, + mc_obj->mcast_addr); + if (mc_obj_chk != mc_obj) { + TRACE_JOIN("%s: mcast set=%p get=%p\n", + __func__, mc_obj, mc_obj_chk); + } +#endif + /* Last field to set */ + mc_obj->is_joined = true; + + /* Return information to the caller */ + jstate->mc_obj = mc_obj; + *jstate->mc = &mc_obj->mc_fid; + TRACE_JOIN("%s: initialized mc[%d] to %p\n", + __func__, jstate->mynode_idx, *jstate->mc); + + return FI_SUCCESS; + +fail: + _close_mc(mc_obj); + return ret; +} + +/** + * CURL callback function upon completion of a request. + * + * This sets jstate->finished_mcast, even if the operation fails. + * This sets jstate->bcast_data.valid if the address is valid. + */ +static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle) +{ + struct cxip_curl_mcast_usrptr *curl_usrptr = handle->usrptr; + struct cxip_join_state *jstate = curl_usrptr->jstate; + struct json_object *json_obj; + struct cxip_addr caddr; + const char *hwrootstr; + int mcaddr, hwroot; + uint32_t b2, b1, b0, n; + int i, ret; + + /* Creation process is done */ + TRACE_JOIN("CURL COMPLETED!\n"); + jstate->finished_mcast = true; + + switch (handle->status) { + case 200: + case 201: + /* CURL succeeded, parse response */ + TRACE_JOIN("CURL PARSE RESPONSE:\n%s\n", handle->response); + if (!(json_obj = json_tokener_parse(handle->response))) + break; + if (cxip_json_int("mcastID", json_obj, &mcaddr)) + break; + if (cxip_json_string("hwRoot", json_obj, &hwrootstr)) + break; + + n = sscanf(hwrootstr, "%x:%x:%x", &b2, &b1, &b0); + if (n < 3 || b2 > 0xf || b1 > 0xff || b2 > 0xff) + break; + hwroot = (b2 << 16) + (b1 << 8) + b0; + + TRACE_JOIN("mcastID=%d hwRoot='%s'=%x\n", mcaddr, hwrootstr, + hwroot); + for (i = 0; i < jstate->av_set_obj->fi_addr_cnt; i++) { + ret = cxip_av_lookup_addr( + jstate->av_set_obj->cxi_av, + jstate->av_set_obj->fi_addr_ary[i], + &caddr); + if (ret < 0) + continue; + TRACE_JOIN("test %d == %d\n", hwroot, caddr.nic); + if (hwroot == caddr.nic) + break; + } + TRACE_JOIN("final index=%d\n", i); + if (i >= jstate->av_set_obj->fi_addr_cnt) { + TRACE_JOIN("multicast HWroot not found in av_set\n"); + jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INVALID; + break; + } + /* Production MCAST address */ + jstate->bcast_data.valid = true; + jstate->bcast_data.hwroot_idx = i; + jstate->bcast_data.mcast_addr = (uint32_t)mcaddr; + jstate->is_mcast = true; + /* This succeeded */ + TRACE_JOIN("curl: mcaddr =%08x\n", + jstate->bcast_data.mcast_addr); + TRACE_JOIN("curl: hwrootidx=%d\n", + jstate->bcast_data.hwroot_idx); + break; + default: + TRACE_JOIN("ERRMSK SET CURL error %ld!\n", handle->status); + if (handle->response) + TRACE_JOIN("ERROR RESPONSE:\n%s\n", handle->response); + // TODO finer error differentiation from CURL errors + jstate->prov_errno = CXIP_PROV_ERRNO_CURL; + break; + } + free(curl_usrptr); + TRACE_JOIN("CURL COMPLETED!\n"); + jstate->finished_mcast = true; +} + +/** + * Start a CURL request for a multicast address. + */ +static void _start_curl(void *ptr) +{ + struct cxip_curl_mcast_usrptr *curl_usrptr; + struct cxip_join_state *jstate = ptr; + static const char *json_fmt = + "{'macs':[%s],'jobID':'%s','jobStepID':'%s','timeout':%ld}"; + struct cxip_addr caddr; + char *jsonreq, *mac, *url, *p; + int i, ret; + + /* early exit will attempt to free these */ + curl_usrptr = NULL; + jsonreq = NULL; + mac = NULL; + url = NULL; + + /* acquire the environment variables needed */ + TRACE_JOIN("jobid = %s\n", cxip_env.coll_job_id); + TRACE_JOIN("stepid = %s\n", cxip_env.coll_job_step_id); + TRACE_JOIN("fmurl = %s\n", cxip_env.coll_fabric_mgr_url); + TRACE_JOIN("token = %s\n", cxip_env.coll_mcast_token); + TRACE_JOIN("maxadrs = %ld\n", cxip_env.hwcoll_addrs_per_job); + TRACE_JOIN("minnodes= %ld\n", cxip_env.hwcoll_min_nodes); + TRACE_JOIN("retry = %ld\n", cxip_env.coll_retry_usec); + TRACE_JOIN("tmout = %ld\n", cxip_env.coll_timeout_usec); + + /* Generic error for any preliminary failures */ + jstate->prov_errno = CXIP_PROV_ERRNO_CURL; + if (!cxip_env.coll_job_id || + !cxip_env.coll_fabric_mgr_url || + !cxip_env.coll_mcast_token) { + TRACE_JOIN("Check environment variables\n"); + ret = -FI_EINVAL; + goto quit; + } + + ret = asprintf(&url, "%s/fabric/collectives/multicast", + cxip_env.coll_fabric_mgr_url); + if (ret < 0) { + TRACE_JOIN("Failed to construct CURL address\n"); + ret = -FI_ENOMEM; + goto quit; + } + + /* five hex digits per mac, two colons, two quotes, comma */ + p = mac = malloc(10*jstate->av_set_obj->fi_addr_cnt + 1); + if (!mac) { + TRACE_JOIN("Failed to allocate mac list\n"); + ret = -FI_ENOMEM; + goto quit; + } + for (i = 0; i < jstate->av_set_obj->fi_addr_cnt; i++) { + ret = cxip_av_lookup_addr( + jstate->av_set_obj->cxi_av, + jstate->av_set_obj->fi_addr_ary[i], &caddr); + if (ret < 0) { + TRACE_JOIN("failed to find address[%d]=%ld\n", + i, jstate->av_set_obj->fi_addr_ary[i]); + goto quit; + } + p += sprintf(p, "'%01X:%02X:%02X',", + (caddr.nic >> 16) & 0xf, + (caddr.nic >> 8) & 0xff, + (caddr.nic) & 0xff); + + } + *(--p) = 0; + + /* generate the CURL JSON request */ + ret = asprintf(&jsonreq, json_fmt, mac, + cxip_env.coll_job_id, + cxip_env.coll_job_step_id, + cxip_env.coll_timeout_usec); + if (ret < 0) { + TRACE_JOIN("Creating JSON request = %d\n", ret); + ret = -FI_ENOMEM; + goto quit; + } + single_to_double_quote(jsonreq); + TRACE_JOIN("JSON = %s\n", jsonreq); + + /* create the mcast address */ + curl_usrptr = calloc(1, sizeof(*curl_usrptr)); + if (!curl_usrptr) { + TRACE_JOIN("curl_usrptr calloc() error\n"); + ret = -FI_ENOMEM; + goto quit; + } + /* dispatch CURL request */ + curl_usrptr->jstate = jstate; + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_CURLSND, &ret)) + goto quit; + ret = cxip_curl_perform(url, jsonreq, cxip_env.coll_mcast_token, 0, + CURL_POST, false, _cxip_create_mcast_cb, + curl_usrptr); +quit: + free(url); + free(mac); + free(jsonreq); + if (ret < 0) { + TRACE_JOIN("CURL execution failed\n"); + free(curl_usrptr); + jstate->finished_mcast = true; + } +} + +/**************************************************************************** + * State machine for performing fi_join_collective() + * + * The zbcoll operations use unrestricted packets, will re-route dynamically, + * and manage NAK retries automatically, so they are resistant to dropped + * packets and other transient errors. They will not (and should not) time out: + * a persistently unresponsive endpoint in the collective tree will cause the + * collective join to block indefinitely. + * + * Each state operation returns without doing any retries. The state machine + * progress table will decide whether to retry the operation. + * + * Each state operation must set zb->error as follows: + * - FI_SUCCESS - continues the state machine + * - FI_EAGAIN - retries the same state + * - other - fails the join operation + * + * The bcast_data value is used to carry 64 bits of data. + * The prov_errno value records a local (speculative) error + * prov_errno is ignored if bcast_data.valid == true + * + * getgroup: + * acquires a group ID for zbcoll collectives + * broadcast (zbcoll rank 0): + * if appropriate, starts CURL request, evaluates return + * otherwise, assumes static initialization, sets return + * on broadcast completion + * - all endpoints share bcast_data from zbcoll rank 0 + * - prov_errno indicates an error if bcast_data.valid is false + * - if bcast_data.valid, initializes a new MC object, new PTE if needed + * - creation errors set bcast_data.valid false, set prov_errno + * reduce: + * converts this endpoint prov_errno to bitmask + * overwrites mcast_addr and hwcoll_idx in bcast_data with bitmask + * bcast_data.valid remains unchanged + * on reduce completion + * - bitmask is bitwise OR of all error bits and address valid bit + * - prov_errno is set to prioritized error code (0 if bcast_data.valid) + * - all endpoints report the same completion status and error + */ + +/** + * Join state machine. + * + * The state machine walks through the following functions top-to-bottom. + * If the return code is success, it advances to the next state. + * If the return code is -FI_EAGAIN, it repeats the current state. + * If the return code is anything else, the join operation fails. + */ + +/* append a jstate to the zbcoll scheduler */ +static void _append_sched(struct cxip_zbcoll_obj *zb, void *usrptr) +{ + struct cxip_ep_coll_obj *coll_obj = &zb->ep_obj->coll; + struct cxip_join_state *jstate = usrptr; + + dlist_ts_insert_tail(&coll_obj->sched_list, &jstate->sched_link); +} + +static void _noop(void *ptr) +{ + TRACE_JOIN("%s: entry\n", __func__); +} + +/* get a zbcoll group identifier */ +static void _start_getgroup(void *ptr) +{ + struct cxip_join_state *jstate = ptr; + struct cxip_zbcoll_obj *zb = jstate->zb; + + TRACE_JOIN("%s on %d: entry\n", __func__, jstate->mynode_idx); + + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_GETGRP, &zb->error)) + goto quit; + /* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */ + zb->error = cxip_zbcoll_getgroup(zb); +quit: + TRACE_JOIN("getgroup error = %d\n", zb->error); + if (zb->error) + _append_sched(zb, jstate); +} + +static void _finish_getgroup(void *ptr) +{ + struct cxip_join_state *jstate = ptr; + struct cxip_zbcoll_obj *zb = jstate->zb; + + TRACE_JOIN("%s on %d: entry\n", __func__, jstate->mynode_idx); + _append_sched(zb, jstate); // _start_bcast +} + +/* Create a multicast address and broadcast it to all endpoints. + * If jstate->create_mcast is set, this will use CURL to get an address. + * Otherwise, this presumes static initialization, and sets bcast_data.valid. + */ +static void _start_bcast(void *ptr) +{ + struct cxip_join_state *jstate = ptr; + struct cxip_zbcoll_obj *zb = jstate->zb; + + TRACE_JOIN("%s: entry\n", __func__); + + /* error will indicate that the multicast request fails */ + jstate->prov_errno = C_RC_INVALID_DFA_FORMAT; + /* rank 0 always does the work here */ + if (jstate->mynode_idx == 0) { + if (jstate->create_mcast) { + /* first call (only) initiates CURL request */ + if (!jstate->creating_mcast) { + jstate->creating_mcast = true; + _start_curl(jstate); + } + /* every retry call checks to see if CURL is complete */ + if (!jstate->finished_mcast) { + zb->error = -FI_EAGAIN; + goto quit; + } + /* bcast_data.valid is set by curl callback */ + } else { + /* static bcast data is presumed correct */ + jstate->bcast_data.valid = true; + } + } + /* speculative prov_errno for trap */ + jstate->prov_errno = CXIP_PROV_ERRNO_CURL; + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_BCAST, &zb->error)) + goto quit; + /* rank > 0 endpoints overwritten by rank = 0 data */ + /* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */ + zb->error = cxip_zbcoll_broadcast(zb, &jstate->bcast_data.uint64); +quit: + if (zb->error) + _append_sched(zb, jstate); +} + +/* Check broadcast validity, and if valid, set up the MC object */ +static void _finish_bcast(void *ptr) +{ + struct cxip_join_state *jstate = ptr; + struct cxip_zbcoll_obj *zb = jstate->zb; + bool is_hwroot; + int ret; + + TRACE_JOIN("%s: mc addr=%d hw_root=%d valid=%d\n", __func__, + jstate->bcast_data.mcast_addr, + jstate->bcast_data.hwroot_idx, + jstate->bcast_data.valid); + /* all NICs now have same mc_addr data, if invalid, fail */ + /* jstate->prov_errno is presumed set if not valid */ + if (!jstate->bcast_data.valid) + goto quit; + /* error indicates that attempt to configure fails */ + + /* check for invalid hwroot index */ + TRACE_JOIN("check hwroot\n"); + if (jstate->bcast_data.hwroot_idx >= + jstate->av_set_obj->fi_addr_cnt) { + TRACE_JOIN("%s: reject invalid hwroot_idx\n", __func__); + jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INVALID; + ret = -FI_EINVAL; + goto quit; + } + + /* check for hwroot overlap on this node */ + is_hwroot = (jstate->bcast_data.hwroot_idx == jstate->mynode_idx); + if (is_hwroot && jstate->ep_obj->coll.is_hwroot) { + TRACE_JOIN("%s: reject join, hwroot in use\n", __func__); + jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INUSE; + ret = -FI_EINVAL; + goto quit; + + } + /* check for mcast_addr overlap */ + TRACE_JOIN("check mcast addr\n"); + if (!jstate->is_rank && + ofi_idm_lookup(&jstate->ep_obj->coll.mcast_map, + jstate->bcast_data.mcast_addr)) { + TRACE_JOIN("%s: reject join, mcast %d in use\n", __func__, + jstate->bcast_data.mcast_addr); + jstate->prov_errno = CXIP_PROV_ERRNO_MCAST_INUSE; + ret = -FI_EINVAL; + goto quit; + } + /* speculative prov_errno for trap */ + jstate->prov_errno = CXIP_PROV_ERRNO_PTE; + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_INITPTE, &ret)) + goto quit; + TRACE_JOIN("%s: continuing to configure\n", __func__); + ret = _initialize_mc(jstate); +quit: + /* if initialization fails, invalidate bcast_data */ + if (ret != FI_SUCCESS) + jstate->bcast_data.valid = false; + /* represent prov_errno values as inverted bitmask */ + _proverr_to_bits(jstate); + _append_sched(zb, jstate); // _start_reduce +} + +/* Accumulate composite errors from different endpoints */ +static void _start_reduce(void *ptr) +{ + struct cxip_join_state *jstate = ptr; + struct cxip_zbcoll_obj *zb = jstate->zb; + + /* reduce ANDs inverted bcast_data, if any invalid, all become invalid */ + if (cxip_trap_search(jstate->mynode_idx, CXIP_TRAP_REDUCE, &zb->error)) + goto quit; + /* zb->error == FI_SUCCESS, -FI_EAGAIN, -FI_EINVAL */ + zb->error = cxip_zbcoll_reduce(zb, &jstate->bcast_data.uint64); +quit: + if (zb->error) + _append_sched(zb, jstate); +} + +/* process error bits (if any) to produce an error condition */ +static void _finish_reduce(void *ptr) +{ + struct cxip_join_state *jstate = ptr; + struct cxip_zbcoll_obj *zb = jstate->zb; + + TRACE_JOIN("%s: entry\n", __func__); + + /* re-invert bitmap, select common reported error */ + _bits_to_proverr(jstate); + + TRACE_JOIN("%s: prov_errno=0x%x\n", __func__, jstate->prov_errno); + _append_sched(zb, jstate); // _start_cleanup +} + +/* state machine cleanup */ +static void _start_cleanup(void *ptr) +{ + struct cxip_join_state *jstate = ptr; + struct fi_eq_err_entry entry = {}; + size_t size = sizeof(entry); + uint64_t flags = 0L; + int ret; + + TRACE_JOIN("%s: entry\n", __func__); + if (jstate) { + entry.fid = (jstate->mc_obj) ? + &jstate->mc_obj->mc_fid.fid : NULL; + entry.context = jstate->context; + + if (jstate->prov_errno != CXIP_PROV_ERRNO_OK) { + size = sizeof(struct fi_eq_err_entry); + entry.data = FI_JOIN_COMPLETE; + entry.err = -FI_EAVAIL; + entry.prov_errno = jstate->prov_errno; + flags |= UTIL_FLAG_ERROR; + } + ret = ofi_eq_write(&jstate->ep_obj->eq->util_eq.eq_fid, + FI_JOIN_COMPLETE, &entry, + size, flags); + if (ret < 0) + CXIP_INFO("FATAL ERROR: cannot post to EQ\n"); + cxip_zbcoll_free(jstate->zb); + jstate->ep_obj->coll.join_busy = false; + } + free(jstate); +} + +typedef void (*sched_func)(void *ptr); + +enum state_code { + state_init, + start_getgroup, + finish_getgroup, + start_bcast, + finish_bcast, + start_reduce, + finish_reduce, + start_cleanup, + state_done +}; + +const char *state_name[] = { + "state_init", + "start_getgroup", + "finish_getgroup", + "start_bcast", + "finish_bcast", + "start_reduce", + "finish_reduce", + "start_cleanup", + "state_done" +}; +sched_func state_func[] = { + _noop, + _start_getgroup, + _finish_getgroup, + _start_bcast, + _finish_bcast, + _start_reduce, + _finish_reduce, + _start_cleanup, + _noop, +}; + +/** + * State progression table + * + * Row is the current state. + * Col contains states reachable from this state on success/again/fail. + */ +static enum state_code progress_state[][3] = { + /* STATE SUCCESS EAGAIN FAIL */ + /* state_init */ {start_getgroup, start_cleanup, start_cleanup}, + /* start_getgroup */ {finish_getgroup,start_getgroup,start_cleanup}, + /* finish_getgroup*/ {start_bcast, start_cleanup, start_cleanup}, + /* start_bcast */ {finish_bcast, start_bcast, start_cleanup}, + /* finish_bcast */ {start_reduce, start_cleanup, start_cleanup}, + /* start_reduce */ {finish_reduce, start_reduce, start_cleanup}, + /* finish_reduce */ {start_cleanup, start_cleanup, start_cleanup}, + /* start_cleanup */ {state_done, state_done, state_done}, + /* state_done */ {state_done, state_done, state_done}, +}; + +/* Advance the state and run scheduled operations */ +static void _progress_sched(struct cxip_join_state *jstate) +{ + struct cxip_zbcoll_obj *zb = jstate->zb; + enum state_code *codes; + + TRACE_JOIN("entry jstate[%d,%d]=%s, error=%d\n", + jstate->join_idx, jstate->mynode_idx, + state_name[jstate->sched_state], zb->error); + + /* acquire the success/again/fail state codes for current state */ + codes = progress_state[jstate->sched_state]; + switch (zb->error) { + case FI_SUCCESS: + /* last operation succeeded */ + TRACE_JOIN("%s: success\n", __func__); + jstate->sched_state = codes[0]; + break; + case -FI_EBUSY: + case -FI_EAGAIN: + /* last operation needs a retry */ + TRACE_JOIN("%s: busy retry\n", __func__); + jstate->sched_state = codes[1]; + break; + default: + /* last operation failed */ + TRACE_JOIN("%s: fail zberr=%d\n", __func__, zb->error); + jstate->sched_state = codes[2]; + break; + } + TRACE_JOIN("----> jstate[%d,%d]=%s\n", + jstate->join_idx, jstate->mynode_idx, + state_name[jstate->sched_state]); + + /* execute the new state function */ + state_func[jstate->sched_state](jstate); +} + +/* Process the schedule list and dispatch next scheduled operation */ +static void _progress_join(struct cxip_ep_obj *ep_obj) +{ + struct cxip_ep_coll_obj *coll_obj = &ep_obj->coll; + struct cxip_join_state *jstate = NULL; + + dlist_ts_pop_front(&coll_obj->sched_list, + struct cxip_join_state, + jstate, sched_link); + + if (jstate) + _progress_sched(jstate); +} + +/* During join, determine my index position in the av_set_obj */ +static unsigned int _caddr_to_idx(struct cxip_av_set *av_set_obj, + struct cxip_addr caddr) +{ + struct cxip_addr addr; + size_t size = sizeof(addr); + int i, ret; + + for (i = 0; i < av_set_obj->fi_addr_cnt; i++) { + ret = fi_av_lookup(&av_set_obj->cxi_av->av_fid, + av_set_obj->fi_addr_ary[i], + &addr, &size); + if (ret) + return ret; + if (CXIP_ADDR_EQUAL(addr, caddr)) + return i; + } + return -FI_EADDRNOTAVAIL; +} + +/** + * fi_join_collective() implementation. + * + * Calling syntax is defined by libfabric. + * + * This is a multi-stage collective operation, progressed by calling TX/RX CQs + * and the EQ for the endpoint. Upon completion of the state machine, the EQ + * will return an EQ event structure. + * + * We go through the following steps: + * + * 1) allocate a join state for this operation + * 2) allocate zbcoll object + * 3) get a collective group identifier + * 4) generate a multicast tree from NIC 0 + * 5) broadcast multicast address from NIC 0 + * 6) reduce error mask across all NICs + * 7) cleanup + * + * Joins are non-concurrent, and return FI_EAGAIN until any active join + * completes. The final return code of a join is not known to all nodes until + * the final state completes. + * + * Joins are progressed by polling TX/RX CQs, and completion status is + * returned by polling the endpoint EQ. + * + * CPU errors like -FI_ENOMEM will likely occur on individual endpoints, + * and the correct response is to exit the application. There is no + * reasonable way to re-enter the state machine once any participant has + * unexpectedly failed. + * + * Internal errors, such as inability to acquire a multicast address, are + * are represented by a CXIP_PROV_ERRNO value, which is returned through the + * EQ polling with an error of -FI_EAVAIL, and the CXIP_PROV_ERRNO value. + * These values are ranked, and if multiple nodes show different errors, the + * returned error will be the most-significant (most-negative) value. + * + * There are four operational models, one for production, and three for testing. + * + * In all cases, there must be one join for every NIC address in the av_set_obj + * fi_addr_ary, and the collective proceeds among these joined endpoints. + * + * COMM_KEY_RANK tests using a single process on a single Cassini, which + * supplies the src/tgt, but different pid_idx values, representing different + * PTLTE objects, each with its own buffers. The zbcoll operations are performed + * using linked zb objects, which represent a single zbcoll collective, so each + * zb callback function is called only once for the entire set, yet must provide + * a unique mc return value and FI_COLL_COMPLETE event for each joined object. + * We manage this with the simstates array, which associates the simulated rank + * with the state pointer, so that upon completion, we can provide all of the + * return pointers and events. + * + * COMM_KEY_UNICAST tests on multiple nodes on a real network, but without any + * multicast support. It initializes one mc object on each node, and designates + * the first node in the multicast list, fiaddr[0], as the hardware root node. + * fiaddr[1..N] send directly to fiaddr[0], and fiaddr[0] sends to each of the + * other addresses in a simulated broadcast. This is not expected to be + * performant, but it does exercise a necessary incast edge case, and it fully + * exercises the collectives software across multiple nodes. + * + * COMM_KEY_MULTICAST is a fully-functioning model, but requires that an + * external application prepare the multicast address on the fabric before + * calling fi_join_collective() on any node. This information must be supplied + * through the av_set_obj->comm_key structure. + * + * COMM_KEY_NONE is the production model, in which fi_join_collective() creates + * the multicast address by making a CURL call to the fabric manager REST API. + * fiaddr[0] manages the CURL call, and broadcasts the results to all of the + * other objects across the collective group. + */ +int cxip_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, + const struct fid_av_set *coll_av_set, + uint64_t flags, struct fid_mc **mc, void *context) +{ + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct cxip_av_set *av_set_obj; + struct cxip_join_state *jstate; + struct cxip_zbcoll_obj *zb; + bool link_zb; + int ret; + + check_red_pkt(); + + TRACE_JOIN("%s: entry\n", __func__); + /* Validate arguments */ + if (!ep || !coll_av_set || !mc || coll_addr != FI_ADDR_NOTAVAIL) + return -FI_EINVAL; + /* flags are ignored, per util_coll.c example code + * Only FI_SCATTER is documented, and applies to fi_query_collective(). + */ + + cxip_ep = container_of(ep, struct cxip_ep, ep.fid); + av_set_obj = container_of(coll_av_set, struct cxip_av_set, av_set_fid); + jstate = NULL; + zb = NULL; + *mc = NULL; + + ep_obj = cxip_ep->ep_obj; + + /* join must be serialized through to completion */ + ofi_genlock_lock(&ep_obj->lock); + if (ep_obj->coll.join_busy) { + ofi_genlock_unlock(&ep_obj->lock); + return -FI_EAGAIN; + } + /* SHORT-TERM HACK see NETCASSINI-5771 */ + if (av_set_obj->comm_key.keytype != COMM_KEY_RANK) + ep_obj->coll.join_busy = true; + ofi_genlock_unlock(&ep_obj->lock); + + /* allocate state to pass arguments through callbacks */ + jstate = calloc(1, sizeof(*jstate)); + if (!jstate) { + ret = -FI_ENOMEM; + goto fail; + } + + jstate->ep_obj = ep_obj; + jstate->av_set_obj = av_set_obj; + jstate->mc = mc; + jstate->context = context; + jstate->join_flags = flags; + jstate->sched_state = state_init; + jstate->join_idx = ofi_atomic_inc32(&ep_obj->coll.join_cnt); + + /* rank 0 (av_set_obj->fi_addr_cnt[0]) does zb broadcast, so all nodes + * will share whatever bcast_data rank 0 ends up with. + */ + + ret = -FI_EINVAL; + switch (av_set_obj->comm_key.keytype) { + case COMM_KEY_NONE: + /* Production case, acquire multicast from FM */ + if (is_netsim(ep_obj)) { + CXIP_INFO("NETSIM COMM_KEY_NONE not supported\n"); + goto fail; + } + TRACE_JOIN("%s: MULTICAST CURL model setup\n", __func__); + jstate->mynode_idx = + _caddr_to_idx(av_set_obj, ep_obj->src_addr); + jstate->mynode_fiaddr = + av_set_obj->fi_addr_ary[jstate->mynode_idx]; + jstate->simrank = ZB_NOSIM; + jstate->pid_idx = CXIP_PTL_IDX_COLL; + jstate->bcast_data.hwroot_idx = 0; + jstate->bcast_data.mcast_addr = 0; + jstate->bcast_data.valid = false; + jstate->is_rank = false; + jstate->is_mcast = true; + jstate->create_mcast = (jstate->mynode_idx == 0); + jstate->rx_discard = true; + link_zb = false; + break; + case COMM_KEY_MULTICAST: + /* Real network test with predefined multicast address */ + if (is_netsim(ep_obj)) { + CXIP_INFO("NETSIM COMM_KEY_MULTICAST not supported\n"); + goto fail; + } + TRACE_JOIN("%s: MULTICAST prefab model setup\n", __func__); + jstate->mynode_idx = + _caddr_to_idx(av_set_obj, ep_obj->src_addr); + jstate->mynode_fiaddr = + av_set_obj->fi_addr_ary[jstate->mynode_idx]; + jstate->simrank = ZB_NOSIM; + jstate->pid_idx = CXIP_PTL_IDX_COLL; + jstate->bcast_data.hwroot_idx = + av_set_obj->comm_key.mcast.hwroot_idx; + jstate->bcast_data.mcast_addr = + av_set_obj->comm_key.mcast.mcast_addr; + jstate->bcast_data.valid = true; + jstate->is_rank = false; + jstate->is_mcast = true; + jstate->create_mcast = false; + jstate->rx_discard = true; + link_zb = false; + break; + case COMM_KEY_UNICAST: + /* Real network test without multicast address */ + if (is_netsim(ep_obj)) { + CXIP_INFO("NETSIM COMM_KEY_UNICAST not supported\n"); + goto fail; + } + TRACE_JOIN("%s: UNICAST model setup\n", __func__); + jstate->mynode_idx = + _caddr_to_idx(av_set_obj, ep_obj->src_addr); + jstate->mynode_fiaddr = + av_set_obj->fi_addr_ary[jstate->mynode_idx]; + jstate->simrank = ZB_NOSIM; + jstate->pid_idx = CXIP_PTL_IDX_COLL; + jstate->bcast_data.hwroot_idx = + av_set_obj->comm_key.ucast.hwroot_idx; + jstate->bcast_data.mcast_addr = + av_set_obj->comm_key.ucast.mcast_addr; + jstate->bcast_data.valid = false; + jstate->is_rank = false; + jstate->is_mcast = false; + jstate->create_mcast = false; + jstate->rx_discard = true; + link_zb = false; + break; + case COMM_KEY_RANK: + /* Single process simulation, can run under NETSIM */ + TRACE_JOIN("%s: COMM_KEY_RANK model setup\n", __func__); + jstate->mynode_idx = av_set_obj->comm_key.rank.rank; + jstate->mynode_fiaddr = (fi_addr_t)jstate->mynode_idx; + jstate->simrank = jstate->mynode_idx; + jstate->pid_idx = CXIP_PTL_IDX_COLL + jstate->simrank; + jstate->bcast_data.hwroot_idx = 0; + jstate->bcast_data.mcast_addr = ep_obj->src_addr.nic; + jstate->bcast_data.valid = true; + jstate->is_rank = true; + jstate->is_mcast = false; + jstate->create_mcast = false; + jstate->rx_discard = av_set_obj->comm_key.rank.rx_discard; + link_zb = true; + break; + default: + CXIP_INFO("unexpected comm_key keytype: %d\n", + av_set_obj->comm_key.keytype); + goto fail; + } + + /* Reject if a rank tries to join a group it doesn't belong to */ + ret = jstate->mynode_idx; + if (ret < 0) { + TRACE_JOIN("May not participate\n"); + goto fail; + } + + /* Acquire a zbcoll identifier */ + TRACE_JOIN("%s: allocate zb\n", __func__); + ret = cxip_zbcoll_alloc(jstate->ep_obj, + jstate->av_set_obj->fi_addr_cnt, + jstate->av_set_obj->fi_addr_ary, + jstate->simrank, &zb); + TRACE_JOIN("%s: returned=%d\n", __func__, ret); + if (ret) + goto fail; + + /* Install the callback function for zb collectives */ + TRACE_JOIN("%s: cxip_zbcoll_set_user_cb\n", __func__); + cxip_zbcoll_set_user_cb(zb, _append_sched, jstate); + + /* If COMM_KEY_RANK, join is called for each rank */ + if (link_zb) { + static struct cxip_zbcoll_obj *zb0 = NULL; + static int zb0_count = 0; + int rank = av_set_obj->comm_key.rank.rank; + + /* first call sets the zb0 simulated endpoint */ + TRACE_JOIN("%s: rank = %d, zb0_count=%d\n", __func__, rank, zb0_count); + if (!zb0_count++) { + /* first must be rank 0 */ + if (rank != 0) { + TRACE_JOIN("%s: rank %d not 0\n", __func__, rank); + ret = -FI_EINVAL; + goto fail; + } + zb0 = zb; + TRACE_JOIN("%s: zb0=%p zb=%p\n", __func__, zb0, zb); + } + /* link this zb to zb0 */ + ret = cxip_zbcoll_simlink(zb0, zb); + if (ret) { + TRACE_JOIN("%s: return=%d\n", __func__, ret); + return ret; + } + /* after the last, we need to reset this */ + if (zb0_count == av_set_obj->fi_addr_cnt) { + zb0_count = 0; + zb0 = NULL; + } + } + + jstate->zb = zb; + _append_sched(zb, jstate); + + return FI_SUCCESS; + +fail: + /* this path returns error, does not post to EQ */ + TRACE_JOIN("cxip_join_collective, ret=%d\n", ret); + cxip_zbcoll_free(zb); + free(jstate); + ep_obj->coll.join_busy = false; + + return ret; +} + +/* Exported to be called by EQ read function */ +void cxip_coll_progress_join(struct cxip_ep_obj *ep_obj) +{ + ofi_genlock_lock(&ep_obj->lock); + + /* progress the work schedule */ + _progress_join(ep_obj); + + /* don't want handle returned, callback function manages it */ + cxip_curl_progress(NULL); + + /* progress the underlying zbcoll */ + cxip_ep_zbcoll_progress(ep_obj); + + ofi_genlock_unlock(&ep_obj->lock); +} + +/* Reset all of the diagnostic counters */ +void cxip_coll_reset_mc_ctrs(struct fid_mc *mc) +{ + struct cxip_coll_mc *mc_obj = (struct cxip_coll_mc *)mc; + + ofi_atomic_set32(&mc_obj->coll_pte->recv_cnt, 0); + ofi_atomic_set32(&mc_obj->send_cnt, 0); + ofi_atomic_set32(&mc_obj->recv_cnt, 0); + ofi_atomic_set32(&mc_obj->pkt_cnt, 0); + ofi_atomic_set32(&mc_obj->seq_err_cnt, 0); + ofi_atomic_set32(&mc_obj->tmout_cnt, 0); +} + +/**************************************************************************** + * Manage the static coll structure in the EP. Because of its specialized + * nature, it made sense to manage it here, rather than in the EP module. + */ +struct fi_ops_collective cxip_collective_ops = { + .size = sizeof(struct fi_ops_collective), + .barrier = cxip_barrier, + .broadcast = cxip_broadcast, + .alltoall = fi_coll_no_alltoall, + .allreduce = cxip_allreduce, + .allgather = fi_coll_no_allgather, + .reduce_scatter = fi_coll_no_reduce_scatter, + .reduce = cxip_reduce, + .scatter = fi_coll_no_scatter, + .gather = fi_coll_no_gather, + .msg = fi_coll_no_msg, +}; + +struct fi_ops_collective cxip_collective_no_ops = { + .size = sizeof(struct fi_ops_collective), + .barrier = fi_coll_no_barrier, + .broadcast = fi_coll_no_broadcast, + .alltoall = fi_coll_no_alltoall, + .allreduce = fi_coll_no_allreduce, + .allgather = fi_coll_no_allgather, + .reduce_scatter = fi_coll_no_reduce_scatter, + .reduce = fi_coll_no_reduce, + .scatter = fi_coll_no_scatter, + .gather = fi_coll_no_gather, + .msg = fi_coll_no_msg, +}; + +/* Close collectives - call during EP close, ep_obj->lock is held */ +void cxip_coll_close(struct cxip_ep_obj *ep_obj) +{ + struct cxip_coll_mc *mc_obj; + + while (!dlist_empty(&ep_obj->coll.mc_list)) { + dlist_pop_front(&ep_obj->coll.mc_list, + struct cxip_coll_mc, mc_obj, entry); + _close_mc(mc_obj); + } +} + +/** + * Initialize collectives - call during EP init */ +void cxip_coll_init(struct cxip_ep_obj *ep_obj) +{ + cxip_coll_populate_opcodes(); + + memset(&ep_obj->coll.mcast_map, 0, sizeof(ep_obj->coll.mcast_map)); + dlist_ts_init(&ep_obj->coll.sched_list); + dlist_init(&ep_obj->coll.mc_list); + ep_obj->coll.rx_cmdq = NULL; + ep_obj->coll.tx_cmdq = NULL; + ep_obj->coll.rx_cntr = NULL; + ep_obj->coll.tx_cntr = NULL; + ep_obj->coll.rx_evtq = NULL; + ep_obj->coll.tx_evtq = NULL; + ep_obj->coll.min_multi_recv = CXIP_COLL_MIN_MULTI_RECV; + ep_obj->coll.buffer_count = CXIP_COLL_MIN_RX_BUFS; + ep_obj->coll.buffer_size = CXIP_COLL_MIN_RX_SIZE; + + ofi_atomic_initialize32(&ep_obj->coll.num_mc, 0); + ofi_atomic_initialize32(&ep_obj->coll.join_cnt, 0); +} + +/** + * Enable collectives - call from EP enable. + */ +int cxip_coll_enable(struct cxip_ep *ep) +{ + struct cxip_ep_obj *ep_obj = ep->ep_obj; + + if (ep_obj->coll.enabled) + return FI_SUCCESS; + + if (!(ep_obj->caps & FI_COLLECTIVE)) { + CXIP_INFO("FI_COLLECTIVE not requested\n"); + return FI_SUCCESS; + } + + /* A read-only or write-only endpoint is legal */ + if (!(ofi_recv_allowed(ep_obj->rxc.attr.caps) && + ofi_send_allowed(ep_obj->txc.attr.caps))) { + CXIP_INFO("EP not recv/send, collectives not enabled\n"); + return FI_SUCCESS; + } + + /* Sanity checks */ + if (ep_obj->coll.buffer_size == 0) + return -FI_EINVAL; + if (ep_obj->coll.buffer_count == 0) + return -FI_EINVAL; + if (ep_obj->coll.min_multi_recv == 0) + return -FI_EINVAL; + if (ep_obj->coll.min_multi_recv >= ep_obj->coll.buffer_size) + return -FI_EINVAL; + + /* Bind all STD EP objects to the coll object */ + ep_obj->coll.rx_cmdq = ep_obj->rxc.rx_cmdq; + ep_obj->coll.tx_cmdq = ep_obj->txc.tx_cmdq; + ep_obj->coll.rx_cntr = ep_obj->rxc.recv_cntr; + ep_obj->coll.tx_cntr = ep_obj->txc.send_cntr; + ep_obj->coll.rx_evtq = &ep_obj->rxc.rx_evtq; + ep_obj->coll.tx_evtq = &ep_obj->txc.tx_evtq; + ep_obj->coll.eq = ep_obj->eq; + + ep->ep.collective = &cxip_collective_ops; + ep_obj->coll.enabled = true; + + return FI_SUCCESS; +} + +/* Disable collectives - call from EP disable */ +int cxip_coll_disable(struct cxip_ep_obj *ep_obj) +{ + if (!ep_obj->coll.enabled) + return FI_SUCCESS; + + ep_obj->coll.enabled = false; + ep_obj->coll.rx_cmdq = NULL; + ep_obj->coll.tx_cmdq = NULL; + ep_obj->coll.rx_cntr = NULL; + ep_obj->coll.tx_cntr = NULL; + ep_obj->coll.rx_evtq = NULL; + ep_obj->coll.tx_evtq = NULL; + ep_obj->coll.eq = NULL; + + return FI_SUCCESS; +} diff --git a/prov/cxi/src/cxip_cq.c b/prov/cxi/src/cxip_cq.c new file mode 100644 index 00000000000..675d91eeb56 --- /dev/null +++ b/prov/cxi/src/cxip_cq.c @@ -0,0 +1,436 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_CQ, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_CQ, __VA_ARGS__) + +/* + * cxip_cq_req_complete() - Generate a completion event for the request. + */ +int cxip_cq_req_complete(struct cxip_req *req) +{ + if (req->discard) { + CXIP_DBG("Event discarded: %p\n", req); + return FI_SUCCESS; + } + + return ofi_cq_write(&req->cq->util_cq, (void *)req->context, + req->flags, req->data_len, (void *)req->buf, + req->data, req->tag); +} + +/* + * cxip_cq_req_complete() - Generate a completion event with source address for + * the request. + */ +int cxip_cq_req_complete_addr(struct cxip_req *req, fi_addr_t src) +{ + if (req->discard) { + CXIP_DBG("Event discarded: %p\n", req); + return FI_SUCCESS; + } + + return ofi_cq_write_src(&req->cq->util_cq, (void *)req->context, + req->flags, req->data_len, (void *)req->buf, + req->data, req->tag, src); +} + +/* + * proverr2errno() - Match NIC errno to Linux errno. + */ +int proverr2errno(int err) +{ + if (err == C_RC_UNDELIVERABLE) + return FI_EHOSTUNREACH; + else if (err == C_RC_VNI_NOT_FOUND) + return FI_ENOTCONN; + return FI_EIO; +} + +/* + * cxip_cq_req_error() - Generate an error event for the request. + */ +int cxip_cq_req_error(struct cxip_req *req, size_t olen, + int err, int prov_errno, void *err_data, + size_t err_data_size, fi_addr_t src_addr) +{ + struct fi_cq_err_entry err_entry; + + if (req->discard) { + CXIP_DBG("Event discarded: %p\n", req); + return FI_SUCCESS; + } + + err_entry.err = err; + err_entry.olen = olen; + err_entry.err_data = err_data; + err_entry.err_data_size = err_data_size; + err_entry.len = req->data_len; + err_entry.prov_errno = prov_errno; + err_entry.flags = req->flags; + err_entry.data = req->data; + err_entry.tag = req->tag; + err_entry.op_context = (void *)(uintptr_t)req->context; + err_entry.buf = (void *)(uintptr_t)req->buf; + err_entry.src_addr = src_addr; + + return ofi_cq_write_error(&req->cq->util_cq, &err_entry); +} + +/* + * cxip_cq_progress() - Progress the CXI Completion Queue. + * + * The CQ lock must not be held and this function can not be + * called from within event queue callback processing. + */ +void cxip_cq_progress(struct cxip_cq *cq) +{ + cxip_util_cq_progress(&cq->util_cq); +} + +/* + * cxip_util_cq_progress() - Progress function wrapper for utility CQ. + */ +void cxip_util_cq_progress(struct util_cq *util_cq) +{ + struct cxip_cq *cq = container_of(util_cq, struct cxip_cq, util_cq); + struct fid_list_entry *fid_entry; + struct dlist_entry *item; + + ofi_genlock_lock(&cq->ep_list_lock); + dlist_foreach(&util_cq->ep_list, item) { + fid_entry = container_of(item, struct fid_list_entry, entry); + cxip_ep_progress(fid_entry->fid); + } + ofi_genlock_unlock(&cq->ep_list_lock); +} + +/* + * cxip_cq_strerror() - Converts provider specific error information into a + * printable string. + */ +static const char *cxip_cq_strerror(struct fid_cq *cq, int prov_errno, + const void *err_data, char *buf, + size_t len) +{ + switch (prov_errno) { + case CXIP_PROV_ERRNO_OK: + return "CXIP_COLL_OK"; + case CXIP_PROV_ERRNO_PTE: + return "CXIP_COLL_PTE_ERROR"; + case CXIP_PROV_ERRNO_MCAST_INUSE: + return "CXIP_COLL_MCAST_IN_USE"; + case CXIP_PROV_ERRNO_HWROOT_INUSE: + return "CXIP_COLL_HWROOT_IN_USE"; + case CXIP_PROV_ERRNO_MCAST_INVALID: + return "CXIP_COLL_MCAST_INVALID"; + case CXIP_PROV_ERRNO_HWROOT_INVALID: + return "CXIP_COLL_HWROOT_INVALID"; + case CXIP_PROV_ERRNO_CURL: + return "CXIP_COLL_CURL_ERROR"; + } + return cxi_rc_to_str(prov_errno); +} + +/* + * cxip_cq_trywait - Return success if able to block waiting for CQ events. + */ +static int cxip_cq_trywait(void *arg) +{ + struct cxip_cq *cq = (struct cxip_cq *)arg; + struct fid_list_entry *fid_entry; + struct dlist_entry *item; + + assert(cq->util_cq.wait); + + if (!cq->priv_wait) { + CXIP_WARN("No CXI wait object\n"); + return -FI_EINVAL; + } + + ofi_genlock_lock(&cq->ep_list_lock); + dlist_foreach(&cq->util_cq.ep_list, item) { + fid_entry = container_of(item, struct fid_list_entry, entry); + if (cxip_ep_peek(fid_entry->fid)) { + ofi_genlock_unlock(&cq->ep_list_lock); + + return -FI_EAGAIN; + } + } + + /* Clear wait, and check for any events */ + cxil_clear_wait_obj(cq->priv_wait); + dlist_foreach(&cq->util_cq.ep_list, item) { + fid_entry = container_of(item, struct fid_list_entry, entry); + if (cxip_ep_peek(fid_entry->fid)) { + ofi_genlock_unlock(&cq->ep_list_lock); + + return -FI_EAGAIN; + } + } + ofi_genlock_unlock(&cq->ep_list_lock); + + return FI_SUCCESS; +} + +/* + * cxip_cq_flush_trig_reqs() - Flush all triggered requests on the CQ. + * + * This function will free all triggered requests associated with the + * CQ. This should only be called after canceling triggered operations + * against all counters in use and verifying the cancellations have + * completed successfully. + */ +void cxip_cq_flush_trig_reqs(struct cxip_cq *cq) +{ + struct fid_list_entry *fid_entry; + struct dlist_entry *item; + struct cxip_ep *ep; + + ofi_genlock_lock(&cq->ep_list_lock); + dlist_foreach(&cq->util_cq.ep_list, item) { + fid_entry = container_of(item, struct fid_list_entry, entry); + ep = container_of(fid_entry->fid, struct cxip_ep, ep.fid); + + cxip_ep_flush_trig_reqs(ep->ep_obj); + } + ofi_genlock_unlock(&cq->ep_list_lock); +} + +/* + * cxip_cq_close() - Destroy the Completion Queue object. + */ +static int cxip_cq_close(struct fid *fid) +{ + struct cxip_cq *cq = container_of(fid, struct cxip_cq, + util_cq.cq_fid.fid); + int ret; + + if (ofi_atomic_get32(&cq->util_cq.ref)) + return -FI_EBUSY; + + if (cq->priv_wait) { + ret = ofi_wait_del_fd(cq->util_cq.wait, + cxil_get_wait_obj_fd(cq->priv_wait)); + if (ret) + CXIP_WARN("Wait FD delete error: %d\n", ret); + + ret = cxil_destroy_wait_obj(cq->priv_wait); + if (ret) + CXIP_WARN("Release CXI wait object failed: %d\n", ret); + } + + ofi_cq_cleanup(&cq->util_cq); + ofi_genlock_destroy(&cq->ep_list_lock); + cxip_domain_remove_cq(cq->domain, cq); + + free(cq); + + return 0; +} + +static struct fi_ops cxip_cq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_cq_close, + .bind = fi_no_bind, + .control = ofi_cq_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_cq_attr cxip_cq_def_attr = { + .flags = 0, + .format = FI_CQ_FORMAT_CONTEXT, + .wait_obj = FI_WAIT_NONE, + .signaling_vector = 0, + .wait_cond = FI_CQ_COND_NONE, + .wait_set = NULL, +}; + +/* + * cxip_cq_verify_attr() - Verify input Completion Queue attributes. + */ +static int cxip_cq_verify_attr(struct fi_cq_attr *attr) +{ + if (!attr) + return FI_SUCCESS; + + switch (attr->format) { + case FI_CQ_FORMAT_CONTEXT: + case FI_CQ_FORMAT_MSG: + case FI_CQ_FORMAT_DATA: + case FI_CQ_FORMAT_TAGGED: + break; + case FI_CQ_FORMAT_UNSPEC: + attr->format = cxip_cq_def_attr.format; + break; + default: + CXIP_WARN("Unsupported CQ attribute format: %d\n", + attr->format); + return -FI_ENOSYS; + } + + /* Applications should set wait_obj == FI_WAIT_NONE for best + * performance. However, if a wait_obj is required and not + * specified, default to FI_WAIT_FD. + */ + switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: + attr->wait_obj = FI_WAIT_FD; + break; + case FI_WAIT_NONE: + case FI_WAIT_FD: + case FI_WAIT_POLLFD: + break; + default: + CXIP_WARN("Unsupported CQ wait object: %d\n", + attr->wait_obj); + return -FI_ENOSYS; + } + + /* Use environment variable to allow for dynamic setting of default CQ + * size. + */ + if (!attr->size) + attr->size = cxip_env.default_cq_size; + + return FI_SUCCESS; +} + +/* + * cxip_cq_alloc_priv_wait - Allocate an internal wait channel for the CQ. + */ +static int cxip_cq_alloc_priv_wait(struct cxip_cq *cq) +{ + int ret; + int wait_fd; + + assert(cq->domain); + + /* Not required or already created */ + if (!cq->util_cq.wait || cq->priv_wait) + return FI_SUCCESS; + + ret = cxil_alloc_wait_obj(cq->domain->lni->lni, &cq->priv_wait); + if (ret) { + CXIP_WARN("Allocation of internal wait object failed %d\n", + ret); + return ret; + } + + wait_fd = cxil_get_wait_obj_fd(cq->priv_wait); + ret = fi_fd_nonblock(wait_fd); + if (ret) { + CXIP_WARN("Unable to set CQ wait non-blocking mode: %d\n", ret); + goto destroy_wait; + } + + ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, POLLIN, + cxip_cq_trywait, cq, &cq->util_cq.cq_fid.fid); + if (ret) { + CXIP_WARN("Add FD of internal wait object failed: %d\n", ret); + goto destroy_wait; + } + + CXIP_DBG("Add CQ private wait object, CQ intr FD: %d\n", wait_fd); + + return FI_SUCCESS; + +destroy_wait: + cxil_destroy_wait_obj(cq->priv_wait); + cq->priv_wait = NULL; + + return ret; +} + +/* + * cxip_cq_open() - Allocate a new Completion Queue object. + */ +int cxip_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq, void *context) +{ + struct cxip_domain *cxi_dom; + struct cxip_cq *cxi_cq; + int ret; + + if (!domain || !cq) + return -FI_EINVAL; + + cxi_dom = container_of(domain, struct cxip_domain, + util_domain.domain_fid); + + ret = cxip_cq_verify_attr(attr); + if (ret != FI_SUCCESS) + return ret; + + cxi_cq = calloc(1, sizeof(*cxi_cq)); + if (!cxi_cq) + return -FI_ENOMEM; + + if (!attr) { + cxi_cq->attr = cxip_cq_def_attr; + cxi_cq->attr.size = cxip_env.default_cq_size; + } else { + cxi_cq->attr = *attr; + } + + ret = ofi_cq_init(&cxip_prov, domain, &cxi_cq->attr, &cxi_cq->util_cq, + cxip_util_cq_progress, context); + if (ret != FI_SUCCESS) { + CXIP_WARN("ofi_cq_init() failed: %d\n", ret); + goto err_util_cq; + } + + cxi_cq->util_cq.cq_fid.ops->strerror = &cxip_cq_strerror; + cxi_cq->util_cq.cq_fid.fid.ops = &cxip_cq_fi_ops; + + cxi_cq->domain = cxi_dom; + cxi_cq->ack_batch_size = cxip_env.eq_ack_batch_size; + + /* Optimize locking when possible */ + if (cxi_dom->util_domain.threading == FI_THREAD_DOMAIN || + cxi_dom->util_domain.threading == FI_THREAD_COMPLETION) + ofi_genlock_init(&cxi_cq->ep_list_lock, OFI_LOCK_NONE); + else + ofi_genlock_init(&cxi_cq->ep_list_lock, OFI_LOCK_SPINLOCK); + + if (cxi_cq->util_cq.wait) { + ret = cxip_cq_alloc_priv_wait(cxi_cq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate CXI wait obj: %d\n", + ret); + goto err_wait_alloc; + } + } + + cxip_domain_add_cq(cxi_dom, cxi_cq); + *cq = &cxi_cq->util_cq.cq_fid; + + return FI_SUCCESS; + +err_wait_alloc: + ofi_cq_cleanup(&cxi_cq->util_cq); +err_util_cq: + free(cxi_cq); + + return ret; +} diff --git a/prov/cxi/src/cxip_ctrl.c b/prov/cxi/src/cxip_ctrl.c new file mode 100644 index 00000000000..3d484dcdccd --- /dev/null +++ b/prov/cxi/src/cxip_ctrl.c @@ -0,0 +1,789 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2017 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include + +#include + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) + +/* + * cxip_ctrl_msg_cb() - Process control message target events. + */ +int cxip_ctrl_msg_cb(struct cxip_ctrl_req *req, const union c_event *event) +{ + uint32_t pid_bits = req->ep_obj->domain->iface->dev->info.pid_bits; + uint32_t nic_addr; + uint32_t pid; + union cxip_match_bits mb = { + .raw = event->tgt_long.match_bits, + }; + uint32_t init = event->tgt_long.initiator.initiator.process; + int ret __attribute__((unused)); + + switch (event->hdr.event_type) { + case C_EVENT_MATCH: + break; + case C_EVENT_PUT: + assert(cxi_event_rc(event) == C_RC_OK); + + nic_addr = CXI_MATCH_ID_EP(pid_bits, init); + pid = CXI_MATCH_ID_PID(pid_bits, init); + + switch (mb.ctrl_msg_type) { + case CXIP_CTRL_MSG_FC_NOTIFY: + ret = cxip_fc_process_drops(req->ep_obj, nic_addr, pid, + mb.drops); + assert(ret == FI_SUCCESS); + + break; + case CXIP_CTRL_MSG_FC_RESUME: + ret = cxip_fc_resume(req->ep_obj, nic_addr, pid); + assert(ret == FI_SUCCESS); + + break; + case CXIP_CTRL_MSG_ZB_DATA: + ret = cxip_zbcoll_recv_cb(req->ep_obj, nic_addr, pid, + mb.raw); + assert(ret == FI_SUCCESS); + break; + default: + CXIP_FATAL("Unexpected msg type: %d\n", + mb.ctrl_msg_type); + } + + break; + default: + CXIP_FATAL(CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + + CXIP_DBG("got event: %s rc: %s (req: %p)\n", + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event)), + req); + + return FI_SUCCESS; +} + +/* + * cxip_ctrl_msg_send() - Send a control message. + * + * Caller should hold req->ep_obj->lock. + */ +int cxip_ctrl_msg_send(struct cxip_ctrl_req *req) +{ + struct cxip_cmdq *txq; + union c_fab_addr dfa; + uint8_t idx_ext; + uint32_t pid_bits; + union c_cmdu cmd = {}; + uint32_t match_id; + int ret; + + txq = req->ep_obj->ctrl_txq; + pid_bits = req->ep_obj->domain->iface->dev->info.pid_bits; + cxi_build_dfa(req->send.nic_addr, req->send.pid, pid_bits, + CXIP_PTL_IDX_CTRL, &dfa, &idx_ext); + match_id = CXI_MATCH_ID(pid_bits, req->ep_obj->src_addr.pid, + req->ep_obj->src_addr.nic); + + cmd.c_state.event_send_disable = 1; + cmd.c_state.index_ext = idx_ext; + cmd.c_state.eq = req->ep_obj->ctrl_tx_evtq->eqn; + cmd.c_state.initiator = match_id; + + if (!req->ep_obj->ctrl_tx_credits) { + CXIP_WARN("Control TX credits exhausted\n"); + return -FI_EAGAIN; + } + + req->ep_obj->ctrl_tx_credits--; + + ret = cxip_cmdq_emit_c_state(txq, &cmd.c_state); + if (ret) { + CXIP_DBG("Failed to issue C_STATE command: %d\n", ret); + goto err_return_credit; + } + + memset(&cmd.idc_msg, 0, sizeof(cmd.idc_msg)); + cmd.idc_msg.dfa = dfa; + cmd.idc_msg.match_bits = req->send.mb.raw; + cmd.idc_msg.user_ptr = (uint64_t)req; + + ret = cxi_cq_emit_idc_msg(txq->dev_cmdq, &cmd.idc_msg, NULL, 0); + if (ret) { + CXIP_DBG("Failed to write IDC: %d\n", ret); + + /* Return error according to Domain Resource Management + */ + ret = -FI_EAGAIN; + goto err_return_credit; + } + + cxi_cq_ring(txq->dev_cmdq); + + CXIP_DBG("Queued control message: %p\n", req); + + return FI_SUCCESS; + +err_return_credit: + req->ep_obj->ctrl_tx_credits++; + + return ret; +} + +/* + * cxip_ctrl_msg_init() - Initialize control messaging resources. + * + * Caller must hold ep_obj->lock. + */ +int cxip_ctrl_msg_init(struct cxip_ep_obj *ep_obj) +{ + const union c_event *event; + int ret; + uint32_t le_flags; + union cxip_match_bits mb = { + .ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG, + }; + union cxip_match_bits ib = { + .raw = ~0, + }; + + ret = cxip_domain_ctrl_id_alloc(ep_obj->domain, &ep_obj->ctrl_msg_req); + if (ret) { + CXIP_WARN("Failed to allocate MR buffer ID: %d\n", ret); + return -FI_ENOSPC; + } + ep_obj->ctrl_msg_req.ep_obj = ep_obj; + ep_obj->ctrl_msg_req.cb = cxip_ctrl_msg_cb; + + le_flags = C_LE_UNRESTRICTED_BODY_RO | C_LE_UNRESTRICTED_END_RO | + C_LE_OP_PUT; + + ib.ctrl_le_type = 0; + + ret = cxip_pte_append(ep_obj->ctrl_pte, 0, 0, 0, + C_PTL_LIST_PRIORITY, ep_obj->ctrl_msg_req.req_id, + mb.raw, ib.raw, CXI_MATCH_ID_ANY, 0, le_flags, + NULL, ep_obj->ctrl_tgq, true); + if (ret) { + CXIP_DBG("Failed to write Append command: %d\n", ret); + goto err_free_id; + } + + /* Wait for link EQ event */ + while (!(event = cxi_eq_get_event(ep_obj->ctrl_tgt_evtq))) + sched_yield(); + + if (event->hdr.event_type != C_EVENT_LINK || + event->tgt_long.buffer_id != ep_obj->ctrl_msg_req.req_id) { + /* This is a device malfunction */ + CXIP_WARN("Invalid Link EQE %u %u %u %u\n", + event->hdr.event_type, + event->tgt_long.return_code, + event->tgt_long.buffer_id, + ep_obj->ctrl_msg_req.req_id); + ret = -FI_EIO; + goto err_free_id; + } + + if (cxi_event_rc(event) != C_RC_OK) { + CXIP_WARN("Append failed: %s\n", + cxi_rc_to_str(cxi_event_rc(event))); + ret = -FI_ENOSPC; + goto err_free_id; + } + + cxi_eq_ack_events(ep_obj->ctrl_tgt_evtq); + + CXIP_DBG("Control messaging initialized: %p\n", ep_obj); + + return FI_SUCCESS; + +err_free_id: + cxip_domain_ctrl_id_free(ep_obj->domain, &ep_obj->ctrl_msg_req); + + return ret; +} + +/* + * cxip_ctrl_msg_fini() - Finalize control messaging resources. + * + * Caller must hold ep_obj->lock. + */ +void cxip_ctrl_msg_fini(struct cxip_ep_obj *ep_obj) +{ + cxip_domain_ctrl_id_free(ep_obj->domain, &ep_obj->ctrl_msg_req); + + CXIP_DBG("Control messaging finalized: %p\n", ep_obj); +} + +/* + * cxip_ep_ctrl_event_req() - Look up a control request using Cassini event. + */ +static struct cxip_ctrl_req *cxip_ep_ctrl_event_req(struct cxip_ep_obj *ep_obj, + const union c_event *event) +{ + struct cxip_ctrl_req *req; + int event_rc; + + switch (event->hdr.event_type) { + case C_EVENT_ACK: + req = (struct cxip_ctrl_req *)event->init_short.user_ptr; + break; + case C_EVENT_LINK: + case C_EVENT_UNLINK: + case C_EVENT_MATCH: + req = cxip_domain_ctrl_id_at(ep_obj->domain, + event->tgt_long.buffer_id); + if (!req) + CXIP_WARN("Invalid buffer_id: %d (%s)\n", + event->tgt_long.buffer_id, + cxi_event_to_str(event)); + break; + case C_EVENT_PUT: + case C_EVENT_GET: + case C_EVENT_ATOMIC: + case C_EVENT_FETCH_ATOMIC: + event_rc = cxi_event_rc(event); + + if (event_rc != C_RC_ENTRY_NOT_FOUND && + event_rc != C_RC_MST_CANCELLED) { + req = cxip_domain_ctrl_id_at(ep_obj->domain, + event->tgt_long.buffer_id); + if (!req) + CXIP_WARN("Invalid buffer_id: %d (%s)\n", + event->tgt_long.buffer_id, + cxi_event_to_str(event)); + break; + } + + req = NULL; + + /* Silently drop any invalidated LE events. Since the control + * PtlTE is used for non-optimized MRs, it is possible to + * trigger a target error event if an invalid MR key was + * specified. For such operations, it is safe to just log the + * bad access attempt and drop the EQ event, the error will be + * reported to the initiator. + */ + if (event_rc != C_RC_MST_CANCELLED) + CXIP_WARN("Unexpected %s event rc: %s\n", + cxi_event_to_str(event), + cxi_rc_to_str(event_rc)); + + break; + case C_EVENT_STATE_CHANGE: + cxip_pte_state_change(ep_obj->domain->iface, event); + + req = NULL; + break; + case C_EVENT_COMMAND_FAILURE: + CXIP_FATAL("Command failure: cq=%u target=%u fail_loc=%u cmd_type=%u cmd_size=%u opcode=%u\n", + event->cmd_fail.cq_id, event->cmd_fail.is_target, + event->cmd_fail.fail_loc, + event->cmd_fail.fail_command.cmd_type, + event->cmd_fail.fail_command.cmd_size, + event->cmd_fail.fail_command.opcode); + default: + CXIP_FATAL("Invalid event type: %d\n", event->hdr.event_type); + } + + CXIP_DBG("got control event: %s rc: %s (req: %p)\n", + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event)), + req); + + return req; +} + +/* Caller must hold ep_obj->lock. */ +static void cxip_ep_return_ctrl_tx_credits(struct cxip_ep_obj *ep_obj, + unsigned int credits) +{ + ep_obj->ctrl_tx_credits += credits; +} + +void cxip_ep_ctrl_eq_progress(struct cxip_ep_obj *ep_obj, + struct cxi_eq *ctrl_evtq, bool tx_evtq, + bool ep_obj_locked) +{ + const union c_event *event; + struct cxip_ctrl_req *req; + int ret; + + /* The Control EQ is shared by a SEP. Avoid locking. */ + if (!cxi_eq_peek_event(ctrl_evtq)) + return; + + if (!ep_obj_locked) + ofi_genlock_lock(&ep_obj->lock); + + while ((event = cxi_eq_peek_event(ctrl_evtq))) { + req = cxip_ep_ctrl_event_req(ep_obj, event); + if (req) { + ret = req->cb(req, event); + if (ret != FI_SUCCESS) + break; + } + + /* Consume and ack event. */ + cxi_eq_next_event(ctrl_evtq); + + cxi_eq_ack_events(ctrl_evtq); + + if (tx_evtq) + cxip_ep_return_ctrl_tx_credits(ep_obj, 1); + + } + + if (cxi_eq_get_drops(ctrl_evtq)) + CXIP_FATAL("Control EQ drops detected\n"); + + if (!ep_obj_locked) + ofi_genlock_unlock(&ep_obj->lock); +} + +void cxip_ep_tx_ctrl_progress(struct cxip_ep_obj *ep_obj) +{ + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tx_evtq, true, false); +} + +void cxip_ep_tx_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) +{ + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tx_evtq, true, true); +} + +/* + * cxip_ep_ctrl_progress() - Progress operations using the control EQ. + */ +void cxip_ep_ctrl_progress(struct cxip_ep_obj *ep_obj) +{ + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, false); + cxip_ep_tx_ctrl_progress(ep_obj); +} + +/* + * cxip_ep_ctrl_progress_locked() - Progress operations using the control EQ. + */ +void cxip_ep_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) +{ + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, true); + cxip_ep_tx_ctrl_progress_locked(ep_obj); +} + +/* + * cxip_ep_tgt_ctrl_progress() - Progress TGT operations using the control EQ. + */ +void cxip_ep_tgt_ctrl_progress(struct cxip_ep_obj *ep_obj) +{ + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, false); +} + +/* + * cxip_ep_tgt_ctrl_progress_locked() - Progress operations using the control + * EQ. + */ +void cxip_ep_tgt_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) +{ + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, true); +} + +/* + * cxip_ep_ctrl_trywait() - Return 0 if no events need to be progressed. + */ +int cxip_ep_ctrl_trywait(void *arg) +{ + struct cxip_ep_obj *ep_obj = (struct cxip_ep_obj *)arg; + + if (!ep_obj->ctrl_wait) { + CXIP_WARN("No CXI ep_obj wait object\n"); + return -FI_EINVAL; + } + + if (cxi_eq_peek_event(ep_obj->ctrl_tgt_evtq) || + cxi_eq_peek_event(ep_obj->ctrl_tx_evtq)) + return -FI_EAGAIN; + + ofi_genlock_lock(&ep_obj->lock); + cxil_clear_wait_obj(ep_obj->ctrl_wait); + + if (cxi_eq_peek_event(ep_obj->ctrl_tgt_evtq) || + cxi_eq_peek_event(ep_obj->ctrl_tx_evtq)) { + ofi_genlock_unlock(&ep_obj->lock); + + return -FI_EAGAIN; + } + ofi_genlock_unlock(&ep_obj->lock); + + return FI_SUCCESS; +} + +static void cxip_eq_ctrl_eq_free(void *eq_buf, struct cxi_md *eq_md, + struct cxi_eq *eq) +{ + int ret; + + ret = cxil_destroy_evtq(eq); + if (ret) + CXIP_WARN("Failed to free CXI EQ: ret=%d", ret); + + ret = cxil_unmap(eq_md); + if (ret) + CXIP_WARN("Failed to unmap EQ buffer: ret=%d", ret); + + free(eq_buf); +} + +static int cxip_ep_ctrl_eq_alloc(struct cxip_ep_obj *ep_obj, size_t len, + void **eq_buf, struct cxi_md **eq_md, + struct cxi_eq **eq) +{ + struct cxi_eq_attr eq_attr = { + .flags = CXI_EQ_TGT_LONG, + }; + int ret; + int unmap_ret __attribute__((unused)); + int page_size; + + page_size = ofi_get_page_size(); + if (page_size < 0) + return -ofi_syserr(); + + len = ofi_get_aligned_size(len, page_size); + *eq_buf = aligned_alloc(page_size, len); + if (!eq_buf) { + ret = -FI_ENOMEM; + goto err; + } + + ret = cxil_map(ep_obj->domain->lni->lni, *eq_buf, len, + CXIP_EQ_MAP_FLAGS, NULL, eq_md); + if (ret) + goto err_free_eq_buf; + + eq_attr.queue = *eq_buf; + eq_attr.queue_len = len; + + /* ep_obj->ctrl_wait will be NULL if not required */ + ret = cxil_alloc_evtq(ep_obj->domain->lni->lni, *eq_md, &eq_attr, + ep_obj->ctrl_wait, NULL, eq); + if (ret) + goto err_free_eq_md; + + return FI_SUCCESS; + +err_free_eq_md: + unmap_ret = cxil_unmap(*eq_md); + assert(unmap_ret == 0); + +err_free_eq_buf: + free(*eq_buf); +err: + return ret; +} + +/* + * cxip_ep_wait_required() - return true if base EP wait object is required. + */ +static bool cxip_ctrl_wait_required(struct cxip_ep_obj *ep_obj) +{ + if (ep_obj->rxc.recv_cq && ep_obj->rxc.recv_cq->priv_wait) + return true; + + if (ep_obj->txc.send_cq && ep_obj->txc.send_cq->priv_wait) + return true; + + return false; +} + +/* + * cxip_ep_ctrl_del_wait() - Delete control FD object + */ +void cxip_ep_ctrl_del_wait(struct cxip_ep_obj *ep_obj) +{ + int wait_fd; + + wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl_wait); + + if (ep_obj->txc.send_cq) { + ofi_wait_del_fd(ep_obj->txc.send_cq->util_cq.wait, wait_fd); + CXIP_DBG("Deleted control HW EQ FD: %d from CQ: %p\n", + wait_fd, ep_obj->txc.send_cq); + } + + if (ep_obj->rxc.recv_cq && ep_obj->rxc.recv_cq != ep_obj->txc.send_cq) { + ofi_wait_del_fd(ep_obj->rxc.recv_cq->util_cq.wait, wait_fd); + CXIP_DBG("Deleted control HW EQ FD: %d from CQ %p\n", + wait_fd, ep_obj->rxc.recv_cq); + } +} + +/* + * cxip_ep_ctrl_add_wait() - Add control FD to CQ object + */ +int cxip_ep_ctrl_add_wait(struct cxip_ep_obj *ep_obj) +{ + struct cxip_cq *cq; + int wait_fd; + int ret; + + ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, + &ep_obj->ctrl_wait); + if (ret) { + CXIP_WARN("Control wait object allocation failed: %d\n", ret); + return -FI_ENOMEM; + } + + wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl_wait); + ret = fi_fd_nonblock(wait_fd); + if (ret) { + CXIP_WARN("Unable to set control wait non-blocking: %d, %s\n", + ret, fi_strerror(-ret)); + goto err; + } + + cq = ep_obj->txc.send_cq; + if (cq) { + ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, + POLLIN, cxip_ep_ctrl_trywait, ep_obj, + &cq->util_cq.cq_fid.fid); + if (ret) { + CXIP_WARN("TX CQ add FD failed: %d, %s\n", + ret, fi_strerror(-ret)); + goto err; + } + } + + if (ep_obj->rxc.recv_cq && ep_obj->rxc.recv_cq != cq) { + cq = ep_obj->rxc.recv_cq; + + ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, + POLLIN, cxip_ep_ctrl_trywait, ep_obj, + &cq->util_cq.cq_fid.fid); + if (ret) { + CXIP_WARN("RX CQ add FD failed: %d, %s\n", + ret, fi_strerror(-ret)); + goto err_add_fd; + } + } + + CXIP_DBG("Added control EQ private wait object, intr FD: %d\n", + wait_fd); + + return FI_SUCCESS; + +err_add_fd: + if (ep_obj->txc.send_cq) + ofi_wait_del_fd(ep_obj->txc.send_cq->util_cq.wait, wait_fd); +err: + cxil_destroy_wait_obj(ep_obj->ctrl_wait); + ep_obj->ctrl_wait = NULL; + + return ret; +} + +/* + * cxip_ep_ctrl_init() - Initialize endpoint control resources. + * + * Caller must hold ep_obj->lock. + */ +int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) +{ + struct cxi_pt_alloc_opts pt_opts = { + .use_long_event = 1, + .is_matching = 1, + }; + const union c_event *event; + int ret; + size_t rx_eq_size = MIN(cxip_env.ctrl_rx_eq_max_size, + ofi_universe_size * 64 + + ep_obj->domain->mr_match_events * 256 * 64); + + /* When MR event counting has been requested turn on + * delivery of match events. + */ + if (ep_obj->domain->mr_match_events) + pt_opts.en_event_match = 1; + + /* If CQ(s) are using a wait object, then control event + * queues need to unblock CQ poll as well. CQ will add the + * associated FD to the CQ FD list. + */ + if (cxip_ctrl_wait_required(ep_obj)) { + ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, + &ep_obj->ctrl_wait); + if (ret) { + CXIP_WARN("EP ctrl wait object alloc failed: %d\n", + ret); + return ret; + } + } + + ret = cxip_ep_ctrl_eq_alloc(ep_obj, 4 * s_page_size, + &ep_obj->ctrl_tx_evtq_buf, + &ep_obj->ctrl_tx_evtq_buf_md, + &ep_obj->ctrl_tx_evtq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate TX EQ resources, ret: %d\n", ret); + goto err; + } + + ret = cxip_ep_ctrl_eq_alloc(ep_obj, rx_eq_size, + &ep_obj->ctrl_tgt_evtq_buf, + &ep_obj->ctrl_tgt_evtq_buf_md, + &ep_obj->ctrl_tgt_evtq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate TGT EQ resources, ret: %d\n", + ret); + goto free_tx_evtq; + } + + ret = cxip_ep_cmdq(ep_obj, true, ep_obj->domain->tclass, + ep_obj->ctrl_tx_evtq, &ep_obj->ctrl_txq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate control TXQ, ret: %d\n", ret); + ret = -FI_EDOMAIN; + goto free_tgt_evtq; + } + + ret = cxip_ep_cmdq(ep_obj, false, ep_obj->domain->tclass, + ep_obj->ctrl_tgt_evtq, &ep_obj->ctrl_tgq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate control TGQ, ret: %d\n", ret); + ret = -FI_EDOMAIN; + goto free_txq; + } + + ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl_tgt_evtq, + &pt_opts, NULL, NULL, &ep_obj->ctrl_pte); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate control PTE: %d\n", ret); + goto free_tgq; + } + + /* CXIP_PTL_IDX_WRITE_MR_STD is shared with CXIP_PTL_IDX_CTRL. */ + ret = cxip_pte_map(ep_obj->ctrl_pte, CXIP_PTL_IDX_WRITE_MR_STD, false); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to map write PTE: %d\n", ret); + goto free_pte; + } + + ret = cxip_pte_map(ep_obj->ctrl_pte, CXIP_PTL_IDX_READ_MR_STD, false); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to map read PTE: %d\n", ret); + goto free_pte; + } + + ret = cxip_pte_set_state(ep_obj->ctrl_pte, ep_obj->ctrl_tgq, + C_PTLTE_ENABLED, 0); + if (ret) { + /* This is a bug, we have exclusive access to this CMDQ. */ + CXIP_WARN("Failed to enqueue command: %d\n", ret); + goto free_pte; + } + + /* Wait for Enable event */ + while (!(event = cxi_eq_get_event(ep_obj->ctrl_tgt_evtq))) + sched_yield(); + + switch (event->hdr.event_type) { + case C_EVENT_STATE_CHANGE: + if (event->tgt_long.return_code != C_RC_OK || + event->tgt_long.initiator.state_change.ptlte_state != + C_PTLTE_ENABLED || + event->tgt_long.ptlte_index != ep_obj->ctrl_pte->pte->ptn) + CXIP_FATAL("Invalid PtlTE enable event\n"); + break; + case C_EVENT_COMMAND_FAILURE: + CXIP_FATAL("Command failure: cq=%u target=%u fail_loc=%u cmd_type=%u cmd_size=%u opcode=%u\n", + event->cmd_fail.cq_id, event->cmd_fail.is_target, + event->cmd_fail.fail_loc, + event->cmd_fail.fail_command.cmd_type, + event->cmd_fail.fail_command.cmd_size, + event->cmd_fail.fail_command.opcode); + default: + CXIP_FATAL("Invalid event type: %d\n", event->hdr.event_type); + } + + cxi_eq_ack_events(ep_obj->ctrl_tgt_evtq); + + ret = cxip_ctrl_msg_init(ep_obj); + if (ret != FI_SUCCESS) + goto free_pte; + + /* Reserve 4 event queue slots to prevent EQ overrun. + * 1. One slot for EQ status writeback + * 2. One slot for default reserved_fc value + * 3. One slot for EQ overrun detection. + * 4. TODO: Determine why an additional slot needs to be reserved. + */ + ep_obj->ctrl_tx_credits = + ep_obj->ctrl_tx_evtq->byte_size / C_EE_CFG_ECB_SIZE - 4; + + CXIP_DBG("EP control initialized: %p\n", ep_obj); + + return FI_SUCCESS; + +free_pte: + cxip_pte_free(ep_obj->ctrl_pte); +free_tgq: + cxip_ep_cmdq_put(ep_obj, false); +free_txq: + cxip_ep_cmdq_put(ep_obj, true); +free_tgt_evtq: + cxip_eq_ctrl_eq_free(ep_obj->ctrl_tgt_evtq_buf, + ep_obj->ctrl_tgt_evtq_buf_md, + ep_obj->ctrl_tgt_evtq); +free_tx_evtq: + cxip_eq_ctrl_eq_free(ep_obj->ctrl_tx_evtq_buf, + ep_obj->ctrl_tx_evtq_buf_md, ep_obj->ctrl_tx_evtq); +err: + if (ep_obj->ctrl_wait) { + cxil_destroy_wait_obj(ep_obj->ctrl_wait); + ep_obj->ctrl_wait = NULL; + } + + return ret; +} + +/* + * cxip_ep_ctrl_fini() - Finalize endpoint control resources. + * + * Caller must hold ep_obj->lock. + */ +void cxip_ep_ctrl_fini(struct cxip_ep_obj *ep_obj) +{ + cxip_ctrl_mr_cache_flush(ep_obj); + cxip_ctrl_msg_fini(ep_obj); + cxip_pte_free(ep_obj->ctrl_pte); + cxip_ep_cmdq_put(ep_obj, false); + cxip_ep_cmdq_put(ep_obj, true); + + cxip_eq_ctrl_eq_free(ep_obj->ctrl_tgt_evtq_buf, + ep_obj->ctrl_tgt_evtq_buf_md, + ep_obj->ctrl_tgt_evtq); + cxip_eq_ctrl_eq_free(ep_obj->ctrl_tx_evtq_buf, + ep_obj->ctrl_tx_evtq_buf_md, ep_obj->ctrl_tx_evtq); + + if (ep_obj->ctrl_wait) { + cxil_destroy_wait_obj(ep_obj->ctrl_wait); + ep_obj->ctrl_wait = NULL; + + CXIP_DBG("Deleted control EQ wait object\n"); + } + + CXIP_DBG("EP control finalized: %p\n", ep_obj); +} diff --git a/prov/cxi/src/cxip_curl.c b/prov/cxi/src/cxip_curl.c new file mode 100644 index 00000000000..225512dcaa8 --- /dev/null +++ b/prov/cxi/src/cxip_curl.c @@ -0,0 +1,599 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cxip.h" + +#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_CURL, fmt, ##__VA_ARGS__) + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_FABRIC, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_FABRIC, __VA_ARGS__) + +#define CHUNK_SIZE 4096 +#define CHUNK_MASK (CHUNK_SIZE-1) + +/** + * Expandable buffer that can receive data in arbitrary-sized chunks. + */ +struct curl_buffer { + char *buffer; + size_t size; + size_t offset; +}; + +/** + * Allocate an expandable CURL buffer. + * + * This expands as necessary to accommodate the data, which may be delivered in + * chunks over the network. If you know in advance the approximate size of the + * return data on a large transfer, you can avoid repeated calls to realloc(). + * + * @param rsp_init_size : initial size of buffer area (> 0), default 4k + * + * @return struct curl_buffer* : returned CURL buffer + */ +static inline struct curl_buffer *init_curl_buffer(size_t rsp_init_size) +{ + struct curl_buffer *buf; + + if (rsp_init_size == 0) + rsp_init_size = 4096; + buf = calloc(1, sizeof(*buf)); + if (!buf) + return NULL; + + buf->buffer = malloc(rsp_init_size); + if (!buf->buffer) { + free(buf); + buf = NULL; + } + + return buf; +} + +/** + * Free a curl_buffer and all its data. + * + * @param buf : curl buffer to delete + */ +static inline void free_curl_buffer(struct curl_buffer *buf) +{ + if (buf) + free(buf->buffer); + free(buf); +} + +/** + * Curl is curl-centric, facing the application (not the server). For Curl, a + * "write" is a write to application memory. This is the function that fills + * the user's curl buffer with data returned by the server. + * + * Buffer expands as needed to accommodate data. Note that this means the buffer + * itself must be treated as uninitialized memory beyond buf->offset (see + * realloc()). + * + * If the return value does not match the number of bytes requested, it will + * abort the transfer and the curl function will return CURLE_WRITE_ERROR. + * + * @param curl_rcvd : poiter to data received from server + * @param size : size of member + * @param nmemb : number of members + * @param userp : (void *)-cast curl_buffer + * + * @return ssize_t : number of bytes added + */ +static size_t write_callback(void *curl_rcvd, size_t size, size_t nmemb, + void *userp) +{ + struct curl_buffer *curl_buf = (struct curl_buffer *)userp; + size_t sz = size * nmemb; + size_t need = curl_buf->offset + sz; + + if (need >= curl_buf->size) { + curl_buf->size = (need + CHUNK_MASK) & ~CHUNK_MASK; + curl_buf->buffer = realloc(curl_buf->buffer, curl_buf->size); + if (!curl_buf->buffer) + return 0; + } + memcpy(&curl_buf->buffer[curl_buf->offset], curl_rcvd, sz); + + curl_buf->offset += sz; + return sz; +} + +/* + * The CURL library must be explicitly initialized. It is application-global, + * and the initialization is not thread-safe, according to the documentation. We + * do not protect this call, because it is running under CXI_INIT (see + * cxip_info.c), which is single-threaded. The curl_global_init() call can be + * issued multiple times (non-concurrently) and has the same end result as + * calling it once. + */ +static CURLM *cxip_curlm; +static int cxip_curl_count; + +/** + * Initialize CURL globally for the application, enabling multi-curl + * (concurrent calls). + */ +int cxip_curl_init(void) +{ + int ret = FI_SUCCESS; + CURLcode res; + + if (!cxip_curlm) { + res = curl_global_init(CURL_GLOBAL_DEFAULT); + if (res == CURLE_OK) { + cxip_curlm = curl_multi_init(); + if (!cxip_curlm) { + curl_global_cleanup(); + ret = -FI_EINVAL; + } + } else + ret = -FI_EINVAL; + } + return ret; +} + +/** + * Globally terminate this module. + */ +void cxip_curl_fini(void) +{ + cxip_curl_count = 0; + if (cxip_curlm) { + curl_multi_cleanup(cxip_curlm); + curl_global_cleanup(); + cxip_curlm = NULL; + } +} + +/** + * Return a name for an opcode. + * + * @param op : curl operation + * @return const char* : printable name for curl operation + */ +const char *cxip_curl_opname(enum curl_ops op) +{ + static char * const curl_opnames[] = { + "GET", + "PUT", + "POST", + "PATCH", + "DELETE", + }; + return (op >= 0 && op < CURL_MAX) ? curl_opnames[op] : NULL; +} + +/** + * Free a handle created by cxip_curl_perform(). + * + * @param handle : handle created by cxip_curl_perform() + */ +void cxip_curl_free(struct cxip_curl_handle *handle) +{ + if (!handle) + return; + + free((void *)handle->endpoint); + free((void *)handle->request); + /* do not directly free handle->response (== handle->recv->buffer) */ + free_curl_buffer((struct curl_buffer *)handle->recv); + free(handle); + cxip_curl_count -= 1; +} + +/** + * Dispatch a CURL request. + * + * This is a general-purpose CURL multi (async) JSON format curl request. + * + * Note that this function only dispatches the request. cxip_curl_progress() + * must be called to progress the dispatched operations and retrieve data. + * + * The usrfunc is called in cxip_curl_progress() when the request completes, + * and receives the handle as its sole argument. The handle also contains an + * arbitrary usrptr supplied by the caller. This usrptr can contain specific + * information to identify which of multiple concurrent requests has completed. + * + * There are no "normal" REST errors from this call. REST errors are instead + * returned on attempts to progress the dispatched operation. + * + * @param endpoint : HTTP server endpoint address + * @param request : JSON-formatted request + * @param rsp_init_size : initial size of response buffer (can be 0) + * @param op : curl operation + * @param verbose : use to display sent HTTP headers + * @param userfunc : user-defined completion function + * @param usrptr : user-defined data pointer + * + * @return int : 0 on success, -1 on failure + */ +int cxip_curl_perform(const char *endpoint, const char *request, + const char *sessionToken, size_t rsp_init_size, + enum curl_ops op, bool verbose, + curlcomplete_t usrfunc, void *usrptr) +{ + struct cxip_curl_handle *handle; + struct curl_slist *headers; + char *token; + CURLMcode mres; + CURL *curl; + int running; + int ret; + + TRACE("%s: usrptr=%p\n", __func__, usrptr); + ret = -FI_ENOMEM; + handle = calloc(1, sizeof(*handle)); + if (!handle) + goto fail; + TRACE("%s: handle=%p\n", __func__, handle); + + /* libcurl is fussy about NULL requests */ + handle->endpoint = strdup(endpoint); + if (!handle->endpoint) + goto fail; + handle->request = strdup(request ? request : ""); + if (!handle->request) + goto fail; + handle->response = NULL; + handle->recv = (void *)init_curl_buffer(rsp_init_size); + if (!handle->recv) + goto fail; + /* add user completion function and pointer */ + handle->usrfunc = usrfunc; + handle->usrptr = usrptr; + TRACE("%s: handle->usrfnc=%p\n", __func__, handle->usrfunc); + TRACE("%s: handle->usrptr=%p\n", __func__, handle->usrptr); + + ret = -FI_EACCES; + curl = curl_easy_init(); + if (!curl) { + CXIP_WARN("curl_easy_init() failed\n"); + goto fail; + } + + /* HTTP 1.1 assumed */ + headers = NULL; + headers = curl_slist_append(headers, "Expect:"); + headers = curl_slist_append(headers, "Accept: application/json"); + headers = curl_slist_append(headers, "Content-Type: application/json"); + headers = curl_slist_append(headers, "charset: utf-8"); + token = NULL; + if (sessionToken) { + ret = asprintf(&token, "x-xenon-auth-token: %s", sessionToken); + if (ret < 0) { + CXIP_WARN("x-xenon-auth-token create failed\n"); + goto fail; + } + headers = curl_slist_append(headers, token); + } + handle->headers = (void *)headers; + + curl_easy_setopt(curl, CURLOPT_URL, handle->endpoint); + if (op == CURL_GET) { + curl_easy_setopt(curl, CURLOPT_HTTPGET, 1L); + } else { + curl_easy_setopt(curl, CURLOPT_POST, 1L); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, handle->request); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, + strlen(handle->request)); + } + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, handle->recv); + curl_easy_setopt(curl, CURLOPT_PRIVATE, (void *)handle); + curl_easy_setopt(curl, CURLOPT_VERBOSE, (long)verbose); + curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, cxip_curl_opname(op)); + + curl_multi_add_handle(cxip_curlm, curl); + mres = curl_multi_perform(cxip_curlm, &running); + if (mres != CURLM_OK) { + CXIP_WARN("curl_multi_perform() failed: %s\n", + curl_multi_strerror(mres)); + goto fail; + } + cxip_curl_count += 1; + return FI_SUCCESS; + +fail: + CXIP_WARN("%s failed %d\n", __func__, ret); + cxip_curl_free(handle); + return ret; +} + +/** + * Progress the CURL requests. + * + * This progresses concurrent CURL requests, and returns the following: + * + * - 0 indicates an operation completed + * - -FI_EAGAIN indicates operations are pending, none completed + * - -FI_ENODATA indicates no operations are pending + * - -errorcode a fatal error + * + * Repeated calls will return additional completions, until there are no more + * pending and -FI_ENODATA is returned. + * + * Note that a CURL request will succeed if the server is not reachable. It will + * return a handle->status value of 0, which is an invalid HTTP status, and + * indicates that it could not connect to a server. + * + * For unit testing, it is useful for the test to be able to inspect the handle + * directly, and it can be obtained by specifying a non-null handleptr value. If + * handleptr is supplied, the caller is responsible for calling cxip_curl_free() + * on the returned handle. In normal usage, handleptr is NULL, and this routine + * will clean up the handle after the operation completes. + * + * The user should provide a callback routine to examine the final state of the + * CURL request, as well as any data it returns: see cxip_curl_perform(). This + * user callback is called after completion of the request, before the handle is + * destroyed. + * + * The callback routine has read-only access to the handle, and read-write + * access to its own data area, available as handle->usrptr. + * + * The handle contains the following documented fields: + * + * - status = HTTP status of the op, or 0 if the endpoint could not be reached + * - endpoint = copy of the endpoint address supplied for the post + * - request = copy of the JSON request data supplied for the post + * - response = pointer to the JSON response returned by the endpoint + * - usrptr = arbitrary user pointer supplied during CURL request + * + * @param handleptr : if not NULL, returns the request handle + * @return int : return code, see above + */ +int cxip_curl_progress(struct cxip_curl_handle **handleptr) +{ + struct cxip_curl_handle *handle; + struct CURLMsg *msg; + CURLMcode mres; + CURLcode res; + int running; + int messages; + long status; + struct curl_buffer *recv; + + + /* This needs to be quick if nothing is pending */ + if (!cxip_curl_count) + return -FI_ENODATA; + + handle = NULL; + + /* running returns the number of curls running */ + mres = curl_multi_perform(cxip_curlm, &running); + if (mres != CURLM_OK) { + CXIP_WARN("curl_multi_perform() failed: %s\n", + curl_multi_strerror(mres)); + return -FI_EOTHER; + } + + /* messages returns the number of additional curls finished */ + msg = curl_multi_info_read(cxip_curlm, &messages); + if (!msg || msg->msg != CURLMSG_DONE) { + return (running) ? -FI_EAGAIN : -FI_ENODATA; + } + + /* retrieve our handle from the private pointer */ + res = curl_easy_getinfo(msg->easy_handle, + CURLINFO_PRIVATE, (char **)&handle); + if (res != CURLE_OK) { + CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n", + "CURLINFO_PRIVATE", + curl_easy_strerror(res)); + return -FI_EOTHER; + } + /* handle is now valid, must eventually be freed */ + TRACE("%s: handle=%p\n", __func__, handle); + + /* retrieve the status code, should not fail */ + res = curl_easy_getinfo(msg->easy_handle, + CURLINFO_RESPONSE_CODE, &status); + if (res != CURLE_OK) { + CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n", + "CURLINFO_RESPONSE_CODE", + curl_easy_strerror(res)); + /* continue, handle->status should show zero */ + } + + /* we can recover resources now */ + curl_slist_free_all((struct curl_slist *)handle->headers); + curl_easy_cleanup(msg->easy_handle); + handle->headers = NULL; + + /* make sure response string is terminated */ + recv = (struct curl_buffer *)handle->recv; + recv->buffer[recv->offset] = 0; + handle->response = recv->buffer; + handle->status = status; + + /* call the user function */ + TRACE("%s: handle->usrfnc=%p\n", __func__, handle->usrfunc); + TRACE("%s: handle->usrptr=%p\n", __func__, handle->usrptr); + if (handle->usrfunc) + handle->usrfunc(handle); + TRACE("%s: returned from usrfnc\n", __func__); + + /* return the handle, or free it */ + if (handleptr) { + *handleptr = handle; + } else { + cxip_curl_free(handle); + } + return FI_SUCCESS; +} + +/** + * @brief Simplified search for JSON objects. + * + * Simplified object search using a descriptor like the following: + * Example: "firstkey.secondkey.arraykey[3].thirdkey" + * + * The first character is '.' or '['. If omitted, it is assumed to be '.'. + * + * The appearance of '.' indicates that the current object is expected to be + * a json_type_object, and the text that follows is a key within the object. + * + * The appearance of '[' must be part of a '[]' construction, and + * indicates that the current object is expected to be a json_type_array, and + * the specified integer value is an index into the array. + * + * The descriptor allows you to dive into the structure and return the endpoint + * of the dive in the returned jval pointer, and returns the type of this + * endpoint object. + * + * Note that this is a convenience method, primarily for testing. Results are + * undefined if the '.' or '[' or ']' characters appear in a key. + * + * Note that the returned jval is a json_object. You can use the following + * libjson functions to directly extract values: + * + * - json_object_get_boolean() + * - json_object_get_int() + * - json_object_get_int64() + * - json_object_get_uint64() + * - json_object_get_double() + * - json_object_get_string() + * + * Note also that these functions are used in the variants below. + * + * All memory is managed by json, so on 'put' of the head object, all memory is + * recovered. + * + * This returns json_type_null on any error. + * + * @param desc - string descriptor of endpoint argument + * @param jobj - starting object + * @param jval - final endpoint object, or NULL + * @return enum json_type - type of the endpoint object + */ +enum json_type cxip_json_extract(const char *desc, struct json_object *jobj, + struct json_object **jval) +{ + const char *beg; + struct json_object *jo; + enum json_type jt; + + *jval = NULL; + + beg = desc; + jo = jobj; + jt = json_object_get_type(jo); + while (*beg) { + if (*beg == '[') { + /* expect "[]" */ + size_t idx = 0; + size_t len; + + if (jt != json_type_array) + return json_type_null; + /* skip '[' and ensure index is not empty */ + if (*(++beg) == ']') + return json_type_null; + idx = strtoul(beg, (char **)&beg, 10); + /* ensure strtol consumed index */ + if (*(beg++) != ']') + return json_type_null; + /* check index validity */ + len = json_object_array_length(jo); + if (idx >= len) + return json_type_null; + /* get the indexed object and continue */ + jo = json_object_array_get_idx(jo, idx); + jt = json_object_get_type(jo); + continue; + } + if (beg == desc || *beg == '.') { + /* expect ".key" */ + char key[256], *p = key; + size_t len = sizeof(key); + + if (jt != json_type_object) + return json_type_null; + /* skip leading '.' */ + if (*beg == '.') + beg++; + /* copy key from descriptor to local storage */ + while (*beg && *beg != '.' && *beg != '[' && --len > 0) + *p++ = *beg++; + *p = 0; + /* extract the associated value */ + if (!json_object_object_get_ex(jo, key, &jo)) + return json_type_null; + jt = json_object_get_type(jo); + continue; + } + } + + /* return the final object */ + *jval = jo; + return jt; +} + +/** + * @brief Simplified search for JSON terminal type values. + * + * @param desc : search descriptor for cxip_json_extract() + * @param jobj : starting object + * @param val : return value + * @return int : 0 on success, -EINVAL on error + */ +int cxip_json_bool(const char *desc, struct json_object *jobj, bool *val) +{ + struct json_object *jval; + if (json_type_boolean != cxip_json_extract(desc, jobj, &jval)) + return -EINVAL; + *val = json_object_get_boolean(jval); + return 0; +} + +int cxip_json_int(const char *desc, struct json_object *jobj, int *val) +{ + struct json_object *jval; + if (json_type_int != cxip_json_extract(desc, jobj, &jval)) + return -EINVAL; + *val = json_object_get_int(jval); + return 0; +} + +int cxip_json_int64(const char *desc, struct json_object *jobj, int64_t *val) +{ + struct json_object *jval; + if (json_type_int != cxip_json_extract(desc, jobj, &jval)) + return -EINVAL; + *val = json_object_get_int64(jval); + return 0; +} + +int cxip_json_double(const char *desc, struct json_object *jobj, double *val) +{ + struct json_object *jval; + if (json_type_double != cxip_json_extract(desc, jobj, &jval)) + return -EINVAL; + *val = json_object_get_double(jval); + return 0; +} + +int cxip_json_string(const char *desc, struct json_object *jobj, + const char **val) +{ + struct json_object *jval; + if (json_type_string != cxip_json_extract(desc, jobj, &jval)) + return -EINVAL; + *val = json_object_get_string(jval); + return 0; +} diff --git a/prov/cxi/src/cxip_dom.c b/prov/cxi/src/cxip_dom.c new file mode 100644 index 00000000000..8aa6831b0c8 --- /dev/null +++ b/prov/cxi/src/cxip_dom.c @@ -0,0 +1,1676 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved. + * Copyright (c) 2018,2020-2023 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include + +#include + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__) + +extern struct fi_ops_mr cxip_dom_mr_ops; + +/* + * cxip_domain_req_alloc() - Allocate a domain control buffer ID + */ +int cxip_domain_ctrl_id_alloc(struct cxip_domain *dom, + struct cxip_ctrl_req *req) +{ + int buffer_id; + + ofi_spin_lock(&dom->ctrl_id_lock); + buffer_id = ofi_idx_insert(&dom->req_ids, req); + if (buffer_id < 0 || buffer_id >= CXIP_BUFFER_ID_MAX) { + CXIP_WARN("Failed to allocate MR buffer ID: %d\n", + buffer_id); + ofi_spin_unlock(&dom->ctrl_id_lock); + return -FI_ENOSPC; + } + + ofi_spin_unlock(&dom->ctrl_id_lock); + req->req_id = buffer_id; + + return FI_SUCCESS; +} + +/* + * cxip_domain_ctrl_id_free() - Free a domain wide control buffer id. + */ +void cxip_domain_ctrl_id_free(struct cxip_domain *dom, + struct cxip_ctrl_req *req) +{ + /* Non-remote MR will not have a buffer ID assigned */ + if (req->req_id < 0) + return; + + ofi_spin_lock(&dom->ctrl_id_lock); + ofi_idx_remove(&dom->req_ids, req->req_id); + ofi_spin_unlock(&dom->ctrl_id_lock); +} + +/* + * cxip_domain_prov_mr_key_alloc() - Allocate a domain unique + * non-cached FI_MR_PROV_KEY key ID. + */ +int cxip_domain_prov_mr_id_alloc(struct cxip_domain *dom, + struct cxip_mr *mr) +{ + struct cxip_mr_key key = {}; + int buffer_id; + + /* Allocations favor optimized MR range (if enabled) */ + ofi_spin_lock(&dom->ctrl_id_lock); + buffer_id = ofi_idx_insert(&dom->mr_ids, mr); + if (buffer_id < 0 || buffer_id >= CXIP_BUFFER_ID_MAX) { + CXIP_WARN("Failed to allocate FI_MR_PROV_KEY MR ID: %d\n", + buffer_id); + ofi_spin_unlock(&dom->ctrl_id_lock); + return -FI_ENOSPC; + } + + /* IDX 0 is reserved and should never be returned */ + assert(buffer_id > 0); + buffer_id = buffer_id - 1; + + mr->mr_id = buffer_id; + key.is_prov = 1; + key.id = buffer_id; + key.seqnum = ++dom->prov_key_seqnum; + + /* Let the source know events are required and it should use + * unrestricted operations. + */ + key.events = mr->count_events || mr->rma_events || mr->cntr; + + key.opt = cxip_env.optimized_mrs && + key.id < CXIP_PTL_IDX_PROV_MR_OPT_CNT; + mr->key = key.raw; + ofi_spin_unlock(&dom->ctrl_id_lock); + + return FI_SUCCESS; +} + +/* + * cxip_domain_prov_mr_id_free() - Free a domain wide FI_MR_PROV_KEY MR id. + */ +void cxip_domain_prov_mr_id_free(struct cxip_domain *dom, + struct cxip_mr *mr) +{ + struct cxip_mr_key key = { + .raw = mr->key, + }; + + /* Only non-cached FI_MR_PROV_KEY MR require MR ID */ + if (mr->mr_id < 0) + return; + + ofi_spin_lock(&dom->ctrl_id_lock); + ofi_idx_remove(&dom->mr_ids, key.id + 1); + ofi_spin_unlock(&dom->ctrl_id_lock); +} + +#define TLE_RESERVED 8U + +/* + * cxip_domain_enable() - Enable an FI Domain for use. + * + * Allocate hardware resources and initialize software to prepare the Domain + * for use. + */ +static int cxip_domain_enable(struct cxip_domain *dom) +{ + int ret = FI_SUCCESS; + struct cxi_svc_desc svc_desc; + + ofi_spin_lock(&dom->lock); + + if (dom->enabled) + goto unlock; + + ret = cxip_get_if(dom->nic_addr, &dom->iface); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to get IF\n"); + ret = -FI_ENODEV; + goto unlock; + } + + ret = cxil_get_svc(dom->iface->dev, dom->auth_key.svc_id, &svc_desc); + if (ret) { + CXIP_WARN("cxil_get_svc with %s and svc_id %d failed: %d:%s\n", + dom->iface->dev->info.device_name, + dom->auth_key.svc_id, ret, strerror(-ret)); + ret = -FI_EINVAL; + goto put_if; + } + + if (!svc_desc.restricted_members) + CXIP_WARN("Security Issue: Using unrestricted service ID %d for %s. " + "Please provide a service ID via auth_key fields.\n", + dom->auth_key.svc_id, + dom->iface->dev->info.device_name); + if (!svc_desc.restricted_vnis) + CXIP_WARN("Security Issue: Using service ID %d with unrestricted VNI access %s. " + "Please provide a service ID via auth_key fields.\n", + dom->auth_key.svc_id, + dom->iface->dev->info.device_name); + + /* Need to reserved TLEs to prevent stalling. */ + dom->max_trig_op_in_use = + svc_desc.limits.type[CXI_RSRC_TYPE_TLE].res - TLE_RESERVED; + + ret = cxip_alloc_lni(dom->iface, dom->auth_key.svc_id, &dom->lni); + if (ret) { + CXIP_WARN("cxip_alloc_lni returned: %d\n", ret); + ret = -FI_ENODEV; + goto put_if; + } + + ret = cxip_iomm_init(dom); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to initialize IOMM: %d\n", ret); + assert(ret == -FI_ENOMEM); + goto free_lni; + } + + ret = cxil_get_amo_remap_to_pcie_fadd(dom->iface->dev, + &dom->amo_remap_to_pcie_fadd); + if (ret) { + CXIP_WARN("Failed to get amo_remap_to_pcie_fadd value: %d\n", + ret); + goto iomm_fini; + } + + cxip_mr_domain_init(&dom->mr_domain); + + dom->enabled = true; + ofi_spin_unlock(&dom->lock); + + DOM_INFO(dom, "Domain enabled\n"); + + /* Telemetry are considered optional and will not stop domain + * allocation. + */ + ret = cxip_telemetry_alloc(dom, &dom->telemetry); + if (ret == FI_SUCCESS) + DOM_INFO(dom, "Telemetry collection enabled\n"); + + return FI_SUCCESS; + +iomm_fini: + cxip_iomm_fini(dom); +free_lni: + cxip_free_lni(dom->lni); + dom->lni = NULL; +put_if: + cxip_put_if(dom->iface); + dom->iface = NULL; +unlock: + ofi_spin_unlock(&dom->lock); + + return ret; +} + +/* + * cxip_domain_disable() - Disable an FI Domain. + */ +static void cxip_domain_disable(struct cxip_domain *dom) +{ + ofi_spin_lock(&dom->lock); + + if (!dom->enabled) + goto unlock; + + DOM_INFO(dom, "Domain disabled\n"); + + cxip_mr_domain_fini(&dom->mr_domain); + cxip_dom_cntr_disable(dom); + cxip_iomm_fini(dom); + cxip_free_lni(dom->lni); + cxip_put_if(dom->iface); + + dom->enabled = false; + +unlock: + ofi_spin_unlock(&dom->lock); +} + +/* + * cxip_dom_close() - Provider fi_close implementation for an FI Domain object. + */ +static int cxip_dom_close(struct fid *fid) +{ + struct cxip_domain *dom; + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + if (ofi_atomic_get32(&dom->ref)) + return -FI_EBUSY; + + if (dom->telemetry) { + cxip_telemetry_dump_delta(dom->telemetry); + cxip_telemetry_free(dom->telemetry); + } + + cxip_domain_disable(dom); + + ofi_spin_destroy(&dom->lock); + ofi_spin_destroy(&dom->ctrl_id_lock); + ofi_idx_reset(&dom->req_ids); + ofi_idx_reset(&dom->mr_ids); + ofi_domain_close(&dom->util_domain); + free(dom); + + return 0; +} + +/* + * cxip_dom_bind() - Provider fi_domain_bind implementation. + */ +static int cxip_dom_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct cxip_domain *dom; + struct cxip_eq *eq; + + dom = container_of(fid, struct cxip_domain, util_domain.domain_fid.fid); + eq = container_of(bfid, struct cxip_eq, util_eq.eq_fid.fid); + + if (dom->eq) + return -FI_EINVAL; + + dom->eq = eq; + if (flags & FI_REG_MR) + dom->mr_eq = eq; + + return 0; +} + +static int cxip_dom_dwq_op_send(struct cxip_domain *dom, struct fi_op_msg *msg, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr, + uint64_t trig_thresh) +{ + struct cxip_ep *ep = container_of(msg->ep, struct cxip_ep, ep); + const void *buf; + size_t len; + int ret; + + if (!msg || msg->msg.iov_count > 1) + return -FI_EINVAL; + + /* FI_INJECT is not supported for triggered sends */ + if (msg->flags & FI_INJECT) { + CXIP_WARN("FI_INJECT not supported for triggered op\n"); + return -FI_EINVAL; + } + + /* To prevent triggered operation exhaustion, FI_MORE cannot be + * supported. + */ + msg->flags &= ~FI_MORE; + + buf = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_base : NULL; + len = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_len : 0; + + ret = cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, + NULL, msg->msg.data, msg->msg.addr, 0, + msg->msg.context, msg->flags, false, true, + trig_thresh, trig_cntr, comp_cntr); + if (ret) + CXIP_DBG("Failed to emit message triggered op, ret=%d\n", ret); + else + CXIP_DBG("Queued triggered message op with threshold %lu\n", + trig_thresh); + + return ret; +} + +static int cxip_dom_dwq_op_tsend(struct cxip_domain *dom, + struct fi_op_tagged *tagged, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr, + uint64_t trig_thresh) +{ + struct cxip_ep *ep = container_of(tagged->ep, struct cxip_ep, ep); + const void *buf; + size_t len; + int ret; + + if (!tagged || tagged->msg.iov_count > 1) + return -FI_EINVAL; + + /* FI_INJECT is not supported for triggered tsends */ + if (tagged->flags & FI_INJECT) { + CXIP_WARN("FI_INJECT not supported for triggered op\n"); + return -FI_EINVAL; + } + + /* To prevent triggered operation exhaustion, FI_MORE cannot be + * supported. + */ + tagged->flags &= ~FI_MORE; + + buf = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_base : NULL; + len = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_len : 0; + + ret = cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, + NULL, tagged->msg.data, tagged->msg.addr, + tagged->msg.tag, tagged->msg.context, + tagged->flags, true, true, trig_thresh, + trig_cntr, comp_cntr); + if (ret) + CXIP_DBG("Failed to emit tagged msg triggered op, ret=%d\n", + ret); + else + CXIP_DBG("Queued triggered tagged msg op with threshold %lu\n", + trig_thresh); + + return ret; +} + +static int cxip_dom_dwq_op_rma(struct cxip_domain *dom, struct fi_op_rma *rma, + enum fi_op_type op, struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr, + uint64_t trig_thresh) +{ + struct cxip_ep *ep = container_of(rma->ep, struct cxip_ep, ep); + const void *buf; + size_t len; + int ret; + + if (!rma || !rma->msg.msg_iov || rma->msg.iov_count > 1 || + !rma->msg.rma_iov || rma->msg.rma_iov_count != 1) + return -FI_EINVAL; + + /* To prevent triggered operation exhaustion, FI_MORE cannot be + * supported. + */ + rma->flags &= ~FI_MORE; + + buf = rma->msg.iov_count ? rma->msg.msg_iov[0].iov_base : NULL; + len = rma->msg.iov_count ? rma->msg.msg_iov[0].iov_len : 0; + + ret = cxip_rma_common(op, &ep->ep_obj->txc, buf, len, NULL, + rma->msg.addr, rma->msg.rma_iov[0].addr, + rma->msg.rma_iov[0].key, rma->msg.data, + rma->flags, ep->tx_attr.tclass, + ep->tx_attr.msg_order, rma->msg.context, true, + trig_thresh, trig_cntr, comp_cntr); + if (ret) + CXIP_DBG("Failed to emit RMA triggered op, ret=%d\n", ret); + else + CXIP_DBG("Queued triggered RMA operation with threshold %lu\n", + trig_thresh); + + return ret; +} + +static int cxip_dom_dwq_op_atomic(struct cxip_domain *dom, + struct fi_op_atomic *amo, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr, + uint64_t trig_thresh) +{ + struct cxip_ep *ep = container_of(amo->ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + int ret; + + if (!amo) + return -FI_EINVAL; + + /* To prevent triggered operation exhaustion, FI_MORE cannot be + * supported. + */ + amo->flags &= ~FI_MORE; + + ret = cxip_amo_common(CXIP_RQ_AMO, txc, txc->tclass, &amo->msg, + NULL, NULL, 0, NULL, NULL, 0, amo->flags, + true, trig_thresh, trig_cntr, comp_cntr); + if (ret) + CXIP_DBG("Failed to emit AMO triggered op, ret=%d\n", ret); + else + CXIP_DBG("Queued triggered AMO operation with threshold %lu\n", + trig_thresh); + + return ret; +} + +static int cxip_dom_dwq_op_fetch_atomic(struct cxip_domain *dom, + struct fi_op_fetch_atomic *fetch_amo, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr, + uint64_t trig_thresh) +{ + struct cxip_ep *ep = container_of(fetch_amo->ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + int ret; + + if (!fetch_amo) + return -FI_EINVAL; + + /* To prevent triggered operation exhaustion, FI_MORE cannot be + * supported. + */ + fetch_amo->flags &= ~FI_MORE; + + ret = cxip_amo_common(CXIP_RQ_AMO_FETCH, txc, txc->tclass, + &fetch_amo->msg, NULL, NULL, 0, + fetch_amo->fetch.msg_iov, fetch_amo->fetch.desc, + fetch_amo->fetch.iov_count, fetch_amo->flags, + true, trig_thresh, trig_cntr, comp_cntr); + if (ret) + CXIP_DBG("Failed to emit fetching AMO triggered op, ret=%d\n", + ret); + else + CXIP_DBG("Queued triggered fetching AMO op with threshold %lu\n", + trig_thresh); + + return ret; +} + +static int cxip_dom_dwq_op_comp_atomic(struct cxip_domain *dom, + struct fi_op_compare_atomic *comp_amo, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr, + uint64_t trig_thresh) +{ + struct cxip_ep *ep = container_of(comp_amo->ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + int ret; + + if (!comp_amo) + return -FI_EINVAL; + + /* To prevent triggered operation exhaustion, FI_MORE cannot be + * supported. + */ + comp_amo->flags &= ~FI_MORE; + + ret = cxip_amo_common(CXIP_RQ_AMO_SWAP, txc, txc->tclass, + &comp_amo->msg, comp_amo->compare.msg_iov, + comp_amo->compare.desc, + comp_amo->compare.iov_count, + comp_amo->fetch.msg_iov, comp_amo->fetch.desc, + comp_amo->fetch.iov_count, comp_amo->flags, true, + trig_thresh, trig_cntr, comp_cntr); + if (ret) + CXIP_DBG("Failed to emit compare AMO triggered op, ret=%d\n", + ret); + else + CXIP_DBG("Queued triggered compare AMO op with threshold %lu\n", + trig_thresh); + + return ret; +} + +static int cxip_dom_dwq_op_cntr(struct cxip_domain *dom, + struct fi_op_cntr *cntr, enum fi_op_type op, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr, + uint64_t trig_thresh, + bool cntr_wb) +{ + struct cxip_cntr *op_cntr; + int ret; + unsigned opcode; + struct c_ct_cmd cmd = {}; + + /* Completion counter must be NULL. */ + if (!cntr || !cntr->cntr || comp_cntr) + return -FI_EINVAL; + + if (cntr_wb) { + opcode = C_CMD_CT_TRIG_EVENT; + cmd.eq = C_EQ_NONE; + } else { + opcode = op == FI_OP_CNTR_SET ? + C_CMD_CT_TRIG_SET : C_CMD_CT_TRIG_INC; + } + + op_cntr = container_of(cntr->cntr, struct cxip_cntr, cntr_fid); + + cmd.trig_ct = trig_cntr->ct->ctn; + cmd.threshold = trig_thresh; + cmd.ct = op_cntr->ct->ctn; + cmd.set_ct_success = 1; + cmd.ct_success = cntr->value; + + ofi_genlock_lock(&dom->trig_cmdq_lock); + ret = cxi_cq_emit_ct(dom->trig_cmdq->dev_cmdq, opcode, &cmd); + if (ret) { + /* TODO: Handle this assert. */ + assert(!ret); + } + cxi_cq_ring(dom->trig_cmdq->dev_cmdq); + ofi_genlock_unlock(&dom->trig_cmdq_lock); + + return FI_SUCCESS; +} + +static int cxip_dom_dwq_op_recv(struct cxip_domain *dom, struct fi_op_msg *msg, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr, + uint64_t trig_thresh) +{ + struct cxip_ep *ep = container_of(msg->ep, struct cxip_ep, ep); + void *buf; + size_t len; + + /* Non-zero thresholds for triggered receives are not supported. */ + if (!msg || msg->msg.iov_count > 1 || trig_thresh) + return -FI_EINVAL; + + buf = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_base : NULL; + len = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_len : 0; + + return cxip_recv_common(&ep->ep_obj->rxc, buf, len, NULL, msg->msg.addr, + 0, 0, msg->msg.context, msg->flags, false, + comp_cntr); +} + +static int cxip_dom_dwq_op_trecv(struct cxip_domain *dom, + struct fi_op_tagged *tagged, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr, + uint64_t trig_thresh) +{ + struct cxip_ep *ep = container_of(tagged->ep, struct cxip_ep, ep); + void *buf; + size_t len; + + /* Non-zero thresholds for triggered receives are not supported. */ + if (!tagged || tagged->msg.iov_count > 1 || trig_thresh) + return -FI_EINVAL; + + buf = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_base : NULL; + len = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_len : 0; + + return cxip_recv_common(&ep->ep_obj->rxc, buf, len, tagged->msg.desc, + tagged->msg.addr, tagged->msg.tag, + tagged->msg.ignore, tagged->msg.context, + tagged->flags, true, comp_cntr); +} + +/* Must hold domain lock. */ +static void cxip_dom_progress_all_cqs(struct cxip_domain *dom) +{ + struct cxip_cq *cq; + + dlist_foreach_container(&dom->cq_list, struct cxip_cq, cq, + dom_entry) + cxip_util_cq_progress(&cq->util_cq); +} + +static int cxip_dom_trig_op_get_in_use(struct cxip_domain *dom) +{ + struct cxi_rsrc_use in_use; + int ret; + + ret = cxil_get_svc_rsrc_use(dom->iface->dev, dom->auth_key.svc_id, + &in_use); + if (ret) + return ret; + + return in_use.in_use[CXI_RSRC_TYPE_TLE]; +} + +#define DWQ_SEMAPHORE_TIMEOUT 10U + +static int cxip_dom_dwq_queue_work_validate(struct cxip_domain *dom, + struct fi_deferred_work *work) +{ + struct cxip_ep *ep; + + if (!work->triggering_cntr) + return -FI_EINVAL; + + switch (work->op_type) { + case FI_OP_SEND: + case FI_OP_RECV: + ep = container_of(work->op.msg->ep, struct cxip_ep, ep); + break; + + case FI_OP_TSEND: + case FI_OP_TRECV: + ep = container_of(work->op.tagged->ep, struct cxip_ep, ep); + break; + + case FI_OP_READ: + case FI_OP_WRITE: + ep = container_of(work->op.rma->ep, struct cxip_ep, ep); + break; + + case FI_OP_ATOMIC: + ep = container_of(work->op.atomic->ep, struct cxip_ep, ep); + break; + + case FI_OP_FETCH_ATOMIC: + ep = container_of(work->op.fetch_atomic->ep, struct cxip_ep, + ep); + break; + + case FI_OP_COMPARE_ATOMIC: + ep = container_of(work->op.compare_atomic->ep, struct cxip_ep, + ep); + break; + + case FI_OP_CNTR_SET: + case FI_OP_CNTR_ADD: + return FI_SUCCESS; + + default: + return -FI_EINVAL; + } + + /* All EPs that share a Domain must use the same VNI. This is a + * simplification due to Cassini requiring triggered op TXQs to + * use CP 0. + */ + if (ep->ep_obj->auth_key.vni != dom->auth_key.vni) { + CXIP_WARN("Invalid VNI: %u\n", ep->ep_obj->auth_key.vni); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static int cxip_dom_dwq_queue_work(struct cxip_domain *dom, + struct fi_deferred_work *work) +{ + struct cxip_cntr *trig_cntr; + struct cxip_cntr *comp_cntr; + bool queue_wb_work; + int ret; + int trig_op_count; + int trig_op_in_use; + struct timespec ts; + bool again; + + ret = cxip_dom_dwq_queue_work_validate(dom, work); + if (ret != FI_SUCCESS) + return ret; + + comp_cntr = work->completion_cntr ? + container_of(work->completion_cntr, struct cxip_cntr, + cntr_fid) : NULL; + trig_cntr = container_of(work->triggering_cntr, struct cxip_cntr, + cntr_fid); + + switch (work->op_type) { + case FI_OP_SEND: + case FI_OP_RECV: + queue_wb_work = !!(work->op.msg->flags & FI_CXI_CNTR_WB); + break; + + case FI_OP_TSEND: + case FI_OP_TRECV: + queue_wb_work = !!(work->op.tagged->flags & FI_CXI_CNTR_WB); + break; + + case FI_OP_READ: + case FI_OP_WRITE: + queue_wb_work = !!(work->op.rma->flags & FI_CXI_CNTR_WB); + break; + + case FI_OP_ATOMIC: + queue_wb_work = !!(work->op.atomic->flags & FI_CXI_CNTR_WB); + break; + + case FI_OP_FETCH_ATOMIC: + queue_wb_work = !!(work->op.fetch_atomic->flags & FI_CXI_CNTR_WB); + break; + + case FI_OP_COMPARE_ATOMIC: + queue_wb_work = !!(work->op.compare_atomic->flags & FI_CXI_CNTR_WB); + break; + + default: + queue_wb_work = false; + } + + if (cxip_env.enable_trig_op_limit) { + if (queue_wb_work) + trig_op_count = 2; + else + trig_op_count = 1; + + if (clock_gettime(CLOCK_REALTIME, &ts) == -1) { + CXIP_WARN("clock_gettime failed: %d\n", -errno); + return -errno; + } + + ts.tv_sec += DWQ_SEMAPHORE_TIMEOUT; + + again = true; + do { + if (sem_timedwait(dom->trig_op_lock, &ts) == -1) { + if (errno == EINTR) { + CXIP_WARN("sem_timedwait failed: %d\n", + -errno); + return -errno; + } + } else { + again = false; + } + } while (again); + + ret = cxip_dom_trig_op_get_in_use(dom); + if (ret < 0) { + CXIP_WARN("cxip_dom_trig_op_get_in_use: %d\n", ret); + goto unlock; + } + + trig_op_in_use = ret; + + if ((trig_op_in_use + trig_op_count) > dom->max_trig_op_in_use) { + CXIP_WARN("Trig ops exhausted: in-use=%d\n", trig_op_in_use); + ret = -FI_ENOSPC; + goto unlock; + } + } + + switch (work->op_type) { + case FI_OP_SEND: + ret = cxip_dom_dwq_op_send(dom, work->op.msg, trig_cntr, + comp_cntr, work->threshold); + break; + + case FI_OP_TSEND: + ret = cxip_dom_dwq_op_tsend(dom, work->op.tagged, trig_cntr, + comp_cntr, work->threshold); + break; + + case FI_OP_RECV: + ret = cxip_dom_dwq_op_recv(dom, work->op.msg, trig_cntr, + comp_cntr, work->threshold); + break; + + case FI_OP_TRECV: + ret = cxip_dom_dwq_op_trecv(dom, work->op.tagged, trig_cntr, + comp_cntr, work->threshold); + break; + + case FI_OP_READ: + case FI_OP_WRITE: + ret = cxip_dom_dwq_op_rma(dom, work->op.rma, work->op_type, + trig_cntr, comp_cntr, + work->threshold); + break; + + case FI_OP_ATOMIC: + ret = cxip_dom_dwq_op_atomic(dom, work->op.atomic, trig_cntr, + comp_cntr, work->threshold); + break; + + case FI_OP_FETCH_ATOMIC: + ret = cxip_dom_dwq_op_fetch_atomic(dom, work->op.fetch_atomic, + trig_cntr, comp_cntr, + work->threshold); + break; + + case FI_OP_COMPARE_ATOMIC: + ret = cxip_dom_dwq_op_comp_atomic(dom, work->op.compare_atomic, + trig_cntr, comp_cntr, + work->threshold); + break; + + case FI_OP_CNTR_SET: + case FI_OP_CNTR_ADD: + ret = cxip_dom_dwq_op_cntr(dom, work->op.cntr, work->op_type, + trig_cntr, comp_cntr, + work->threshold, false); + break; + + default: + ret = -FI_EINVAL; + CXIP_WARN("Invalid FI_QUEUE_WORK op %s\n", + fi_tostr(&work->op_type, FI_TYPE_OP_TYPE)); + } + + if (ret) + goto unlock; + + if (queue_wb_work) { + struct fi_op_cntr op_cntr = { + .cntr = &trig_cntr->cntr_fid, + }; + + /* no op_type needed for counter writeback */ + ret = cxip_dom_dwq_op_cntr(dom, &op_cntr, 0, trig_cntr, NULL, + work->threshold + 1, true); + /* TODO: If cxip_dom_dwq_op_cntr fails we need to cancel the + * above work queue. + */ + } + + /* Wait until the command queue is empty. This is a sign that hardware + * has processed triggered operation commands. At this point, it is + * safe to release the trigger op pool lock. + */ + if (cxip_env.enable_trig_op_limit) { + ofi_genlock_lock(&dom->trig_cmdq_lock); + while (dom->trig_cmdq->dev_cmdq->status->rd_ptr != + (dom->trig_cmdq->dev_cmdq->hw_wp32 / 2)) {}; + ofi_genlock_unlock(&dom->trig_cmdq_lock); + } + +unlock: + if (cxip_env.enable_trig_op_limit) + sem_post(dom->trig_op_lock); + + return ret; +} + +static int cxip_dom_dwq_flush_work(struct cxip_domain *dom) +{ + struct cxip_cntr *trig_cntr; + struct cxip_txc *txc; + struct cxip_cq *cq; + int ret __attribute__ ((unused)); + + ofi_spin_lock(&dom->lock); + if (!dom->cntr_init) { + ofi_spin_unlock(&dom->lock); + return FI_SUCCESS; + } + + ofi_genlock_lock(&dom->trig_cmdq_lock); + + /* Issue cancels to all allocated counters. */ + dlist_foreach_container(&dom->cntr_list, struct cxip_cntr, + trig_cntr, dom_entry) { + struct c_ct_cmd ct_cmd = {}; + + if (!trig_cntr->ct) + continue; + + ct_cmd.ct = trig_cntr->ct->ctn; + ret = cxi_cq_emit_ct(dom->trig_cmdq->dev_cmdq, C_CMD_CT_CANCEL, + &ct_cmd); + + // TODO: Handle this assert. Multiple triggered CQs may + // be required. + assert(!ret); + cxi_cq_ring(dom->trig_cmdq->dev_cmdq); + }; + + /* Rely on the triggered CQ ack counter to know when there are no more + * pending triggered operations. In-between, progress CQs to cleanup + * internal transaction state. + */ + while (true) { + unsigned int ack_counter; + + ret = cxil_cmdq_ack_counter(dom->trig_cmdq->dev_cmdq, + &ack_counter); + assert(!ret); + + if (!ack_counter) + break; + + cxip_dom_progress_all_cqs(dom); + } + + /* It is possible that the ack counter is zero and there are completion + * events in-flight meaning that the above progression may have missed + * events. Perform a sleep to help ensure events have arrived and + * progress all CQs one more time. + * + * TODO: Investigate better way to resolve this race condition. + */ + sleep(1); + cxip_dom_progress_all_cqs(dom); + + /* At this point, all triggered operations should be cancelled or have + * completed. Due to special handling of message operations, flush any + * remaining message triggered requests from the TX context first. + */ + dlist_foreach_container(&dom->txc_list, struct cxip_txc, txc, + dom_entry) + cxip_txc_flush_msg_trig_reqs(txc); + + /* Flush all the CQs of any remaining non-message triggered operation + * requests. + */ + dlist_foreach_container(&dom->cq_list, struct cxip_cq, cq, dom_entry) + cxip_cq_flush_trig_reqs(cq); + + ofi_genlock_unlock(&dom->trig_cmdq_lock); + ofi_spin_unlock(&dom->lock); + + return FI_SUCCESS; +} + +static int cxip_domain_enable_mr_match_events(struct fid *fid, bool enable) +{ + struct cxip_domain *dom; + + if (fid->fclass != FI_CLASS_DOMAIN) { + CXIP_WARN("Invalid FID: %p\n", fid); + return -FI_EINVAL; + } + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + dom->mr_match_events = enable; + + return FI_SUCCESS; +} + +static int cxip_domain_enable_optimized_mrs(struct fid *fid, bool enable) +{ + struct cxip_domain *dom; + + if (fid->fclass != FI_CLASS_DOMAIN) { + CXIP_WARN("Invalid FID: %p\n", fid); + return -FI_EINVAL; + } + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + if (!dom->is_prov_key) { + CXIP_WARN("Requires FI_MR_PROV_KEY\n"); + return -FI_EINVAL; + } + + dom->optimized_mrs = enable; + + return FI_SUCCESS; +} + +static int cxip_domain_enable_prov_key_cache(struct fid *fid, bool enable) +{ + struct cxip_domain *dom; + + if (fid->fclass != FI_CLASS_DOMAIN) { + CXIP_WARN("Invalid FID: %p\n", fid); + return -FI_EINVAL; + } + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + if (!dom->is_prov_key) { + CXIP_WARN("Requires FI_MR_PROV_KEY\n"); + return -FI_EINVAL; + } + + dom->prov_key_cache = enable; + + return FI_SUCCESS; +} + + +static int cxip_dom_control(struct fid *fid, int command, void *arg) +{ + struct cxip_domain *dom; + + dom = container_of(fid, struct cxip_domain, util_domain.domain_fid.fid); + + if (command != FI_FLUSH_WORK && !arg) { + CXIP_WARN("Required argument missing\n"); + return -FI_EINVAL; + } + + switch (command) { + case FI_QUEUE_WORK: + return cxip_dom_dwq_queue_work(dom, arg); + + case FI_FLUSH_WORK: + return cxip_dom_dwq_flush_work(dom); + + case FI_OPT_CXI_SET_OPTIMIZED_MRS: + return cxip_domain_enable_optimized_mrs(fid, *(bool *)arg); + + case FI_OPT_CXI_GET_OPTIMIZED_MRS: + *(bool *)arg = dom->optimized_mrs; + break; + + case FI_OPT_CXI_SET_MR_MATCH_EVENTS: + return cxip_domain_enable_mr_match_events(fid, *(bool *)arg); + + case FI_OPT_CXI_GET_MR_MATCH_EVENTS: + *(bool *)arg = dom->mr_match_events; + break; + + case FI_OPT_CXI_SET_PROV_KEY_CACHE: + return cxip_domain_enable_prov_key_cache(fid, *(bool *)arg); + + case FI_OPT_CXI_GET_PROV_KEY_CACHE: + *(bool *)arg = dom->prov_key_cache; + break; + + default: + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static int cxip_domain_cntr_read(struct fid *fid, unsigned int cntr, + uint64_t *value, struct timespec *ts) +{ + struct cxip_domain *dom; + int ret; + + if (fid->fclass != FI_CLASS_DOMAIN) { + CXIP_WARN("Invalid FID: %p\n", fid); + return -FI_EINVAL; + } + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + + if (!dom->enabled) + return -FI_EOPBADSTATE; + + ret = cxil_read_cntr(dom->iface->dev, cntr, value, ts); + + return ret ? -FI_EINVAL : FI_SUCCESS; +} + +static int cxip_domain_topology(struct fid *fid, unsigned int *group_id, + unsigned int *switch_id, unsigned int *port_id) +{ + struct cxip_domain *dom; + struct cxip_topo_addr topo; + + if (fid->fclass != FI_CLASS_DOMAIN) { + CXIP_WARN("Invalid FID: %p\n", fid); + return -FI_EINVAL; + } + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + topo.addr = dom->nic_addr; + + /* Only a dragonfly topology is supported at this time */ + if (group_id) + *group_id = topo.dragonfly.group_num; + if (switch_id) + *switch_id = topo.dragonfly.switch_num; + if (port_id) + *port_id = topo.dragonfly.port_num; + + return FI_SUCCESS; +} + +static int cxip_domain_enable_hybrid_mr_desc(struct fid *fid, bool enable) +{ + struct cxip_domain *dom; + + if (fid->fclass != FI_CLASS_DOMAIN) { + CXIP_WARN("Invalid FID: %p\n", fid); + return -FI_EINVAL; + } + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + + dom->hybrid_mr_desc = enable; + + return FI_SUCCESS; +} + +static int cxip_domain_get_dwq_depth(struct fid *fid, size_t *depth) +{ + struct cxip_domain *dom; + + if (fid->fclass != FI_CLASS_DOMAIN) { + CXIP_WARN("Invalid FID: %p\n", fid); + return -FI_EINVAL; + } + + dom = container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + + *depth = dom->max_trig_op_in_use; + + return FI_SUCCESS; +} + +static struct fi_cxi_dom_ops cxip_dom_ops_ext = { + .cntr_read = cxip_domain_cntr_read, + .topology = cxip_domain_topology, + .enable_hybrid_mr_desc = cxip_domain_enable_hybrid_mr_desc, + .ep_get_unexp_msgs = cxip_ep_get_unexp_msgs, + .get_dwq_depth = cxip_domain_get_dwq_depth, + .enable_mr_match_events = cxip_domain_enable_mr_match_events, + .enable_optimized_mrs = cxip_domain_enable_optimized_mrs, +}; + +static int cxip_dom_ops_open(struct fid *fid, const char *ops_name, + uint64_t flags, void **ops, void *context) +{ + /* v5 only appended a new function */ + if (!strcmp(ops_name, FI_CXI_DOM_OPS_1) || + !strcmp(ops_name, FI_CXI_DOM_OPS_2) || + !strcmp(ops_name, FI_CXI_DOM_OPS_3) || + !strcmp(ops_name, FI_CXI_DOM_OPS_4) || + !strcmp(ops_name, FI_CXI_DOM_OPS_5) || + !strcmp(ops_name, FI_CXI_DOM_OPS_6)) { + *ops = &cxip_dom_ops_ext; + return FI_SUCCESS; + } + + return -FI_EINVAL; +} + +static int cxip_domain_ops_set(struct fid *fid, const char *name, + uint64_t flags, void *ops, void *context) +{ + struct cxip_domain *domain = + container_of(fid, struct cxip_domain, + util_domain.domain_fid.fid); + struct fi_hmem_override_ops *hmem_ops; + + if (strcmp(FI_SET_OPS_HMEM_OVERRIDE, name) == 0) { + hmem_ops = ops; + + if (!hmem_ops->copy_from_hmem_iov || + !hmem_ops->copy_to_hmem_iov) + return -FI_EINVAL; + + domain->hmem_ops = *hmem_ops; + + return FI_SUCCESS; + } + + return -FI_ENOSYS; +} + +static int cxip_query_atomic_flags_valid(uint64_t flags) +{ + /* FI_COMPARE_ATOMIC and FI_FETCH_ATOMIC are mutually exclusive. */ + if ((flags & FI_COMPARE_ATOMIC) && (flags & FI_FETCH_ATOMIC)) + return -FI_EINVAL; + + if (flags & FI_CXI_PCIE_AMO) { + /* Only FI_FETCH_ATOMIC is support with FI_CXI_PCIE_AMO. */ + if (!(flags & FI_FETCH_ATOMIC)) + return -FI_EOPNOTSUPP; + } + + return FI_SUCCESS; +} + +static int cxip_query_atomic(struct fid_domain *domain, + enum fi_datatype datatype, enum fi_op op, + struct fi_atomic_attr *attr, uint64_t flags) +{ + enum cxip_amo_req_type req_type; + int ret; + unsigned int datatype_len; + struct cxip_domain *dom; + + dom = container_of(domain, struct cxip_domain, + util_domain.domain_fid.fid); + + if (!attr) + return -FI_EINVAL; + + ret = cxip_query_atomic_flags_valid(flags); + if (ret) + return ret; + + if (flags & FI_COMPARE_ATOMIC) { + req_type = CXIP_RQ_AMO_SWAP; + } else if (flags & FI_FETCH_ATOMIC) { + if (flags & FI_CXI_PCIE_AMO) + req_type = CXIP_RQ_AMO_PCIE_FETCH; + else + req_type = CXIP_RQ_AMO_FETCH; + } else { + req_type = CXIP_RQ_AMO; + } + + ret = _cxip_atomic_opcode(req_type, datatype, op, + dom->amo_remap_to_pcie_fadd, NULL, NULL, NULL, + &datatype_len); + if (ret) + return ret; + + attr->count = 1; + attr->size = datatype_len; + + return FI_SUCCESS; +} + +static int cxip_query_collective(struct fid_domain *domain, + enum fi_collective_op coll, + struct fi_collective_attr *attr, + uint64_t flags) +{ + int ext_op; + + /* BARRIER does not require attr */ + if (coll == FI_BARRIER && !attr) + return FI_SUCCESS; + + /* Anything else requires attr */ + if (!attr) + return -FI_EINVAL; + + /* Flags are not supported */ + if (flags) + return -FI_EOPNOTSUPP; + + /* The limit to collective membership is the size of the multicast tree, + * which is limited by the maximum address space of addressable ports on + * the fabric. + */ + attr->max_members = (1L << C_DFA_NIC_BITS) - 1; + + /* supported collective operations */ + ext_op = (int)attr->op; + switch (coll) { + case FI_BARRIER: + /* ignore attr->op: barrier takes no operator */ + /* ignore attr->datatype: barrier takes no data */ + attr->datatype_attr.count = 0; + attr->datatype_attr.size = 0; + break; + case FI_BROADCAST: + /* ignore attr->op: barrier takes no operator */ + switch (attr->datatype) { + case FI_INT8: + case FI_UINT8: + attr->datatype_attr.count = 32; + attr->datatype_attr.size = 1; + break; + case FI_INT16: + case FI_UINT16: + attr->datatype_attr.count = 16; + attr->datatype_attr.size = 2; + break; + case FI_INT32: + case FI_UINT32: + case FI_FLOAT: + attr->datatype_attr.count = 8; + attr->datatype_attr.size = 4; + break; + case FI_INT64: + case FI_UINT64: + case FI_DOUBLE: + attr->datatype_attr.count = 4; + attr->datatype_attr.size = 8; + break; + default: + return -FI_EOPNOTSUPP; + } + break; + case FI_REDUCE: + case FI_ALLREDUCE: + switch (ext_op) { + case FI_BOR: + case FI_BAND: + case FI_BXOR: + switch (attr->datatype) { + case FI_INT8: + case FI_UINT8: + attr->datatype_attr.count = 32; + attr->datatype_attr.size = 1; + break; + case FI_INT16: + case FI_UINT16: + attr->datatype_attr.count = 16; + attr->datatype_attr.size = 2; + break; + case FI_INT32: + case FI_UINT32: + attr->datatype_attr.count = 8; + attr->datatype_attr.size = 4; + break; + case FI_INT64: + case FI_UINT64: + attr->datatype_attr.count = 4; + attr->datatype_attr.size = 8; + break; + default: + return -FI_EOPNOTSUPP; + } + break; + case FI_MIN: + case FI_MAX: + case FI_SUM: + if (attr->datatype != FI_INT64 && + attr->datatype != FI_DOUBLE) + return -FI_EOPNOTSUPP; + attr->datatype_attr.count = 4; + attr->datatype_attr.size = 8; + break; + case FI_CXI_MINMAXLOC: + attr->datatype_attr.count = 1; + attr->datatype_attr.size = 32; + break; + case FI_CXI_REPSUM: + attr->datatype_attr.count = 1; + attr->datatype_attr.size = 8; + break; + default: + return -FI_EOPNOTSUPP; + } + break; + default: + return -FI_EOPNOTSUPP; + } + return FI_SUCCESS; +} + +static struct fi_ops cxip_dom_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_dom_close, + .bind = cxip_dom_bind, + .control = cxip_dom_control, + .ops_open = cxip_dom_ops_open, + .ops_set = cxip_domain_ops_set, +}; + +static struct fi_ops_domain cxip_dom_ops = { + .size = sizeof(struct fi_ops_domain), + .av_open = cxip_av_open, + .cq_open = cxip_cq_open, + .endpoint = cxip_endpoint, + .scalable_ep = fi_no_scalable_ep, + .cntr_open = cxip_cntr_open, + .poll_open = fi_no_poll_open, + .stx_ctx = fi_no_stx_context, + .srx_ctx = fi_no_srx_context, + .query_atomic = cxip_query_atomic, + .query_collective = cxip_query_collective +}; + +/* + * cxip_domain() - Provider fi_domain() implementation. + */ +int cxip_domain(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **dom, void *context) +{ + struct cxip_domain *cxi_domain; + struct cxip_fabric *fab; + struct cxip_addr *src_addr; + uint32_t seed; + int ret; + + /* The OFI check_info function does not verify that rx/tx attribute + * capabilities are a subset of the info capabilities. Currently + * MPI removes the FI_HMEM cap from info->caps but not the rx/tx + * caps. To avoided breaking MPI, the capabilities are removed + * here as a temporary work around. + * TODO: Remove this code when no longer required. + */ + if (info->caps && !(info->caps & FI_HMEM)) { + if (info->tx_attr) + info->tx_attr->caps &= ~FI_HMEM; + if (info->rx_attr) + info->rx_attr->caps &= ~FI_HMEM; + } + + ret = ofi_prov_check_info(&cxip_util_prov, CXIP_FI_VERSION, info); + if (ret != FI_SUCCESS) + return -FI_EINVAL; + + ret = cxip_check_auth_key_info(info); + if (ret) + return ret; + + fab = container_of(fabric, struct cxip_fabric, util_fabric.fabric_fid); + + cxi_domain = calloc(1, sizeof(*cxi_domain)); + if (!cxi_domain) + return -FI_ENOMEM; + + ret = ofi_domain_init(&fab->util_fabric.fabric_fid, info, + &cxi_domain->util_domain, context, + OFI_LOCK_SPINLOCK); + if (ret) + goto free_dom; + + if (!info || !info->src_addr) { + CXIP_WARN("Invalid fi_info\n"); + goto close_util_dom; + } + src_addr = (struct cxip_addr *)info->src_addr; + cxi_domain->nic_addr = src_addr->nic; + + if (info->domain_attr->auth_key) { + /* Auth key size is verified in ofi_prov_check_info(). */ + assert(info->domain_attr->auth_key_size == + sizeof(struct cxi_auth_key)); + + memcpy(&cxi_domain->auth_key, info->domain_attr->auth_key, + sizeof(struct cxi_auth_key)); + } else { + ret = cxip_gen_auth_key(info, &cxi_domain->auth_key); + if (ret) { + CXIP_WARN("cxip_gen_auth_key failed: %d:%s", ret, + fi_strerror(-ret)); + return ret; + } + + /* If FI_AV_AUTH_KEY is used, the auth_key.vni value will never + * be used. Thus, set it to zero which is invalid. + */ + cxi_domain->av_auth_key = + info->domain_attr->auth_key_size == FI_AV_AUTH_KEY; + if (cxi_domain->av_auth_key) + cxi_domain->auth_key.vni = 0; + } + + if (info->domain_attr->tclass != FI_TC_UNSPEC) { + if (info->domain_attr->tclass >= FI_TC_LABEL && + info->domain_attr->tclass <= FI_TC_SCAVENGER) { + cxi_domain->tclass = info->domain_attr->tclass; + } else { + CXIP_WARN("Invalid tclass\n"); + goto close_util_dom; + } + } else { + /* Use default tclass */ + cxi_domain->tclass = FI_TC_BEST_EFFORT; + } + + cxi_domain->av_user_id = + !!(cxi_domain->util_domain.info_domain_caps & FI_AV_USER_ID); + cxi_domain->auth_key_entry_max = info->domain_attr->max_ep_auth_key; + cxi_domain->util_domain.domain_fid.fid.ops = &cxip_dom_fi_ops; + cxi_domain->util_domain.domain_fid.ops = &cxip_dom_ops; + cxi_domain->util_domain.domain_fid.mr = &cxip_dom_mr_ops; + + dlist_init(&cxi_domain->txc_list); + dlist_init(&cxi_domain->cntr_list); + dlist_init(&cxi_domain->cq_list); + ofi_spin_init(&cxi_domain->lock); + ofi_spin_init(&cxi_domain->ctrl_id_lock); + memset(&cxi_domain->req_ids, 0, sizeof(cxi_domain->req_ids)); + memset(&cxi_domain->mr_ids, 0, sizeof(cxi_domain->mr_ids)); + + ofi_atomic_initialize32(&cxi_domain->ref, 0); + cxi_domain->fab = fab; + + cxi_domain->hmem_ops.copy_from_hmem_iov = ofi_copy_from_hmem_iov; + cxi_domain->hmem_ops.copy_to_hmem_iov = ofi_copy_to_hmem_iov; + + /* Allocate/initialize domain hardware resources */ + ret = cxip_domain_enable(cxi_domain); + if (ret) { + CXIP_WARN("Resource allocation failed: %d: %s\n", + ret, fi_strerror(-ret)); + goto cleanup_dom; + } + + /* Handle client vs provider MR RKEY differences */ + if (cxi_domain->util_domain.mr_mode & FI_MR_PROV_KEY) { + cxi_domain->is_prov_key = true; + + seed = (uint32_t)ofi_gettime_ns(); + cxi_domain->prov_key_seqnum = ofi_xorshift_random(seed); + } + + cxi_domain->mr_match_events = cxip_env.mr_match_events; + cxi_domain->optimized_mrs = cxip_env.optimized_mrs; + cxi_domain->prov_key_cache = cxip_env.prov_key_cache; + *dom = &cxi_domain->util_domain.domain_fid; + + return 0; + +cleanup_dom: + ofi_spin_destroy(&cxi_domain->lock); +close_util_dom: + ofi_domain_close(&cxi_domain->util_domain); +free_dom: + free(cxi_domain); + return -FI_EINVAL; +} + +int cxip_domain_valid_vni(struct cxip_domain *dom, unsigned int vni) +{ + /* Currently the auth_key.svc_id field contains the resource group ID. + */ + return cxip_if_valid_rgroup_vni(dom->iface, dom->auth_key.svc_id, vni); +} + +#define SUPPORTED_DWQ_FLAGS (FI_MORE | FI_COMPLETION | FI_DELIVERY_COMPLETE | \ + FI_MATCH_COMPLETE | FI_TRANSMIT_COMPLETE | FI_CXI_CNTR_WB) + +static int cxip_domain_dwq_emit_validate(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + uint64_t flags) +{ + uint64_t unsupported_flags = flags & ~SUPPORTED_DWQ_FLAGS; + + if (unsupported_flags) { + CXIP_WARN("Unsupported flags: %lx\n", unsupported_flags); + return -FI_EINVAL; + } + + if (tc != dom->trig_cmdq->cur_cp->tc) { + CXIP_WARN("Invalid tc: %d\n", tc); + return -FI_EINVAL; + } + + if (tc_type != dom->trig_cmdq->cur_cp->tc_type) { + CXIP_WARN("Invalid tc_type: %d\n", tc_type); + return -FI_EINVAL; + } + + if (vni != dom->trig_cmdq->cur_cp->vni) { + CXIP_WARN("Invalid vni: %d\n", vni); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +int cxip_domain_dwq_emit_dma(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cntr *trig_cntr, size_t trig_thresh, + struct c_full_dma_cmd *dma, uint64_t flags) +{ + struct c_ct_cmd ct_cmd = { + .trig_ct = trig_cntr->ct->ctn, + .threshold = trig_thresh, + }; + int ret; + + ret = cxip_domain_dwq_emit_validate(dom, vni, tc, tc_type, flags); + if (ret) + return ret; + + ofi_genlock_lock(&dom->trig_cmdq_lock); + + ret = cxi_cq_emit_trig_full_dma(dom->trig_cmdq->dev_cmdq, &ct_cmd, dma); + if (ret) { + CXIP_WARN("Failed to emit trigger dma command: %d:%s\n", ret, + fi_strerror(-ret)); + ret = -FI_EAGAIN; + } else { + cxip_txq_ring(dom->trig_cmdq, false, 1); + } + + ofi_genlock_unlock(&dom->trig_cmdq_lock); + + return ret; +} + +int cxip_domain_dwq_emit_amo(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cntr *trig_cntr, size_t trig_thresh, + struct c_dma_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush) +{ + struct c_ct_cmd ct_cmd = { + .trig_ct = trig_cntr->ct->ctn, + .threshold = trig_thresh, + }; + struct c_full_dma_cmd flush_cmd; + bool fetching_flush = fetching && flush; + int ret; + + /* TODO: Need to ensure there are at least 2 TLEs free for the following + * triggered commands. + */ + + /* TODO: Support triggered operations with different VNIs. */ + + if (fetching_flush) { + memset(&flush_cmd, 0, sizeof(flush_cmd)); + flush_cmd.command.opcode = C_CMD_PUT; + flush_cmd.index_ext = amo->index_ext; + flush_cmd.event_send_disable = 1; + flush_cmd.dfa = amo->dfa; + flush_cmd.remote_offset = amo->remote_offset; + flush_cmd.eq = amo->eq; + flush_cmd.user_ptr = amo->user_ptr; + flush_cmd.flush = 1; + } + + ret = cxip_domain_dwq_emit_validate(dom, vni, tc, tc_type, flags); + if (ret) + return ret; + + ofi_genlock_lock(&dom->trig_cmdq_lock); + + if (fetching_flush && + __cxi_cq_free_slots(dom->trig_cmdq->dev_cmdq) >= 16) { + CXIP_WARN("No space for FAMO with FI_DELIVERY_COMPLETE\n"); + ret = -FI_EAGAIN; + goto out_unlock; + } + + ret = cxi_cq_emit_trig_dma_amo(dom->trig_cmdq->dev_cmdq, &ct_cmd, + amo, fetching); + if (ret) { + CXIP_WARN("Failed to emit trigger amo command: %d:%s\n", ret, + fi_strerror(-ret)); + ret = -FI_EAGAIN; + goto out_unlock; + } + + if (fetching_flush) { + /* CQ space check already occurred. Thus, return code can be + * ignored. + */ + ret = cxi_cq_emit_trig_full_dma(dom->trig_cmdq->dev_cmdq, + &ct_cmd, &flush_cmd); + assert(ret == 0); + } + + cxip_txq_ring(dom->trig_cmdq, false, 1); + +out_unlock: + ofi_genlock_unlock(&dom->trig_cmdq_lock); + + return ret; +} diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c new file mode 100644 index 00000000000..4b579662002 --- /dev/null +++ b/prov/cxi/src/cxip_ep.c @@ -0,0 +1,1311 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2013-2014 Intel Corporation. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved. + * Copyright (c) 2018,2020-2023 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include + +#include "ofi_util.h" +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) + +extern struct fi_ops_rma cxip_ep_rma_ops; +extern struct fi_ops_rma cxip_ep_rma_no_ops; + +extern struct fi_ops_msg cxip_ep_msg_ops; +extern struct fi_ops_msg cxip_ep_msg_no_ops; +extern struct fi_ops_msg cxip_ep_msg_no_tx_ops; +extern struct fi_ops_msg cxip_ep_msg_no_rx_ops; + +extern struct fi_ops_tagged cxip_ep_tagged_ops; +extern struct fi_ops_tagged cxip_ep_tagged_no_ops; +extern struct fi_ops_tagged cxip_ep_tagged_no_tx_ops; +extern struct fi_ops_tagged cxip_ep_tagged_no_rx_ops; + +extern struct fi_ops_atomic cxip_ep_atomic_ops; +extern struct fi_ops_atomic cxip_ep_atomic_no_ops; + +extern struct fi_ops_collective cxip_collective_ops; +extern struct fi_ops_collective cxip_collective_no_ops; + +extern struct fi_ops_cm cxip_ep_cm_ops; +extern struct fi_ops_ep cxip_ep_ops; +extern struct fi_ops cxip_ep_fi_ops; +extern struct fi_ops_ep cxip_ctx_ep_ops; + +/* + * cxip_ep_cmdq() - Open a shareable TX or Target command queue. + * + * Caller must hold ep_obj->lock + */ +int cxip_ep_cmdq(struct cxip_ep_obj *ep_obj, bool transmit, uint32_t tclass, + struct cxi_eq *evtq, struct cxip_cmdq **cmdq) +{ + struct cxi_cq_alloc_opts cq_opts = {}; + struct cxip_cmdq **ep_obj_cmdq; + ofi_atomic32_t *ep_obj_ref; + int ret; + size_t size; + + if (transmit) { + ep_obj_cmdq = &ep_obj->txq; + ep_obj_ref = &ep_obj->txq_ref; + size = ep_obj->txq_size; + } else { + ep_obj_cmdq = &ep_obj->tgq; + ep_obj_ref = &ep_obj->tgq_ref; + size = ep_obj->tgq_size; + } + + if (*ep_obj_cmdq) { + ofi_atomic_inc32(ep_obj_ref); + CXIP_DBG("Reusing %s base CMDQ: %p\n", + transmit ? "TX" : "RX", *ep_obj_cmdq); + *cmdq = *ep_obj_cmdq; + + return FI_SUCCESS; + } + + /* An IDC command can use up to 4 64 byte slots. */ + cq_opts.count = size * 4; + cq_opts.flags = transmit ? CXI_CQ_IS_TX : 0; + cq_opts.policy = cxip_env.cq_policy; + + ret = cxip_cmdq_alloc(ep_obj->domain->lni, evtq, &cq_opts, + ep_obj->auth_key.vni, cxip_ofi_to_cxi_tc(tclass), + CXI_TC_TYPE_DEFAULT, cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate CMDQ, ret: %d\n", ret); + return -FI_ENOSPC; + } + *ep_obj_cmdq = *cmdq; + ofi_atomic_inc32(ep_obj_ref); + + CXIP_DBG("Allocated %s CMDQ: %p CP: %u\n", + transmit ? "TX" : "RX", *cmdq, cq_opts.lcid); + return ret; +} + +/* + * cxip_ep_cmdq_put() - Release reference to shareable TX or Target command + * queue. + * + * Caller must hold ep_obj->lock. + */ +void cxip_ep_cmdq_put(struct cxip_ep_obj *ep_obj, bool transmit) +{ + struct cxip_cmdq **ep_obj_cmdq; + ofi_atomic32_t *ep_obj_ref; + + if (transmit) { + ep_obj_cmdq = &ep_obj->txq; + ep_obj_ref = &ep_obj->txq_ref; + } else { + ep_obj_cmdq = &ep_obj->tgq; + ep_obj_ref = &ep_obj->tgq_ref; + } + + if (!ofi_atomic_dec32(ep_obj_ref)) { + cxip_cmdq_free(*ep_obj_cmdq); + + CXIP_DBG("Freed %s CMDQ: %p\n", + transmit ? "TX" : "RX", *ep_obj_cmdq); + *ep_obj_cmdq = NULL; + } else { + CXIP_DBG("Put %s CMDQ: %p\n", + transmit ? "TX" : "RX", *ep_obj_cmdq); + } +} + +static int cxip_ep_cm_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct cxip_ep *cxip_ep = container_of(fid, struct cxip_ep, ep.fid); + size_t len; + + len = MIN(*addrlen, sizeof(struct cxip_addr)); + + if (!cxip_ep->ep_obj->enabled) + return -FI_EOPBADSTATE; + + CXIP_DBG("NIC: 0x%x PID: %u\n", cxip_ep->ep_obj->src_addr.nic, + cxip_ep->ep_obj->src_addr.pid); + + memcpy(addr, &cxip_ep->ep_obj->src_addr, len); + *addrlen = sizeof(struct cxip_addr); + + return (len == sizeof(struct cxip_addr)) ? FI_SUCCESS : -FI_ETOOSMALL; +} + +static int _join_collective(struct fid_ep *ep, const void *addr, + uint64_t flags, struct fid_mc **mc, void *context) +{ + struct fi_collective_addr *arg = (struct fi_collective_addr *)addr; + + return cxip_join_collective(ep, arg->coll_addr, arg->set, + flags, mc, context); +} + +struct fi_ops_cm cxip_ep_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .setname = fi_no_setname, + .getname = cxip_ep_cm_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, + .join = _join_collective, +}; + +/* + * cxip_ep_progress() - Progress an endpoint. + */ +void cxip_ep_progress(struct fid *fid) +{ + struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); + struct cxip_ep_obj *ep_obj = ep->ep_obj; + + if (ep_obj->enabled) { + + ofi_genlock_lock(&ep_obj->lock); + cxip_evtq_progress(&ep_obj->rxc.rx_evtq); + cxip_evtq_progress(&ep_obj->txc.tx_evtq); + cxip_ep_ctrl_progress_locked(ep_obj); + ofi_genlock_unlock(&ep_obj->lock); + } +} + +/* + * cxip_ep_peek() - Peek at EP event queues + * + * Return whether the associated EP event queues are empty. + */ +int cxip_ep_peek(struct fid *fid) +{ + struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); + struct cxip_ep_obj *ep_obj = ep->ep_obj; + + if (ep_obj->txc.tx_evtq.eq && cxi_eq_peek_event(ep_obj->txc.tx_evtq.eq)) + return -FI_EAGAIN; + if (ep_obj->rxc.rx_evtq.eq && cxi_eq_peek_event(ep_obj->rxc.rx_evtq.eq)) + return -FI_EAGAIN; + + return FI_SUCCESS; +} + +/* + * fi_ep_get_unexpected_msgs() - Get unexpected message information, exposed + * via domain open ops. + */ +size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep, + struct fi_cq_tagged_entry *entry, size_t count, + fi_addr_t *src_addr, size_t *ux_count) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + size_t ret_count = 0; + + /* Synchronous implementation to return a snapshot of the unexpected + * message queue for the endpoint. + */ + if (!ux_count) + return -FI_EINVAL; + + if (ep->ep_obj->rxc.state == RXC_DISABLED) + return -FI_EOPBADSTATE; + + if (!ofi_recv_allowed(ep->rx_attr.caps)) { + CXIP_WARN("FI_RECV not enabled\n"); + return -FI_EINVAL; + } + + /* If in flow control, let that complete since + * on-loading could be in progress. + */ + if (ep->ep_obj->rxc.state != RXC_ENABLED && + ep->ep_obj->rxc.state != RXC_ENABLED_SOFTWARE) { + cxip_cq_progress(ep->ep_obj->rxc.recv_cq); + return -FI_EAGAIN; + } + + ofi_genlock_lock(&ep->ep_obj->lock); + if (cxip_evtq_saturated(&ep->ep_obj->rxc.rx_evtq)) { + RXC_DBG(&ep->ep_obj->rxc, "Target HW EQ saturated\n"); + ofi_genlock_unlock(&ep->ep_obj->lock); + + return -FI_EAGAIN; + } + + /* Fill in supplied memory with what can fit */ + ret_count = cxip_build_ux_entry_info(ep, entry, count, src_addr, + ux_count); + ofi_genlock_unlock(&ep->ep_obj->lock); + + return ret_count; +} + +/* + * cxip_ep_flush_trig_reqs() - Free triggered request for the EP. + */ +void cxip_ep_flush_trig_reqs(struct cxip_ep_obj *ep_obj) +{ + ofi_genlock_lock(&ep_obj->lock); + cxip_evtq_flush_trig_reqs(&ep_obj->txc.tx_evtq); + ofi_genlock_unlock(&ep_obj->lock); +} + +/* + * cxip_txc_close() - close the TX side of endpoint object. + */ +void cxip_txc_close(struct cxip_ep *ep) +{ + struct cxip_txc *txc = &ep->ep_obj->txc; + + if (txc->send_cq) { + ofi_genlock_lock(&txc->send_cq->ep_list_lock); + fid_list_remove2(&txc->send_cq->util_cq.ep_list, + &txc->send_cq->util_cq.ep_list_lock, + &ep->ep.fid); + ofi_genlock_unlock(&txc->send_cq->ep_list_lock); + + ofi_atomic_dec32(&txc->send_cq->util_cq.ref); + } + + if (txc->send_cntr) { + fid_list_remove(&txc->send_cntr->ctx_list, + &txc->send_cntr->lock, + &ep->ep.fid); + ofi_atomic_dec32(&txc->send_cntr->ref); + } + + if (txc->read_cntr) { + fid_list_remove(&txc->read_cntr->ctx_list, + &txc->read_cntr->lock, + &ep->ep.fid); + ofi_atomic_dec32(&txc->read_cntr->ref); + } + + if (txc->write_cntr) { + fid_list_remove(&txc->write_cntr->ctx_list, + &txc->write_cntr->lock, + &ep->ep.fid); + ofi_atomic_dec32(&txc->write_cntr->ref); + } + + cxip_domain_remove_txc(txc->domain, txc); + + cxip_txc_disable(txc); +} + +/* + * cxip_rxc_close() - close the RX side of the endpoint object. + */ +void cxip_rxc_close(struct cxip_ep *ep) +{ + struct cxip_rxc *rxc = &ep->ep_obj->rxc; + + if (rxc->recv_cq) { + /* EP FID may not be found in the list if recv_cq == send_cq, + * but we still need to decrement reference. + */ + ofi_genlock_lock(&rxc->recv_cq->ep_list_lock); + fid_list_remove2(&rxc->recv_cq->util_cq.ep_list, + &rxc->recv_cq->util_cq.ep_list_lock, + &ep->ep.fid); + ofi_genlock_unlock(&rxc->recv_cq->ep_list_lock); + + ofi_atomic_dec32(&rxc->recv_cq->util_cq.ref); + } + + if (rxc->recv_cntr) { + fid_list_remove(&rxc->recv_cntr->ctx_list, + &rxc->recv_cntr->lock, + &ep->ep.fid); + ofi_atomic_dec32(&rxc->recv_cntr->ref); + } + + cxip_rxc_disable(rxc); +} + +/** + * Get TX/RX option flags. + * + * Support TX/RX context control(FI_GETOPSFLAG). + * + * @param tx_attr : TX attributes, or NULL + * @param rx_attr : RX attributes, or NULL + * @param flags : storage for returned flags + * + * @return int : 0 on success, -errno on failure + */ +int cxip_getopflags(struct fi_tx_attr *tx_attr, struct fi_rx_attr *rx_attr, + uint64_t *flags) +{ + if ((*flags & FI_TRANSMIT) && (*flags & FI_RECV)) { + CXIP_WARN("Both Tx/Rx flags cannot be specified\n"); + return -FI_EINVAL; + } else if (tx_attr && (*flags & FI_TRANSMIT)) { + *flags = tx_attr->op_flags; + } else if (rx_attr && (*flags & FI_RECV)) { + *flags = rx_attr->op_flags; + } else { + CXIP_WARN("Tx/Rx flags not specified\n"); + return -FI_EINVAL; + } + + return 0; +} + +/** + * Set TX/RX option flags. + * + * Support TX/RX control(FI_SETOPSFLAG). + * + * @param tx_attr : TX attributes, or NULL + * @param rx_attr : RX attributes, or NULL + * @param flags : flags to set + * + * @return int : 0 on success, -errno on failure + */ +int cxip_setopflags(struct fi_tx_attr *tx_attr, struct fi_rx_attr *rx_attr, + uint64_t flags) +{ + if ((flags & FI_TRANSMIT) && (flags & FI_RECV)) { + CXIP_WARN("Both Tx/Rx flags cannot be specified\n"); + return -FI_EINVAL; + } else if (tx_attr && (flags & FI_TRANSMIT)) { + tx_attr->op_flags = flags; + tx_attr->op_flags &= ~FI_TRANSMIT; + if (!(flags & (FI_INJECT_COMPLETE | FI_TRANSMIT_COMPLETE | + FI_DELIVERY_COMPLETE))) + tx_attr->op_flags |= FI_TRANSMIT_COMPLETE; + } else if (rx_attr && (flags & FI_RECV)) { + rx_attr->op_flags = flags; + rx_attr->op_flags &= ~FI_RECV; + } else { + CXIP_WARN("Tx/Rx flags not specified\n"); + return -FI_EINVAL; + } + + return 0; +} + +/** + * Cancel RX operation + * + * Support TX/RX context cancel(). + * + * Searches the RX queue for a pending async operation with the specified + * 'context', and cancels it if still pending. + * + * @param rxc : RX context to search + * @param context : user context pointer to search for + * + * @return ssize_t : 0 on success, -errno on failure + */ +ssize_t cxip_rxc_cancel(struct cxip_rxc *rxc, void *context) +{ + if (rxc->state == RXC_DISABLED) + return -FI_EOPBADSTATE; + + return cxip_evtq_req_cancel(&rxc->rx_evtq, rxc, context, true); +} + +/* + * cxip_ep_cancel() - Cancel TX/RX operation for EP. + */ +ssize_t cxip_ep_cancel(fid_t fid, void *context) +{ + struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); + + /* TODO: Remove this since it requires malicious programming to + * create this condition. + */ + if (fid->fclass != FI_CLASS_EP) + return -FI_EINVAL; + + if (!ofi_recv_allowed(ep->ep_obj->caps)) + return -FI_ENOENT; + + return cxip_rxc_cancel(&ep->ep_obj->rxc, context); +} + +/* + * cxip_ep_enable() - Enable standard EP. + */ +static int cxip_ep_enable(struct fid_ep *fid_ep) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_ep_obj *ep_obj = ep->ep_obj; + int ret = FI_SUCCESS; + + ofi_genlock_lock(&ep_obj->lock); + if (ep_obj->enabled) + goto unlock; + + if (!ep_obj->av) { + CXIP_WARN("Endpoint must be bound to an AV\n"); + ret = -FI_ENOAV; + goto unlock; + } + + assert(ep_obj->domain->enabled); + + /* src_addr.pid may be C_PID_ANY at this point. */ + if (ep_obj->av_auth_key) { + ret = cxip_av_auth_key_get_vnis(ep_obj->av, &ep_obj->vnis, + &ep_obj->vni_count); + if (ret) + goto unlock; + + ret = cxip_portals_table_alloc(ep_obj->domain->lni, + ep_obj->vnis, ep_obj->vni_count, + ep_obj->src_addr.pid, + &ep_obj->ptable); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate auth key ring portals table: %d\n", + ret); + goto free_vnis; + } + + /* This is unfortunately needed to allocate a command queue. + * But, this can be changed later. + */ + ep_obj->auth_key.vni = ep_obj->vnis[0]; + } else { + ret = cxip_portals_table_alloc(ep_obj->domain->lni, + &ep_obj->auth_key.vni, 1, + ep_obj->src_addr.pid, + &ep_obj->ptable); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate portals table: %d\n", + ret); + goto unlock; + } + } + + ep_obj->src_addr.pid = ep_obj->ptable->pid; + + ret = cxip_ep_ctrl_init(ep_obj); + if (ret != FI_SUCCESS) { + CXIP_WARN("cxip_ep_ctrl_init returned: %d\n", ret); + goto free_portals_table; + } + + ret = cxip_zbcoll_init(ep_obj); + if (ret != FI_SUCCESS) { + CXIP_WARN("cxip_zbcoll_init returned: %d\n", ret); + goto free_ep_ctrl; + } + + CXIP_DBG("EP assigned NIC: %#x VNI: %u PID: %u\n", + ep_obj->src_addr.nic, + ep_obj->auth_key.vni, + ep_obj->src_addr.pid); + + ret = cxip_txc_enable(&ep_obj->txc); + if (ret != FI_SUCCESS) { + CXIP_WARN("cxip_txc_enable returned: %d\n", ret); + goto unlock; + } + + ret = cxip_rxc_enable(&ep_obj->rxc); + if (ret != FI_SUCCESS) { + CXIP_WARN("cxip_rxc_enable returned: %d\n", ret); + goto unlock; + } + + ret = cxip_coll_enable(ep); + if (ret != FI_SUCCESS) { + CXIP_WARN("cxip_coll_enable returned: %d\n", ret); + /* collectives will not function, but EP will */ + } + + /* Enable only appropriate API functions based on primary/secondary + * capabilities. Send/Receive requires FI_MSG or FI_TAGGED. + */ + if (ofi_send_allowed(ep->tx_attr.caps & ~FI_MSG) && + ofi_recv_allowed(ep->rx_attr.caps & ~FI_MSG)) + ep->ep.tagged = &cxip_ep_tagged_ops; + else if (ofi_send_allowed(ep->tx_attr.caps & ~FI_MSG)) + ep->ep.tagged = &cxip_ep_tagged_no_rx_ops; + else if (ofi_recv_allowed(ep->rx_attr.caps & ~FI_MSG)) + ep->ep.tagged = &cxip_ep_tagged_no_tx_ops; + + if (ofi_send_allowed(ep->tx_attr.caps & ~FI_TAGGED) && + ofi_recv_allowed(ep->rx_attr.caps & ~FI_TAGGED)) + ep->ep.msg = &cxip_ep_msg_ops; + else if (ofi_send_allowed(ep->tx_attr.caps & ~FI_TAGGED)) + ep->ep.msg = &cxip_ep_msg_no_rx_ops; + else if (ofi_recv_allowed(ep->rx_attr.caps & ~FI_TAGGED)) + ep->ep.msg = &cxip_ep_msg_no_tx_ops; + + /* Initiate requires FI_RMA or FI_ATOMIC */ + if (ofi_rma_initiate_allowed(ep->tx_attr.caps & ~FI_RMA)) + ep->ep.atomic = &cxip_ep_atomic_ops; + + if (ofi_rma_initiate_allowed(ep->tx_attr.caps & ~FI_ATOMIC)) + ep->ep.rma = &cxip_ep_rma_ops; + + ep_obj->enabled = true; + ofi_genlock_unlock(&ep_obj->lock); + + return FI_SUCCESS; + +free_ep_ctrl: + cxip_ep_ctrl_fini(ep_obj); + +free_portals_table: + cxip_portals_table_free(ep_obj->ptable); + ep_obj->ptable = NULL; +free_vnis: + if (ep_obj->vnis) { + cxip_av_auth_key_put_vnis(ep_obj->av, ep_obj->vnis, + ep_obj->vni_count); + ep_obj->vnis = NULL; + } +unlock: + ofi_genlock_unlock(&ep_obj->lock); + + return ret; +} + +/* + * cxip_ep_disable() - Disable the base EP if enabled. + */ +static void cxip_ep_disable(struct cxip_ep_obj *ep_obj) +{ + if (ep_obj->enabled) { + cxip_coll_disable(ep_obj); + cxip_zbcoll_fini(ep_obj); + cxip_ep_ctrl_fini(ep_obj); + cxip_portals_table_free(ep_obj->ptable); + if (ep_obj->vnis) { + cxip_av_auth_key_put_vnis(ep_obj->av, ep_obj->vnis, + ep_obj->vni_count); + ep_obj->vnis = NULL; + } + ep_obj->ptable = NULL; + ep_obj->enabled = false; + } +} + +/* + * cxip_free_endpoint() - Release base EP object resources and free object. + */ +int cxip_free_endpoint(struct cxip_ep *ep) +{ + struct cxip_ep_obj *ep_obj = ep->ep_obj; + int count; + + /* Each bound MR increments ref, so MRs must be removed. + */ + count = ofi_atomic_get32(&ep_obj->ref); + if (count) { + CXIP_WARN("EP refcount non-zero: %d\n", count); + return -FI_EBUSY; + } + + count = ofi_atomic_get32(&ep_obj->coll.num_mc); + if (count) { + CXIP_WARN("EP num_mc non-zero: %d\n", count); + return -FI_EBUSY; + } + + if (ep_obj->av) + cxip_av_unbind_ep(ep_obj->av, ep); + + if (ep->ep_obj->eq) { + ofi_mutex_lock(&ep_obj->eq->list_lock); + dlist_remove(&ep_obj->eq_link); + ofi_mutex_unlock(&ep_obj->eq->list_lock); + ofi_atomic_dec32(&ep_obj->eq->util_eq.ref); + } + + ofi_genlock_lock(&ep_obj->lock); + cxip_coll_close(ep_obj); + cxip_txc_close(ep); + cxip_rxc_close(ep); + cxip_ep_disable(ep_obj); + ofi_genlock_unlock(&ep_obj->lock); + + ofi_atomic_dec32(&ep_obj->domain->ref); + ofi_genlock_destroy(&ep_obj->lock); + free(ep_obj); + ep->ep_obj = NULL; + + return FI_SUCCESS; +} + +/* + * cxip_ep_close() - Close (destroy) the base EP. + */ +static int cxip_ep_close(struct fid *fid) +{ + struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); + int ret; + int count; + + if (ep->is_alias) { + count = ofi_atomic_get32(&ep->ep_obj->ref); + + if (count > 0) { + ofi_atomic_dec32(&ep->ep_obj->ref); + free(ep); + return FI_SUCCESS; + } + + CXIP_WARN("EP alias %p, invalid EP object refcnt %d\n", + ep, count); + return -FI_EINVAL; + } + + ret = cxip_free_endpoint(ep); + if (ret) { + CXIP_WARN("Unable to free EP object %d : %s\n", + ret, fi_strerror(-ret)); + return ret; + } + free(ep); + + return FI_SUCCESS; +} + +/* + * cxip_ep_bind_cq() - Bind the EP to the CQ resource. + */ +static int cxip_ep_bind_cq(struct cxip_ep *ep, struct cxip_cq *cq, + uint64_t flags) +{ + struct cxip_txc *txc; + struct cxip_rxc *rxc; + int ret; + + if (ep->ep_obj->domain != cq->domain) { + CXIP_WARN("Invalid CQ domain for EP\n"); + return -FI_EINVAL; + } + + if ((flags | CXIP_EP_CQ_FLAGS) != CXIP_EP_CQ_FLAGS) { + CXIP_WARN("Invalid CQ flags\n"); + return -FI_EINVAL; + } + + if (flags & FI_TRANSMIT) { + txc = &ep->ep_obj->txc; + if (txc->send_cq) { + CXIP_WARN("SEND CQ previously bound\n"); + return -FI_EINVAL; + } + + ofi_atomic_inc32(&cq->util_cq.ref); + txc->send_cq = cq; + + if (flags & FI_SELECTIVE_COMPLETION) + txc->selective_completion = 1; + if (!txc->selective_completion) + txc->attr.op_flags |= FI_COMPLETION; + + ep->tx_attr.op_flags = txc->attr.op_flags; + + /* Use CXI ep_list_lock that can be selectively optimized */ + ofi_genlock_lock(&cq->ep_list_lock); + ret = fid_list_insert2(&cq->util_cq.ep_list, + &cq->util_cq.ep_list_lock, + &ep->ep.fid); + ofi_genlock_unlock(&cq->ep_list_lock); + + if (ret) { + CXIP_WARN("EP CQ fid insert failed %d\n", ret); + ofi_atomic_dec32(&cq->util_cq.ref); + txc->send_cq = NULL; + } + } + + if (flags & FI_RECV) { + rxc = &ep->ep_obj->rxc; + if (rxc->recv_cq) { + CXIP_WARN("RECV CQ previously bound\n"); + return -FI_EINVAL; + } + + ofi_atomic_inc32(&cq->util_cq.ref); + rxc->recv_cq = cq; + + if (flags & FI_SELECTIVE_COMPLETION) + rxc->selective_completion = 1; + if (!rxc->selective_completion) + rxc->attr.op_flags |= FI_COMPLETION; + + ep->rx_attr.op_flags = rxc->attr.op_flags; + + /* Use CXI ep_list_lock that can be selectively optimized */ + ofi_genlock_lock(&cq->ep_list_lock); + ret = fid_list_insert2(&cq->util_cq.ep_list, + &cq->util_cq.ep_list_lock, + &ep->ep.fid); + ofi_genlock_unlock(&cq->ep_list_lock); + + if (ret) { + CXIP_WARN("EP CQ fid insert failed %d\n", ret); + ofi_atomic_dec32(&cq->util_cq.ref); + rxc->recv_cq = NULL; + } + } + return FI_SUCCESS; +} + +/* + * cxip_ep_bind_cntr() - Bind EP to counter resource + */ +static int cxip_ep_bind_cntr(struct cxip_ep *ep, struct cxip_cntr *cntr, + uint64_t flags) +{ + int ret; + + if (ep->ep_obj->domain != cntr->domain) { + CXIP_WARN("Counter domain invalid for EP\n"); + return -FI_EINVAL; + } + + if (!(flags & CXIP_EP_CNTR_FLAGS)) + return FI_SUCCESS; + + if ((flags & FI_SEND && ep->ep_obj->txc.send_cntr) || + (flags & FI_READ && ep->ep_obj->txc.read_cntr) || + (flags & FI_WRITE && ep->ep_obj->txc.write_cntr) || + (flags & FI_RECV && ep->ep_obj->rxc.recv_cntr)) { + CXIP_WARN("EP previously bound to counter\n"); + return -FI_EINVAL; + } + + ret = fid_list_insert(&cntr->ctx_list, &cntr->lock, &ep->ep.fid); + if (ret) { + CXIP_WARN("Add of EP to cntr EP list failed: %d:%s\n", + ret, fi_strerror(-ret)); + return ret; + } + + if (flags & FI_SEND) { + ep->ep_obj->txc.send_cntr = cntr; + ofi_atomic_inc32(&cntr->ref); + } + if (flags & FI_READ) { + ep->ep_obj->txc.read_cntr = cntr; + ofi_atomic_inc32(&cntr->ref); + } + if (flags & FI_WRITE) { + ep->ep_obj->txc.write_cntr = cntr; + ofi_atomic_inc32(&cntr->ref); + } + if (flags & FI_RECV) { + ep->ep_obj->rxc.recv_cntr = cntr; + ofi_atomic_inc32(&cntr->ref); + } + + return FI_SUCCESS; +} + +/* + * cxip_ep_bind() - Bind EP resources. + */ +int cxip_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int ret; + struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); + struct cxip_eq *eq; + struct cxip_cq *cq; + struct cxip_av *av; + struct cxip_cntr *cntr; + + /* TODO: Remove this since it requires malicious programming to + * create this condition. + */ + if (fid->fclass != FI_CLASS_EP) + return -FI_EINVAL; + + ret = ofi_ep_bind_valid(&cxip_prov, bfid, flags); + if (ret) + return ret; + + switch (bfid->fclass) { + case FI_CLASS_EQ: + eq = container_of(bfid, struct cxip_eq, util_eq.eq_fid.fid); + ofi_atomic_inc32(&eq->util_eq.ref); + ofi_mutex_lock(&eq->list_lock); + dlist_insert_tail(&ep->ep_obj->eq_link, &eq->ep_list); + ofi_mutex_unlock(&eq->list_lock); + ep->ep_obj->eq = eq; + break; + + case FI_CLASS_CQ: + cq = container_of(bfid, struct cxip_cq, util_cq.cq_fid.fid); + ret = cxip_ep_bind_cq(ep, cq, flags); + if (ret) + return ret; + break; + + case FI_CLASS_CNTR: + cntr = container_of(bfid, struct cxip_cntr, cntr_fid.fid); + ret = cxip_ep_bind_cntr(ep, cntr, flags); + if (ret) + return ret; + break; + + case FI_CLASS_AV: + av = container_of(bfid, struct cxip_av, av_fid.fid); + ret = cxip_av_bind_ep(av, ep); + if (ret) + return ret; + ep->ep_obj->av = av; + + break; + + default: + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +/* + * cxip_set_tclass() + */ +int cxip_set_tclass(uint32_t desired_tc, uint32_t default_tc, uint32_t *new_tc) +{ + assert(new_tc != NULL); + + if (desired_tc != FI_TC_UNSPEC) { + if (desired_tc >= FI_TC_LABEL && + desired_tc <= FI_TC_SCAVENGER) { + *new_tc = desired_tc; + } else { + CXIP_WARN("Invalid tclass\n"); + return -FI_EINVAL; + } + } else { + *new_tc = default_tc; + } + + CXIP_DBG("Set tclass to %d\n", *new_tc); + return FI_SUCCESS; +} + +/** + * provider fi_set_val()/FI_SET_VAL implementation for EP + * + * @param fid : EP fid + * @param val : parameter structure for set value operations. + * + * @return int : 0 on success, -errno on failure + */ +static inline int cxip_ep_set_val(struct cxip_ep *cxi_ep, + struct fi_fid_var *val) +{ + uint32_t *req_tclass; + uint64_t *req_order; + uint32_t new_tclass; + + if (!val->val) + return -FI_EINVAL; + + switch (val->name) { + case FI_OPT_CXI_SET_TCLASS: + req_tclass = (uint32_t *) val->val; + + if (cxip_set_tclass(*req_tclass, cxi_ep->tx_attr.tclass, + &new_tclass)) + return -FI_EINVAL; + + cxi_ep->tx_attr.tclass = new_tclass; + break; + case FI_OPT_CXI_SET_MSG_ORDER: + req_order = (uint64_t *) val->val; + + if (*req_order & ~CXIP_MSG_ORDER) { + CXIP_WARN("Invalid message order 0x%" PRIx64 "\n", + *req_order); + return -FI_EINVAL; + } + + cxi_ep->tx_attr.msg_order = *req_order; + break; + default: + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +/* + * cxip_ep_control() - Provider EP control implementation. + */ +static int cxip_ep_control(struct fid *fid, int command, void *arg) +{ + struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); + struct cxip_ep *new_ep; + struct fi_alias *alias; + int ret; + + /* TODO: Remove this since it requires malicious programming to + * create this condition. + */ + if (fid->fclass != FI_CLASS_EP) + return -FI_EINVAL; + + switch (command) { + case FI_ALIAS: + if (!arg) + return -FI_EINVAL; + alias = (struct fi_alias *)arg; + if (!alias->fid) + return -FI_EINVAL; + new_ep = calloc(1, sizeof(*new_ep)); + if (!new_ep) + return -FI_ENOMEM; + + memcpy(&new_ep->tx_attr, &ep->tx_attr, + sizeof(struct fi_tx_attr)); + memcpy(&new_ep->rx_attr, &ep->rx_attr, + sizeof(struct fi_rx_attr)); + ret = cxip_setopflags(&new_ep->tx_attr, &new_ep->rx_attr, + alias->flags); + if (ret) { + free(new_ep); + return -FI_EINVAL; + } + new_ep->ep_obj = ep->ep_obj; + new_ep->is_alias = 1; + memcpy(&new_ep->ep, &ep->ep, sizeof(struct fid_ep)); + *alias->fid = &new_ep->ep.fid; + ofi_atomic_inc32(&new_ep->ep_obj->ref); + break; + case FI_GETOPSFLAG: + if (!arg) + return -FI_EINVAL; + ret = cxip_getopflags(&ep->tx_attr, &ep->rx_attr, + (uint64_t *)arg); + if (ret) + return -FI_EINVAL; + break; + case FI_SETOPSFLAG: + if (!arg) + return -FI_EINVAL; + ret = cxip_setopflags(&ep->tx_attr, &ep->rx_attr, + *(uint64_t *)arg); + if (ret) + return -FI_EINVAL; + break; + case FI_ENABLE: + return cxip_ep_enable(&ep->ep); + case FI_SET_VAL: + if (!arg) + return -FI_EINVAL; + return cxip_ep_set_val(ep, (struct fi_fid_var *) arg); + default: + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +struct fi_ops cxip_ep_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_ep_close, + .bind = cxip_ep_bind, + .control = cxip_ep_control, + .ops_open = fi_no_ops_open, +}; + +int cxip_ep_getopt_priv(struct cxip_ep *ep, int level, int optname, + void *optval, size_t *optlen) +{ + if (level != FI_OPT_ENDPOINT) + return -FI_ENOPROTOOPT; + + switch (optname) { + case FI_OPT_MIN_MULTI_RECV: + if (!optval || !optlen) + return -FI_EINVAL; + if (*optlen < sizeof(size_t)) + return -FI_ETOOSMALL; + + *(size_t *)optval = ep->ep_obj->rxc.min_multi_recv; + *optlen = sizeof(size_t); + break; + + default: + return -FI_ENOPROTOOPT; + } + + return FI_SUCCESS; +} + +/* + * cxip_ep_getopt() - Return endpoint option value if supported. + */ +static int cxip_ep_getopt(fid_t fid, int level, int optname, void *optval, + size_t *optlen) +{ + struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); + + return cxip_ep_getopt_priv(ep, level, optname, optval, optlen); +} + +int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname, + const void *optval, size_t optlen) +{ + size_t min_multi_recv; + + if (level != FI_OPT_ENDPOINT) + return -FI_ENOPROTOOPT; + + switch (optname) { + case FI_OPT_MIN_MULTI_RECV: + if (!optval) + return -FI_EINVAL; + + min_multi_recv = *(size_t *)optval; + + if (min_multi_recv > CXIP_EP_MAX_MULTI_RECV) { + CXIP_WARN("Maximum min_multi_recv value is: %u\n", + CXIP_EP_MAX_MULTI_RECV); + return -FI_EINVAL; + } + ep->ep_obj->rxc.min_multi_recv = min_multi_recv; + break; + + default: + return -FI_ENOPROTOOPT; + } + + return FI_SUCCESS; +} + +/* + * cxip_ep_setopt() - Set endpoint option value if supported. + */ +static int cxip_ep_setopt(fid_t fid, int level, int optname, const void *optval, + size_t optlen) +{ + struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); + + return cxip_ep_setopt_priv(ep, level, optname, optval, optlen); +} + +struct fi_ops_ep cxip_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = cxip_ep_cancel, + .getopt = cxip_ep_getopt, + .setopt = cxip_ep_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +/* + * cxip_alloc_endpoint() - Allocate and initialize base EP object. + */ +int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, + struct cxip_ep_obj **ep_base_obj, void *context) +{ + int ret; + struct cxip_ep_obj *ep_obj; + struct cxip_txc *txc; + struct cxip_rxc *rxc; + uint32_t nic; + uint32_t pid; + int i; + + if (!hints || !hints->ep_attr || !hints->tx_attr || !hints->rx_attr) + return -FI_EINVAL; + + ret = ofi_prov_check_info(&cxip_util_prov, CXIP_FI_VERSION, hints); + if (ret != FI_SUCCESS) + return -FI_EINVAL; + + if (cxip_dom->auth_key_entry_max > 1 && + ((hints->caps & FI_DIRECTED_RECV) || + (hints->rx_attr->caps & FI_DIRECTED_RECV))) { + CXIP_WARN("FI_DIRECTED_RECV not supported with multiple auth key per EP\n"); + return -FI_EINVAL; + } + + ret = cxip_check_auth_key_info(hints); + if (ret) + return ret; + + nic = cxip_dom->nic_addr; + if (hints->src_addr) { + struct cxip_addr *src = hints->src_addr; + if (src->nic != nic) { + CXIP_WARN("bad src_addr NIC value\n"); + return -FI_EINVAL; + } + pid = src->pid; + } else { + pid = C_PID_ANY; + } + + ep_obj = calloc(1, sizeof(struct cxip_ep_obj)); + if (!ep_obj) + return -FI_ENOMEM; + + txc = &ep_obj->txc; + rxc = &ep_obj->rxc; + + /* For faster access */ + ep_obj->asic_ver = cxip_dom->iface->info->cassini_version; + + /* Save EP attributes from hints */ + ep_obj->caps = hints->caps; + ep_obj->ep_attr = *hints->ep_attr; + ep_obj->txq_size = hints->tx_attr->size; + ep_obj->tgq_size = hints->rx_attr->size; + ep_obj->tx_attr = *hints->tx_attr; + ep_obj->rx_attr = *hints->rx_attr; + + if (hints->ep_attr->auth_key) { + /* Auth key size is verified in ofi_prov_check_info(). */ + assert(hints->ep_attr->auth_key_size == + sizeof(struct cxi_auth_key)); + + memcpy(&ep_obj->auth_key, hints->ep_attr->auth_key, + sizeof(struct cxi_auth_key)); + + /* All EPs that share a Domain must use the same Service ID. */ + if (ep_obj->auth_key.svc_id != cxip_dom->auth_key.svc_id) { + CXIP_WARN("Invalid svc_id: %u\n", + ep_obj->auth_key.svc_id); + ret = -FI_EINVAL; + goto err; + } + } else { + if (cxip_dom->av_auth_key) { + ep_obj->av_auth_key = true; + } else { + /* Inherit auth_key from Domain. */ + ep_obj->auth_key = cxip_dom->auth_key; + CXIP_DBG("Inherited domain auth_key\n"); + } + } + + if (cxip_set_tclass(ep_obj->tx_attr.tclass, + cxip_dom->tclass, &ep_obj->txc.tclass)) { + CXIP_WARN("Invalid tclass\n"); + ret = -FI_EINVAL; + goto err; + } + ep_obj->tx_attr.tclass = ep_obj->txc.tclass; + + /* Initialize object */ + ofi_atomic_initialize32(&ep_obj->ref, 0); + + /* Allow FI_THREAD_DOMAIN optimizaiton */ + if (cxip_dom->util_domain.threading == FI_THREAD_DOMAIN || + cxip_dom->util_domain.threading == FI_THREAD_COMPLETION) + ofi_genlock_init(&ep_obj->lock, OFI_LOCK_NONE); + else + ofi_genlock_init(&ep_obj->lock, OFI_LOCK_SPINLOCK); + + ep_obj->domain = cxip_dom; + ep_obj->src_addr.nic = nic; + ep_obj->src_addr.pid = pid; + ep_obj->fi_addr = FI_ADDR_NOTAVAIL; + + ofi_atomic_initialize32(&ep_obj->txq_ref, 0); + ofi_atomic_initialize32(&ep_obj->tgq_ref, 0); + + for (i = 0; i < CXIP_NUM_CACHED_KEY_LE; i++) { + ofi_atomic_initialize32(&ep_obj->std_mr_cache[i].ref, 0); + ofi_atomic_initialize32(&ep_obj->opt_mr_cache[i].ref, 0); + } + + dlist_init(&ep_obj->mr_list); + ep_obj->ep_attr.tx_ctx_cnt = 1; + ep_obj->ep_attr.rx_ctx_cnt = 1; + txc->ep_obj = ep_obj; + rxc->ep_obj = ep_obj; + + cxip_txc_struct_init(txc, &ep_obj->tx_attr, context); + cxip_rxc_struct_init(rxc, &ep_obj->rx_attr, context); + + txc->domain = cxip_dom; + txc->hrp_war_req = txc->ep_obj->asic_ver < CASSINI_2_0; + + rxc->domain = cxip_dom; + rxc->min_multi_recv = CXIP_EP_MIN_MULTI_RECV; + ofi_atomic_inc32(&cxip_dom->ref); + + *ep_base_obj = ep_obj; + + return FI_SUCCESS; + +err: + free(ep_obj); + + return ret; +} + +/* + * cxip_endpoint() - Provider fi_endpoint() implementation. + */ +int cxip_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **fid_ep, void *context) +{ + int ret; + struct cxip_ep *ep; + struct cxip_domain *cxip_dom = container_of(domain, struct cxip_domain, + util_domain.domain_fid); + if (!fid_ep) + return -FI_EINVAL; + + ep = calloc(1, sizeof(*ep)); + if (!ep) + return -FI_ENOMEM; + + /* Allocate and initialize the base endpoint */ + ret = cxip_alloc_endpoint(cxip_dom, info, &ep->ep_obj, context); + if (ret) { + free(ep); + return ret; + } + + /* Store EP attribures with the wrapper since values can be + * overridden by alias EP that share the same EP object. + */ + ep->tx_attr = ep->ep_obj->tx_attr; + ep->rx_attr = ep->ep_obj->rx_attr; + + ep->ep.fid.fclass = FI_CLASS_EP; + ep->ep.fid.context = context; + ep->ep.fid.ops = &cxip_ep_fi_ops; + ep->ep.ops = &cxip_ep_ops; + ep->ep.cm = &cxip_ep_cm_ops; + + /* Initialize API to not supported until EP is enabled */ + ep->ep.msg = &cxip_ep_msg_no_ops; + ep->ep.tagged = &cxip_ep_tagged_no_ops; + ep->ep.rma = &cxip_ep_rma_no_ops; + ep->ep.atomic = &cxip_ep_atomic_no_ops; + ep->ep.collective = &cxip_collective_no_ops; + + *fid_ep = &ep->ep; + + cxip_coll_init(ep->ep_obj); + cxip_domain_add_txc(ep->ep_obj->domain, &ep->ep_obj->txc); + + return FI_SUCCESS; +} diff --git a/prov/cxi/src/cxip_eq.c b/prov/cxi/src/cxip_eq.c new file mode 100644 index 00000000000..61aad506663 --- /dev/null +++ b/prov/cxi/src/cxip_eq.c @@ -0,0 +1,134 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2020 Hewlett Packard Enterprise Development LP + */ + + /* + * Notes: + * + * Implemented as an extension of util_eq. + * + * At present, the cxip_wait objects are not implemented as extensions of the + * util_wait object, so we cannot currently fully implement the EQ with wait + * states. However, the non-blocking read() and peek() functions work. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cxip.h" + +static int cxip_eq_close(struct fid *fid) +{ + struct cxip_eq *cxi_eq; + + cxi_eq = container_of(fid, struct cxip_eq, util_eq.eq_fid.fid); + + /* May not close until all bound EPs closed */ + if (ofi_atomic_get32(&cxi_eq->util_eq.ref)) + return -FI_EBUSY; + + ofi_mutex_destroy(&cxi_eq->list_lock); + ofi_eq_cleanup(&cxi_eq->util_eq.eq_fid.fid); + free(cxi_eq); + + return FI_SUCCESS; +} + +static void cxip_eq_progress(struct cxip_eq *eq) +{ + struct cxip_ep_obj *ep_obj; + + ofi_mutex_lock(&eq->list_lock); + dlist_foreach_container(&eq->ep_list, struct cxip_ep_obj, + ep_obj, eq_link) { + cxip_coll_progress_join(ep_obj); + } + ofi_mutex_unlock(&eq->list_lock); +} + +ssize_t cxip_eq_read(struct fid_eq *eq_fid, uint32_t *event, + void *buf, size_t len, uint64_t flags) +{ + struct cxip_eq *eq; + int ret; + + eq = container_of(eq_fid, struct cxip_eq, util_eq.eq_fid.fid); + + ret = ofi_eq_read(eq_fid, event, buf, len, flags); + if (ret == -FI_EAGAIN) + cxip_eq_progress(eq); + return ret; +} + +static struct fi_ops_eq cxi_eq_ops = { + .size = sizeof(struct fi_ops_eq), + .read = cxip_eq_read, // customized + .readerr = ofi_eq_readerr, + .sread = ofi_eq_sread, + .write = ofi_eq_write, + .strerror = ofi_eq_strerror, +}; + +static struct fi_ops cxi_eq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_eq_close, // customized + .bind = fi_no_bind, + .control = ofi_eq_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_eq_attr cxip_eq_def_attr = { + .size = CXIP_EQ_DEF_SZ, + .flags = 0, + .wait_obj = FI_WAIT_FD, + .signaling_vector = 0, + .wait_set = NULL +}; + +int cxip_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, + struct fid_eq **eq, void *context) +{ + struct cxip_eq *cxi_eq; + int ret; + + cxi_eq = calloc(1, sizeof(*cxi_eq)); + if (!cxi_eq) + return -FI_ENOMEM; + + if (!attr) + cxi_eq->attr = cxip_eq_def_attr; + else + cxi_eq->attr = *attr; + + ret = ofi_eq_init(fabric, &cxi_eq->attr, &cxi_eq->util_eq.eq_fid, + context); + if (ret != FI_SUCCESS) + goto err0; + + ofi_mutex_init(&cxi_eq->list_lock); + dlist_init(&cxi_eq->ep_list); + ofi_atomic_initialize32(&cxi_eq->util_eq.ref, 0); + + /* custom operations */ + cxi_eq->util_eq.eq_fid.fid.ops = &cxi_eq_fi_ops; + cxi_eq->util_eq.eq_fid.ops = &cxi_eq_ops; + + *eq = &cxi_eq->util_eq.eq_fid; + + return FI_SUCCESS; +err0: + free(cxi_eq); + return ret; +} diff --git a/prov/cxi/src/cxip_evtq.c b/prov/cxi/src/cxip_evtq.c new file mode 100644 index 00000000000..68b3a99d165 --- /dev/null +++ b/prov/cxi/src/cxip_evtq.c @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2018-2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include "config.h" + +#include +#include +#include + +#include +#include + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_CQ, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_CQ, __VA_ARGS__) + +bool cxip_evtq_saturated(struct cxip_evtq *evtq) +{ + if (evtq->eq_saturated) + return true; + + /* Hardware will automatically update the EQ status writeback area, + * which includes a timestamp, once the EQ reaches a certain fill + * percentage. The EQ status timestamp is compared against cached + * versions of the previous EQ status timestamp to determine if new + * writebacks have occurred. Each time a new writeback occurs, the EQ + * is treated as saturated. + * + * Note that the previous EQ status is always updated when the + * corresponding OFI completion queue is progressed. + */ + if (evtq->eq->status->timestamp_sec > + evtq->prev_eq_status.timestamp_sec || + evtq->eq->status->timestamp_ns > + evtq->prev_eq_status.timestamp_ns) { + evtq->eq_saturated = true; + return true; + } + + return false; +} + +int cxip_evtq_adjust_reserved_fc_event_slots(struct cxip_evtq *evtq, int value) +{ + int ret; + + ret = cxil_evtq_adjust_reserved_fc(evtq->eq, value); + if (ret >= 0) + ret = 0; + + return ret; +} + +/* + * cxip_evtq_req_cancel() - Cancel one request. + * + * Cancel one Receive request. If match is true, cancel the request with + * matching op_ctx. Only Receive requests should be in the request list. + * + * Caller must hold ep_obj->lock. + */ +int cxip_evtq_req_cancel(struct cxip_evtq *evtq, void *req_ctx, + void *op_ctx, bool match) +{ + int ret = -FI_ENOENT; + struct cxip_req *req; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(&evtq->req_list, struct cxip_req, req, + evtq_entry, tmp) { + if (req->req_ctx == req_ctx && + req->type == CXIP_REQ_RECV && + !req->recv.canceled && + !req->recv.parent && + (!match || (void *)req->context == op_ctx)) { + ret = cxip_recv_cancel(req); + break; + } + } + + return ret; +} + +static void cxip_evtq_req_free_no_lock(struct cxip_req *req) +{ + struct cxip_req *table_req; + + CXIP_DBG("Freeing req: %p (ID: %d)\n", req, req->req_id); + + dlist_remove(&req->evtq_entry); + + if (req->req_id >= 0) { + table_req = (struct cxip_req *)ofi_idx_remove( + &req->evtq->req_table, req->req_id); + if (table_req != req) + CXIP_WARN("Failed to unmap request: %p\n", req); + } + + ofi_buf_free(req); +} + +/* + * cxip_evtq_flush_trig_reqs() - Flush triggered TX requests + */ +void cxip_evtq_flush_trig_reqs(struct cxip_evtq *evtq) +{ + struct cxip_req *req; + struct dlist_entry *tmp; + struct cxip_txc *txc; + + dlist_foreach_container_safe(&evtq->req_list, struct cxip_req, req, + evtq_entry, tmp) { + + if (cxip_is_trig_req(req)) { + /* If a request is triggered, the context will only be + * a TX context (never a RX context). + */ + txc = req->req_ctx; + + /* Since an event will not arrive to progress the + * request, MDs must be cleaned up now. + */ + switch (req->type) { + case CXIP_REQ_RMA: + if (req->rma.local_md) + cxip_unmap(req->rma.local_md); + if (req->rma.ibuf) + cxip_txc_ibuf_free(txc, + req->rma.ibuf); + break; + + case CXIP_REQ_AMO: + if (req->amo.oper1_md) + cxip_unmap(req->amo.oper1_md); + if (req->amo.result_md) + cxip_unmap(req->amo.result_md); + if (req->amo.ibuf) + cxip_txc_ibuf_free(txc, + req->amo.ibuf); + break; + + case CXIP_REQ_SEND: + if (req->send.send_md) + cxip_unmap(req->send.send_md); + if (req->send.ibuf) + cxip_txc_ibuf_free(txc, + req->send.ibuf); + break; + + default: + CXIP_WARN("Invalid trig req type: %d\n", + req->type); + } + + ofi_atomic_dec32(&txc->otx_reqs); + cxip_evtq_req_free_no_lock(req); + } + + } +} + +/* + * cxip_evtq_req_discard() - Discard all matching requests. + * + * Mark all requests on the Completion Queue to be discarded. When a marked + * request completes, it's completion event will be dropped. This is the + * behavior defined for requests belonging to a closed Endpoint. + * + * Caller must hold ep_obj->lock. + */ +void cxip_evtq_req_discard(struct cxip_evtq *evtq, void *req_ctx) +{ + struct cxip_req *req; + int discards = 0; + + dlist_foreach_container(&evtq->req_list, struct cxip_req, req, + evtq_entry) { + if (req->req_ctx == req_ctx) { + req->discard = true; + discards++; + } + } + + if (discards) + CXIP_DBG("Marked %d requests\n", discards); +} + +/* + * cxip_evtq_req_find() - Look up a request by ID (from an event). + */ +static struct cxip_req *cxip_evtq_req_find(struct cxip_evtq *evtq, int id) +{ + return ofi_idx_at(&evtq->req_table, id); +} + +/* + * cxip_evtq_req_alloc() - Allocate a request. + * + * If remap is set, allocate a 16-bit request ID and map it to the new + * request. + * + * Caller must hold ep_obj->lock of associated EP. + */ +struct cxip_req *cxip_evtq_req_alloc(struct cxip_evtq *evtq, int remap, + void *req_ctx) +{ + struct cxip_req *req; + + req = (struct cxip_req *)ofi_buf_alloc(evtq->req_pool); + if (!req) { + CXIP_DBG("Failed to allocate request\n"); + goto out; + } + memset(req, 0, sizeof(*req)); + + if (remap) { + req->req_id = ofi_idx_insert(&evtq->req_table, req); + + /* Target command buffer IDs are 16 bits wide. */ + if (req->req_id < 0 || req->req_id >= CXIP_BUFFER_ID_MAX) { + CXIP_WARN("Failed to map request: %d\n", + req->req_id); + if (req->req_id > 0) + ofi_idx_remove(&evtq->req_table, req->req_id); + ofi_buf_free(req); + req = NULL; + goto out; + } + } else { + req->req_id = -1; + } + + CXIP_DBG("Allocated req: %p (ID: %d)\n", req, req->req_id); + req->cq = evtq->cq; + req->evtq = evtq; + req->req_ctx = req_ctx; + req->discard = false; + dlist_init(&req->evtq_entry); + dlist_insert_tail(&req->evtq_entry, &evtq->req_list); + +out: + return req; +} + +/* + * cxip_evtq_req_free() - Free a request. + * + * Caller must hold ep_obj->lock. + */ +void cxip_evtq_req_free(struct cxip_req *req) +{ + cxip_evtq_req_free_no_lock(req); +} + +/* + * cxip_evtq_event_req() - Locate a request corresponding to the Cassini event. + */ +static struct cxip_req *cxip_evtq_event_req(struct cxip_evtq *evtq, + const union c_event *event) +{ + struct cxip_req *req; + int return_code; + + switch (event->hdr.event_type) { + case C_EVENT_ACK: + req = (struct cxip_req *)event->init_short.user_ptr; + break; + case C_EVENT_UNLINK: + switch (cxi_tgt_event_rc(event)) { + /* User issued unlink events can race with put events. Assume + * C_RC_ENTRY_NOT_FOUND is this case. + */ + case C_RC_ENTRY_NOT_FOUND: + return NULL; + case C_RC_OK: + break; + default: + CXIP_FATAL("Unhandled unlink return code: %d\n", + cxi_tgt_event_rc(event)); + } + + /* Fall through. */ + case C_EVENT_LINK: + case C_EVENT_GET: + case C_EVENT_PUT: + case C_EVENT_PUT_OVERFLOW: + case C_EVENT_RENDEZVOUS: + case C_EVENT_SEARCH: + req = cxip_evtq_req_find(evtq, event->tgt_long.buffer_id); + if (req) + break; + /* HW error can return zero buffer_id */ + CXIP_WARN("Invalid buffer_id: %d (%s)\n", + event->tgt_long.buffer_id, cxi_event_to_str(event)); + return_code = cxi_tgt_event_rc(event); + if (return_code != C_RC_OK) + CXIP_WARN("Hardware return code: %s (%s)\n", + cxi_rc_to_str(return_code), + cxi_event_to_str(event)); + break; + case C_EVENT_REPLY: + case C_EVENT_SEND: + if (!event->init_short.rendezvous) { + req = (struct cxip_req *)event->init_short.user_ptr; + } else { + struct cxi_rdzv_user_ptr *up = + (struct cxi_rdzv_user_ptr *) + &event->init_short.user_ptr; + req = cxip_evtq_req_find(evtq, up->buffer_id); + if (req) + break; + /* HW error can return zero buffer_id */ + CXIP_WARN("Invalid buffer_id: %d (%s)\n", + event->tgt_long.buffer_id, + cxi_event_to_str(event)); + return_code = cxi_tgt_event_rc(event); + if (return_code != C_RC_OK) + CXIP_WARN("Hardware return code: %s (%s)\n", + cxi_rc_to_str(return_code), + cxi_event_to_str(event)); + } + break; + + + req = NULL; + break; + case C_EVENT_COMMAND_FAILURE: + CXIP_FATAL("Command failure: cq=%u target=%u fail_loc=%u cmd_type=%u cmd_size=%u opcode=%u\n", + event->cmd_fail.cq_id, event->cmd_fail.is_target, + event->cmd_fail.fail_loc, + event->cmd_fail.fail_command.cmd_type, + event->cmd_fail.fail_command.cmd_size, + event->cmd_fail.fail_command.opcode); + default: + CXIP_FATAL("Invalid event type: %d\n", event->hdr.event_type); + } + + CXIP_DBG("got event: %s rc: %s (req: %p)\n", + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event)), + req); + + return req; +} + +/* + * cxip_evtq_progress() - Progress the CXI hardware EQ specified + * + * Caller must hold ep_obj->lock. + */ +void cxip_evtq_progress(struct cxip_evtq *evtq) +{ + const union c_event *event; + struct cxip_req *req; + int ret = FI_SUCCESS; + + if (!evtq->eq || !evtq->cq) + return; + + /* The EQ status needs to be cached on each poll to be able to properly + * determine if the OFI completion queue is saturated. + */ + evtq->prev_eq_status = *evtq->eq->status; + + while ((event = cxi_eq_peek_event(evtq->eq))) { + + /* State change events can be caused due to unacked events. When + * a state change event occurs, always ack EQ. + */ + if (event->hdr.event_type == C_EVENT_STATE_CHANGE) { + cxi_eq_ack_events(evtq->eq); + evtq->unacked_events = 0; + cxip_pte_state_change(evtq->cq->domain->iface, event); + } else { + + req = cxip_evtq_event_req(evtq, event); + if (req) { + ret = req->cb(req, event); + if (ret != FI_SUCCESS) + break; + } + } + + cxi_eq_next_event(evtq->eq); + + evtq->unacked_events++; + if (evtq->unacked_events >= evtq->ack_batch_size) { + cxi_eq_ack_events(evtq->eq); + evtq->unacked_events = 0; + } + } + + if (cxi_eq_get_drops(evtq->eq)) { + CXIP_WARN("EQ %d dropped event, rsvd slots %u, free slots %u\n", + evtq->eq->eqn, + evtq->eq->status->event_slots_rsrvd, + evtq->eq->status->event_slots_free); + CXIP_FATAL("H/W Event Queue overflow detected.\n"); + } + + if (ret == FI_SUCCESS) + evtq->eq_saturated = false; +} + +void cxip_evtq_fini(struct cxip_evtq *evtq) +{ + if (!evtq->eq) + return; + + cxil_destroy_evtq(evtq->eq); + + if (evtq->md) + cxil_unmap(evtq->md); + else + madvise(evtq->buf, evtq->len, MADV_DOFORK); + + if (evtq->mmap) + munmap(evtq->buf, evtq->len); + else + free(evtq->buf); + + ofi_idx_reset(&evtq->req_table); + ofi_bufpool_destroy(evtq->req_pool); + evtq->eq = NULL; +} + +static size_t cxip_evtq_get_queue_size(struct cxip_cq *cq, size_t num_events) +{ + size_t num_slots = num_events + cq->ack_batch_size; + + /* One additional event slot is needed for full queue. */ + num_slots += 1; + + /* One additional event slot is needed for EQ status. */ + num_slots += 1; + + /* Users current expect libfabric CQ size to control sizing of HW EQs. + * Honor this by using CQ size to override CXI provider requested EQ + * slots. + */ + num_slots = MAX(num_slots, cq->attr.size); + + return num_slots * C_EE_CFG_ECB_SIZE; +} + +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq, + size_t num_events, size_t num_fc_events) +{ + struct cxi_eq_attr eq_attr = { + .reserved_slots = num_fc_events, + }; + struct ofi_bufpool_attr bp_attr = { + .size = sizeof(struct cxip_req), + .alignment = 8, + .chunk_cnt = 64, + .flags = OFI_BUFPOOL_NO_TRACK, + }; + size_t len; + size_t eq_len; + bool eq_passthrough = false; + int ret; + int page_size; + + assert(cq->domain->enabled); + + len = cxip_evtq_get_queue_size(cq, num_events + num_fc_events); + + /* Note max_cnt == 0 is unlimited */ + ret = ofi_bufpool_create_attr(&bp_attr, &evtq->req_pool); + if (ret) { + CXIP_WARN("Failed to create req pool: %d, %s\n", + ret, fi_strerror(-ret)); + return ret; + } + memset(&evtq->req_table, 0, sizeof(evtq->req_table)); + dlist_init(&evtq->req_list); + + /* Attempt to use 2 MiB hugepages. */ + if (!cxip_env.disable_eq_hugetlb) { + eq_len = ofi_get_aligned_size(len, 1U << 21); + evtq->buf = mmap(NULL, eq_len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | + MAP_HUGE_2MB, -1, 0); + if (evtq->buf != MAP_FAILED) { + evtq->mmap = true; + + /* If a single hugepage is used, CXI_EQ_PASSTHROUGH can + * be used. + */ + if (eq_len <= (1U << 21)) + eq_passthrough = true; + goto mmap_success; + } + + CXIP_DBG("Unable to map hugepage for EQ\n"); + } + + page_size = ofi_get_page_size(); + if (page_size < 0) + return -ofi_syserr(); + + evtq->mmap = false; + eq_len = ofi_get_aligned_size(len, page_size); + evtq->buf = aligned_alloc(page_size, eq_len); + if (!evtq->buf) { + CXIP_WARN("Unable to allocate EQ buffer\n"); + ret = -FI_ENOMEM; + goto err_free_bp; + } + +mmap_success: + /* Buffer has been allocated. Only map if needed. */ + evtq->len = eq_len; + if (eq_passthrough) { + evtq->md = NULL; + eq_attr.flags |= CXI_EQ_PASSTHROUGH; + + ret = madvise(evtq->buf, evtq->len, MADV_DONTFORK); + if (ret) { + ret = -errno; + CXIP_WARN("madvise failed: %d\n", ret); + goto err_free_eq_buf; + } + } else { + ret = cxil_map(cq->domain->lni->lni, evtq->buf, evtq->len, + CXIP_EQ_MAP_FLAGS, NULL, &evtq->md); + if (ret) { + CXIP_WARN("Unable to map EQ buffer: %d\n", ret); + goto err_free_eq_buf; + } + } + + /* Once the EQ is at CQ fill percentage full, a status event is + * generated. When a status event occurs, the CXIP CQ is considered + * saturated until the CXI EQ is drained. + */ + eq_attr.status_thresh_base = cxip_env.cq_fill_percent; + eq_attr.status_thresh_delta = 0; + eq_attr.status_thresh_count = 1; + + eq_attr.queue = evtq->buf; + eq_attr.queue_len = evtq->len; + eq_attr.flags |= CXI_EQ_TGT_LONG | CXI_EQ_EC_DISABLE; + + /* CPU number will be ignored if invalid */ + if (cq->attr.flags & FI_AFFINITY && cq->attr.signaling_vector > 0) + eq_attr.cpu_affinity = cq->attr.signaling_vector; + + /* cq->priv_wait is NULL if not backed by wait object */ + ret = cxil_alloc_evtq(cq->domain->lni->lni, evtq->md, &eq_attr, + cq->priv_wait, NULL, &evtq->eq); + if (ret) { + CXIP_WARN("Failed to allocated EQ: %d\n", ret); + goto err_unmap_eq_buf; + } + + /* Point back to the CQ bound to the TX or RX context */ + evtq->cq = cq; + evtq->ack_batch_size = cq->ack_batch_size; + + return FI_SUCCESS; + +err_unmap_eq_buf: + if (evtq->md) + cxil_unmap(evtq->md); + else + madvise(evtq->buf, evtq->len, MADV_DOFORK); +err_free_eq_buf: + if (evtq->mmap) + munmap(evtq->buf, evtq->len); + else + free(evtq->buf); + +err_free_bp: + ofi_idx_reset(&evtq->req_table); + ofi_bufpool_destroy(evtq->req_pool); + + return ret; +} diff --git a/prov/cxi/src/cxip_fabric.c b/prov/cxi/src/cxip_fabric.c new file mode 100644 index 00000000000..c8528cf829c --- /dev/null +++ b/prov/cxi/src/cxip_fabric.c @@ -0,0 +1,93 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 DataDirect Networks, Inc. All rights reserved. + * Copyright (c) 2018,2020 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include + +#include "ofi_prov.h" +#include "ofi_osd.h" + +#include "cxip.h" + +int cxip_cq_def_sz = CXIP_CQ_DEF_SZ; +int cxip_eq_def_sz = CXIP_EQ_DEF_SZ; + +static int read_default_params; + +static struct fi_ops_fabric cxip_fab_ops = { + .size = sizeof(struct fi_ops_fabric), + .domain = cxip_domain, + .passive_ep = fi_no_passive_ep, + .eq_open = cxip_eq_open, + .wait_open = ofi_wait_fd_open, + .trywait = ofi_trywait, +}; + +static int cxip_fabric_close(fid_t fid) +{ + struct cxip_fabric *fab; + + fab = container_of(fid, struct cxip_fabric, util_fabric.fabric_fid); + if (ofi_atomic_get32(&fab->ref)) + return -FI_EBUSY; + + ofi_fabric_close(&fab->util_fabric); + free(fab); + + return 0; +} + +static struct fi_ops cxip_fab_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_fabric_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static void cxip_read_default_params(void) +{ + if (!read_default_params) + read_default_params = 1; +} + +int cxip_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, + void *context) +{ + struct cxip_fabric *fab; + int ret; + + fab = calloc(1, sizeof(*fab)); + if (!fab) + return -FI_ENOMEM; + + ret = ofi_fabric_init(&cxip_prov, &cxip_fabric_attr, attr, + &fab->util_fabric, context); + if (ret != FI_SUCCESS) + goto free_fab; + + cxip_read_default_params(); + + ofi_atomic_initialize32(&fab->ref, 0); + + fab->util_fabric.fabric_fid.fid.ops = &cxip_fab_fi_ops; + fab->util_fabric.fabric_fid.ops = &cxip_fab_ops; + + *fabric = &fab->util_fabric.fabric_fid; + + return 0; + +free_fab: + free(fab); + return ret; +} diff --git a/prov/cxi/src/cxip_faults.c b/prov/cxi/src/cxip_faults.c new file mode 100644 index 00000000000..04564b1bd04 --- /dev/null +++ b/prov/cxi/src/cxip_faults.c @@ -0,0 +1,70 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2019 Hewlett Packard Enterprise Development LP + */ + +/* Fault injection. */ + +#include +#include + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) + +#if ENABLE_DEBUG + +struct cxip_fault dma_fault = { .env = "DMA_FAULT_RATE" }; +struct cxip_fault malloc_fault = { .env = "MALLOC_FAULT_RATE" }; + +static void fault_init(struct cxip_fault *fault) +{ + char *var; + float rate; + int ret; + + var = getenv(fault->env); + if (var) { + ret = sscanf(var, "%f", &rate); + if (ret == 1) { + if (rate < 0) + rate = 0; + if (rate > 1) + rate = 1; + + fault->prop = rate * RAND_MAX; + CXIP_DBG("%s: %f\n", fault->env, rate); + } + } +} + +static void fault_fini(struct cxip_fault *fault) +{ + if (fault->prop) + CXIP_WARN("%s: %ld faults injected\n", + fault->env, fault->count); +} + +void cxip_fault_inject_init(void) +{ + time_t t = time(NULL); + + CXIP_DBG("Rand seed: %lu\n", t); + srand(t); + + fault_init(&dma_fault); + fault_init(&malloc_fault); +} + +void cxip_fault_inject_fini(void) +{ + fault_fini(&dma_fault); + fault_fini(&malloc_fault); +} + +#else +void cxip_fault_inject_init(void) {} +void cxip_fault_inject_fini(void) {} +#endif diff --git a/prov/cxi/src/cxip_if.c b/prov/cxi/src/cxip_if.c new file mode 100644 index 00000000000..d037bda13a5 --- /dev/null +++ b/prov/cxi/src/cxip_if.c @@ -0,0 +1,628 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018,2020-2023 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include "ofi_prov.h" +#include "ofi_osd.h" + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_DOMAIN, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__) + +struct slist cxip_if_list; +static struct cxil_device_list *cxi_dev_list; + +/* + * cxip_if_lookup_addr() - Return a provider NIC interface descriptor + * associated with a specified NIC address, if available. + */ +struct cxip_if *cxip_if_lookup_addr(uint32_t nic_addr) +{ + struct slist_entry *entry, *prev __attribute__ ((unused)); + struct cxip_if *if_entry; + + slist_foreach(&cxip_if_list, entry, prev) { + if_entry = container_of(entry, struct cxip_if, if_entry); + if (if_entry->info->nic_addr == nic_addr) + return if_entry; + } + + return NULL; +} + +/* + * cxip_if_lookup() - Return a provider NIC interface descriptor associated + * with a specified NIC device name, if available. + */ +struct cxip_if *cxip_if_lookup_name(const char *name) +{ + struct slist_entry *entry, *prev __attribute__ ((unused)); + struct cxip_if *if_entry; + + slist_foreach(&cxip_if_list, entry, prev) { + if_entry = container_of(entry, struct cxip_if, if_entry); + if (!strcmp(if_entry->info->device_name, name)) + return if_entry; + } + + return NULL; +} + +/* + * cxip_lni_res_count() - Return usage information for LNI resource. + */ +int cxip_lni_res_cnt(struct cxip_lni *lni, char *res_str) +{ + struct dirent *de; + char path[100]; + uint32_t c = 0; + DIR *dr; + + sprintf(path, "/sys/kernel/debug/cxi/cxi%u/lni/%u/%s", + lni->iface->info->dev_id, lni->lni->id, res_str); + + dr = opendir(path); + if (!dr) + return 0; + + while ((de = readdir(dr))) { + if (strncmp(de->d_name, ".", 1)) + c++; + } + + closedir(dr); + + return c; +} + +/* + * cxip_lni_res_dump() - Dump resource usage information for an LNI. + */ +void cxip_lni_res_dump(struct cxip_lni *lni) +{ + DIR *dr; + uint32_t pt_count = 0; + uint32_t cq_count = 0; + uint32_t eq_count = 0; + uint32_t ct_count = 0; + uint32_t ac_count = 0; + + /* Check if debugfs is available. */ + dr = opendir("/sys/kernel/debug/cxi"); + if (!dr) { + CXIP_INFO("Resource usage info unavailable: %s RGID: %u.\n", + lni->iface->info->device_name, lni->lni->id); + return; + } + + closedir(dr); + + cq_count = cxip_lni_res_cnt(lni, "cq"); + pt_count = cxip_lni_res_cnt(lni, "pt"); + eq_count = cxip_lni_res_cnt(lni, "eq"); + ct_count = cxip_lni_res_cnt(lni, "ct"); + ac_count = cxip_lni_res_cnt(lni, "ac"); + + CXIP_INFO("Resource usage: %s RGID: %u CQ: %u PTE: %u EQ: %u CT: %u AC: %u\n", + lni->iface->info->device_name, lni->lni->id, cq_count, + pt_count, eq_count, ct_count, ac_count); +} + +/* + * cxip_get_if() - Get a reference to the device interface associated with a + * provided NIC address. A process can open each interface once to support many + * FI Domains. An IF is used to allocate the various device resources including + * CMDQs, EVTQs, and PtlTEs. + */ +int cxip_get_if(uint32_t nic_addr, struct cxip_if **iface) +{ + int ret; + struct cxip_if *if_entry; + + /* The IF list device info is static, no need to lock */ + if_entry = cxip_if_lookup_addr(nic_addr); + if (!if_entry) { + CXIP_DBG("interface not found\n"); + return -FI_ENODEV; + } + + if (!if_entry->link) { + CXIP_INFO("Interface %s link down.\n", + if_entry->info->device_name); + return -FI_ENODEV; + } + + /* Lock the IF to serialize opening the device */ + ofi_spin_lock(&if_entry->lock); + + if (!if_entry->dev) { + ret = cxil_open_device(if_entry->info->dev_id, &if_entry->dev); + if (ret) { + CXIP_WARN("Failed to open CXI Device, ret: %d\n", ret); + ret = -FI_ENODEV; + goto unlock; + } + + CXIP_DBG("Opened %s\n", if_entry->info->device_name); + } + + ofi_atomic_inc32(&if_entry->ref); + *iface = if_entry; + + ofi_spin_unlock(&if_entry->lock); + + return FI_SUCCESS; + +unlock: + ofi_spin_unlock(&if_entry->lock); + + return ret; +} + +/* + * cxip_put_if() - Drop a reference to the device interface. + */ +void cxip_put_if(struct cxip_if *iface) +{ + ofi_spin_lock(&iface->lock); + + if (!ofi_atomic_dec32(&iface->ref)) { + cxil_close_device(iface->dev); + iface->dev = NULL; + + CXIP_DBG("Closed %s\n", iface->info->device_name); + } + + ofi_spin_unlock(&iface->lock); +} + +int cxip_if_valid_rgroup_vni(struct cxip_if *iface, unsigned int rgroup_id, + unsigned int vni) +{ + struct cxi_svc_desc svc_desc; + bool vni_found = false; + int ret; + int i; + + ret = cxil_get_svc(iface->dev, rgroup_id, &svc_desc); + if (ret) { + CXIP_WARN("cxil_get_svc with %s and rgroup_id %d failed: %d:%s\n", + iface->dev->info.device_name, rgroup_id, ret, + strerror(-ret)); + return -FI_EINVAL; + } + + if (svc_desc.restricted_vnis) { + for (i = 0; i < svc_desc.num_vld_vnis; i++) { + if (vni == svc_desc.vnis[i]) { + vni_found = true; + break; + } + } + + if (!vni_found) { + CXIP_WARN("Invalid VNI %d for %s and svc_id %d\n", + vni, iface->dev->info.device_name, + rgroup_id); + return -FI_EINVAL; + } + } + + return FI_SUCCESS; +} + +/* + * cxip_alloc_lni() - Allocate an LNI + */ +int cxip_alloc_lni(struct cxip_if *iface, uint32_t svc_id, + struct cxip_lni **if_lni) +{ + struct cxip_lni *lni; + int ret; + + lni = calloc(1, sizeof(*lni)); + if (!lni) { + CXIP_WARN("Unable to allocate LNI\n"); + return -FI_ENOMEM; + } + + ret = cxil_alloc_lni(iface->dev, &lni->lni, svc_id); + if (ret) { + CXIP_WARN("Failed to allocate LNI, ret: %d\n", ret); + ret = -FI_ENOSPC; + goto free_lni; + } + + lni->iface = iface; + ofi_spin_init(&lni->lock); + dlist_init(&lni->remap_cps); + + CXIP_DBG("Allocated LNI, %s RGID: %u\n", + lni->iface->info->device_name, lni->lni->id); + + *if_lni = lni; + + return FI_SUCCESS; + +free_lni: + free(lni); + + return ret; +} + +/* + * cxip_free_lni() - Free an LNI + */ +void cxip_free_lni(struct cxip_lni *lni) +{ + int ret; + int i; + struct dlist_entry *tmp; + struct cxip_remap_cp *sw_cp; + + cxip_lni_res_dump(lni); + + CXIP_DBG("Freeing LNI, %s RGID: %u\n", + lni->iface->info->device_name, lni->lni->id); + + dlist_foreach_container_safe(&lni->remap_cps, struct cxip_remap_cp, + sw_cp, remap_entry, tmp) + free(sw_cp); + + for (i = 0; i < lni->n_cps; i++) { + ret = cxil_destroy_cp(lni->hw_cps[i]); + if (ret) + CXIP_WARN("Failed to destroy CP: %d\n", ret); + } + + ret = cxil_destroy_lni(lni->lni); + if (ret) + CXIP_WARN("Failed to destroy LNI: %d\n", ret); + + free(lni); +} + +/* + * netdev_ama_check - Return true if the netdev has an AMA installed. + */ +static bool netdev_ama_check(char *netdev) +{ + int rc; + char addr_path[FI_PATH_MAX]; + FILE *f; + int val; + + rc = snprintf(addr_path, FI_PATH_MAX, + "/sys/class/net/%s/addr_assign_type", + netdev); + if (rc < 0) + return false; + + f = fopen(addr_path, "r"); + if (!f) + return false; + + rc = fscanf(f, "%d", &val); + + fclose(f); + + if (rc != 1) + return false; + + /* Check for temporary address */ + if (val != 3) + return false; + + rc = snprintf(addr_path, FI_PATH_MAX, "/sys/class/net/%s/address", + netdev); + if (rc < 0) + return false; + + f = fopen(addr_path, "r"); + if (!f) + return false; + + rc = fscanf(f, "%x:%*x:%*x:%*x:%*x", &val); + + fclose(f); + + if (rc != 1) + return false; + + /* Check for locally administered unicast address */ + if ((val & 0x3) != 0x2) + return false; + + return true; +} + +/* + * netdev_link - Return netdev link state. + */ +static int netdev_link(char *netdev, int *link) +{ + int rc; + char path[FI_PATH_MAX]; + FILE *f; + char state[20]; + int carrier; + + rc = snprintf(path, FI_PATH_MAX, "/sys/class/net/%s/operstate", + netdev); + if (rc < 0) + return -1; + + f = fopen(path, "r"); + if (!f) + return -1; + + rc = fscanf(f, "%20s", state); + + fclose(f); + + if (!strncmp(state, "up", strlen("up"))) { + *link = 1; + return 0; + } + + if (strncmp(state, "unknown", strlen("unknown"))) { + /* State is not not up or unknown, link is down. */ + *link = 0; + return 0; + } + + /* operstate is unknown, must check carrier. */ + rc = snprintf(path, FI_PATH_MAX, "/sys/class/net/%s/carrier", + netdev); + if (rc < 0) + return -1; + + f = fopen(path, "r"); + if (!f) + return -1; + + rc = fscanf(f, "%d", &carrier); + + fclose(f); + + if (carrier) + *link = 1; + else + *link = 0; + + return 0; +} + +/* + * netdev_speed - Return netdev interface speed. + */ +static int netdev_speed(char *netdev, int *speed) +{ + int rc; + char path[FI_PATH_MAX]; + FILE *f; + int val; + + rc = snprintf(path, FI_PATH_MAX, "/sys/class/net/%s/speed", + netdev); + if (rc < 0) + return -1; + + f = fopen(path, "r"); + if (!f) + return -1; + + rc = fscanf(f, "%u", &val); + + fclose(f); + + if (rc != 1) + return -1; + + *speed = val; + + return 0; +} + +/* + * netdev_netdev - Look up the netdev associated with an RDMA device file. + */ +static int netdev_lookup(struct cxil_devinfo *info, char **netdev) +{ + glob_t globbuf; + int rc; + int count; + int i; + char if_path[FI_PATH_MAX]; + char addr_path[FI_PATH_MAX]; + char *addr; + unsigned int dom; + unsigned int bus; + unsigned int dev; + unsigned int func; + + rc = glob("/sys/class/net/*", 0, NULL, &globbuf); + if (rc) + return -1; + + count = globbuf.gl_pathc; + + for (i = 0; i < count; i++) { + rc = snprintf(if_path, FI_PATH_MAX, "%s/device", + globbuf.gl_pathv[i]); + if (rc < 0) + goto free_glob; + + rc = readlink(if_path, addr_path, FI_PATH_MAX-1); + if (rc < 0) { + /* A virtual device, like a bridge, doesn't have a + * device link. + */ + if (errno == ENOENT || errno == ENOTDIR) + continue; + + goto free_glob; + } + addr_path[rc] = '\0'; + + addr = basename(addr_path); + + rc = sscanf(addr, "%x:%x:%x.%x", &dom, &bus, &dev, &func); + if (rc != 4) + continue; + + if (info->pci_domain == dom && + info->pci_bus == bus && + info->pci_device == dev && + info->pci_function == func) { + *netdev = strdup(basename(globbuf.gl_pathv[i])); + if (!*netdev) + goto free_glob; + + globfree(&globbuf); + return 0; + } + } + +free_glob: + globfree(&globbuf); + + return -1; +} + +/* + * cxip_query_if_list() - Populate static IF data during initialization. + */ +static void cxip_query_if_list(struct slist *if_list) +{ + struct cxip_if *if_entry; + int ret; + int i; + char *netdev; + int speed = 0; + int link = 0; + + slist_init(if_list); + + /* The cxi_dev_list is freed in the provider IF destructor */ + ret = cxil_get_device_list(&cxi_dev_list); + if (ret) { + CXIP_WARN("cxil_get_device_list failed\n"); + return; + } + + if (cxi_dev_list->count == 0) { + CXIP_DBG("No IFs found\n"); + return; + } + + if (cxi_dev_list->info[0].min_free_shift) { + CXIP_WARN("Non-zero min_free_shift not supported\n"); + return; + } + + for (i = 0; i < cxi_dev_list->count; i++) { + /* Ignore cxi devices not included in device name string. */ + if (cxip_env.device_name && + (strstr(cxip_env.device_name, + cxi_dev_list->info[i].device_name) == NULL)) + continue; + + if (!getenv("CXIP_SKIP_RH_CHECK") && + cxi_dev_list->info[i].device_platform == C_PLATFORM_ASIC && + !cxil_rh_running(&cxi_dev_list->info[i])) { + CXIP_LOG("CXI retry handler not running for device: %s\n", + cxi_dev_list->info[i].device_name); + continue; + } + + ret = netdev_lookup(&cxi_dev_list->info[i], &netdev); + if (ret) { + CXIP_LOG("CXI netdev not found for device: %s\n", + cxi_dev_list->info[i].device_name); + netdev = strdup("DNE"); + } else { + ret = netdev_link(netdev, &link); + if (ret) + CXIP_WARN("Failed to read netdev link: %s\n", + netdev); + + ret = netdev_speed(netdev, &speed); + if (ret) + CXIP_WARN("Failed to read netdev speed: %s\n", + netdev); + + CXIP_DBG("Device %s has netdev %s (link: %u speed: %u)\n", + cxi_dev_list->info[i].device_name, + netdev, link, speed); + } + + if (!getenv("CXIP_SKIP_AMA_CHECK") && + !netdev_ama_check(netdev)) { + CXIP_LOG("CXI device %s, netdev %s AMA not recognized\n", + cxi_dev_list->info[i].device_name, + netdev); + free(netdev); + continue; + } + + free(netdev); + + if_entry = calloc(1, sizeof(struct cxip_if)); + if_entry->info = &cxi_dev_list->info[i]; + if_entry->link = link; + if_entry->speed = speed; + + ofi_atomic_initialize32(&if_entry->ref, 0); + dlist_init(&if_entry->ptes); + ofi_spin_init(&if_entry->lock); + slist_insert_tail(&if_entry->if_entry, if_list); + } +} + +/* + * cxip_free_if_list() - Tears down static IF data. + */ +static void cxip_free_if_list(struct slist *if_list) +{ + struct slist_entry *entry; + struct cxip_if *if_entry; + + while (!slist_empty(if_list)) { + entry = slist_remove_head(if_list); + if_entry = container_of(entry, struct cxip_if, if_entry); + ofi_spin_destroy(&if_entry->lock); + free(if_entry); + } + + cxil_free_device_list(cxi_dev_list); +} + +/* + * cxip_if_init() - The provider IF constructor. Initializes static IF data. + */ +void cxip_if_init(void) +{ + cxip_query_if_list(&cxip_if_list); +} + +/* + * cxip_if_fini() - The provider IF destructor. Tears down IF data. + */ +void cxip_if_fini(void) +{ + cxip_free_if_list(&cxip_if_list); +} diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c new file mode 100644 index 00000000000..25c392fa6b2 --- /dev/null +++ b/prov/cxi/src/cxip_info.c @@ -0,0 +1,1898 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2019,2022 Hewlett Packard Enterprise Development LP + */ + +/* CXI fabric discovery implementation. */ + +#include "ofi_prov.h" +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_FABRIC, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_FABRIC, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_FABRIC, __VA_ARGS__) + +char cxip_prov_name[] = "cxi"; + +struct fi_fabric_attr cxip_fabric_attr = { + .prov_version = CXIP_PROV_VERSION, + .name = cxip_prov_name, +}; + +/* No ODP, provider specified MR keys */ +struct fi_domain_attr cxip_prov_key_domain_attr = { + .name = NULL, + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_MANUAL, + .data_progress = FI_PROGRESS_MANUAL, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_PROV_KEY | FI_MR_ALLOCATED | FI_MR_ENDPOINT, + .mr_key_size = CXIP_MR_PROV_KEY_SIZE, + .cq_data_size = CXIP_REMOTE_CQ_DATA_SZ, + .cq_cnt = 32, + .ep_cnt = 128, + .tx_ctx_cnt = CXIP_EP_MAX_TX_CNT, + .rx_ctx_cnt = CXIP_EP_MAX_RX_CNT, + .max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT, + .max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT, + .max_ep_stx_ctx = 0, + .max_ep_srx_ctx = 0, + .cntr_cnt = 16, + .mr_iov_limit = 1, + .mr_cnt = 100, + .caps = CXIP_DOM_CAPS, + .auth_key_size = sizeof(struct cxi_auth_key), + .max_ep_auth_key = 1, +}; + +/* ODP, provider specified MR keys */ +struct fi_domain_attr cxip_odp_prov_key_domain_attr = { + .name = NULL, + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_MANUAL, + .data_progress = FI_PROGRESS_MANUAL, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_PROV_KEY | FI_MR_ENDPOINT, + .mr_key_size = CXIP_MR_PROV_KEY_SIZE, + .cq_data_size = CXIP_REMOTE_CQ_DATA_SZ, + .cq_cnt = 32, + .ep_cnt = 128, + .tx_ctx_cnt = CXIP_EP_MAX_TX_CNT, + .rx_ctx_cnt = CXIP_EP_MAX_RX_CNT, + .max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT, + .max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT, + .max_ep_stx_ctx = 0, + .max_ep_srx_ctx = 0, + .cntr_cnt = 16, + .mr_iov_limit = 1, + .mr_cnt = 100, + .caps = CXIP_DOM_CAPS, + .auth_key_size = sizeof(struct cxi_auth_key), + .max_ep_auth_key = 1, +}; + +/* No ODP, client specified MR keys */ +struct fi_domain_attr cxip_client_key_domain_attr = { + .name = NULL, + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_MANUAL, + .data_progress = FI_PROGRESS_MANUAL, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED, + .mr_key_size = CXIP_MR_KEY_SIZE, + .cq_data_size = CXIP_REMOTE_CQ_DATA_SZ, + .cq_cnt = 32, + .ep_cnt = 128, + .tx_ctx_cnt = CXIP_EP_MAX_TX_CNT, + .rx_ctx_cnt = CXIP_EP_MAX_RX_CNT, + .max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT, + .max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT, + .max_ep_stx_ctx = 0, + .max_ep_srx_ctx = 0, + .cntr_cnt = 16, + .mr_iov_limit = 1, + .mr_cnt = 100, + .caps = CXIP_DOM_CAPS, + .auth_key_size = sizeof(struct cxi_auth_key), + .max_ep_auth_key = 1, +}; + +/* ODP, client specified MR keys */ +struct fi_domain_attr cxip_odp_client_key_domain_attr = { + .name = NULL, + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_MANUAL, + .data_progress = FI_PROGRESS_MANUAL, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_ENDPOINT, + .mr_key_size = CXIP_MR_KEY_SIZE, + .cq_data_size = CXIP_REMOTE_CQ_DATA_SZ, + .cq_cnt = 32, + .ep_cnt = 128, + .tx_ctx_cnt = CXIP_EP_MAX_TX_CNT, + .rx_ctx_cnt = CXIP_EP_MAX_RX_CNT, + .max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT, + .max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT, + .max_ep_stx_ctx = 0, + .max_ep_srx_ctx = 0, + .cntr_cnt = 16, + .mr_iov_limit = 1, + .mr_cnt = 100, + .caps = CXIP_DOM_CAPS, + .auth_key_size = sizeof(struct cxi_auth_key), + .max_ep_auth_key = 1, +}; + +/* No ODP, provider specified MR keys */ +struct fi_domain_attr cxip_prov_key_multi_auth_key_domain_attr = { + .name = NULL, + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_MANUAL, + .data_progress = FI_PROGRESS_MANUAL, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_PROV_KEY | FI_MR_ALLOCATED | FI_MR_ENDPOINT, + .mr_key_size = CXIP_MR_PROV_KEY_SIZE, + .cq_data_size = CXIP_REMOTE_CQ_DATA_SZ, + .cq_cnt = 32, + .ep_cnt = 128, + .tx_ctx_cnt = CXIP_EP_MAX_TX_CNT, + .rx_ctx_cnt = CXIP_EP_MAX_RX_CNT, + .max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT, + .max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT, + .max_ep_stx_ctx = 0, + .max_ep_srx_ctx = 0, + .cntr_cnt = 16, + .mr_iov_limit = 1, + .mr_cnt = 100, + .caps = CXIP_DOM_CAPS, + .auth_key_size = sizeof(struct cxi_auth_key), + + /* Set to the number of VNIs supported by a single CXI service. */ + .max_ep_auth_key = 4, +}; + +/* ODP, provider specified MR keys */ +struct fi_domain_attr cxip_odp_prov_key_multi_auth_key_domain_attr = { + .name = NULL, + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_MANUAL, + .data_progress = FI_PROGRESS_MANUAL, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_PROV_KEY | FI_MR_ENDPOINT, + .mr_key_size = CXIP_MR_PROV_KEY_SIZE, + .cq_data_size = CXIP_REMOTE_CQ_DATA_SZ, + .cq_cnt = 32, + .ep_cnt = 128, + .tx_ctx_cnt = CXIP_EP_MAX_TX_CNT, + .rx_ctx_cnt = CXIP_EP_MAX_RX_CNT, + .max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT, + .max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT, + .max_ep_stx_ctx = 0, + .max_ep_srx_ctx = 0, + .cntr_cnt = 16, + .mr_iov_limit = 1, + .mr_cnt = 100, + .caps = CXIP_DOM_CAPS, + .auth_key_size = sizeof(struct cxi_auth_key), + + /* Set to the number of VNIs supported by a single CXI service. */ + .max_ep_auth_key = 4, +}; + +/* No ODP, client specified MR keys */ +struct fi_domain_attr cxip_client_key_multi_auth_key_domain_attr = { + .name = NULL, + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_MANUAL, + .data_progress = FI_PROGRESS_MANUAL, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED, + .mr_key_size = CXIP_MR_KEY_SIZE, + .cq_data_size = CXIP_REMOTE_CQ_DATA_SZ, + .cq_cnt = 32, + .ep_cnt = 128, + .tx_ctx_cnt = CXIP_EP_MAX_TX_CNT, + .rx_ctx_cnt = CXIP_EP_MAX_RX_CNT, + .max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT, + .max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT, + .max_ep_stx_ctx = 0, + .max_ep_srx_ctx = 0, + .cntr_cnt = 16, + .mr_iov_limit = 1, + .mr_cnt = 100, + .caps = CXIP_DOM_CAPS, + .auth_key_size = sizeof(struct cxi_auth_key), + + /* Set to the number of VNIs supported by a single CXI service. */ + .max_ep_auth_key = 4, +}; + +/* ODP, client specified MR keys */ +struct fi_domain_attr cxip_odp_client_key_multi_auth_key_domain_attr = { + .name = NULL, + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_MANUAL, + .data_progress = FI_PROGRESS_MANUAL, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_ENDPOINT, + .mr_key_size = CXIP_MR_KEY_SIZE, + .cq_data_size = CXIP_REMOTE_CQ_DATA_SZ, + .cq_cnt = 32, + .ep_cnt = 128, + .tx_ctx_cnt = CXIP_EP_MAX_TX_CNT, + .rx_ctx_cnt = CXIP_EP_MAX_RX_CNT, + .max_ep_tx_ctx = CXIP_EP_MAX_TX_CNT, + .max_ep_rx_ctx = CXIP_EP_MAX_RX_CNT, + .max_ep_stx_ctx = 0, + .max_ep_srx_ctx = 0, + .cntr_cnt = 16, + .mr_iov_limit = 1, + .mr_cnt = 100, + .caps = CXIP_DOM_CAPS, + .auth_key_size = sizeof(struct cxi_auth_key), + + /* Set to the number of VNIs supported by a single CXI service. */ + .max_ep_auth_key = 4, +}; + +struct fi_ep_attr cxip_ep_attr = { + .type = FI_EP_RDM, + .protocol = FI_PROTO_CXI, + .protocol_version = CXIP_WIRE_PROTO_VERSION, + .max_msg_size = CXIP_EP_MAX_MSG_SZ, + .max_order_raw_size = -1, + .max_order_war_size = -1, + .max_order_waw_size = -1, + .mem_tag_format = FI_TAG_GENERIC >> (64 - CXIP_TAG_WIDTH), + .auth_key_size = sizeof(struct cxi_auth_key), +}; + +struct fi_tx_attr cxip_tx_attr = { + .caps = CXIP_EP_CAPS & ~OFI_IGNORED_TX_CAPS, + .op_flags = CXIP_TX_OP_FLAGS, + .msg_order = CXIP_MSG_ORDER, + .inject_size = CXIP_INJECT_SIZE, + .size = CXIP_MAX_TX_SIZE, + .iov_limit = 1, + .rma_iov_limit = 1, +}; + +struct fi_rx_attr cxip_rx_attr = { + .caps = CXIP_EP_CAPS & ~OFI_IGNORED_RX_CAPS, + .op_flags = CXIP_RX_OP_FLAGS, + .msg_order = CXIP_MSG_ORDER, + .comp_order = FI_ORDER_NONE, + .total_buffered_recv = CXIP_UX_BUFFER_SIZE, + .size = CXIP_MAX_RX_SIZE, + .iov_limit = 1, +}; + +struct fi_tx_attr cxip_multi_auth_key_tx_attr = { + .caps = CXIP_EP_CAPS & ~OFI_IGNORED_TX_CAPS & ~FI_DIRECTED_RECV, + .op_flags = CXIP_TX_OP_FLAGS, + .msg_order = CXIP_MSG_ORDER, + .inject_size = CXIP_INJECT_SIZE, + .size = CXIP_MAX_TX_SIZE, + .iov_limit = 1, + .rma_iov_limit = 1, +}; + +struct fi_rx_attr cxip_multi_auth_key_rx_attr = { + .caps = CXIP_EP_CAPS & ~OFI_IGNORED_RX_CAPS & ~FI_DIRECTED_RECV, + .op_flags = CXIP_RX_OP_FLAGS, + .msg_order = CXIP_MSG_ORDER, + .comp_order = FI_ORDER_NONE, + .total_buffered_recv = CXIP_UX_BUFFER_SIZE, + .size = CXIP_MAX_RX_SIZE, + .iov_limit = 1, +}; + +/* The CXI provider supports multiple operating modes by exporting + * several fi_info structures. The application can filter the fi_info + * with hints, or choose the fi_info based on desired application + * behavior. Matched fi_info are returned in the order of highest + * to lowest provider performance.: + * + * 1. Pinned memory with provider MR Keys + * 2. Pinned memory with application provided MR Keys + * 3. On-Demand paging with provider MR Keys + * 4. On-Demand paging with application provided MR Keys + */ +struct fi_info cxip_infos[] = { + { + .caps = CXIP_CAPS, + .addr_format = FI_ADDR_CXI, + .tx_attr = &cxip_tx_attr, + .rx_attr = &cxip_rx_attr, + .ep_attr = &cxip_ep_attr, + .domain_attr = &cxip_prov_key_domain_attr, + .fabric_attr = &cxip_fabric_attr, + }, + { + .caps = CXIP_CAPS, + .addr_format = FI_ADDR_CXI, + .tx_attr = &cxip_tx_attr, + .rx_attr = &cxip_rx_attr, + .ep_attr = &cxip_ep_attr, + .domain_attr = &cxip_client_key_domain_attr, + .fabric_attr = &cxip_fabric_attr, + }, + { + .caps = CXIP_CAPS, + .addr_format = FI_ADDR_CXI, + .tx_attr = &cxip_tx_attr, + .rx_attr = &cxip_rx_attr, + .ep_attr = &cxip_ep_attr, + .domain_attr = &cxip_odp_prov_key_domain_attr, + .fabric_attr = &cxip_fabric_attr, + }, + { + .caps = CXIP_CAPS, + .addr_format = FI_ADDR_CXI, + .tx_attr = &cxip_tx_attr, + .rx_attr = &cxip_rx_attr, + .ep_attr = &cxip_ep_attr, + .domain_attr = &cxip_odp_client_key_domain_attr, + .fabric_attr = &cxip_fabric_attr, + }, + { + .caps = CXIP_CAPS & ~FI_DIRECTED_RECV, + .addr_format = FI_ADDR_CXI, + .tx_attr = &cxip_multi_auth_key_tx_attr, + .rx_attr = &cxip_multi_auth_key_rx_attr, + .ep_attr = &cxip_ep_attr, + .domain_attr = &cxip_prov_key_multi_auth_key_domain_attr, + .fabric_attr = &cxip_fabric_attr, + }, + { + .caps = CXIP_CAPS & ~FI_DIRECTED_RECV, + .addr_format = FI_ADDR_CXI, + .tx_attr = &cxip_multi_auth_key_tx_attr, + .rx_attr = &cxip_multi_auth_key_rx_attr, + .ep_attr = &cxip_ep_attr, + .domain_attr = &cxip_client_key_multi_auth_key_domain_attr, + .fabric_attr = &cxip_fabric_attr, + }, + { + .caps = CXIP_CAPS & ~FI_DIRECTED_RECV, + .addr_format = FI_ADDR_CXI, + .tx_attr = &cxip_multi_auth_key_tx_attr, + .rx_attr = &cxip_multi_auth_key_rx_attr, + .ep_attr = &cxip_ep_attr, + .domain_attr = &cxip_odp_prov_key_multi_auth_key_domain_attr, + .fabric_attr = &cxip_fabric_attr, + }, + { + .caps = CXIP_CAPS & ~FI_DIRECTED_RECV, + .addr_format = FI_ADDR_CXI, + .tx_attr = &cxip_multi_auth_key_tx_attr, + .rx_attr = &cxip_multi_auth_key_rx_attr, + .ep_attr = &cxip_ep_attr, + .domain_attr = &cxip_odp_client_key_multi_auth_key_domain_attr, + .fabric_attr = &cxip_fabric_attr, + }, +}; + +struct fi_provider cxip_prov; + +struct util_prov cxip_util_prov = { + .prov = &cxip_prov, + .info = NULL, + .flags = 0, +}; + +int s_page_size; + +/* Get _SC_PAGESIZE */ +static void set_system_page_size(void) +{ + if (!s_page_size) + s_page_size = sysconf(_SC_PAGESIZE); +} + +/* + * cxip_info_alloc() - Create a fabric info structure for the CXI interface. + */ +static int cxip_info_alloc(struct cxip_if *nic_if, int info_index, + struct fi_info **info) +{ + int ret; + struct fi_info *fi; + struct cxip_addr addr = {}; + + /* If the forcing of ODP mode was requested, remove any info that + * supports FI_MR_ALLOCATED. + */ + if (cxip_env.force_odp && + cxip_infos[info_index].domain_attr->mr_mode & FI_MR_ALLOCATED) + return -FI_ENODATA; + + /* For now only expose ODP fi_info if ODP selection is enabled. + * TODO: When ODP is always available remove this filter. + */ + if (!(cxip_infos[info_index].domain_attr->mr_mode & FI_MR_ALLOCATED) && + !cxip_env.odp) + return -FI_ENODATA; + + fi = fi_dupinfo(&cxip_infos[info_index]); + if (!fi) + return -FI_ENOMEM; + + fi->domain_attr->name = strdup(nic_if->info->device_name); + if (!fi->domain_attr->name) + return -ENOMEM; + + addr.nic = nic_if->info->nic_addr; + addr.pid = C_PID_ANY; + fi->src_addr = mem_dup(&addr, sizeof(addr)); + if (!fi->src_addr) { + ret = -ENOMEM; + goto err; + } + fi->src_addrlen = sizeof(addr); + + ret = cxip_nic_alloc(nic_if, &fi->nic); + if (ret != FI_SUCCESS) + goto err; + + *info = fi; + return FI_SUCCESS; + +err: + fi_freeinfo((void *)fi); + return ret; +} + +/* + * cxip_info_init() - Initialize fabric info for each CXI interface. + */ +static int cxip_info_init(void) +{ + struct slist_entry *entry, *prev __attribute__ ((unused)); + struct cxip_if *tmp; + struct cxip_if *nic_if; + struct fi_info **fi_list = (void *)&cxip_util_prov.info; + struct fi_info *fi; + int ndx; + int ret; + + slist_foreach(&cxip_if_list, entry, prev) { + /* Bit hacky... but use cxip_if list entry as input to + * cxip_get_if(). cxip_get_if() will init a cxil_dev which is + * used to build a NIC info. + */ + tmp = container_of(entry, struct cxip_if, if_entry); + ret = cxip_get_if(tmp->info->nic_addr, &nic_if); + if (ret != FI_SUCCESS) + continue; + + for (ndx = 0; ndx < ARRAY_SIZE(cxip_infos); ndx++) { + ret = cxip_info_alloc(nic_if, ndx, &fi); + if (ret == -FI_ENODATA) + continue;; + if (ret != FI_SUCCESS) { + cxip_put_if(nic_if); + goto free_info; + } + + CXIP_DBG("%s info created\n", + nic_if->info->device_name); + *fi_list = fi; + fi_list = &(fi->next); + } + } + + return FI_SUCCESS; + +free_info: + fi_freeinfo((void *)cxip_util_prov.info); + return ret; +} + +static bool cxip_env_validate_device_token(const char *device_token) +{ + unsigned int device_index; + unsigned int device_strlen; + + /* Only allow for device tokens of cxi0 - cxi99. */ + device_strlen = strlen(device_token); + if (device_strlen != 4 && device_strlen != 5) + return false; + + /* Ensure device token is of cxi## format. */ + if (sscanf(device_token, "cxi%u", &device_index) != 1) + return false; + + /* Ensure that a device string length of 5 chars is only true if the + * device index is greater than 9. + */ + if (device_strlen == 5 && device_index < 10) + return false; + + return true; +} + +static int cxip_env_validate_device_name(const char *device_name) +{ + const char *device_token; + char *device_name_copy; + int ret = FI_SUCCESS; + + device_name_copy = malloc(strlen(device_name) + 1); + if (!device_name_copy) + return -FI_ENOMEM; + + strcpy(device_name_copy, device_name); + + device_token = strtok(device_name_copy, ","); + while (device_token != NULL) { + if (!cxip_env_validate_device_token(device_token)) { + ret = -FI_EINVAL; + break; + } + + device_token = strtok(NULL, ","); + } + + free(device_name_copy); + + return ret; +} + +static int cxip_env_validate_url(const char *url) +{ + /* Trying to validate further is likely to generate false failures */ + if (url && strlen(url) > 7 && !strncasecmp(url, "http://", 7)) + return FI_SUCCESS; + if (url && strlen(url) > 8 && !strncasecmp(url, "https://", 8)) + return FI_SUCCESS; + return -FI_EINVAL; +} + +static const char * const cxip_rdzv_proto_strs[] = { + [CXIP_RDZV_PROTO_DEFAULT] = "default", + [CXIP_RDZV_PROTO_ALT_READ] = "alt_read", + [CXIP_RDZV_PROTO_ALT_WRITE] = "alt_write", +}; + +const char *cxip_rdzv_proto_to_str(enum cxip_rdzv_proto proto) +{ + if (proto > CXIP_RDZV_PROTO_ALT_WRITE) + return NULL; + + return cxip_rdzv_proto_strs[proto]; +} + +/* Provider environment variables are FI_CXI_{NAME} in all-caps */ +struct cxip_environment cxip_env = { + .odp = false, + .force_odp = false, + .ats = false, + .iotlb = true, + .ats_mlock_mode = CXIP_ATS_MLOCK_ALL, + .fork_safe_requested = false, + .rx_match_mode = CXIP_PTLTE_DEFAULT_MODE, + .rdzv_threshold = CXIP_RDZV_THRESHOLD, + .rdzv_get_min = 2049, /* Avoid single packet Gets */ + .rdzv_eager_size = CXIP_RDZV_THRESHOLD, + .rdzv_aligned_sw_rget = 1, + .disable_non_inject_msg_idc = 0, + .disable_host_register = 0, + .oflow_buf_size = CXIP_OFLOW_BUF_SIZE, + .oflow_buf_min_posted = CXIP_OFLOW_BUF_MIN_POSTED, + .oflow_buf_max_cached = CXIP_OFLOW_BUF_MAX_CACHED, + .safe_devmem_copy_threshold = CXIP_SAFE_DEVMEM_COPY_THRESH, + .optimized_mrs = true, + .mr_match_events = false, + .prov_key_cache = true, + .llring_mode = CXIP_LLRING_IDLE, + .cq_policy = CXI_CQ_UPDATE_LOW_FREQ_EMPTY, + .default_vni = 10, + .eq_ack_batch_size = 32, + .req_buf_size = CXIP_REQ_BUF_SIZE, + .req_buf_min_posted = CXIP_REQ_BUF_MIN_POSTED, + .req_buf_max_cached = CXIP_REQ_BUF_MAX_CACHED, + .msg_offload = 1, + .msg_lossless = 0, + .sw_rx_tx_init_max = CXIP_SW_RX_TX_INIT_MAX_DEFAULT, + .hybrid_preemptive = 0, + .hybrid_recv_preemptive = 0, + .hybrid_posted_recv_preemptive = 0, + .hybrid_unexpected_msg_preemptive = 0, + .fc_retry_usec_delay = 1000, + .ctrl_rx_eq_max_size = 67108864, + .default_cq_size = CXIP_CQ_DEF_SZ, + .default_tx_size = CXIP_DEFAULT_TX_SIZE, + .default_rx_size = CXIP_DEFAULT_RX_SIZE, + .disable_eq_hugetlb = false, + .zbcoll_radix = 2, + .cq_fill_percent = 50, + .enable_unrestricted_end_ro = true, + .rget_tc = FI_TC_UNSPEC, + .cacheline_size = CXIP_DEFAULT_CACHE_LINE_SIZE, + .coll_job_id = NULL, + .coll_job_step_id = NULL, + .coll_mcast_token = NULL, + .hwcoll_addrs_per_job = 0, + .hwcoll_min_nodes = -1, + .coll_fabric_mgr_url = NULL, + .coll_retry_usec = CXIP_COLL_MAX_RETRY_USEC, + .coll_timeout_usec = CXIP_COLL_MAX_TIMEOUT_USEC, + .coll_use_dma_put = false, + .telemetry_rgid = -1, + .disable_hmem_dev_register = 0, + .ze_hmem_supported = 0, + .rdzv_proto = CXIP_RDZV_PROTO_DEFAULT, + .enable_trig_op_limit = false, +}; + +static void cxip_env_init(void) +{ + char *param_str = NULL; + size_t min_free; + int ret; + + gethostname(cxip_env.hostname, sizeof(cxip_env.hostname)); + + fi_param_define(&cxip_prov, "rget_tc", FI_PARAM_STRING, + "Traffic class used for software initiated rendezvous gets."); + fi_param_get_str(&cxip_prov, "rget_tc", ¶m_str); + + if (param_str) { + if (!strcmp(param_str, "BEST_EFFORT")) + cxip_env.rget_tc = FI_TC_BEST_EFFORT; + else if (!strcmp(param_str, "LOW_LATENCY")) + cxip_env.rget_tc = FI_TC_LOW_LATENCY; + else if (!strcmp(param_str, "DEDICATED_ACCESS")) + cxip_env.rget_tc = FI_TC_DEDICATED_ACCESS; + else if (!strcmp(param_str, "BULK_DATA")) + cxip_env.rget_tc = FI_TC_BULK_DATA; + else + CXIP_WARN("Unrecognized rget_tc: %s\n", param_str); + param_str = NULL; + } + + cxip_env.cacheline_size = cxip_cacheline_size(); + CXIP_DBG("Provider using cacheline size of %d\n", + cxip_env.cacheline_size); + + fi_param_define(&cxip_prov, "rdzv_aligned_sw_rget", FI_PARAM_BOOL, + "Enables SW RGet address alignment (default: %d).", + cxip_env.rdzv_aligned_sw_rget); + fi_param_get_bool(&cxip_prov, "rdzv_aligned_sw_rget", + &cxip_env.rdzv_aligned_sw_rget); + + fi_param_define(&cxip_prov, "enable_trig_op_limit", FI_PARAM_BOOL, + "Enable enforcement of triggered operation limit. " + "Doing this can result in degrade " + "fi_control(FI_QUEUE_WORK) performance at the cost of " + "potentially deadlocking. If disabled, applications " + "must prevent deadlock by ensuring triggered op limit " + "is not exceeded. Default: %d.", + cxip_env.enable_trig_op_limit); + fi_param_get_bool(&cxip_prov, "enable_trig_op_limit", + &cxip_env.enable_trig_op_limit); + + fi_param_define(&cxip_prov, "disable_non_inject_msg_idc", FI_PARAM_BOOL, + "Disables IDC for non-inject messages (default: %d).", + cxip_env.disable_non_inject_msg_idc); + fi_param_get_bool(&cxip_prov, "disable_non_inject_msg_idc", + &cxip_env.disable_non_inject_msg_idc); + + fi_param_define(&cxip_prov, "disable_host_register", FI_PARAM_BOOL, + "Disables host buffer GPU registration (default: %d).", + cxip_env.disable_host_register); + fi_param_get_bool(&cxip_prov, "disable_host_register", + &cxip_env.disable_host_register); + + fi_param_define(&cxip_prov, "enable_unrestricted_end_ro", FI_PARAM_BOOL, + "Default: %d", cxip_env.enable_unrestricted_end_ro); + fi_param_get_bool(&cxip_prov, "enable_unrestricted_end_ro", + &cxip_env.enable_unrestricted_end_ro); + + fi_param_define(&cxip_prov, "odp", FI_PARAM_BOOL, + "Enables on-demand paging (default %d).", cxip_env.odp); + fi_param_get_bool(&cxip_prov, "odp", &cxip_env.odp); + + fi_param_define(&cxip_prov, "force_odp", FI_PARAM_BOOL, + "Force use of on-demand paging (default %d).", + cxip_env.force_odp); + fi_param_get_bool(&cxip_prov, "force_odp", &cxip_env.force_odp); + if (cxip_env.force_odp && !cxip_env.odp) { + cxip_env.odp = true; + CXIP_INFO("Forcing ODP usage enabled ODP mode\n"); + } + + fi_param_define(&cxip_prov, "ats", FI_PARAM_BOOL, + "Enables PCIe ATS."); + fi_param_get_bool(&cxip_prov, "ats", &cxip_env.ats); + + fi_param_define(&cxip_prov, "iotlb", FI_PARAM_BOOL, + "Enables the NIC IOTLB (default %d).", cxip_env.iotlb); + fi_param_get_bool(&cxip_prov, "iotlb", &cxip_env.iotlb); + + fi_param_define(&cxip_prov, "ats_mlock_mode", FI_PARAM_STRING, + "Sets ATS mlock mode (off | all)."); + fi_param_get_str(&cxip_prov, "ats_mlock_mode", ¶m_str); + + if (param_str) { + if (!strcmp(param_str, "off")) + cxip_env.ats_mlock_mode = CXIP_ATS_MLOCK_OFF; + else if (!strcmp(param_str, "all")) + cxip_env.ats_mlock_mode = CXIP_ATS_MLOCK_ALL; + else + CXIP_WARN("Unrecognized ats_mlock_mode: %s\n", + param_str); + param_str = NULL; + } + + fi_param_define(&cxip_prov, "device_name", FI_PARAM_STRING, + "Restrict CXI provider to specific CXI devices. Format is a comma separated list of CXI devices (e.g. cxi0,cxi1)."); + fi_param_get_str(&cxip_prov, "device_name", &cxip_env.device_name); + + if (cxip_env.device_name) { + ret = cxip_env_validate_device_name(cxip_env.device_name); + if (ret) { + CXIP_WARN("Failed to validate device name: name=%s rc=%d. Ignoring device name.\n", + cxip_env.device_name, ret); + cxip_env.device_name = NULL; + } + } + + /* Keep track if CXI_FORK_SAFE/CXI_FORK_SAFE_HP was requested. This + * is used to avoid mapping memory is some cases. + */ + if (getenv("CXI_FORK_SAFE") || getenv("CXI_FORK_SAFE_HP")) + cxip_env.fork_safe_requested = true; + + /* Counters env string is validate when the cxip_env.telemetry string + * is used. + */ + fi_param_define(&cxip_prov, "telemetry", FI_PARAM_STRING, + "Perform a telemetry delta captured between fi_domain open and close. " + "Format is a comma separated list of telemetry files as defined in /sys/class/cxi/cxi*/device/telemetry/. " + "Default is counter delta captured disabled."); + fi_param_get_str(&cxip_prov, "telemetry", &cxip_env.telemetry); + + fi_param_define(&cxip_prov, "telemetry_rgid", FI_PARAM_INT, + "Resource group ID (RGID) to restrict the telemetry collection to. " + "Value less than 0 is no restrictions. " + "Default is no restrictions."); + fi_param_get_int(&cxip_prov, "telemetry_rgid", + &cxip_env.telemetry_rgid); + + fi_param_define(&cxip_prov, "rx_match_mode", FI_PARAM_STRING, + "Sets RX message match mode (hardware | software | hybrid)."); + fi_param_get_str(&cxip_prov, "rx_match_mode", ¶m_str); + + if (param_str) { + if (!strcasecmp(param_str, "hardware")) { + cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; + cxip_env.msg_offload = true; + } else if (!strcmp(param_str, "software")) { + cxip_env.rx_match_mode = CXIP_PTLTE_SOFTWARE_MODE; + cxip_env.msg_offload = false; + } else if (!strcmp(param_str, "hybrid")) { + cxip_env.rx_match_mode = CXIP_PTLTE_HYBRID_MODE; + cxip_env.msg_offload = true; + } else { + CXIP_WARN("Unrecognized rx_match_mode: %s\n", + param_str); + cxip_env.rx_match_mode = CXIP_PTLTE_HARDWARE_MODE; + cxip_env.msg_offload = true; + } + + param_str = NULL; + } + + fi_param_define(&cxip_prov, "rdzv_threshold", FI_PARAM_SIZE_T, + "Message size threshold for rendezvous protocol."); + fi_param_get_size_t(&cxip_prov, "rdzv_threshold", + &cxip_env.rdzv_threshold); + + /* Rendezvous protocol does support FI_INJECT, make sure + * eager send message is selected for FI_INJECT. + */ + if (cxip_env.rdzv_threshold < CXIP_INJECT_SIZE) { + cxip_env.rdzv_threshold = CXIP_INJECT_SIZE; + CXIP_WARN("Increased rdzv_threshold size to: %lu\n", + cxip_env.rdzv_threshold); + } + + /* If aligned SW Rget is enabled, rendezvous eager data must + * be greater than cache-line size. + */ + if (cxip_env.rdzv_aligned_sw_rget && + cxip_env.rdzv_threshold < cxip_env.cacheline_size) { + cxip_env.rdzv_threshold = cxip_env.cacheline_size; + CXIP_WARN("Increased rdzv_threshold size to: %lu\n", + cxip_env.rdzv_threshold); + } + + fi_param_define(&cxip_prov, "rdzv_get_min", FI_PARAM_SIZE_T, + "Minimum rendezvous Get payload size."); + fi_param_get_size_t(&cxip_prov, "rdzv_get_min", + &cxip_env.rdzv_get_min); + + fi_param_define(&cxip_prov, "rdzv_eager_size", FI_PARAM_SIZE_T, + "Eager data size for rendezvous protocol."); + fi_param_get_size_t(&cxip_prov, "rdzv_eager_size", + &cxip_env.rdzv_eager_size); + + if (cxip_env.rdzv_eager_size > cxip_env.rdzv_threshold) { + cxip_env.rdzv_eager_size = cxip_env.rdzv_threshold; + CXIP_WARN("Invalid rdzv_eager_size, new size: %lu\n", + cxip_env.rdzv_eager_size); + } + + fi_param_define(&cxip_prov, "oflow_buf_size", FI_PARAM_SIZE_T, + "Overflow buffer size."); + fi_param_get_size_t(&cxip_prov, "oflow_buf_size", + &cxip_env.oflow_buf_size); + + if (cxip_env.rdzv_threshold > cxip_env.oflow_buf_size) { + CXIP_WARN("Invalid rdzv_threshold: %lu\n", + cxip_env.rdzv_threshold); + cxip_env.rdzv_threshold = CXIP_RDZV_THRESHOLD; + } + + if (cxip_env.rdzv_get_min > + (cxip_env.oflow_buf_size - cxip_env.rdzv_threshold)) { + CXIP_WARN("Invalid rdzv_get_min: %lu\n", + cxip_env.rdzv_get_min); + cxip_env.rdzv_get_min = 0; + } + + /* Allow either FI_CXI_OFLOW_BUF_COUNT or FI_CXI_FLOW_BUF_MIN_POSTED */ + fi_param_define(&cxip_prov, "oflow_buf_count", FI_PARAM_SIZE_T, + "Overflow buffer count/min posted."); + fi_param_get_size_t(&cxip_prov, "oflow_buf_count", + &cxip_env.oflow_buf_min_posted); + fi_param_define(&cxip_prov, "oflow_buf_min_posted", FI_PARAM_SIZE_T, + "Overflow buffer count/min posted."); + fi_param_get_size_t(&cxip_prov, "oflow_buf_min_posted", + &cxip_env.oflow_buf_min_posted); + cxip_env.oflow_buf_max_cached = cxip_env.oflow_buf_min_posted * 3; + + fi_param_define(&cxip_prov, "oflow_buf_max_cached", FI_PARAM_SIZE_T, + "Maximum number of overflow buffers cached."); + fi_param_get_size_t(&cxip_prov, "oflow_buf_max_cached", + &cxip_env.oflow_buf_max_cached); + if (cxip_env.oflow_buf_max_cached && cxip_env.oflow_buf_max_cached < + cxip_env.oflow_buf_min_posted) { + cxip_env.oflow_buf_max_cached = cxip_env.oflow_buf_min_posted; + CXIP_WARN("Adjusted oflow buffer max cached to %lu\n", + cxip_env.oflow_buf_max_cached); + } + + fi_param_define(&cxip_prov, "safe_devmem_copy_threshold", + FI_PARAM_SIZE_T, + "Max memcpy for load/store HMEM access (default %lu).", + cxip_env.safe_devmem_copy_threshold); + fi_param_get_size_t(&cxip_prov, "safe_devmem_copy_threshold", + &cxip_env.safe_devmem_copy_threshold); + + fi_param_define(&cxip_prov, "optimized_mrs", FI_PARAM_BOOL, + "Enables optimized memory regions."); + fi_param_get_bool(&cxip_prov, "optimized_mrs", + &cxip_env.optimized_mrs); + + fi_param_define(&cxip_prov, "mr_match_events", FI_PARAM_BOOL, + "Enable MR match counting (default %lu).", + &cxip_env.mr_match_events); + fi_param_get_bool(&cxip_prov, "mr_match_events", + &cxip_env.mr_match_events); + + fi_param_define(&cxip_prov, "prov_key_cache", FI_PARAM_BOOL, + "Disable caching of FI_MR_PROV_KEY (default %lu).", + &cxip_env.prov_key_cache); + fi_param_get_bool(&cxip_prov, "prov_key_cache", + &cxip_env.prov_key_cache); + + fi_param_define(&cxip_prov, "llring_mode", FI_PARAM_STRING, + "Set low-latency command queue ring mode."); + fi_param_get_str(&cxip_prov, "llring_mode", ¶m_str); + + if (param_str) { + if (!strcmp(param_str, "always")) + cxip_env.llring_mode = CXIP_LLRING_ALWAYS; + else if (!strcmp(param_str, "idle")) + cxip_env.llring_mode = CXIP_LLRING_IDLE; + else if (!strcmp(param_str, "never")) + cxip_env.llring_mode = CXIP_LLRING_NEVER; + else + CXIP_WARN("Unrecognized llring_mode: %s\n", + param_str); + + param_str = NULL; + } + + fi_param_define(&cxip_prov, "zbcoll_radix", FI_PARAM_INT, + "Set radix of the zero-byte barrier tree."); + fi_param_get_int(&cxip_prov, "zbcoll_radix", &cxip_env.zbcoll_radix); + if (cxip_env.zbcoll_radix < 2) { + CXIP_WARN("Invalid zbcoll_radix=%d, reset to 2\n", + cxip_env.zbcoll_radix); + cxip_env.zbcoll_radix = 2; + } + + fi_param_define(&cxip_prov, "cq_policy", FI_PARAM_STRING, + "Set Command Queue write-back policy."); + fi_param_get_str(&cxip_prov, "cq_policy", ¶m_str); + + if (param_str) { + if (!strcmp(param_str, "always")) + cxip_env.cq_policy = CXI_CQ_UPDATE_ALWAYS; + else if (!strcmp(param_str, "high_empty")) + cxip_env.cq_policy = CXI_CQ_UPDATE_HIGH_FREQ_EMPTY; + else if (!strcmp(param_str, "low_empty")) + cxip_env.cq_policy = CXI_CQ_UPDATE_LOW_FREQ_EMPTY; + else if (!strcmp(param_str, "low")) + cxip_env.cq_policy = CXI_CQ_UPDATE_LOW_FREQ; + else + CXIP_WARN("Unrecognized cq_policy: %s\n", + param_str); + + param_str = NULL; + } + + fi_param_define(&cxip_prov, "default_vni", FI_PARAM_SIZE_T, + "Default VNI value used only for service IDs where the VNI is not restricted."); + fi_param_get_size_t(&cxip_prov, "default_vni", &cxip_env.default_vni); + + fi_param_define(&cxip_prov, "eq_ack_batch_size", FI_PARAM_SIZE_T, + "Number of EQ events to process before acknowledgement"); + fi_param_get_size_t(&cxip_prov, "eq_ack_batch_size", + &cxip_env.eq_ack_batch_size); + + if (!cxip_env.eq_ack_batch_size) + cxip_env.eq_ack_batch_size = 1; + + fi_param_define(&cxip_prov, "msg_lossless", FI_PARAM_BOOL, + "Enable/Disable lossless message matching."); + fi_param_get_bool(&cxip_prov, "msg_lossless", &cxip_env.msg_lossless); + + fi_param_define(&cxip_prov, "req_buf_size", FI_PARAM_SIZE_T, + "Size of request buffer."); + fi_param_get_size_t(&cxip_prov, "req_buf_size", &cxip_env.req_buf_size); + + fi_param_define(&cxip_prov, "req_buf_min_posted", FI_PARAM_SIZE_T, + "Minimum number of request buffer posted."); + fi_param_get_size_t(&cxip_prov, "req_buf_min_posted", + &cxip_env.req_buf_min_posted); + + /* Allow either FI_CXI_REQ_BUF_MAX_CACHED or FI_CXI_REQ_BUF_MAX_COUNT */ + fi_param_define(&cxip_prov, "req_buf_max_count", FI_PARAM_SIZE_T, + "Maximum number of request buffer cached."); + fi_param_get_size_t(&cxip_prov, "req_buf_max_count", + &cxip_env.req_buf_max_cached); + fi_param_define(&cxip_prov, "req_buf_max_cached", FI_PARAM_SIZE_T, + "Maximum number of request buffer cached."); + fi_param_get_size_t(&cxip_prov, "req_buf_max_cached", + &cxip_env.req_buf_max_cached); + + /* Parameters to tailor hybrid hardware to software transitions + * that are initiated by software. + */ + fi_param_define(&cxip_prov, "hybrid_preemptive", FI_PARAM_BOOL, + "Enable/Disable low LE preemptive UX transitions."); + fi_param_get_bool(&cxip_prov, "hybrid_preemptive", + &cxip_env.hybrid_preemptive); + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_preemptive) { + cxip_env.hybrid_preemptive = false; + CXIP_WARN("Not in hybrid mode, ignoring preemptive\n"); + } + + fi_param_define(&cxip_prov, "hybrid_recv_preemptive", FI_PARAM_BOOL, + "Enable/Disable low LE preemptive recv transitions."); + fi_param_get_bool(&cxip_prov, "hybrid_recv_preemptive", + &cxip_env.hybrid_recv_preemptive); + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_recv_preemptive) { + CXIP_WARN("Not in hybrid mode, ignore LE recv preemptive\n"); + cxip_env.hybrid_recv_preemptive = 0; + } + + fi_param_define(&cxip_prov, "hybrid_posted_recv_preemptive", + FI_PARAM_BOOL, + "Enable preemptive transition to software endpoint when number of posted receives exceeds RX attribute size"); + fi_param_get_bool(&cxip_prov, "hybrid_posted_recv_preemptive", + &cxip_env.hybrid_posted_recv_preemptive); + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_posted_recv_preemptive) { + CXIP_WARN("Not in hybrid mode, ignore hybrid_posted_recv_preemptive\n"); + cxip_env.hybrid_posted_recv_preemptive = 0; + } + + fi_param_define(&cxip_prov, "hybrid_unexpected_msg_preemptive", + FI_PARAM_BOOL, + "Enable preemptive transition to software endpoint when number of hardware unexpected messages exceeds RX attribute size"); + fi_param_get_bool(&cxip_prov, "hybrid_unexpected_msg_preemptive", + &cxip_env.hybrid_unexpected_msg_preemptive); + + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_unexpected_msg_preemptive) { + CXIP_WARN("Not in hybrid mode, ignore hybrid_unexpected_msg_preemptive\n"); + cxip_env.hybrid_unexpected_msg_preemptive = 0; + } + + if (cxip_software_pte_allowed()) { + min_free = CXIP_REQ_BUF_HEADER_MAX_SIZE + + cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; + + if (cxip_env.req_buf_size < min_free) { + cxip_env.req_buf_size = min_free; + CXIP_WARN("Requested request buffer size to small. Setting to %lu bytes\n", + cxip_env.req_buf_size); + } + + if (cxip_env.req_buf_min_posted < 2) { + cxip_env.req_buf_min_posted = 2; + CXIP_WARN("Adjusted request buffer min posted to %lu\n", + cxip_env.req_buf_min_posted); + } + + /* Zero max count is unlimited */ + if (cxip_env.req_buf_max_cached && + cxip_env.req_buf_max_cached < cxip_env.req_buf_min_posted) { + cxip_env.req_buf_max_cached = + cxip_env.req_buf_min_posted; + CXIP_WARN("Adjusted request buffer max cached to %lu\n", + cxip_env.req_buf_max_cached); + } + } + + fi_param_define(&cxip_prov, "fc_retry_usec_delay", FI_PARAM_INT, + "Micro-second delay before retrying failed flow-control messages. Default: %d usecs", + cxip_env.fc_retry_usec_delay); + fi_param_get_int(&cxip_prov, "fc_retry_usec_delay", + &cxip_env.fc_retry_usec_delay); + if (cxip_env.fc_retry_usec_delay < 0) { + cxip_env.fc_retry_usec_delay = 0; + CXIP_WARN("FC retry delay invalid. Setting to %d usecs\n", + cxip_env.fc_retry_usec_delay); + } + + fi_param_define(&cxip_prov, "sw_rx_tx_init_max", FI_PARAM_INT, + "Max TX S/W RX processing will initiate. Default: %d", + cxip_env.sw_rx_tx_init_max); + fi_param_get_int(&cxip_prov, "sw_rx_tx_init_max", + &cxip_env.sw_rx_tx_init_max); + if (cxip_env.sw_rx_tx_init_max < CXIP_SW_RX_TX_INIT_MIN) { + cxip_env.sw_rx_tx_init_max = CXIP_SW_RX_TX_INIT_MIN; + CXIP_WARN("Max TX S/W RX processing initiates adjusted to: %d", + cxip_env.sw_rx_tx_init_max); + } + + fi_param_define(&cxip_prov, "ctrl_rx_eq_max_size", FI_PARAM_SIZE_T, + "Control receive event queue max size. Values are aligned up to 4KiB. Default: %lu bytes", + cxip_env.ctrl_rx_eq_max_size); + fi_param_get_size_t(&cxip_prov, "ctrl_rx_eq_max_size", + &cxip_env.ctrl_rx_eq_max_size); + + fi_param_define(&cxip_prov, "default_cq_size", FI_PARAM_SIZE_T, + "Default provider CQ size (default: %lu).", + cxip_env.default_cq_size); + fi_param_get_size_t(&cxip_prov, "default_cq_size", + &cxip_env.default_cq_size); + if (cxip_env.default_cq_size == 0) { + cxip_env.default_cq_size = CXIP_CQ_DEF_SZ; + CXIP_WARN("Default CQ size invalid. Setting to %lu\n", + cxip_env.default_cq_size); + } + + /* FI_CXI_DISABLE_EQ_HUGETLB will deprecate use of + * FI_CXI_DISABLE_CQ_HUGETLB, both are allowed for now. + */ + fi_param_define(&cxip_prov, "disable_cq_hugetlb", FI_PARAM_BOOL, + "Disable 2MiB hugetlb allocates for HW event queues (default: %u).", + cxip_env.disable_eq_hugetlb); + fi_param_get_bool(&cxip_prov, "disable_cq_hugetlb", + &cxip_env.disable_eq_hugetlb); + fi_param_define(&cxip_prov, "disable_eq_hugetlb", FI_PARAM_BOOL, + "Disable 2MiB hugetlb allocates for HW event queues (default: %u).", + cxip_env.disable_eq_hugetlb); + fi_param_get_bool(&cxip_prov, "disable_eq_hugetlb", + &cxip_env.disable_eq_hugetlb); + + fi_param_define(&cxip_prov, "cq_fill_percent", FI_PARAM_SIZE_T, + "Fill percent of underlying hardware event queue used to determine when completion queue is saturated (default: %lu).", + cxip_env.cq_fill_percent); + fi_param_get_size_t(&cxip_prov, "cq_fill_percent", + &cxip_env.cq_fill_percent); + + if (cxip_env.cq_fill_percent < 1 || + cxip_env.cq_fill_percent > 100) { + cxip_env.cq_fill_percent = 50; + CXIP_WARN("CQ fill percent invalid. Setting to %lu.\n", + cxip_env.cq_fill_percent); + } + + fi_param_define(&cxip_prov, "coll_job_id", FI_PARAM_STRING, + "Collective job identifier (default %s).", + cxip_env.coll_job_id); + fi_param_get_str(&cxip_prov, "coll_job_id", + &cxip_env.coll_job_id); + + fi_param_define(&cxip_prov, "coll_job_step_id", FI_PARAM_STRING, + "Collective job-step identifier (default %s).", + cxip_env.coll_job_step_id); + fi_param_get_str(&cxip_prov, "coll_job_step_id", + &cxip_env.coll_job_step_id); + + fi_param_define(&cxip_prov, "coll_fabric_mgr_url", FI_PARAM_STRING, + "Fabric multicast REST API URL (default %s).", + cxip_env.coll_fabric_mgr_url); + fi_param_get_str(&cxip_prov, "coll_fabric_mgr_url", + &cxip_env.coll_fabric_mgr_url); + if (cxip_env.coll_fabric_mgr_url) { + ret = cxip_env_validate_url(cxip_env.coll_fabric_mgr_url); + if (ret) { + CXIP_WARN("Failed to validate fabric multicast URL: name=%s rc=%d. Ignoring URL.\n", + cxip_env.coll_fabric_mgr_url, ret); + cxip_env.coll_fabric_mgr_url = NULL; + } + } + + fi_param_define(&cxip_prov, "coll_mcast_token", FI_PARAM_STRING, + "Fabric multicast REST API TOKEN (default none).", + cxip_env.coll_mcast_token); + fi_param_get_str(&cxip_prov, "coll_mcast_token", + &cxip_env.coll_mcast_token); + + fi_param_define(&cxip_prov, "coll_use_dma_put", FI_PARAM_BOOL, + "Use DMA Put for collectives (default: %d).", + cxip_env.coll_use_dma_put); + fi_param_get_bool(&cxip_prov, "coll_use_dma_put", + &cxip_env.coll_use_dma_put); + + fi_param_define(&cxip_prov, "hwcoll_addrs_per_job", FI_PARAM_SIZE_T, + "Maximum hardware collective addresses allowed."); + fi_param_get_size_t(&cxip_prov, "hwcoll_addrs_per_job", + &cxip_env.hwcoll_addrs_per_job); + + fi_param_define(&cxip_prov, "hwcoll_min_nodes", FI_PARAM_SIZE_T, + "Minimum number of nodes required for hwcoll."); + fi_param_get_size_t(&cxip_prov, "hwcoll_min_nodes", + &cxip_env.hwcoll_min_nodes); + + fi_param_define(&cxip_prov, "coll_retry_usec", FI_PARAM_SIZE_T, + "Retry period (usec) (default %d, min %d, max %d).", + cxip_env.coll_retry_usec, CXIP_COLL_MIN_RETRY_USEC, + CXIP_COLL_MAX_RETRY_USEC); + fi_param_get_size_t(&cxip_prov, "coll_retry_usec", + &cxip_env.coll_retry_usec); + if (cxip_env.coll_retry_usec < CXIP_COLL_MIN_RETRY_USEC) + cxip_env.coll_retry_usec = CXIP_COLL_MIN_RETRY_USEC; + if (cxip_env.coll_retry_usec > CXIP_COLL_MAX_RETRY_USEC) + cxip_env.coll_retry_usec = CXIP_COLL_MAX_RETRY_USEC; + + fi_param_define(&cxip_prov, "coll_timeout_usec", FI_PARAM_SIZE_T, + "Reduction tree timeout (usec) (default %d, min %d, max %d).", + cxip_env.coll_timeout_usec, CXIP_COLL_MIN_TIMEOUT_USEC, + CXIP_COLL_MAX_TIMEOUT_USEC); + fi_param_get_size_t(&cxip_prov, "coll_timeout_usec", + &cxip_env.coll_timeout_usec); + if (cxip_env.coll_timeout_usec < CXIP_COLL_MIN_TIMEOUT_USEC) + cxip_env.coll_timeout_usec = CXIP_COLL_MIN_TIMEOUT_USEC; + if (cxip_env.coll_timeout_usec > CXIP_COLL_MAX_TIMEOUT_USEC) + cxip_env.coll_timeout_usec = CXIP_COLL_MAX_TIMEOUT_USEC; + + fi_param_define(&cxip_prov, "default_tx_size", FI_PARAM_SIZE_T, + "Default provider tx_attr.size (default: %lu).", + cxip_env.default_tx_size); + fi_param_get_size_t(&cxip_prov, "default_tx_size", + &cxip_env.default_tx_size); + if (cxip_env.default_tx_size < 16 || + cxip_env.default_tx_size > CXIP_MAX_TX_SIZE) { + cxip_env.default_tx_size = CXIP_DEFAULT_TX_SIZE; + CXIP_WARN("Default TX size invalid. Setting to %lu\n", + cxip_env.default_tx_size); + } + + fi_param_define(&cxip_prov, "default_rx_size", FI_PARAM_SIZE_T, + "Default provider rx_attr.size (default: %lu).", + cxip_env.default_rx_size); + fi_param_get_size_t(&cxip_prov, "default_rx_size", + &cxip_env.default_rx_size); + if (cxip_env.default_rx_size < 16 || + cxip_env.default_rx_size > CXIP_MAX_RX_SIZE) { + cxip_env.default_rx_size = CXIP_DEFAULT_RX_SIZE; + CXIP_WARN("Default RX size invalid. Setting to %lu\n", + cxip_env.default_rx_size); + } + + fi_param_define(&cxip_prov, "disable_hmem_dev_register", FI_PARAM_BOOL, + "Disable registering HMEM device buffer for load/store access (default: %u).", + cxip_env.disable_hmem_dev_register); + fi_param_get_bool(&cxip_prov, "disable_hmem_dev_register", + &cxip_env.disable_hmem_dev_register); + + /* Check if ZE device memory can be supported. Provide env var to + * override just in case these checks become invalid. + */ + fi_param_define(&cxip_prov, "force_ze_hmem_support", FI_PARAM_BOOL, + "Disable ZE implicit scaling and KDM checks and force ZE HMEM support."); + fi_param_get_bool(&cxip_prov, "force_ze_hmem_support", + &cxip_env.ze_hmem_supported); + + if (!cxip_env.ze_hmem_supported) { + param_str = getenv("EnableImplicitScaling"); + if (param_str && atoi(param_str) == 0) { + param_str = getenv("NEOReadDebugKeys"); + if (param_str && atoi(param_str) == 1) + cxip_env.ze_hmem_supported = 1; + } + param_str = NULL; + } + + fi_param_define(&cxip_prov, "rdzv_proto", FI_PARAM_STRING, + "Sets preferred rendezvous protocol [default | alt_read] (default %s).", + cxip_rdzv_proto_to_str(cxip_env.rdzv_proto)); + fi_param_get_str(&cxip_prov, "rdzv_proto", ¶m_str); + + if (param_str) { + char *ch = param_str; + int chars = 8; + + while (ch && chars) { + if (*ch == '-') + *ch = '_'; + ch++; + chars--; + } + + if (!strcmp(param_str, "default")) + cxip_env.rdzv_proto = CXIP_RDZV_PROTO_DEFAULT; + else if (!strcmp(param_str, "alt_read")) + cxip_env.rdzv_proto = CXIP_RDZV_PROTO_ALT_READ; + else { + CXIP_WARN("Unrecognized rendezvous protocol: %s\n", + param_str); + cxip_env.rdzv_proto = CXIP_RDZV_PROTO_DEFAULT; + } + + param_str = NULL; + } + + set_system_page_size(); +} + +/* + * CXI_INI - Provider constructor. + */ +CXI_INI +{ + cxip_env_init(); + + cxip_curl_init(); + + cxip_if_init(); + + cxip_info_init(); + + cxip_fault_inject_init(); + + return &cxip_prov; +} + +/* + * cxip_fini() - Provider destructor. + */ +static void cxip_fini(void) +{ + cxip_fault_inject_fini(); + + fi_freeinfo((void *)cxip_util_prov.info); + + cxip_if_fini(); + + cxip_curl_fini(); +} + +static void cxip_alter_caps(struct fi_info *info, const struct fi_info *hints) +{ + /* If FI_COLLECTIVE explicitly requested then must enable + * FI_MSG for send and receive if not already enabled. + */ + if (hints && hints->caps && (hints->caps & FI_COLLECTIVE)) { + if (!(info->caps & (FI_MSG | FI_TAGGED))) { + info->caps |= FI_MSG | FI_SEND | FI_RECV; + info->tx_attr->caps |= FI_MSG | FI_SEND; + info->rx_attr->caps |= FI_MSG | FI_RECV; + } + } +} + +static void cxip_alter_tx_attr(struct fi_tx_attr *attr, + const struct fi_tx_attr *hints, + uint64_t info_caps) +{ + if (!hints || hints->size == 0) + attr->size = cxip_env.default_tx_size; +} + +static void cxip_alter_rx_attr(struct fi_rx_attr *attr, + const struct fi_rx_attr *hints, + uint64_t info_caps) +{ + if (!hints || hints->size == 0) + attr->size = cxip_env.default_rx_size; +} + +static void cxip_alter_info(struct fi_info *info, const struct fi_info *hints, + uint32_t api_version) +{ + for (; info; info = info->next) { + fi_control(&info->nic->fid, FI_OPT_CXI_NIC_REFRESH_ATTR, NULL); + + cxip_alter_caps(info, hints); + cxip_alter_tx_attr(info->tx_attr, hints ? hints->tx_attr : NULL, + info->caps); + cxip_alter_rx_attr(info->rx_attr, hints ? hints->rx_attr : NULL, + info->caps); + + /* Remove secondary capabilities that impact performance if + * hints are not specified. They must be explicitly requested. + */ + if (!hints) { + info->caps &= ~(FI_SOURCE | FI_SOURCE_ERR); + info->rx_attr->caps &= ~(FI_SOURCE | FI_SOURCE_ERR); + } + } +} + +static int cxip_alter_auth_key_align_domain_ep(struct fi_info **info) +{ + struct fi_info *fi_ptr; + + /* CXI provider requires the endpoint to have the same service ID as the + * domain. Account for edge case where users only set endpoint auth_key + * and leave domain auth_key as NULL by duplicating the endpoint + * auth_key to the domain. + */ + for (fi_ptr = *info; fi_ptr; fi_ptr = fi_ptr->next) { + if (!fi_ptr->domain_attr->auth_key && + fi_ptr->ep_attr->auth_key) { + fi_ptr->domain_attr->auth_key = + mem_dup(fi_ptr->ep_attr->auth_key, + fi_ptr->ep_attr->auth_key_size); + if (!fi_ptr->domain_attr->auth_key) + return -FI_ENOMEM; + + fi_ptr->domain_attr->auth_key_size = + fi_ptr->ep_attr->auth_key_size; + } + } + + return FI_SUCCESS; +} + +static void cxip_alter_auth_key_scrub_auth_key_size(const struct fi_info *hints, + struct fi_info **info) +{ + struct fi_info *fi_ptr; + bool av_auth_key = false; + + if (hints && hints->domain_attr) + av_auth_key = + hints->domain_attr->auth_key_size == FI_AV_AUTH_KEY; + + /* Zero the auth_key_size for any NULL auth_key. */ + for (fi_ptr = *info; fi_ptr; fi_ptr = fi_ptr->next) { + if (!fi_ptr->domain_attr->auth_key && !av_auth_key) + fi_ptr->domain_attr->auth_key_size = 0; + + if (!fi_ptr->ep_attr->auth_key) + fi_ptr->ep_attr->auth_key_size = 0; + } +} + +static int cxip_alter_auth_key_validate(struct fi_info **info) +{ + struct fi_info *fi_ptr; + struct fi_info *fi_ptr_tmp; + struct fi_info *fi_prev_ptr; + int ret; + + /* Core auth_key checks only verify auth_key_size. This check verifies + * that the user provided auth_key is valid. + */ + fi_ptr = *info; + *info = NULL; + fi_prev_ptr = NULL; + + while (fi_ptr) { + ret = cxip_check_auth_key_info(fi_ptr); + if (ret) { + /* discard entry */ + if (fi_prev_ptr) + fi_prev_ptr->next = fi_ptr->next; + + fi_ptr_tmp = fi_ptr; + fi_ptr = fi_ptr->next; + fi_ptr_tmp->next = NULL; + fi_freeinfo(fi_ptr_tmp); + continue; + } + + if (*info == NULL) + *info = fi_ptr; + + fi_prev_ptr = fi_ptr; + fi_ptr = fi_ptr->next; + } + + return FI_SUCCESS; +} + +int cxip_gen_auth_key(struct fi_info *info, struct cxi_auth_key *key) +{ + struct cxip_nic_attr *nic_attr; + + memset(key, 0, sizeof(*key)); + + if (info->domain_attr->auth_key) { + CXIP_WARN("Domain auth_key not NULL\n"); + return -FI_EINVAL; + } + + if (!info->nic || !info->nic->prov_attr) { + CXIP_WARN("Missing NIC provider attributes\n"); + return -FI_EINVAL; + } + + nic_attr = (struct cxip_nic_attr *)info->nic->prov_attr; + if (nic_attr->default_rgroup_id == 0) + return -FI_ENOSYS; + + key->svc_id = nic_attr->default_rgroup_id; + key->vni = nic_attr->default_vni; + + return FI_SUCCESS; +} + +static int cxip_alter_auth_key(const struct fi_info *hints, + struct fi_info **info) +{ + int ret; + + ret = cxip_alter_auth_key_align_domain_ep(info); + if (ret) + return ret; + + cxip_alter_auth_key_scrub_auth_key_size(hints, info); + + return cxip_alter_auth_key_validate(info); +} + +static int cxip_validate_iface_auth_key(struct cxip_if *iface, + struct cxi_auth_key *auth_key) +{ + if (!auth_key) + return FI_SUCCESS; + + return cxip_if_valid_rgroup_vni(iface, auth_key->svc_id, auth_key->vni); +} + +int cxip_check_auth_key_info(struct fi_info *info) +{ + struct cxip_addr *src_addr; + struct cxip_if *iface; + int ret; + + src_addr = (struct cxip_addr *)info->src_addr; + if (!src_addr) { + CXIP_WARN("NULL src_addr in fi_info\n"); + return -FI_EINVAL; + } + + ret = cxip_get_if(src_addr->nic, &iface); + if (ret) { + CXIP_WARN("cxip_get_if with NIC %#x failed: %d:%s\n", + src_addr->nic, ret, fi_strerror(-ret)); + return ret; + } + + if (info->domain_attr) { + ret = cxip_validate_iface_auth_key(iface, + (struct cxi_auth_key *)info->domain_attr->auth_key); + if (ret) { + CXIP_WARN("Invalid domain auth_key\n"); + goto err_put_if; + } + } + + if (info->ep_attr) { + ret = cxip_validate_iface_auth_key(iface, + (struct cxi_auth_key *)info->ep_attr->auth_key); + if (ret) { + CXIP_WARN("Invalid endpoint auth_key\n"); + goto err_put_if; + } + } + + cxip_put_if(iface); + + return FI_SUCCESS; + +err_put_if: + cxip_put_if(iface); + + return ret; +} + +/* + * cxip_getinfo() - Provider fi_getinfo() implementation. + */ +static int +cxip_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, + struct fi_info **info) +{ + int ret; + struct fi_info *fi_ptr; + struct fi_info *fi_ptr_tmp; + struct fi_info *fi_prev_ptr; + struct ether_addr *mac; + uint32_t scan_nic = 0; + uint32_t scan_pid = 0; + struct cxip_addr *addr; + struct cxip_if *iface; + bool copy_dest = NULL; + struct fi_info *temp_hints = NULL; + + if (flags & FI_SOURCE) { + if (!node && !service) { + CXIP_WARN("FI_SOURCE set, but no node or service\n"); + return -FI_EINVAL; + } + } + + if (node) { + iface = cxip_if_lookup_name(node); + if (iface) { + scan_nic = iface->info->nic_addr; + } else if ((mac = ether_aton(node))) { + scan_nic = cxip_mac_to_nic(mac); + } else if (sscanf(node, "%i", &scan_nic) != 1) { + CXIP_WARN("Invalid node: %s\n", node); + return -FI_EINVAL; + } + + CXIP_DBG("Node NIC: %#x\n", scan_nic); + } + + if (service) { + if (sscanf(service, "%i", &scan_pid) != 1) { + CXIP_WARN("Invalid service: %s\n", service); + return -FI_EINVAL; + } + + if (scan_pid >= C_PID_ANY) { + CXIP_WARN("Service out of range [0-%d): %u\n", + C_PID_ANY, scan_pid); + return -FI_EINVAL; + } + + CXIP_DBG("Service PID: %u\n", scan_pid); + } + + /* Previously when remote access ODP was not enabled, the provider + * did not indicate it required FI_MR_ALLOCATED. To correct this + * while not breaking applications, when ODP is NOT enabled add + * FI_MR_ALLOCATED to the hints. Note that if the client sets + * FI_MR_UNSPEC in hints the correct provider required mode + * bits will be returned that the applicaiton must support. + * + * TODO: When ODP is enabled by default, this should be removed + * and applications should use hints to pick the desired mode. + */ + if (!cxip_env.odp && hints && hints->domain_attr && + hints->domain_attr->mr_mode == FI_MR_ENDPOINT) { + temp_hints = fi_dupinfo(hints); + if (!temp_hints) + return -FI_ENOMEM; + + temp_hints->domain_attr->mr_mode |= FI_MR_ALLOCATED; + + CXIP_INFO("FI_MR_ALLOCATED added to hints MR mode\n"); + } + + /* Find all matching domains, ignoring addresses. */ + ret = util_getinfo(&cxip_util_prov, version, NULL, NULL, 0, + temp_hints ? temp_hints : hints, + info); + if (temp_hints) + fi_freeinfo(temp_hints); + + if (ret) + return ret; + + /* Remove any info that did match based on mr_mode requirements. + * Note that mr_mode FI_MR_ENDPOINT is only required if target + * RMA/ATOMIC access is required. + */ + if (hints) { + fi_ptr = *info; + *info = NULL; + fi_prev_ptr = NULL; + + while (fi_ptr) { + if (fi_ptr->caps & (FI_ATOMIC | FI_RMA) && + !fi_ptr->domain_attr->mr_mode) { + /* discard entry */ + if (fi_prev_ptr) + fi_prev_ptr->next = fi_ptr->next; + + fi_ptr_tmp = fi_ptr; + fi_ptr = fi_ptr->next; + fi_ptr_tmp->next = NULL; + fi_freeinfo(fi_ptr_tmp); + continue; + } + + /* Keep the matching info */ + if (*info == NULL) + *info = fi_ptr; + + fi_prev_ptr = fi_ptr; + fi_ptr = fi_ptr->next; + } + } + + /* Search for a specific OFI Domain by node string. */ + if (flags & FI_SOURCE && node) { + iface = cxip_if_lookup_addr(scan_nic); + if (!iface) { + /* This shouldn't fail. */ + ret = -FI_EINVAL; + goto freeinfo; + } + + fi_ptr = *info; + *info = NULL; + fi_prev_ptr = NULL; + + while (fi_ptr) { + if (strcmp(fi_ptr->domain_attr->name, + iface->info->device_name)) { + /* discard entry */ + if (fi_prev_ptr) + fi_prev_ptr->next = fi_ptr->next; + + fi_ptr_tmp = fi_ptr; + fi_ptr = fi_ptr->next; + fi_ptr_tmp->next = NULL; + fi_freeinfo(fi_ptr_tmp); + continue; + } + + /* Keep the matching info */ + if (*info == NULL) + *info = fi_ptr; + + fi_prev_ptr = fi_ptr; + fi_ptr = fi_ptr->next; + } + } + + /* Search for a specific OFI Domain by name. The CXI Domain name + * matches the NIC device file name (cxi[0-9]). + */ + if (hints && hints->domain_attr && hints->domain_attr->name) { + fi_ptr = *info; + *info = NULL; + fi_prev_ptr = NULL; + + while (fi_ptr) { + if (strcmp(fi_ptr->domain_attr->name, + hints->domain_attr->name)) { + /* discard entry */ + if (fi_prev_ptr) + fi_prev_ptr->next = fi_ptr->next; + + fi_ptr_tmp = fi_ptr; + fi_ptr = fi_ptr->next; + fi_ptr_tmp->next = NULL; + fi_freeinfo(fi_ptr_tmp); + continue; + } + + /* Keep the matching info */ + if (*info == NULL) + *info = fi_ptr; + + fi_prev_ptr = fi_ptr; + fi_ptr = fi_ptr->next; + } + } + + cxip_alter_info(*info, hints, version); + + /* Check if any infos remain. */ + if (!*info) + return FI_SUCCESS; + + for (fi_ptr = *info; fi_ptr; fi_ptr = fi_ptr->next) { + if (flags & FI_SOURCE) { + /* Set client-assigned PID value in source address. */ + if (service) { + addr = (struct cxip_addr *)fi_ptr->src_addr; + addr->pid = scan_pid; + } + + copy_dest = (hints && hints->dest_addr); + } else { + if (node) { + struct cxip_addr addr = {}; + + addr.nic = scan_nic; + addr.pid = scan_pid; + + fi_ptr->dest_addr = mem_dup(&addr, + sizeof(addr)); + if (!fi_ptr->dest_addr) { + ret = -FI_ENOMEM; + goto freeinfo; + } + fi_ptr->dest_addrlen = sizeof(addr); + } else { + copy_dest = (hints && hints->dest_addr); + } + + if (hints && hints->src_addr) { + fi_ptr->src_addr = mem_dup(hints->src_addr, + hints->src_addrlen); + if (!fi_ptr->src_addr) { + ret = -FI_ENOMEM; + goto freeinfo; + } + fi_ptr->src_addrlen = hints->src_addrlen; + fi_ptr->addr_format = hints->addr_format; + } + } + + if (copy_dest) { + fi_ptr->dest_addr = mem_dup(hints->dest_addr, + hints->dest_addrlen); + if (!fi_ptr->dest_addr) { + ret = -FI_ENOMEM; + goto freeinfo; + } + fi_ptr->dest_addrlen = hints->dest_addrlen; + fi_ptr->addr_format = hints->addr_format; + } + } + + ret = cxip_alter_auth_key(hints, info); + if (ret) + goto freeinfo; + + /* Nothing left to do if hints weren't provided. */ + if (!hints) + return FI_SUCCESS; + + /* util_getinfo() returns a list of fi_info that match the MR mode + * for each nic. They are listed in provider preference order. + * Since hints were provided, keep only the most preferred fi_info for + * any given domain/interface using the same address format. We + * always keep the first one. + */ + fi_ptr = *info; + fi_prev_ptr = NULL; + + while (fi_ptr) { + if (fi_prev_ptr && + !strcmp(fi_ptr->domain_attr->name, + fi_prev_ptr->domain_attr->name) && + fi_ptr->addr_format == fi_prev_ptr->addr_format) { + /* discard entry */ + fi_prev_ptr->next = fi_ptr->next; + fi_ptr_tmp = fi_ptr; + fi_ptr = fi_ptr->next; + + fi_ptr_tmp->next = NULL; + fi_freeinfo(fi_ptr_tmp); + continue; + } + + /* Keep the preferred info for this domain */ + fi_prev_ptr = fi_ptr; + fi_ptr = fi_ptr->next; + } + + /* util_getinfo() returns a list of fi_info for each matching OFI + * Domain (physical CXI interface). + * + * Perform fixups: + * -Use input ordering requirements. + * -Remove unrequested secondary caps that impact performance. + */ + for (fi_ptr = *info; fi_ptr; fi_ptr = fi_ptr->next) { + /* Ordering requirements prevent the use of restricted packets. + * If hints exist, copy msg_order settings directly. + */ + fi_ptr->tx_attr->msg_order = hints->tx_attr->msg_order; + + /* Requesting FI_RMA_EVENT prevents the use of restricted + * packets. Do not set FI_RMA_EVENT unless explicitly + * requested. + */ + if (hints->caps && !(hints->caps & FI_RMA_EVENT)) { + fi_ptr->caps &= ~FI_RMA_EVENT; + fi_ptr->rx_attr->caps &= ~FI_RMA_EVENT; + } + + /* FI_SOURCE_ERR requires that FI_SOURCE be set, it is + * an error if requested but can not be honored. + */ + if (hints->caps & FI_SOURCE_ERR && !(hints->caps & FI_SOURCE)) { + ret = -FI_ENODATA; + goto freeinfo; + } + + /* Requesting FI_SOURCE adds overhead to a receive operation. + * Do not set FI_SOURCE unless explicitly requested. + */ + if (!(hints->caps & FI_SOURCE)) { + fi_ptr->caps &= ~FI_SOURCE; + fi_ptr->rx_attr->caps &= ~FI_SOURCE; + } + + /* Requesting FI_SOURCE_ERR adds additional overhead to receive + * operations beyond FI_SOURCE, do not set if not explicitly + * asked. + */ + if (!(hints->caps & FI_SOURCE_ERR)) { + fi_ptr->caps &= ~FI_SOURCE_ERR; + fi_ptr->rx_attr->caps &= ~FI_SOURCE_ERR; + } + + /* Requesting FI_FENCE prevents the use PCIe RO for RMA. Do not + * set FI_FENCE unless explicitly requested. + */ + if (hints->caps && !(hints->caps & FI_FENCE)) { + fi_ptr->caps &= ~FI_FENCE; + fi_ptr->tx_attr->caps &= ~FI_FENCE; + } + + /* Requesting FI_HMEM requires use of device memory safe + * copy routines. Do not set FI_HMEM unless requested or + * all supported provider capabilities are requested. + */ + if (hints->caps && !(hints->caps & FI_HMEM)) { + fi_ptr->caps &= ~FI_HMEM; + fi_ptr->tx_attr->caps &= ~FI_HMEM; + fi_ptr->rx_attr->caps &= ~FI_HMEM; + } + } + + return FI_SUCCESS; + +freeinfo: + fi_freeinfo(*info); + + return ret; +} + +struct fi_provider cxip_prov = { + .name = cxip_prov_name, + .version = CXIP_PROV_VERSION, + .fi_version = CXIP_FI_VERSION, + .getinfo = cxip_getinfo, + .fabric = cxip_fabric, + .cleanup = cxip_fini, +}; diff --git a/prov/cxi/src/cxip_iomm.c b/prov/cxi/src/cxip_iomm.c new file mode 100644 index 00000000000..f6116f39b68 --- /dev/null +++ b/prov/cxi/src/cxip_iomm.c @@ -0,0 +1,618 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2019 Hewlett Packard Enterprise Development LP + */ + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_MR, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_MR, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_MR, __VA_ARGS__) + +#define MAP_FAIL_MSG "cxil_map lni: %d base: 0x%p len: %ld " \ + "map_flags: 0x%0X failure: %d, %s\n" + +/** + * cxip_do_map() - IO map a buffer. + */ +static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) +{ + int ret; + struct cxip_md *md = (struct cxip_md *)entry->data; + struct cxip_domain *dom; + uint32_t map_flags = CXI_MAP_READ | CXI_MAP_WRITE; + struct cxi_md_hints hints; + void *ze_handle; + void *ze_base_addr; + size_t ze_base_size; + uint64_t hmem_flags = entry->info.flags; + + dom = container_of(cache, struct cxip_domain, iomm); + + /* Prefer the ATS (scalable MD) whenever possible + * + * TODO: ATS (scalable MD) can only support CPU page sizes and should be + * avoided for non-standard page sizes. + */ + if (dom->scalable_iomm && entry->info.iface == FI_HMEM_SYSTEM) { + md->md = dom->scalable_md.md; + md->dom = dom; + md->info = entry->info; + + return FI_SUCCESS; + } + + memset(&hints, 0, sizeof(hints)); + + if (entry->info.iface == FI_HMEM_SYSTEM) { + if (dom->ats) + map_flags |= CXI_MAP_ATS; + + if (!dom->odp) + map_flags |= CXI_MAP_PIN; + } else { + /* TODO: Remove PIN when DMA buf move_notify is supported. */ + map_flags |= CXI_MAP_DEVICE | CXI_MAP_PIN; + + /* ZE support requires the use of the DMA buf FD and offset + * hints fields. + */ + if (entry->info.iface == FI_HMEM_ZE) { + if (!cxip_env.ze_hmem_supported) { + CXIP_WARN("ZE device memory not supported. Try disabling implicit scaling (EnableImplicitScaling=0 NEOReadDebugKeys=1).\n"); + return -FI_ENOSYS; + } + + ret = ze_hmem_get_handle(entry->info.iov.iov_base, + entry->info.iov.iov_len, + &ze_handle); + if (ret) { + CXIP_WARN("ze_hmem_get_handle failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err; + } + + ret = ze_hmem_get_base_addr(entry->info.iov.iov_base, + entry->info.iov.iov_len, + &ze_base_addr, + &ze_base_size); + if (ret) { + CXIP_WARN("ze_hmem_get_base_addr failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err; + } + + hints.dmabuf_fd = (int)(uintptr_t)ze_handle; + hints.dmabuf_offset = + (uintptr_t)entry->info.iov.iov_base - + (uintptr_t)ze_base_addr; + hints.dmabuf_valid = true; + } + } + + if (!cxip_env.iotlb) + map_flags |= CXI_MAP_NOCACHE; + + ret = cxil_map(dom->lni->lni, entry->info.iov.iov_base, + entry->info.iov.iov_len, map_flags, &hints, &md->md); + if (ret) { + CXIP_WARN(MAP_FAIL_MSG, dom->lni->lni->id, + entry->info.iov.iov_base, entry->info.iov.iov_len, + map_flags, ret, fi_strerror(-ret)); + goto err; + } + + /* zeHostMalloc() returns FI_HMEM_ZE but this cannot currently be + * registered with ofi_hmem_dev_register(). Thus skip it. + */ + if (cxip_env.disable_hmem_dev_register || + ((entry->info.iface == FI_HMEM_ZE) && + (hmem_flags & FI_HMEM_HOST_ALLOC))) + ret = -FI_ENOSYS; + else + ret = ofi_hmem_dev_register(entry->info.iface, + entry->info.iov.iov_base, + entry->info.iov.iov_len, + &md->handle); + switch (ret) { + case FI_SUCCESS: + md->handle_valid = true; + break; + + case -FI_ENOSYS: + md->handle_valid = false; + break; + + default: + CXIP_WARN("ofi_hmem_dev_register %s failed: %d:%s\n", + fi_tostr(&entry->info.iface, FI_TYPE_HMEM_IFACE), ret, + fi_strerror(-ret)); + goto err_unmap; + } + + md->dom = dom; + md->info = entry->info; + md->cached = true; + CXIP_DBG("addr:%p end:%p len:0x%lx iova:%llx lac:%d device:%d\n", + entry->info.iov.iov_base, + (char *)entry->info.iov.iov_base + entry->info.iov.iov_len, + entry->info.iov.iov_len, md->md->iova, md->md->lac, + !!(map_flags & CXI_MAP_DEVICE)); + + return FI_SUCCESS; + +err_unmap: + cxil_unmap(md->md); +err: + md->dom = NULL; + return ret; +} + +/** + * cxip_do_unmap() - IO unmap a buffer. + */ +static void cxip_do_unmap(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + int ret; + struct cxip_md *md = (struct cxip_md *)entry->data; + + if (!md || !md->dom || md->md == md->dom->scalable_md.md) + return; + + if (md->handle_valid) + ofi_hmem_dev_unregister(entry->info.iface, md->handle); + + ret = cxil_unmap(md->md); + if (ret) + CXIP_WARN("cxil_unmap failed: %d\n", ret); + + CXIP_DBG("addr:%p end:%p len:0x%lx iova:%llx lac:%d\n", + entry->info.iov.iov_base, + (char *)entry->info.iov.iov_base + entry->info.iov.iov_len, + entry->info.iov.iov_len, md->md->iova, md->md->lac); +} + +static int cxip_scalable_iomm_init(struct cxip_domain *dom) +{ + int ret; + uint32_t map_flags = (CXI_MAP_READ | CXI_MAP_WRITE | CXI_MAP_ATS); + + if (!cxip_env.iotlb) + map_flags |= CXI_MAP_NOCACHE; + + ret = cxil_map(dom->lni->lni, 0, 0xfffffffffffff000, map_flags, NULL, + &dom->scalable_md.md); + if (!ret) { + dom->scalable_md.dom = dom; + dom->scalable_iomm = true; + + CXIP_DBG("Scalable IOMM enabled.\n"); + + if (cxip_env.ats_mlock_mode == CXIP_ATS_MLOCK_ALL) { + ret = mlockall(MCL_CURRENT | MCL_FUTURE); + if (ret) { + CXIP_WARN("mlockall(MCL_CURRENT | MCL_FUTURE) failed: %d\n", + -errno); + } + } + + ret = FI_SUCCESS; + } else { + ret = -FI_ENOSYS; + } + + return ret; +} + +static void cxip_scalable_iomm_fini(struct cxip_domain *dom) +{ + cxil_unmap(dom->scalable_md.md); +} + +static int cxip_ats_check(struct cxip_domain *dom) +{ + uint32_t map_flags = CXI_MAP_READ | CXI_MAP_WRITE | CXI_MAP_ATS | + CXI_MAP_PIN; + int stack_var; + struct cxi_md *md; + int ret; + + ret = cxil_map(dom->lni->lni, &stack_var, sizeof(stack_var), map_flags, + NULL, &md); + if (!ret) { + cxil_unmap(md); + CXIP_INFO("PCIe ATS supported.\n"); + return 1; + } + + CXIP_INFO("PCIe ATS not supported.\n"); + return 0; +} + +static void cxip_iomm_set_rocr_dev_mem_only(struct cxip_domain *dom) +{ + int dev_hmem_count = 0; + bool rocr_support = false; + int i; + + if (!dom->hmem) { + dom->rocr_dev_mem_only = false; + return; + } + + for (i = 0; i < OFI_HMEM_MAX; i++) { + if (i == FI_HMEM_SYSTEM) + continue; + + if (hmem_ops[i].initialized) { + dev_hmem_count++; + + if (i == FI_HMEM_ROCR) + rocr_support = true; + } + } + + /* If FI_HMEM_ROCR is the ONLY device supported by libfabric and the + * core ROCR memory monitor is used, cxip_map can be optimized to avoid + * pointer queries. + */ + if (dev_hmem_count == 1 && rocr_support && + default_rocr_monitor == rocr_monitor) + dom->rocr_dev_mem_only = true; + else + dom->rocr_dev_mem_only = false; +} + +/* + * cxip_iomm_init() - Initialize domain IO memory map. + */ +int cxip_iomm_init(struct cxip_domain *dom) +{ + struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = { + [FI_HMEM_SYSTEM] = default_monitor, + [FI_HMEM_CUDA] = default_cuda_monitor, + [FI_HMEM_ROCR] = default_rocr_monitor, + [FI_HMEM_ZE] = default_ze_monitor, + }; + enum fi_hmem_iface iface; + int ret; + bool scalable; + + /* Check if ATS is supported */ + if (cxip_env.ats && cxip_ats_check(dom)) + dom->ats = true; + + if (cxip_env.odp && !(dom->util_domain.mr_mode & FI_MR_ALLOCATED)) + dom->odp = true; + + if (dom->util_domain.info_domain_caps & FI_HMEM) + dom->hmem = true; + + scalable = dom->ats && dom->odp; + + CXIP_INFO("Domain ATS: %d ODP: %d HMEM: %d Scalable: %d\n", + dom->ats, dom->odp, dom->hmem, scalable); + + /* Unpinned ATS translation is scalable. A single MD covers all + * memory addresses and a cache isn't necessary. + */ + if (scalable) { + ret = cxip_scalable_iomm_init(dom); + if (ret) { + CXIP_WARN("cxip_scalable_iomm_init() returned: %d\n", + ret); + return ret; + } + } + + if (!scalable || dom->hmem) { + dom->iomm.entry_data_size = sizeof(struct cxip_md); + dom->iomm.add_region = cxip_do_map; + dom->iomm.delete_region = cxip_do_unmap; + ret = ofi_mr_cache_init(&dom->util_domain, memory_monitors, + &dom->iomm); + if (ret) { + CXIP_INFO("MR cache init failed: %s. MR caching disabled.\n", + fi_strerror(-ret)); + } else { + for (iface = 0; iface < OFI_HMEM_MAX; iface++) { + if (dom->iomm.monitors[iface]) + CXIP_INFO("MR cache enabled for %s memory\n", + fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); + } + } + } + + cxip_iomm_set_rocr_dev_mem_only(dom); + + return FI_SUCCESS; +} + +/* + * cxip_iomm_fini() - Finalize domain IO memory map. + */ +void cxip_iomm_fini(struct cxip_domain *dom) +{ + if (dom->scalable_iomm) + cxip_scalable_iomm_fini(dom); + + if (!dom->scalable_iomm || dom->hmem) + ofi_mr_cache_cleanup(&dom->iomm); +} + +static int cxip_map_cache(struct cxip_domain *dom, struct ofi_mr_info *info, + struct cxip_md **md) +{ + struct ofi_mr_entry *entry; + int ret; + + ret = ofi_mr_cache_search(&dom->iomm, info, &entry); + if (ret) { + CXIP_WARN("Failed to acquire mapping (%p, %lu): %d\n", + info->iov.iov_base, info->iov.iov_len, ret); + return ret; + } + + *md = (struct cxip_md *)entry->data; + + return FI_SUCCESS; +} + +static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr, + uint64_t hmem_flags, struct cxip_md **md) +{ + struct cxip_md *uncached_md; + uint32_t map_flags; + int ret; + struct cxi_md_hints hints; + void *ze_handle; + void *ze_base_addr; + size_t ze_base_size; + + /* Prefer the ATS (scalable MD) whenever possible + * + * TODO: ATS (scalable MD) can only support CPU page sizes and should be + * avoided for non-standard page sizes. + */ + if (dom->scalable_iomm && attr->iface == FI_HMEM_SYSTEM) { + *md = &dom->scalable_md; + return FI_SUCCESS; + } + + memset(&hints, 0, sizeof(hints)); + + uncached_md = calloc(1, sizeof(*uncached_md)); + if (!uncached_md) + return -FI_ENOMEM; + + map_flags = CXI_MAP_READ | CXI_MAP_WRITE; + if (attr->iface == FI_HMEM_SYSTEM) { + if (dom->ats) + map_flags |= CXI_MAP_ATS; + + if (!dom->odp) + map_flags |= CXI_MAP_PIN; + } else { + /* TODO: Remove PIN when DMA buf move_notify is supported. */ + map_flags |= CXI_MAP_DEVICE | CXI_MAP_PIN; + + /* ZE support requires the use of the DMA buf FD and offset + * hints fields. + */ + if (attr->iface == FI_HMEM_ZE) { + if (!cxip_env.ze_hmem_supported) { + CXIP_WARN("ZE device memory not supported. Try disabling implicit scaling (EnableImplicitScaling=0 NEOReadDebugKeys=1).\n"); + ret = -FI_ENOSYS; + goto err_free_uncached_md; + } + + ret = ze_hmem_get_handle(attr->mr_iov->iov_base, + attr->mr_iov->iov_len, + &ze_handle); + if (ret) { + CXIP_WARN("ze_hmem_get_handle failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_uncached_md; + } + + ret = ze_hmem_get_base_addr(attr->mr_iov->iov_base, + attr->mr_iov->iov_len, + &ze_base_addr, + &ze_base_size); + if (ret) { + CXIP_WARN("ze_hmem_get_base_addr failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_uncached_md; + } + + hints.dmabuf_fd = (int)(uintptr_t)ze_handle; + hints.dmabuf_offset = + (uintptr_t)attr->mr_iov->iov_base - + (uintptr_t)ze_base_addr; + hints.dmabuf_valid = true; + } + } + + if (!cxip_env.iotlb) + map_flags |= CXI_MAP_NOCACHE; + + ret = cxil_map(dom->lni->lni, attr->mr_iov->iov_base, + attr->mr_iov->iov_len, map_flags, &hints, + &uncached_md->md); + if (ret) { + CXIP_WARN("cxil_map failed: %d:%s\n", ret, fi_strerror(-ret)); + goto err_free_uncached_md; + } + + /* zeHostMalloc() returns FI_HMEM_ZE but this cannot currently be + * registered with ofi_hmem_dev_register(). Thus skip it. + */ + if (cxip_env.disable_hmem_dev_register || + ((attr->iface == FI_HMEM_ZE) && (hmem_flags & FI_HMEM_HOST_ALLOC))) + ret = -FI_ENOSYS; + else + ret = ofi_hmem_dev_register(attr->iface, + (const void *)uncached_md->md->va, + uncached_md->md->len, + &uncached_md->handle); + + switch (ret) { + case FI_SUCCESS: + uncached_md->handle_valid = true; + break; + + case -FI_ENOSYS: + uncached_md->handle_valid = false; + break; + + default: + CXIP_WARN("ofi_hmem_dev_register %s failed: %d:%s\n", + fi_tostr(&attr->iface, FI_TYPE_HMEM_IFACE), ret, + fi_strerror(-ret)); + goto err_unmap; + } + + uncached_md->dom = dom; + uncached_md->info.iov.iov_base = (void *)uncached_md->md->va; + uncached_md->info.iov.iov_len = uncached_md->md->len; + uncached_md->info.iface = attr->iface; + + *md = uncached_md; + + return FI_SUCCESS; + +err_unmap: + cxil_unmap(uncached_md->md); +err_free_uncached_md: + free(uncached_md); + + return ret; +} + +static void cxip_map_get_mem_region_size(const void *buf, unsigned long len, + enum fi_hmem_iface iface, + void **out_buf, unsigned long *out_len) +{ + int ret; + + ret = ofi_hmem_get_base_addr(iface, buf, len, out_buf, out_len); + if (ret) { + *out_buf = (void *)buf; + *out_len = len; + } + + CXIP_DBG("%s: User addr=%p User len=%lu Region addr=%p Region len=%lu\n", + fi_tostr(&iface, FI_TYPE_HMEM_IFACE), buf, len, *out_buf, + *out_len); +} + +/* + * cxip_map() - Acquire IO mapping for buf. + * + * The IO memory map is searched for a IO mapping which covers buf. If no + * mapping has been established, create one and cache it. + */ +int cxip_map(struct cxip_domain *dom, const void *buf, unsigned long len, + uint64_t flags, struct cxip_md **md) +{ + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = len, + }; + struct fi_mr_attr attr = { + .iov_count = 1, + .mr_iov = &iov, + }; + struct ofi_mr_info mr_info = {}; + uint64_t hmem_flags = 0; + struct ofi_mr_entry *entry; + bool cache = !(flags & OFI_MR_NOCACHE); + + /* TODO: ATS (scalable MD) can only support CPU page sizes and should be + * avoided for non-standard page sizes. + */ + if (dom->scalable_iomm && !dom->hmem) { + *md = &dom->scalable_md; + return FI_SUCCESS; + } + + /* Since the MR cache find operates on virtual addresses and all device + * memory must support a unified virtual address space with system + * memory, the buffer pointer query can be avoided completely if the + * corresponding entry is in the cache. + */ + if (cache && cxip_domain_mr_cache_enabled(dom)) { + entry = ofi_mr_cache_find(&dom->iomm, &attr, 0); + if (entry) { + *md = (struct cxip_md *)entry->data; + return FI_SUCCESS; + } + } + + /* Since the MR cache search will allocate a new entry, the MR iface + * attribute must be defined for the proper MR cache memory monitor to + * be selected. + */ + if (dom->hmem) + attr.iface = ofi_get_hmem_iface(buf, NULL, &hmem_flags); + + if (cache && cxip_domain_mr_cache_iface_enabled(dom, attr.iface)) { + cxip_map_get_mem_region_size(iov.iov_base, iov.iov_len, + attr.iface, &iov.iov_base, + &iov.iov_len); + + mr_info.iface = attr.iface; + mr_info.iov = iov; + + /* Overload IPC addr to pass in HMEM flags. */ + mr_info.flags = hmem_flags; + + return cxip_map_cache(dom, &mr_info, md); + } + + return cxip_map_nocache(dom, &attr, flags, md); +} + +static void cxip_unmap_cache(struct cxip_md *md) +{ + struct ofi_mr_entry *entry = + container_of(md, struct ofi_mr_entry, data); + + ofi_mr_cache_delete(&md->dom->iomm, entry); +} + +static void cxip_unmap_nocache(struct cxip_md *md) +{ + int ret; + + if (md->handle_valid) + ofi_hmem_dev_unregister(md->info.iface, md->handle); + + ret = cxil_unmap(md->md); + if (ret) + CXIP_WARN("cxil_unmap failed: %d\n", ret); + + free(md); +} + +/* + * cxip_unmap() - Release an IO mapping. + * + * Drop a refernce to the IO mapping. If this was the last reference, the + * buffer may be unmapped. + */ +void cxip_unmap(struct cxip_md *md) +{ + /* Scalable MD is owned by the CXIP domain and thus will be freed when + * the domain is closed. + */ + if (md == &md->dom->scalable_md) + return; + + if (md->cached) + cxip_unmap_cache(md); + else + cxip_unmap_nocache(md); +} diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c new file mode 100644 index 00000000000..4ff81d5a448 --- /dev/null +++ b/prov/cxi/src/cxip_mr.c @@ -0,0 +1,1467 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2017 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2018,2020-2023 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_MR, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_MR, __VA_ARGS__) + +static int cxip_mr_init(struct cxip_mr *mr, struct cxip_domain *dom, + const struct fi_mr_attr *attr, uint64_t flags); +static void cxip_mr_fini(struct cxip_mr *mr); +static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr); + +void cxip_mr_domain_fini(struct cxip_mr_domain *mr_domain) +{ + int i; + + /* Assumption is this is only called when a domain is freed and only a + * single thread should be freeing a domain. Thus, no lock is taken. + */ + for (i = 0; i < CXIP_MR_DOMAIN_HT_BUCKETS; i++) { + if (!dlist_empty(&mr_domain->buckets[i])) + CXIP_WARN("MR domain bucket %d is not empty\n", i); + } + + ofi_spin_destroy(&mr_domain->lock); +} + +void cxip_mr_domain_init(struct cxip_mr_domain *mr_domain) +{ + int i; + + ofi_spin_init(&mr_domain->lock); + + for (i = 0; i < CXIP_MR_DOMAIN_HT_BUCKETS; i++) + dlist_init(&mr_domain->buckets[i]); +} + +/* + * cxip_ep_mr_insert() - Insert an MR key into the EP key space. + * + * Called during MR enable. The key space is a sparse 64 bits. + */ +static void cxip_ep_mr_insert(struct cxip_ep_obj *ep_obj, struct cxip_mr *mr) +{ + dlist_insert_tail(&mr->ep_entry, &ep_obj->mr_list); +} + +/* + * cxip_ep_mr_remove() - Remove an MR key from the EP key space. + */ +static void cxip_ep_mr_remove(struct cxip_mr *mr) +{ + dlist_remove(&mr->ep_entry); +} + +/* + * cxip_mr_cb() - Process MR LE events. + */ +int cxip_mr_cb(struct cxip_ctrl_req *req, const union c_event *event) +{ + struct cxip_mr *mr = req->mr.mr; + int evt_rc = cxi_event_rc(event); + + switch (event->hdr.event_type) { + case C_EVENT_LINK: + if (mr->optimized) + assert(mr->mr_state == CXIP_MR_ENABLED); + else + assert(mr->mr_state == CXIP_MR_DISABLED); + + if (evt_rc == C_RC_OK) { + mr->mr_state = CXIP_MR_LINKED; + CXIP_DBG("MR PTE linked: %p\n", mr); + break; + } + + mr->mr_state = CXIP_MR_LINK_ERR; + CXIP_WARN("MR PTE link: %p failed %d\n", mr, evt_rc); + break; + case C_EVENT_UNLINK: + assert(evt_rc == C_RC_OK); + + assert(mr->mr_state == CXIP_MR_LINKED); + mr->mr_state = CXIP_MR_UNLINKED; + + CXIP_DBG("MR PTE unlinked: %p\n", mr); + break; + case C_EVENT_MATCH: + ofi_atomic_inc32(&mr->match_events); + + if (evt_rc != C_RC_OK) + goto log_err; + break; + case C_EVENT_PUT: + case C_EVENT_GET: + case C_EVENT_ATOMIC: + case C_EVENT_FETCH_ATOMIC: + if (mr->count_events) + ofi_atomic_inc32(&mr->access_events); + + if (evt_rc != C_RC_OK) + goto log_err; + + /* TODO handle fi_writedata/fi_inject_writedata */ + break; + default: +log_err: + CXIP_WARN(CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), cxi_rc_to_str(evt_rc)); + } + + return FI_SUCCESS; +} + +static int cxip_mr_wait_append(struct cxip_ep_obj *ep_obj, + struct cxip_mr *mr) +{ + /* Wait for PTE LE append status update */ + do { + sched_yield(); + cxip_ep_tgt_ctrl_progress_locked(ep_obj); + } while (mr->mr_state != CXIP_MR_LINKED && + mr->mr_state != CXIP_MR_LINK_ERR); + + if (mr->mr_state == CXIP_MR_LINK_ERR) + return -FI_ENOSPC; + + return FI_SUCCESS; +} + +/* + * cxip_mr_enable_std() - Assign HW resources to the standard MR. + * + * Standard MRs are implemented by linking an LE describing the registered + * buffer to a shared, matching PtlTE. The MR key is encoded in the LE match + * bits. One PtlTE supports many standard MRs. The number of standard MR + * supported is limited by the total number of NIC LEs. Because a matching LE + * is used, unrestricted commands must be used to target standard MRs. + * + * Caller must hold mr->lock, mr->ep->ep_obj->lock. + */ +static int cxip_mr_enable_std(struct cxip_mr *mr) +{ + int ret; + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + struct cxip_mr_key key = { + .raw = mr->key, + }; + uint32_t le_flags; + + mr->req.cb = cxip_mr_cb; + + le_flags = C_LE_UNRESTRICTED_BODY_RO; + if (mr->attr.access & FI_REMOTE_WRITE) + le_flags |= C_LE_OP_PUT; + if (mr->attr.access & FI_REMOTE_READ) + le_flags |= C_LE_OP_GET; + if (mr->cntr) + le_flags |= C_LE_EVENT_CT_COMM; + + /* TODO: to support fi_writedata(), we will want to leave + * success events enabled for mr->rma_events true too. + */ + if (!mr->count_events) + le_flags |= C_LE_EVENT_SUCCESS_DISABLE; + + ret = cxip_pte_append(ep_obj->ctrl_pte, + mr->len ? CXI_VA_TO_IOVA(mr->md->md, mr->buf) : 0, + mr->len, mr->len ? mr->md->md->lac : 0, + C_PTL_LIST_PRIORITY, mr->req.req_id, + key.key, 0, CXI_MATCH_ID_ANY, + 0, le_flags, mr->cntr, ep_obj->ctrl_tgq, true); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to write Append command: %d\n", ret); + return ret; + } + + ret = cxip_mr_wait_append(ep_obj, mr); + if (ret) + return ret; + + mr->enabled = true; + + CXIP_DBG("Standard MR enabled: %p (key: 0x%016lX)\n", mr, mr->key); + + return FI_SUCCESS; +} + +/* + * cxip_mr_disable_std() - Free HW resources from the standard MR. + * + * Caller must hold mr->lock, mr->ep->ep_obj->lock. + */ +static int cxip_mr_disable_std(struct cxip_mr *mr) +{ + int ret; + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + + /* TODO: Handle -FI_EAGAIN. */ + ret = cxip_pte_unlink(ep_obj->ctrl_pte, C_PTL_LIST_PRIORITY, + mr->req.req_id, ep_obj->ctrl_tgq); + assert(ret == FI_SUCCESS); + + do { + sched_yield(); + cxip_ep_tgt_ctrl_progress_locked(ep_obj); + } while (mr->mr_state != CXIP_MR_UNLINKED); + + /* If MR event counts are recorded then we can check event counts + * to determine if invalidate can be skipped. + */ + if (!mr->count_events || ofi_atomic_get32(&mr->match_events) != + ofi_atomic_get32(&mr->access_events)) { + /* TODO: Temporary debug helper for DAOS to track if + * Match events detect a need to flush. + */ + if (mr->count_events) + CXIP_WARN("Match events required pte LE invalidate\n"); + + ret = cxil_invalidate_pte_le(ep_obj->ctrl_pte->pte, mr->key, + C_PTL_LIST_PRIORITY); + if (ret) + CXIP_WARN("MR %p key 0x%016lX invalidate failed %d\n", + mr, mr->key, ret); + } + + mr->enabled = false; + + CXIP_DBG("Standard MR disabled: %p (key: 0x%016lX)\n", mr, mr->key); + + return FI_SUCCESS; +} + +/* + * cxip_mr_opt_pte_cb() - Process optimized MR state change events. + */ +void cxip_mr_opt_pte_cb(struct cxip_pte *pte, const union c_event *event) +{ + struct cxip_mr *mr = (struct cxip_mr *)pte->ctx; + + switch (pte->state) { + case C_PTLTE_ENABLED: + assert(mr->mr_state == CXIP_MR_DISABLED); + mr->mr_state = CXIP_MR_ENABLED; + + CXIP_DBG("MR PTE enabled: %p\n", mr); + break; + default: + CXIP_WARN("Unexpected state received: %u\n", pte->state); + } +} + +/* + * cxip_mr_enable_opt() - Assign HW resources to the optimized MR. + * + * Optimized MRs are implemented by allocating a dedicated, non-matching PtlTE + * and linking an LE describing the registered buffer. The MR key is used to + * derive the PtlTE index. One PtlTE and one LE is required for each optimized + * MR. Because a non-matching interface is used, optimized MRs can be targeted + * with restricted commands. This may result in better performance. + * + * Caller must hold mr->lock, mr->ep->ep_obj->lock. + */ +static int cxip_mr_enable_opt(struct cxip_mr *mr) +{ + int ret; + struct cxi_pt_alloc_opts opts = {}; + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + uint32_t le_flags; + uint64_t ib = 0; + int pid_idx; + + mr->req.cb = cxip_mr_cb; + + ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl_tgt_evtq, + &opts, cxip_mr_opt_pte_cb, mr, &mr->pte); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate PTE: %d\n", ret); + return ret; + } + + pid_idx = cxip_generic_mr_key_to_ptl_idx(mr->domain, mr->key, true); + ret = cxip_pte_map(mr->pte, pid_idx, false); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to map write pid_idx %d to PTE: %d\n", + pid_idx, ret); + goto err_pte_free; + } + + pid_idx = cxip_generic_mr_key_to_ptl_idx(mr->domain, mr->key, false); + ret = cxip_pte_map(mr->pte, pid_idx, false); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to map read pid_idx %d to PTE: %d\n", + pid_idx, ret); + goto err_pte_free; + } + + ret = cxip_pte_set_state(mr->pte, ep_obj->ctrl_tgq, C_PTLTE_ENABLED, 0); + if (ret != FI_SUCCESS) { + /* This is a bug, we have exclusive access to this CMDQ. */ + CXIP_WARN("Failed to enqueue command: %d\n", ret); + goto err_pte_free; + } + + le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE | + C_LE_UNRESTRICTED_BODY_RO; + if (mr->attr.access & FI_REMOTE_WRITE) + le_flags |= C_LE_OP_PUT; + if (mr->attr.access & FI_REMOTE_READ) + le_flags |= C_LE_OP_GET; + if (mr->cntr) + le_flags |= C_LE_EVENT_CT_COMM; + + /* When FI_FENCE is not requested, restricted operations can used PCIe + * relaxed ordering. Unrestricted operations PCIe relaxed ordering is + * controlled by an env for now. + */ + if (!(ep_obj->caps & FI_FENCE)) { + ib = 1; + + if (cxip_env.enable_unrestricted_end_ro) + le_flags |= C_LE_UNRESTRICTED_END_RO; + } + + ret = cxip_pte_append(mr->pte, + mr->len ? CXI_VA_TO_IOVA(mr->md->md, mr->buf) : 0, + mr->len, mr->len ? mr->md->md->lac : 0, + C_PTL_LIST_PRIORITY, mr->req.req_id, + 0, ib, CXI_MATCH_ID_ANY, + 0, le_flags, mr->cntr, ep_obj->ctrl_tgq, true); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to write Append command: %d\n", ret); + goto err_pte_free; + } + + ret = cxip_mr_wait_append(ep_obj, mr); + if (ret) + goto err_pte_free; + + mr->enabled = true; + + CXIP_DBG("Optimized MR enabled: %p (key: 0x%016lX)\n", mr, mr->key); + + return FI_SUCCESS; + +err_pte_free: + cxip_pte_free(mr->pte); + + return ret; +} + +/* + * cxip_mr_disable_opt() - Free hardware resources for non-cached + * optimized MR. + * + * Caller must hold mr->lock, mr->ep->ep_obj->lock. + */ +static int cxip_mr_disable_opt(struct cxip_mr *mr) +{ + int ret; + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + + ret = cxip_pte_unlink(mr->pte, C_PTL_LIST_PRIORITY, + mr->req.req_id, ep_obj->ctrl_tgq); + if (ret) { + CXIP_WARN("Failed to enqueue Unlink: %d\n", ret); + goto cleanup; + } + + do { + sched_yield(); + cxip_ep_tgt_ctrl_progress_locked(ep_obj); + } while (mr->mr_state != CXIP_MR_UNLINKED); + +cleanup: + cxip_pte_free(mr->pte); + + mr->enabled = false; + + CXIP_DBG("Optimized MR disabled: %p (key: 0x%016lX)\n", mr, mr->key); + + return FI_SUCCESS; +} + +static void cxip_mr_prov_opt_to_std(struct cxip_mr *mr) +{ + struct cxip_mr_key key = { + .raw = mr->mr_fid.key, + }; + + CXIP_WARN("Optimized MR unavailable, fallback to standard MR\n"); + + key.opt = false; + mr->mr_fid.key = key.raw; + mr->optimized = false; +} + +/* + * cxip_mr_prov_enable_opt() - Enable a provider key optimized + * MR, falling back to a standard MR if resources are not available. + * + * Caller must hold mr->lock, mr->ep->ep_obj->lock. + */ +static int cxip_mr_prov_enable_opt(struct cxip_mr *mr) +{ + int ret; + + ret = cxip_mr_enable_opt(mr); + if (!ret) + return ret; + + cxip_mr_prov_opt_to_std(mr); + + return cxip_mr_enable_std(mr); +} + +/* + * cxip_mr_prov_cache_enable_opt() - Enable a provider key optimized + * MR configuring hardware if not already cached. + * + * Caller must hold mr->lock, mr->ep->ep_obj->lock. + */ +static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) +{ + int ret; + int lac = mr->md->md->lac; + struct cxi_pt_alloc_opts opts = {}; + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + struct cxip_mr_lac_cache *mr_cache; + struct cxip_mr *_mr; + uint32_t le_flags; + uint64_t ib = 0; + + mr_cache = &ep_obj->opt_mr_cache[lac]; + ofi_atomic_inc32(&mr_cache->ref); + + if (mr_cache->ctrl_req) + goto done; + + mr_cache->ctrl_req = calloc(1, sizeof(struct cxip_ctrl_req)); + if (!mr_cache->ctrl_req) { + ret = -FI_ENOMEM; + goto err; + } + + ret = cxip_domain_ctrl_id_alloc(ep_obj->domain, mr_cache->ctrl_req); + if (ret) { + CXIP_WARN("Failed to allocate MR buffer ID: %d\n", ret); + goto err_free_req; + } + + mr_cache->ctrl_req->ep_obj = ep_obj; + mr_cache->ctrl_req->cb = cxip_mr_cb; + + /* Allocate a dummy MR used to maintain cache state for this + * LAC/enable RO state PTE. + */ + _mr = calloc(1, sizeof(struct cxip_mr)); + if (!_mr) { + ret = -FI_ENOMEM; + goto err_free_id; + } + + mr_cache->ctrl_req->mr.mr = _mr; + mr_cache->ctrl_req->mr.mr->domain = ep_obj->domain; + mr_cache->ctrl_req->mr.mr->optimized = true; + mr_cache->ctrl_req->mr.mr->mr_state = CXIP_MR_DISABLED; + + ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl_tgt_evtq, + &opts, cxip_mr_opt_pte_cb, + _mr, &_mr->pte); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate PTE: %d\n", ret); + goto err_free_mr; + } + + ret = cxip_pte_map(_mr->pte, CXIP_PTL_IDX_WRITE_PROV_CACHE_MR_OPT(lac), + false); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to map write PTE: %d\n", ret); + goto err_pte_free; + } + + ret = cxip_pte_map(_mr->pte, CXIP_PTL_IDX_READ_PROV_CACHE_MR_OPT(lac), + false); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to map write PTE: %d\n", ret); + goto err_pte_free; + } + + ret = cxip_pte_set_state(_mr->pte, ep_obj->ctrl_tgq, + C_PTLTE_ENABLED, 0); + if (ret != FI_SUCCESS) { + /* This is a bug, we have exclusive access to this CMDQ. */ + CXIP_WARN("Failed to enqueue command: %d\n", ret); + goto err_pte_free; + } + + le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE | + C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_PUT | C_LE_OP_GET; + + /* When FI_FENCE is not requested, restricted operations can used PCIe + * relaxed ordering. Unrestricted operations PCIe relaxed ordering is + * controlled by an env for now. + */ + if (!(ep_obj->caps & FI_FENCE)) { + ib = 1; + + if (cxip_env.enable_unrestricted_end_ro) + le_flags |= C_LE_UNRESTRICTED_END_RO; + } + + ret = cxip_pte_append(_mr->pte, 0, -1ULL, lac, + C_PTL_LIST_PRIORITY, + mr_cache->ctrl_req->req_id, + 0, ib, CXI_MATCH_ID_ANY, + 0, le_flags, NULL, ep_obj->ctrl_tgq, true); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to write Append command: %d\n", ret); + goto err_pte_free; + } + + ret = cxip_mr_wait_append(ep_obj, _mr); + if (ret) + goto err_pte_free; +done: + mr->enabled = true; + + CXIP_DBG("Optimized MR enabled: %p (key: 0x%016lX)\n", mr, mr->key); + + return FI_SUCCESS; + +err_pte_free: + cxip_pte_free(_mr->pte); +err_free_mr: + free(mr_cache->ctrl_req->mr.mr); +err_free_id: + cxip_domain_ctrl_id_free(ep_obj->domain, mr_cache->ctrl_req); +err_free_req: + free(mr_cache->ctrl_req); + mr_cache->ctrl_req = NULL; +err: + cxip_mr_prov_opt_to_std(mr); + + return cxip_mr_prov_cache_enable_std(mr); +} + +/* + * cxip_mr_prov_cache_disable_opt() - Disable a provider key + * optimized MR. + * + * Caller must hold mr->lock, mr->ep->ep_obj->lock. + */ +static int cxip_mr_prov_cache_disable_opt(struct cxip_mr *mr) +{ + struct cxip_mr_key key = { + .raw = mr->key, + }; + int lac = key.lac; + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + + assert(key.opt); + + CXIP_DBG("Disable optimized cached MR: %p (key: 0x%016lX)\n", + mr, mr->key); + + if (ofi_atomic_get32(&ep_obj->opt_mr_cache[lac].ref) <= 0) { + CXIP_WARN("Cached optimized MR reference underflow\n"); + return -FI_EINVAL; + } + ofi_atomic_dec32(&ep_obj->opt_mr_cache[lac].ref); + mr->enabled = false; + + return FI_SUCCESS; +} + +/* + * cxip_mr_prov_cache_enable_std() - Enable a provider key standard + * MR configuring hardware if not already cached. + * + * Caller must hold mr->lock, mr->ep->ep_obj->lock. + */ +static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr) +{ + int ret; + int lac = mr->md->md->lac; + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + struct cxip_mr_lac_cache *mr_cache; + union cxip_match_bits mb; + union cxip_match_bits ib; + uint32_t le_flags; + + /* TODO: Handle enabling for each bound endpoint */ + mr_cache = &ep_obj->std_mr_cache[lac]; + ofi_atomic_inc32(&mr_cache->ref); + + if (mr_cache->ctrl_req) + goto done; + + mr_cache->ctrl_req = calloc(1, sizeof(struct cxip_ctrl_req)); + if (!mr_cache->ctrl_req) { + ret = -FI_ENOMEM; + goto err; + } + + ret = cxip_domain_ctrl_id_alloc(ep_obj->domain, mr_cache->ctrl_req); + if (ret) { + CXIP_WARN("Failed to allocate MR buffer ID: %d\n", ret); + goto err_free_req; + } + + mr_cache->ctrl_req->ep_obj = ep_obj; + mr_cache->ctrl_req->cb = cxip_mr_cb; + + /* Allocate a dummy MR used to maintain cache state transitions */ + mr_cache->ctrl_req->mr.mr = calloc(1, sizeof(struct cxip_mr)); + if (!mr_cache->ctrl_req->mr.mr) { + ret = -FI_ENOMEM; + goto err_free_id; + } + + mr_cache->ctrl_req->mr.mr->domain = ep_obj->domain; + mr_cache->ctrl_req->mr.mr->optimized = false; + mr_cache->ctrl_req->mr.mr->mr_state = CXIP_MR_DISABLED; + + mb.raw = 0; + mb.mr_lac = mr->md->md->lac; + mb.mr_cached = 1; + + ib.raw = ~0; + ib.mr_lac = 0; + ib.mr_cached = 0; + + le_flags = C_LE_EVENT_SUCCESS_DISABLE | C_LE_UNRESTRICTED_BODY_RO | + C_LE_OP_PUT | C_LE_OP_GET; + + ret = cxip_pte_append(ep_obj->ctrl_pte, 0, -1ULL, + mb.mr_lac, C_PTL_LIST_PRIORITY, + mr_cache->ctrl_req->req_id, + mb.raw, ib.raw, CXI_MATCH_ID_ANY, + 0, le_flags, NULL, ep_obj->ctrl_tgq, true); + + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to write Append command: %d\n", ret); + goto err_free_mr; + } + + ret = cxip_mr_wait_append(ep_obj, mr_cache->ctrl_req->mr.mr); + if (ret) + goto err_free_mr; + +done: + mr->enabled = true; + + CXIP_DBG("Enable cached standard MR: %p (key: 0x%016lX\n", + mr, mr->key); + + return FI_SUCCESS; + +err_free_mr: + free(mr_cache->ctrl_req->mr.mr); +err_free_id: + cxip_domain_ctrl_id_free(ep_obj->domain, mr_cache->ctrl_req); +err_free_req: + free(mr_cache->ctrl_req); + mr_cache->ctrl_req = NULL; +err: + ofi_atomic_dec32(&mr_cache->ref); + + return ret; +} + +/* + * cxip_mr_prov_cache_disable_std() - Disable a provider standard + * cached MR. + * + * Caller must hold mr->lock, mr->ep->ep_obj->lock. + */ +static int cxip_mr_prov_cache_disable_std(struct cxip_mr *mr) +{ + struct cxip_mr_key key = { + .raw = mr->key, + }; + int lac = key.lac; + struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; + + CXIP_DBG("Disable standard cached MR: %p (key: 0x%016lX)\n", + mr, mr->key); + if (ofi_atomic_get32(&ep_obj->std_mr_cache[lac].ref) <= 0) { + CXIP_WARN("Cached standard MR reference underflow\n"); + return -FI_EINVAL; + } + ofi_atomic_dec32(&ep_obj->std_mr_cache[lac].ref); + mr->enabled = false; + + return FI_SUCCESS; +} + +/* + * cxip_mr_domain_remove() - Remove client key from domain hash. + */ +static void cxip_mr_domain_remove(struct cxip_mr *mr) +{ + if (mr->domain->is_prov_key) + return; + + /* Only remotely accessible MR were assigned an RKEY */ + if (!(mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE))) + return; + + ofi_spin_lock(&mr->domain->mr_domain.lock); + dlist_remove(&mr->mr_domain_entry); + ofi_spin_unlock(&mr->domain->mr_domain.lock); +} + +/* + * cxip_mr_domain_insert() - Validate uniqueness and insert + * client key in the domain hash table. + */ +static int cxip_mr_domain_insert(struct cxip_mr *mr) +{ + struct cxip_mr_domain *mr_domain = &mr->domain->mr_domain; + int bucket; + struct cxip_mr *clash_mr; + + if (mr->domain->is_prov_key) + return FI_SUCCESS; + + /* Only remotely accessible MR are assigned an RKEY */ + if (!(mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE))) + return FI_SUCCESS; + + mr->key = mr->attr.requested_key; + + if (!cxip_generic_is_valid_mr_key(mr->key)) + return -FI_EKEYREJECTED; + + bucket = fasthash64(&mr->key, sizeof(mr->key), 0) % + CXIP_MR_DOMAIN_HT_BUCKETS; + + ofi_spin_lock(&mr_domain->lock); + + dlist_foreach_container(&mr_domain->buckets[bucket], struct cxip_mr, + clash_mr, mr_domain_entry) { + if (clash_mr->key == mr->key) { + ofi_spin_unlock(&mr_domain->lock); + return -FI_ENOKEY; + } + } + + dlist_insert_tail(&mr->mr_domain_entry, &mr_domain->buckets[bucket]); + + ofi_spin_unlock(&mr_domain->lock); + + return FI_SUCCESS; +} + +static int cxip_init_mr_key(struct cxip_mr *mr, uint64_t req_key) +{ + mr->key = req_key; + + return FI_SUCCESS; +} + +/* + * cxip_prov_init_mr_key() - Generate a provider key for + * a non-cached MR. + */ +static int cxip_prov_init_mr_key(struct cxip_mr *mr, uint64_t req_key) +{ + int ret; + + /* Non-cached FI_MR_PROV_KEY MR keys need to be unique. */ + ret = cxip_domain_prov_mr_id_alloc(mr->domain, mr); + if (ret) + return ret; + + CXIP_DBG("Init non-cached provider MR key 0x%016lX\n", mr->key); + + return FI_SUCCESS; +} + +/* + * cxip_prov_cache_init_mr_key() - Generate a provider key for + * a cached MR. + * + * Note cached MR do not support counters or target events. + */ +static int cxip_prov_cache_init_mr_key(struct cxip_mr *mr, + uint64_t req_key) +{ + struct cxip_mr_key key = {}; + struct cxi_md *md = mr->md->md; + + /* If optimized enabled it is preferred for caching */ + key.opt = mr->domain->optimized_mrs; + key.cached = true; + key.is_prov = 1; + key.lac = mr->len ? md->lac : 0; + key.lac_off = mr->len ? CXI_VA_TO_IOVA(md, mr->buf) : 0; + mr->key = key.raw; + + CXIP_DBG("Init cached MR key 0x%016lX, lac: %d, off:0x%016lX\n", + key.raw, key.lac, (uint64_t)key.lac_off); + + return FI_SUCCESS; +} + +static bool cxip_is_valid_mr_key(uint64_t key) +{ + if (key & ~CXIP_MR_KEY_MASK) + return false; + + return true; +} + +static bool cxip_is_valid_prov_mr_key(uint64_t key) +{ + struct cxip_mr_key cxip_key = { + .raw = key, + }; + + if (cxip_key.cached) + return cxip_key.is_prov == 1; + + if (cxip_key.opt) + return CXIP_MR_UNCACHED_KEY_TO_IDX(cxip_key.key) < + CXIP_PTL_IDX_PROV_MR_OPT_CNT; + + if (cxip_key.key & ~CXIP_MR_PROV_KEY_MASK) + return false; + + return true; +} + +bool cxip_generic_is_valid_mr_key(uint64_t key) +{ + struct cxip_mr_key cxip_key = { + .raw = key, + }; + + if (cxip_key.is_prov) + return cxip_is_valid_prov_mr_key(key); + + return cxip_is_valid_mr_key(key); +} + +static bool cxip_mr_key_opt(uint64_t key) +{ + /* Client key optimized MR controlled globally only */ + return cxip_env.optimized_mrs && key < CXIP_PTL_IDX_MR_OPT_CNT; +} + +static bool cxip_prov_mr_key_opt(uint64_t key) +{ + struct cxip_mr_key cxip_key = { + .raw = key, + }; + + if (cxip_key.opt) + return true; + + return false; +} + +bool cxip_generic_is_mr_key_opt(uint64_t key) +{ + struct cxip_mr_key cxip_key = { + .raw = key, + }; + + if (cxip_key.is_prov) + return cxip_prov_mr_key_opt(key); + + return cxip_mr_key_opt(key); +} + +static bool cxip_prov_mr_key_events(uint64_t key) +{ + struct cxip_mr_key cxip_key = { + .raw = key, + }; + + /* Cached keys can not be bound to counters or require RMA events, + * the "events" field is not defined. + */ + if (cxip_key.cached) + return false; + + if (cxip_key.events) + return true; + + return false; +} + +/* If CAPs or MR Key indicate events are required at the target */ +bool cxip_generic_is_mr_key_events(uint64_t caps, uint64_t key) +{ + struct cxip_mr_key cxip_key = { + .raw = key, + }; + + if (cxip_key.is_prov) + return cxip_prov_mr_key_events(key); + + /* Client keys cannot indicate if they require events and + * rely on FI_RMA_EVENT being set on source and target. + */ + return !!(caps & FI_RMA_EVENT); +} + +/* + * cxip_mr_key_to_ptl_idx() Maps a client generated key to the + * PtlTE index. + */ +static int cxip_mr_key_to_ptl_idx(struct cxip_domain *dom, + uint64_t key, bool write) +{ + if (cxip_generic_is_mr_key_opt(key)) + return write ? CXIP_PTL_IDX_WRITE_MR_OPT(key) : + CXIP_PTL_IDX_READ_MR_OPT(key); + + return write ? CXIP_PTL_IDX_WRITE_MR_STD : CXIP_PTL_IDX_READ_MR_STD; +} + +/* + * cxip_prov_mr_key_to_ptl_idx() - Maps a provider generated key + * to the PtlTE index. + */ +static int cxip_prov_mr_key_to_ptl_idx(struct cxip_domain *dom, + uint64_t key, bool write) +{ + struct cxip_mr_key cxip_key = { + .raw = key, + }; + int idx; + + if (cxip_generic_is_mr_key_opt(key)) { + idx = write ? CXIP_PTL_IDX_WRITE_MR_OPT_BASE : + CXIP_PTL_IDX_READ_MR_OPT_BASE; + + /* First 8 PTE are used for LAC cache entries */ + if (cxip_key.cached) { + idx += cxip_key.lac; + return idx; + } + + /* Verify within non-cached optimized range */ + assert(CXIP_MR_UNCACHED_KEY_TO_IDX(cxip_key.key) < + CXIP_PTL_IDX_PROV_MR_OPT_CNT); + + idx += CXIP_PTL_IDX_PROV_NUM_CACHE_IDX + + CXIP_MR_UNCACHED_KEY_TO_IDX(cxip_key.key); + return idx; + } + + return write ? CXIP_PTL_IDX_WRITE_MR_STD : CXIP_PTL_IDX_READ_MR_STD; +} + +/* + * cxip_generic_mr_key_to_ptl_idx() - Maps a MR RKEY to the PtlTE index. + */ +int cxip_generic_mr_key_to_ptl_idx(struct cxip_domain *dom, uint64_t key, + bool write) +{ + struct cxip_mr_key cxip_key = { + .raw = key, + }; + + if (cxip_key.is_prov) + return cxip_prov_mr_key_to_ptl_idx(dom, key, write); + + return cxip_mr_key_to_ptl_idx(dom, key, write); +} + +/* Caller should hold ep_obj->lock */ +void cxip_ctrl_mr_cache_flush(struct cxip_ep_obj *ep_obj) +{ + int lac; + struct cxip_mr_lac_cache *mr_cache; + int ret; + + /* Flush standard MR resources hardware resources not in use */ + for (lac = 0; lac < CXIP_NUM_CACHED_KEY_LE; lac++) { + mr_cache = &ep_obj->std_mr_cache[lac]; + + if (!mr_cache->ctrl_req || + ofi_atomic_get32(&mr_cache->ref)) + continue; + + ret = cxip_pte_unlink(ep_obj->ctrl_pte, C_PTL_LIST_PRIORITY, + mr_cache->ctrl_req->req_id, + ep_obj->ctrl_tgq); + assert(ret == FI_SUCCESS); + + do { + sched_yield(); + cxip_ep_tgt_ctrl_progress_locked(ep_obj); + } while (mr_cache->ctrl_req->mr.mr->mr_state != + CXIP_MR_UNLINKED); + + ret = cxil_invalidate_pte_le(ep_obj->ctrl_pte->pte, + mr_cache->ctrl_req->req_id, + C_PTL_LIST_PRIORITY); + if (ret) + CXIP_WARN("Remote MR cache flush invalidate err: %d\n", + ret); + + free(mr_cache->ctrl_req->mr.mr); + cxip_domain_ctrl_id_free(ep_obj->domain, mr_cache->ctrl_req); + free(mr_cache->ctrl_req); + mr_cache->ctrl_req = NULL; + } + + /* Flush optimized MR resources hardware resources not in use */ + for (lac = 0; lac < CXIP_NUM_CACHED_KEY_LE; lac++) { + mr_cache = &ep_obj->opt_mr_cache[lac]; + + if (!mr_cache->ctrl_req || + ofi_atomic_get32(&mr_cache->ref)) + continue; + + ret = cxip_pte_unlink(mr_cache->ctrl_req->mr.mr->pte, + C_PTL_LIST_PRIORITY, + mr_cache->ctrl_req->req_id, + ep_obj->ctrl_tgq); + if (ret) { + CXIP_WARN("Failed to enqueue Unlink: %d\n", ret); + goto cleanup; + } + + do { + sched_yield(); + cxip_ep_tgt_ctrl_progress_locked(ep_obj); + } while (mr_cache->ctrl_req->mr.mr->mr_state != + CXIP_MR_UNLINKED); + +cleanup: + cxip_pte_free(mr_cache->ctrl_req->mr.mr->pte); + free(mr_cache->ctrl_req->mr.mr); + cxip_domain_ctrl_id_free(ep_obj->domain, mr_cache->ctrl_req); + free(mr_cache->ctrl_req); + mr_cache->ctrl_req = NULL; + } +} + +struct cxip_mr_util_ops cxip_client_key_mr_util_ops = { + .is_cached = false, + .init_key = cxip_init_mr_key, + .enable_opt = cxip_mr_enable_opt, + .disable_opt = cxip_mr_disable_opt, + .enable_std = cxip_mr_enable_std, + .disable_std = cxip_mr_disable_std, +}; + +struct cxip_mr_util_ops cxip_prov_key_mr_util_ops = { + .is_cached = false, + .init_key = cxip_prov_init_mr_key, + .enable_opt = cxip_mr_prov_enable_opt, + .disable_opt = cxip_mr_disable_opt, + .enable_std = cxip_mr_enable_std, + .disable_std = cxip_mr_disable_std, +}; + +struct cxip_mr_util_ops cxip_prov_key_cache_mr_util_ops = { + .is_cached = true, + .init_key = cxip_prov_cache_init_mr_key, + .enable_opt = cxip_mr_prov_cache_enable_opt, + .disable_opt = cxip_mr_prov_cache_disable_opt, + .enable_std = cxip_mr_prov_cache_enable_std, + .disable_std = cxip_mr_prov_cache_disable_std, +}; + +int cxip_mr_enable(struct cxip_mr *mr) +{ + int ret; + + /* MR which require remote access require additional resources. Locally + * access MRs only do not. Thus, return FI_SUCCESS. + */ + if (mr->enabled || + !(mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE))) + return FI_SUCCESS; + + /* Set MR operations based on key management and whether + * the MR is cache-able. + */ + if (!mr->domain->is_prov_key) + mr->mr_util = &cxip_client_key_mr_util_ops; + else if (mr->md && mr->md->cached && mr->domain->prov_key_cache && + !mr->cntr && !mr->count_events && !mr->rma_events) + mr->mr_util = &cxip_prov_key_cache_mr_util_ops; + else + mr->mr_util = &cxip_prov_key_mr_util_ops; + + /* Officially set MR key */ + if (mr->domain->is_prov_key) { + ret = mr->mr_util->init_key(mr, mr->attr.requested_key); + if (ret) { + CXIP_WARN("Failed to initialize MR key: %d\n", ret); + return ret; + } + mr->mr_fid.key = mr->key; + } + mr->optimized = cxip_generic_is_mr_key_opt(mr->key); + + ofi_genlock_lock(&mr->ep->ep_obj->lock); + cxip_ep_mr_insert(mr->ep->ep_obj, mr); + + if (mr->optimized) + ret = mr->mr_util->enable_opt(mr); + else + ret = mr->mr_util->enable_std(mr); + ofi_genlock_unlock(&mr->ep->ep_obj->lock); + + if (ret != FI_SUCCESS) + goto err_remove_mr; + + return FI_SUCCESS; + +err_remove_mr: + cxip_ep_mr_remove(mr); + + return ret; +} + +int cxip_mr_disable(struct cxip_mr *mr) +{ + int ret; + + if (!mr->enabled || + !(mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE))) + return FI_SUCCESS; + + ofi_genlock_lock(&mr->ep->ep_obj->lock); + if (mr->optimized) + ret = mr->mr_util->disable_opt(mr); + else + ret = mr->mr_util->disable_std(mr); + + cxip_ep_mr_remove(mr); + ofi_genlock_unlock(&mr->ep->ep_obj->lock); + + return ret; +} + +/* + * cxip_mr_close() - fi_close implemented for MRs. + */ +static int cxip_mr_close(struct fid *fid) +{ + struct cxip_mr *mr; + int ret; + + if (!fid) + return -FI_EINVAL; + + mr = container_of(fid, struct cxip_mr, mr_fid.fid); + + ofi_spin_lock(&mr->lock); + + ret = cxip_mr_disable(mr); + if (ret != FI_SUCCESS) + CXIP_WARN("Failed to disable MR: %d\n", ret); + + if (mr->len) + cxip_unmap(mr->md); + + cxip_mr_domain_remove(mr); + + if (mr->ep) + ofi_atomic_dec32(&mr->ep->ep_obj->ref); + + if (mr->cntr) + ofi_atomic_dec32(&mr->cntr->ref); + + cxip_mr_fini(mr); + ofi_atomic_dec32(&mr->domain->ref); + + ofi_spin_unlock(&mr->lock); + + free(mr); + + return FI_SUCCESS; +} + +/* + * cxip_mr_bind() - fi_bind() implementation for MRs. + */ +static int cxip_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct cxip_mr *mr; + struct cxip_cntr *cntr; + struct cxip_ep *ep; + int ret = FI_SUCCESS; + + mr = container_of(fid, struct cxip_mr, mr_fid.fid); + + ofi_spin_lock(&mr->lock); + + switch (bfid->fclass) { + case FI_CLASS_CNTR: + cntr = container_of(bfid, struct cxip_cntr, cntr_fid.fid); + if (mr->domain != cntr->domain || mr->enabled) { + ret = -FI_EINVAL; + break; + } + + if (mr->cntr) { + ret = -FI_EINVAL; + break; + } + + if (!(flags & FI_REMOTE_WRITE)) { + ret = -FI_EINVAL; + break; + } + + mr->cntr = cntr; + ofi_atomic_inc32(&cntr->ref); + break; + + case FI_CLASS_EP: + ep = container_of(bfid, struct cxip_ep, ep.fid); + if (mr->domain != ep->ep_obj->domain || mr->enabled) { + ret = -FI_EINVAL; + break; + } + + if (mr->ep || !ep->ep_obj->enabled) { + ret = -FI_EINVAL; + break; + } + + if (mr->rma_events && !(ep->ep_obj->caps & FI_RMA_EVENT)) { + CXIP_WARN("MR requires FI_RMA_EVENT EP cap\n"); + ret = -FI_EINVAL; + break; + } + + mr->ep = ep; + ofi_atomic_inc32(&ep->ep_obj->ref); + break; + + default: + ret = -FI_EINVAL; + } + + ofi_spin_unlock(&mr->lock); + + return ret; +} + +/* + * cxip_mr_control() - fi_control() implementation for MRs. + */ +static int cxip_mr_control(struct fid *fid, int command, void *arg) +{ + struct cxip_mr *mr; + int ret; + + mr = container_of(fid, struct cxip_mr, mr_fid.fid); + + ofi_spin_lock(&mr->lock); + + switch (command) { + case FI_ENABLE: + /* An MR must be bound to an EP before being enabled. */ + if (!mr->ep) { + ret = -FI_EINVAL; + break; + } + + ret = cxip_mr_enable(mr); + if (ret != FI_SUCCESS) + CXIP_WARN("Failed to enable MR: %d\n", ret); + + break; + + default: + ret = -FI_EINVAL; + } + + ofi_spin_unlock(&mr->lock); + + return ret; +} + +static struct fi_ops cxip_mr_fi_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_mr_close, + .bind = cxip_mr_bind, + .control = cxip_mr_control, + .ops_open = fi_no_ops_open, +}; + +static void cxip_mr_fini(struct cxip_mr *mr) +{ + cxip_domain_ctrl_id_free(mr->domain, &mr->req); + cxip_domain_prov_mr_id_free(mr->domain, mr); +} + +static int cxip_mr_init(struct cxip_mr *mr, struct cxip_domain *dom, + const struct fi_mr_attr *attr, uint64_t flags) +{ + int ret; + + ofi_spin_init(&mr->lock); + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = attr->context; + mr->mr_fid.fid.ops = &cxip_mr_fi_ops; + + /* Generation of the key for FI_MR_PROV_KEY can not be done + * until the MR has been bound and enabled to at least one + * endpoint. + */ + mr->mr_fid.key = FI_KEY_NOTAVAIL; + + mr->domain = dom; + mr->flags = flags; + mr->attr = *attr; + + mr->count_events = dom->mr_match_events; + ofi_atomic_initialize32(&mr->match_events, 0); + ofi_atomic_initialize32(&mr->access_events, 0); + mr->rma_events = flags & FI_RMA_EVENT; + + /* Support length 1 IOV only for now */ + mr->buf = mr->attr.mr_iov[0].iov_base; + mr->len = mr->attr.mr_iov[0].iov_len; + + /* Allocate unique MR buffer ID if remote access MR */ + if (mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE)) { + ret = cxip_domain_ctrl_id_alloc(dom, &mr->req); + if (ret) { + CXIP_WARN("Failed to allocate MR buffer ID: %d\n", ret); + ofi_spin_destroy(&mr->lock); + return -FI_ENOSPC; + } + } else { + mr->req.req_id = -1; + } + + mr->mr_id = -1; + mr->req.mr.mr = mr; + mr->mr_fid.mem_desc = (void *)mr; + mr->mr_state = CXIP_MR_DISABLED; + + return FI_SUCCESS; +} + +/* + * Libfabric MR creation APIs + */ + +static int cxip_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr) +{ + struct cxip_domain *dom; + struct cxip_mr *_mr; + int ret; + + if (fid->fclass != FI_CLASS_DOMAIN || !attr || attr->iov_count <= 0) + return -FI_EINVAL; + + if (attr->iov_count != 1) + return -FI_ENOSYS; + + if (flags & FI_DIRECTED_RECV) { + CXIP_WARN("FI_DIRECTED_RECV and MRs not supported\n"); + return -FI_EINVAL; + } + + dom = container_of(fid, struct cxip_domain, util_domain.domain_fid); + + _mr = calloc(1, sizeof(*_mr)); + if (!_mr) + return -FI_ENOMEM; + ret = cxip_mr_init(_mr, dom, attr, flags); + if (ret) + goto err_free_mr; + + ret = cxip_mr_domain_insert(_mr); + if (ret) + goto err_cleanup_mr; + + /* Client key can be set now and will be used to + * detect duplicate errors. Note only remote MR + * are assigned a RKEY. + */ + if (!_mr->domain->is_prov_key && + _mr->attr.access & (FI_REMOTE_READ | FI_REMOTE_WRITE)) + _mr->mr_fid.key = _mr->key; + + if (_mr->len) { + ret = cxip_map(_mr->domain, (void *)_mr->buf, _mr->len, 0, + &_mr->md); + if (ret) { + CXIP_WARN("Failed to map MR buffer: %d\n", ret); + goto err_remove_mr; + } + } + + ofi_atomic_inc32(&dom->ref); + *mr = &_mr->mr_fid; + + return FI_SUCCESS; + +err_remove_mr: + cxip_mr_domain_remove(_mr); + +err_cleanup_mr: + cxip_mr_fini(_mr); +err_free_mr: + free(_mr); + + return ret; +} + +static int cxip_regv(struct fid *fid, const struct iovec *iov, size_t count, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr, void *context) +{ + struct fi_mr_attr attr; + + attr.mr_iov = iov; + attr.iov_count = count; + attr.access = access; + attr.offset = offset; + attr.requested_key = requested_key; + attr.context = context; + + return cxip_regattr(fid, &attr, flags, mr); +} + +static int cxip_reg(struct fid *fid, const void *buf, size_t len, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr, void *context) +{ + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + return cxip_regv(fid, &iov, 1, access, offset, requested_key, flags, mr, + context); +} + +struct fi_ops_mr cxip_dom_mr_ops = { + .size = sizeof(struct fi_ops_mr), + .reg = cxip_reg, + .regv = cxip_regv, + .regattr = cxip_regattr, +}; diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c new file mode 100644 index 00000000000..a9f3c0f6ec9 --- /dev/null +++ b/prov/cxi/src/cxip_msg.c @@ -0,0 +1,6148 @@ +/* + * Copyright (c) 2018,2021-2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cxip.h" + +#define FC_SW_LE_MSG_FATAL "LE exhaustion during flow control, "\ + "FI_CXI_RX_MATCH_MODE=[hybrid|software] is required\n" +#define FC_SW_ONLOAD_MSG_FATAL "LE resources not recovered during "\ + "flow control. FI_CXI_RX_MATCH_MODE=[hybrid|software] is required\n" +#define FC_OFLOW_NO_MATCH_MSG "Flow control overflow no match, increasing "\ + "FI_CXI_OFLOW_BUF_SIZE (current is %ldB) may reduce occurrence\n" +#define FC_REQ_FULL_MSG "Flow control request list full, increasing"\ + " FI_CXI_REQ_BUF_SIZE value (current is %ldB) may reduce occurrence\n" +#define FC_DROP_COUNT_MSG "Re-enable Drop count mismatch, re-enable will "\ + "be retried on notify\n" + +#define WARN_RESTRICTED_DISABLED "Insufficient resources for %s "\ + "protocol, switching to %s protocol\n" + +/* Defines the posted receive interval for checking LE allocation if + * in hybrid RX match mode and preemptive transitions to software + * managed EP are requested. + */ +#define CXIP_HYBRID_RECV_CHECK_INTERVAL (64-1) + +static int cxip_recv_cb(struct cxip_req *req, const union c_event *event); +static void cxip_ux_onload_complete(struct cxip_req *req); +static int cxip_ux_onload(struct cxip_rxc *rxc); +static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq); +static int cxip_recv_req_dropped(struct cxip_req *req); +static ssize_t _cxip_recv_req(struct cxip_req *req, bool restart_seq); + +static int cxip_send_req_dropped(struct cxip_txc *txc, struct cxip_req *req); +static int cxip_send_req_dequeue(struct cxip_txc *txc, struct cxip_req *req); + +static void cxip_fc_progress_ctrl(struct cxip_rxc *rxc); +static void cxip_send_buf_fini(struct cxip_req *req); + +/* + * match_put_event() - Find/add a matching event. + * + * For every Put Overflow event there is a matching Put event. These events can + * be generated in any order. Both events must be received before progress can + * be made. + * + * If the matching event exists in the mapping, matched is set to true and + * the deferred event is returned. If a match was not found, matched is set to + * false and the event is added to the deferred event mapping. + * + * The deferred match event is returned; unless it must be added to the + * deferred mapping and memory is insufficient. + * + * Caller must hold ep_obj->lock. + */ +static struct cxip_deferred_event * +match_put_event(struct cxip_rxc *rxc, struct cxip_req *req, + const union c_event *event, bool *matched) +{ + union cxip_def_event_key key = {}; + struct cxip_deferred_event *def_ev; + union cxip_match_bits mb; + int bucket; + enum c_event_type match_type = + event->tgt_long.event_type == C_EVENT_PUT ? C_EVENT_PUT_OVERFLOW : C_EVENT_PUT; + + if (event->tgt_long.rendezvous) { + key.initiator = event->tgt_long.initiator.initiator.process; + mb.raw = event->tgt_long.match_bits; + key.rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + event->tgt_long.rendezvous_id; + key.rdzv = 1; + } else { + key.start_addr = event->tgt_long.start; + } + + bucket = fasthash64(&key.raw, sizeof(key.raw), 0) % + CXIP_DEF_EVENT_HT_BUCKETS; + dlist_foreach_container(&rxc->deferred_events.bh[bucket], + struct cxip_deferred_event, def_ev, + rxc_entry) { + if (def_ev->key.raw == key.raw && + def_ev->ev.tgt_long.event_type == match_type && + def_ev->ev.tgt_long.return_code == event->tgt_long.return_code && + def_ev->ev.tgt_long.initiator.initiator.process == event->tgt_long.initiator.initiator.process && + def_ev->ev.tgt_long.match_bits == event->tgt_long.match_bits) { + *matched = true; + return def_ev; + } + } + + /* Not found, add mapping to hash bucket */ + *matched = false; + + def_ev = calloc(1, sizeof(*def_ev)); + if (!def_ev) { + RXC_WARN(rxc, "Failed allocate to memory\n"); + return NULL; + } + + def_ev->key.raw = key.raw; + def_ev->req = req; + def_ev->ev = *event; + + dlist_insert_tail(&def_ev->rxc_entry, &rxc->deferred_events.bh[bucket]); + + return def_ev; +} + +/* + * free_put_event() - Free a deferred put event. + * + * Free an event previously allocated added with match_put_event(). + * + * Caller must hold ep_obj->lock. + */ +static void free_put_event(struct cxip_rxc *rxc, + struct cxip_deferred_event *def_ev) +{ + dlist_remove(&def_ev->rxc_entry); + free(def_ev); +} + +/* + * recv_req_src_addr() - Translate request source address to FI address. + */ +static fi_addr_t recv_req_src_addr(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->recv.rxc; + + /* If the FI_SOURCE capability is enabled, convert the initiator's + * address to an FI address to be reported in a CQ event. If + * application AVs are symmetric, the match_id in the EQ event is + * logical and translation is not needed. Otherwise, translate the + * physical address in the EQ event to logical FI address. + */ + if (rxc->attr.caps & FI_SOURCE) { + struct cxip_addr addr = {}; + + if (rxc->ep_obj->av->symmetric) + return CXI_MATCH_ID_EP(rxc->pid_bits, + req->recv.initiator); + + addr.nic = CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator); + addr.pid = CXI_MATCH_ID_PID(rxc->pid_bits, req->recv.initiator); + addr.vni = req->recv.vni; + + return cxip_av_lookup_fi_addr(rxc->ep_obj->av, &addr); + } + + return FI_ADDR_NOTAVAIL; +} + +/* + * cxip_recv_req_alloc() - Allocate a request handle for a receive, + * mapping the associated buffer if required. + * + * Caller must hold ep->ep_obj->lock. + */ +static int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, + struct cxip_req **cxip_req) +{ + struct cxip_domain *dom = rxc->domain; + struct cxip_req *req; + struct cxip_md *recv_md = NULL; + int ret; + + /* Software EP only mode receives are not posted to hardware + * and are not constrained by hardware buffer ID limits. + * + * Note: cxip_evtq_req_alloc() zeros the req. + */ + req = cxip_evtq_req_alloc(&rxc->rx_evtq, !rxc->sw_ep_only, rxc); + if (!req) { + RXC_INFO(rxc, "Recv request unavailable: -FI_EAGAIN\n"); + return -FI_EAGAIN; + } + + if (len) { + ret = cxip_map(dom, (void *)buf, len, 0, &recv_md); + if (ret) { + RXC_WARN(rxc, "Map of recv buffer failed: %d, %s\n", + ret, fi_strerror(-ret)); + goto err_free_request; + } + } + + /* Initialize common receive request attributes. */ + req->type = CXIP_REQ_RECV; + req->cb = cxip_recv_cb; + req->recv.rxc = rxc; + req->recv.recv_buf = buf; + req->recv.recv_md = recv_md; + req->recv.ulen = len; + dlist_init(&req->recv.children); + dlist_init(&req->recv.rxc_entry); + + ofi_atomic_inc32(&rxc->orx_reqs); + *cxip_req = req; + + return FI_SUCCESS; + +err_free_request: + cxip_evtq_req_free(req); + + return ret; +} + +static void cxip_recv_req_free(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->recv.rxc; + + assert(req->type == CXIP_REQ_RECV); + assert(dlist_empty(&req->recv.children)); + assert(dlist_empty(&req->recv.rxc_entry)); + + ofi_atomic_dec32(&rxc->orx_reqs); + + if (req->recv.recv_md) + cxip_unmap(req->recv.recv_md); + + cxip_evtq_req_free(req); +} + +/* + * recv_req_event_success() - Generate successful receive event completions. + */ +static inline int recv_req_event_success(struct cxip_rxc *rxc, + struct cxip_req *req) +{ + int ret; + fi_addr_t src_addr; + struct cxip_addr *addr; + + if (req->recv.rxc->attr.caps & FI_SOURCE) { + src_addr = recv_req_src_addr(req); + if (src_addr != FI_ADDR_NOTAVAIL || + !(rxc->attr.caps & FI_SOURCE_ERR)) + return cxip_cq_req_complete_addr(req, src_addr); + + addr = calloc(1, sizeof(*addr)); + if (!addr) + return -FI_ENOMEM; + + addr->nic = CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator); + addr->pid = CXI_MATCH_ID_PID(rxc->pid_bits, + req->recv.initiator); + + src_addr = cxip_av_lookup_auth_key_fi_addr(rxc->ep_obj->av, + req->recv.vni); + + ret = cxip_cq_req_error(req, 0, FI_EADDRNOTAVAIL, req->recv.rc, + addr, sizeof(*addr), src_addr); + + free(addr); + } else { + ret = cxip_cq_req_complete(req); + } + + return ret; +} + +/* + * recv_req_report() - Report the completion of a receive operation. + */ +static void recv_req_report(struct cxip_req *req) +{ + int ret; + int err; + int success_event = (req->flags & FI_COMPLETION); + struct cxip_rxc *rxc = req->recv.rxc; + ssize_t truncated = req->recv.rlen - req->data_len; + + /* data_len (i.e. mlength) should NEVER be greater than rlength. */ + assert(truncated >= 0); + + req->flags &= (FI_MSG | FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA); + + if (req->recv.parent) { + struct cxip_req *parent = req->recv.parent; + bool unlinked = false; + + parent->recv.mrecv_bytes += req->data_len; + RXC_DBG(rxc, + "Putting %lu mrecv bytes (req: %p consumed: %lu auto_unlinked: %u unlink_bytes: %lu addr: %#lx ulen=%u min_free=%lu hw_offloaded=%u)\n", + req->data_len, parent, parent->recv.mrecv_bytes, + parent->recv.auto_unlinked, parent->recv.mrecv_unlink_bytes, + req->buf, parent->recv.ulen, rxc->min_multi_recv, + parent->recv.hw_offloaded); + + /* Handle mrecv edge case. If all unexpected headers were + * onloaded, the entire mrecv buffer may be matched against the + * sw_ux_list list before being offloaded to HW. Detect this + * case. + */ + if (parent->recv.hw_offloaded) { + if (parent->recv.auto_unlinked && + parent->recv.mrecv_bytes == parent->recv.mrecv_unlink_bytes) + unlinked = true; + } else { + if ((parent->recv.ulen - parent->recv.mrecv_bytes) < rxc->min_multi_recv) + unlinked = true; + } + + if (unlinked) { + RXC_DBG(rxc, "Freeing parent: %p\n", req->recv.parent); + cxip_recv_req_free(req->recv.parent); + + req->flags |= FI_MULTI_RECV; + } + } + + if (req->recv.rc == C_RC_OK && !truncated) { + RXC_DBG(rxc, "Request success: %p\n", req); + + if (success_event) { + ret = recv_req_event_success(rxc, req); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, + "Failed to report completion: %d\n", + ret); + } + + if (req->recv.cntr) { + ret = cxip_cntr_mod(req->recv.cntr, 1, false, false); + if (ret) + RXC_WARN(rxc, "cxip_cntr_mod returned: %d\n", + ret); + } + } else { + if (req->recv.unlinked) { + err = FI_ECANCELED; + if (req->recv.multi_recv) + req->flags |= FI_MULTI_RECV; + RXC_DBG(rxc, "Request canceled: %p (err: %d)\n", req, + err); + } else if (truncated) { + err = FI_ETRUNC; + RXC_DBG(rxc, "Request truncated: %p (err: %d)\n", req, + err); + } else if (req->recv.flags & FI_PEEK) { + req->data_len = 0; + err = FI_ENOMSG; + RXC_DBG(rxc, "Peek request not found: %p (err: %d)\n", + req, err); + } else { + err = proverr2errno(req->recv.rc); + + RXC_WARN(rxc, "Request error: %p (err: %d, %s)\n", req, + err, cxi_rc_to_str(req->recv.rc)); + } + + ret = cxip_cq_req_error(req, truncated, err, req->recv.rc, + NULL, 0, FI_ADDR_UNSPEC); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "Failed to report error: %d\n", ret); + + if (req->recv.cntr) { + ret = cxip_cntr_mod(req->recv.cntr, 1, false, true); + if (ret) + RXC_WARN(rxc, "cxip_cntr_mod returned: %d\n", + ret); + } + } +} + +/* + * recv_req_tgt_event() - Update common receive request fields + * + * Populate a receive request with information found in all receive event + * types. + */ +static void +recv_req_tgt_event(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc *rxc = req->recv.rxc; + union cxip_match_bits mb = { + .raw = event->tgt_long.match_bits + }; + uint32_t init = event->tgt_long.initiator.initiator.process; + + assert(event->hdr.event_type == C_EVENT_PUT || + event->hdr.event_type == C_EVENT_PUT_OVERFLOW || + event->hdr.event_type == C_EVENT_RENDEZVOUS || + event->hdr.event_type == C_EVENT_SEARCH); + + /* Rendezvous events contain the wrong match bits and do not provide + * initiator context for symmetric AVs. + */ + if (event->hdr.event_type != C_EVENT_RENDEZVOUS) { + req->tag = mb.tag; + req->recv.initiator = init; + + if (mb.cq_data) + req->flags |= FI_REMOTE_CQ_DATA; + } + + /* remote_offset is not provided in Overflow events. */ + if (event->hdr.event_type != C_EVENT_PUT_OVERFLOW) + req->recv.src_offset = event->tgt_long.remote_offset; + + /* For rendezvous, initiator is the RGet DFA. */ + if (event->hdr.event_type == C_EVENT_RENDEZVOUS) { + init = cxi_dfa_to_init(init, rxc->pid_bits); + req->recv.rget_nic = CXI_MATCH_ID_EP(rxc->pid_bits, init); + req->recv.rget_pid = CXI_MATCH_ID_PID(rxc->pid_bits, init); + } + + /* Only need one event to set remaining fields. */ + if (req->recv.tgt_event) + return; + req->recv.tgt_event = true; + + /* VNI is needed to support FI_AV_AUTH_KEY. */ + req->recv.vni = event->tgt_long.vni; + + /* rlen is used to detect truncation. */ + req->recv.rlen = event->tgt_long.rlength; + + /* RC is used when generating completion events. */ + req->recv.rc = cxi_tgt_event_rc(event); + + /* Header data is provided in all completion events. */ + req->data = event->tgt_long.header_data; + + /* rdzv_id is used to correlate Put and Put Overflow events when using + * offloaded RPut. Otherwise, Overflow buffer start address is used to + * correlate events. + */ + if (event->tgt_long.rendezvous) + req->recv.rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + event->tgt_long.rendezvous_id; + else + req->recv.oflow_start = event->tgt_long.start; + + req->recv.rdzv_lac = mb.rdzv_lac; + req->recv.rdzv_proto = mb.rdzv_proto; + req->recv.rdzv_mlen = event->tgt_long.mlength; + + /* data_len must be set uniquely for each protocol! */ +} + +/* + * rdzv_mrecv_req_lookup() - Search for a matching rendezvous, multi-receive + * child request. + */ +static int rdzv_mrecv_req_lookup(struct cxip_req *req, + const union c_event *event, + uint32_t *initiator, uint32_t *rdzv_id, + bool perform_event_checks, + struct cxip_req **req_out) +{ + struct cxip_rxc *rxc = req->recv.rxc; + struct cxip_req *child_req; + union cxip_match_bits mb; + uint32_t ev_init; + uint32_t ev_rdzv_id; + struct cxip_addr caddr; + int ret; + int i; + + if (event->hdr.event_type == C_EVENT_REPLY) { + struct cxi_rdzv_user_ptr *user_ptr; + + /* Events for software-issued operations will return a + * reference to the correct request. + */ + if (!event->init_short.rendezvous) { + *req_out = req; + return FI_SUCCESS; + } + + user_ptr = (struct cxi_rdzv_user_ptr *) + &event->init_short.user_ptr; + + ev_init = CXI_MATCH_ID(rxc->pid_bits, user_ptr->src_pid, + user_ptr->src_nid); + ev_rdzv_id = user_ptr->rendezvous_id; + } else if (event->hdr.event_type == C_EVENT_RENDEZVOUS) { + struct cxip_rxc *rxc = req->recv.rxc; + uint32_t dfa = event->tgt_long.initiator.initiator.process; + + ev_init = cxi_dfa_to_init(dfa, rxc->pid_bits); + mb.raw = event->tgt_long.match_bits; + + ev_rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + event->tgt_long.rendezvous_id; + } else { + ev_init = event->tgt_long.initiator.initiator.process; + mb.raw = event->tgt_long.match_bits; + + ev_rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + event->tgt_long.rendezvous_id; + } + + if ((event->hdr.event_type == C_EVENT_PUT_OVERFLOW || + event->hdr.event_type == C_EVENT_PUT) && + rxc->ep_obj->av->symmetric) { + ret = cxip_av_lookup_addr(rxc->ep_obj->av, + CXI_MATCH_ID_EP(rxc->pid_bits, ev_init), + &caddr); + if (ret != FI_SUCCESS) + RXC_FATAL(rxc, "Lookup of FI addr 0x%x: failed %d\n", + ev_init, ret); + + ev_init = CXI_MATCH_ID(rxc->pid_bits, + CXI_MATCH_ID_PID(rxc->pid_bits, ev_init), + caddr.nic); + } + + *initiator = ev_init; + *rdzv_id = ev_rdzv_id; + + /* Events for hardware-issued operations will return a rendezvous_id + * and initiator data. Use these fields to find a matching child + * request. + */ + dlist_foreach_container(&req->recv.children, + struct cxip_req, child_req, + recv.children) { + if (child_req->recv.rdzv_id == ev_rdzv_id && + child_req->recv.rdzv_initiator == ev_init) { + + if (perform_event_checks) { + /* There is an edge case where source may reuse the + * same rendezvous ID before the target has had time to + * process the C_EVENT_REPLY. If this is the case, an + * incorrect child_req match would occur. To prevent + * this, the events seen are stored with the child_req. + * If a redundant event is seen, this is a sign + * C_EVENT_REPLY needs to be process. Thus, return + * -FI_EAGAIN to process TX EQ. + */ + for (i = 0; i < child_req->recv.rdzv_events; i++) { + if (child_req->recv.rdzv_event_types[i] == event->hdr.event_type) { + assert(event->hdr.event_type != C_EVENT_REPLY); + return -FI_EAGAIN; + } + } + } + + *req_out = child_req; + return FI_SUCCESS; + } + } + + return -FI_ENOMSG; +} + +/* + * mrecv_req_dup() - Create a new request using an event targeting a + * multi-recv buffer. + * + * @mrecv_req: A previously posted multi-recv buffer request. + */ +static struct cxip_req *mrecv_req_dup(struct cxip_req *mrecv_req) +{ + struct cxip_rxc *rxc = mrecv_req->recv.rxc; + struct cxip_req *req; + + req = cxip_evtq_req_alloc(&rxc->rx_evtq, 0, rxc); + if (!req) + return NULL; + + /* Duplicate the parent request. */ + req->cb = mrecv_req->cb; + req->context = mrecv_req->context; + req->flags = mrecv_req->flags; + req->type = mrecv_req->type; + req->recv = mrecv_req->recv; + + /* Update fields specific to this Send */ + req->recv.parent = mrecv_req; + + /* Start pointer and data_len must be set elsewhere! */ + + return req; +} + +/* + * rdzv_mrecv_req_event() - Look up a multi-recieve child request using an + * event and multi-recv request. + * + * Each rendezvous Put transaction targeting a multi-receive buffer is tracked + * using a separate child request. A child request is uniquely identified by + * rendezvous ID and source address. Return a reference to a child request + * which matches the event. Allocate a new child request, if necessary. + */ +static struct cxip_req * +rdzv_mrecv_req_event(struct cxip_req *mrecv_req, const union c_event *event) +{ + uint32_t ev_init; + uint32_t ev_rdzv_id; + struct cxip_req *req; + struct cxip_rxc *rxc __attribute__((unused)) = mrecv_req->recv.rxc; + int ret; + + assert(event->hdr.event_type == C_EVENT_REPLY || + event->hdr.event_type == C_EVENT_PUT || + event->hdr.event_type == C_EVENT_PUT_OVERFLOW || + event->hdr.event_type == C_EVENT_RENDEZVOUS); + + ret = rdzv_mrecv_req_lookup(mrecv_req, event, &ev_init, &ev_rdzv_id, + true, &req); + switch (ret) { + case -FI_EAGAIN: + return NULL; + + case -FI_ENOMSG: + req = mrecv_req_dup(mrecv_req); + if (!req) + return NULL; + + /* Store event initiator and rdzv_id for matching. */ + req->recv.rdzv_id = ev_rdzv_id; + req->recv.rdzv_initiator = ev_init; + + dlist_insert_tail(&req->recv.children, + &mrecv_req->recv.children); + + RXC_DBG(rxc, "New child: %p parent: %p event: %s\n", req, + mrecv_req, cxi_event_to_str(event)); + return req; + + case FI_SUCCESS: + RXC_DBG(rxc, "Found child: %p parent: %p event: %s\n", req, + mrecv_req, cxi_event_to_str(event)); + return req; + + default: + RXC_FATAL(rxc, "Unhandled rdzv_mrecv_req_lookup %d\n", ret); + } +} + +/* + * rdzv_recv_req_event() - Count a rendezvous event. + * + * Call for each target rendezvous event generated on a user receive buffer. + * After three events, a rendezvous receive is complete. The three events could + * be either: + * -Put, Rendezvous, Reply -- or + * -Put Overflow, Rendezvous, Reply + * + * For a restricted Get there is a fourth event, the ACK of the notify. + * + * In either case, the events could be generated in any order. As soon as the + * events expected are processed, the request is complete. + */ +static void rdzv_recv_req_event(struct cxip_req *req, enum c_event_type type) +{ + int total_events = req->recv.done_notify ? 4 : 3; + + req->recv.rdzv_event_types[req->recv.rdzv_events] = type; + + if (++req->recv.rdzv_events == total_events) { + if (req->recv.multi_recv) { + dlist_remove(&req->recv.children); + recv_req_report(req); + cxip_evtq_req_free(req); + } else { + recv_req_report(req); + cxip_recv_req_free(req); + } + } +} + +/* + * oflow_req_put_bytes() - Consume bytes in the Overflow buffer. + * + * An Overflow buffer is freed when all bytes are consumed by the NIC. + * + * Caller must hold ep_obj->lock. + */ +static void oflow_req_put_bytes(struct cxip_req *req, size_t bytes) +{ + struct cxip_ptelist_buf *oflow_buf = req->req_ctx; + + /* Non-zero length UX messages with 0 eager portion do not + * have a dependency on the oflow buffer. + */ + if (bytes == 0) + return; + + oflow_buf->cur_offset += bytes; + + RXC_DBG(oflow_buf->rxc, "Putting %lu bytes (%lu/%lu): %p\n", bytes, + oflow_buf->cur_offset, oflow_buf->unlink_length, req); + + if (oflow_buf->cur_offset == oflow_buf->unlink_length) + cxip_ptelist_buf_consumed(oflow_buf); +} + +/* + * issue_rdzv_get() - Perform a Get to pull source data from the Initiator of a + * Send operation. + */ +static int issue_rdzv_get(struct cxip_req *req) +{ + struct c_full_dma_cmd cmd = {}; + uint64_t local_addr; + uint64_t rem_offset; + uint32_t align_bytes; + uint32_t mlen; + struct cxip_rxc *rxc = req->recv.rxc; + uint32_t pid_idx = rxc->domain->iface->dev->info.rdzv_get_idx; + uint8_t idx_ext; + union cxip_match_bits mb = {}; + int ret; + union c_fab_addr dfa; + + if (req->recv.rdzv_proto == CXIP_RDZV_PROTO_ALT_WRITE) + RXC_WARN_ONCE(rxc, "Rendezvous protocol: %s not implemented\n", + cxip_rdzv_proto_to_str(req->recv.rdzv_proto)); + + cmd.command.cmd_type = C_CMD_TYPE_DMA; + cmd.command.opcode = C_CMD_GET; + cmd.lac = req->recv.recv_md->md->lac; + cmd.event_send_disable = 1; + + /* Must deliver to TX event queue */ + cmd.eq = cxip_evtq_eqn(&rxc->ep_obj->txc.tx_evtq); + + if (req->recv.rdzv_proto == CXIP_RDZV_PROTO_ALT_READ) { + pid_idx = CXIP_PTL_IDX_RDZV_RESTRICTED(req->recv.rdzv_lac); + cmd.restricted = 1; + req->recv.done_notify = true; + } else { + pid_idx = rxc->domain->iface->dev->info.rdzv_get_idx; + mb.rdzv_lac = req->recv.rdzv_lac; + mb.rdzv_id_lo = req->recv.rdzv_id; + mb.rdzv_id_hi = req->recv.rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH; + } + cmd.match_bits = mb.raw; + + cmd.user_ptr = (uint64_t)req; + cxi_build_dfa(req->recv.rget_nic, req->recv.rget_pid, rxc->pid_bits, + pid_idx, &dfa, &idx_ext); + cmd.dfa = dfa; + cmd.index_ext = idx_ext; + + local_addr = CXI_VA_TO_IOVA(req->recv.recv_md->md, + req->recv.recv_buf); + local_addr += req->recv.rdzv_mlen; + + rem_offset = req->recv.src_offset; + mlen = req->recv.rdzv_mlen; + + RXC_DBG(rxc, "SW RGet addr: 0x%" PRIx64 " len %" PRId64 + " rem_off: %" PRId64 " restricted: %d\n", local_addr, + req->data_len - req->recv.rdzv_mlen, rem_offset, + cmd.restricted); + + /* Align mask will be non-zero if local DMA address cache-line + * alignment is desired. + */ + if (mlen >= rxc->rget_align_mask) { + align_bytes = local_addr & rxc->rget_align_mask; + local_addr -= align_bytes; + rem_offset -= align_bytes; + mlen -= align_bytes; + } + + if (req->data_len < mlen) + cmd.request_len = 0; + else + cmd.request_len = req->data_len - mlen; + + cmd.local_addr = local_addr; + cmd.remote_offset = rem_offset; + + RXC_DBG(rxc, "Aligned addr: 0x%" PRIx64 " len %d rem_off %" PRId64 "\n", + (uint64_t)cmd.local_addr, cmd.request_len, + (uint64_t)cmd.remote_offset); + + /* Issue Rendezvous Get command */ + ret = cxi_cq_emit_dma(rxc->tx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_DBG(rxc, "Failed to queue GET command: %d\n", ret); + return -FI_EAGAIN; + } + + cxi_cq_ring(rxc->tx_cmdq->dev_cmdq); + + return FI_SUCCESS; +} + +/* + * cxip_notify_match_cb() - Callback function for match complete notifiction + * Ack events. + */ +static int +cxip_notify_match_cb(struct cxip_req *req, const union c_event *event) +{ + RXC_DBG(req->recv.rxc, "Match complete: %p\n", req); + + recv_req_report(req); + + if (req->recv.multi_recv) + cxip_evtq_req_free(req); + else + cxip_recv_req_free(req); + + return FI_SUCCESS; +} + +/* + * cxip_notify_match() - Notify the initiator of a Send that the match is + * complete at the target. + * + * A transaction ID corresponding to the matched Send request is sent back to + * the initiator in the match_bits field of a zero-byte Put. + */ +static int cxip_notify_match(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc *rxc = req->recv.rxc; + uint32_t pid_idx = rxc->domain->iface->dev->info.rdzv_get_idx; + uint32_t init = event->tgt_long.initiator.initiator.process; + uint32_t nic = CXI_MATCH_ID_EP(rxc->pid_bits, init); + uint32_t pid = CXI_MATCH_ID_PID(rxc->pid_bits, init); + union c_fab_addr dfa; + uint8_t idx_ext; + union cxip_match_bits mb = { + .le_type = CXIP_LE_TYPE_ZBP, + }; + union cxip_match_bits event_mb; + union c_cmdu cmd = {}; + int ret; + + event_mb.raw = event->tgt_long.match_bits; + mb.tx_id = event_mb.tx_id; + + cxi_build_dfa(nic, pid, rxc->pid_bits, pid_idx, &dfa, &idx_ext); + + cmd.c_state.event_send_disable = 1; + cmd.c_state.index_ext = idx_ext; + cmd.c_state.eq = cxip_evtq_eqn(&rxc->ep_obj->txc.tx_evtq); + + ret = cxip_cmdq_emit_c_state(rxc->tx_cmdq, &cmd.c_state); + if (ret) { + RXC_DBG(rxc, "Failed to issue C_STATE command: %d\n", ret); + return ret; + } + + memset(&cmd.idc_msg, 0, sizeof(cmd.idc_msg)); + cmd.idc_msg.dfa = dfa; + cmd.idc_msg.match_bits = mb.raw; + + cmd.idc_msg.user_ptr = (uint64_t)req; + + ret = cxi_cq_emit_idc_msg(rxc->tx_cmdq->dev_cmdq, &cmd.idc_msg, + NULL, 0); + if (ret) { + RXC_DBG(rxc, "Failed to write IDC: %d\n", ret); + + /* Return error according to Domain Resource Management + */ + return -FI_EAGAIN; + } + + req->cb = cxip_notify_match_cb; + + cxi_cq_ring(rxc->tx_cmdq->dev_cmdq); + + RXC_DBG(rxc, "Queued match completion message: %p\n", req); + + return FI_SUCCESS; +} + +/* + * mrecv_req_oflow_event() - Set start and length uniquely for an unexpected + * mrecv request. + * + * Overflow buffer events contain a start address representing the offset into + * the Overflow buffer where data was written. When a unexpected header is + * later matched to a multi-receive buffer in the priority list, The Put + * Overflow event does not contain the offset into the Priority list buffer + * where data should be copied. Software must track the the Priority list + * buffer offset using ordered Put Overflow events. + */ +static int mrecv_req_put_bytes(struct cxip_req *req, uint32_t rlen) +{ + uintptr_t mrecv_head; + uintptr_t mrecv_tail; + size_t mrecv_bytes_remaining; + + mrecv_head = (uintptr_t)req->recv.recv_buf + req->recv.start_offset; + mrecv_tail = (uintptr_t)req->recv.recv_buf + req->recv.ulen; + mrecv_bytes_remaining = mrecv_tail - mrecv_head; + + rlen = MIN(mrecv_bytes_remaining, rlen); + req->recv.start_offset += rlen; + + return rlen; +} + +/* cxip_recv_req_set_rget_info() - Set RGet NIC and PID fields. Used for + * messages where a rendezvous event will not be generated. Current usages are + * for the eager long protocol and rendezvous operations which have unexpected + * headers onloaded due to flow control. + */ +static void cxip_recv_req_set_rget_info(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->recv.rxc; + int ret; + + if (rxc->ep_obj->av->symmetric) { + struct cxip_addr caddr; + + RXC_DBG(rxc, "Translating initiator: %x, req: %p\n", + req->recv.initiator, req); + + ret = cxip_av_lookup_addr(rxc->ep_obj->av, + CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator), + &caddr); + if (ret != FI_SUCCESS) + RXC_FATAL(rxc, "Failed to look up FI addr: %d\n", ret); + + req->recv.rget_nic = caddr.nic; + } else { + req->recv.rget_nic = CXI_MATCH_ID_EP(rxc->pid_bits, + req->recv.initiator); + } + + req->recv.rget_pid = CXI_MATCH_ID_PID(rxc->pid_bits, + req->recv.initiator); +} + +/* + * cxip_ux_send() - Progress an unexpected Send after receiving matching Put + * and Put and Put Overflow events. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_ux_send(struct cxip_req *match_req, struct cxip_req *oflow_req, + const union c_event *put_event, uint64_t mrecv_start, + uint32_t mrecv_len, bool remove_recv_entry) +{ + struct cxip_ptelist_buf *buf; + void *oflow_va; + size_t oflow_bytes; + union cxip_match_bits mb; + ssize_t ret; + struct cxip_req *parent_req = match_req; + + assert(match_req->type == CXIP_REQ_RECV); + + if (match_req->recv.multi_recv) { + if (put_event->tgt_long.rendezvous) + match_req = rdzv_mrecv_req_event(match_req, put_event); + else + match_req = mrecv_req_dup(match_req); + if (!match_req) + return -FI_EAGAIN; + + /* Set start and length uniquely for an unexpected + * mrecv request. + */ + match_req->recv.recv_buf = (uint8_t *) + match_req->recv.parent->recv.recv_buf + + mrecv_start; + match_req->buf = (uint64_t)match_req->recv.recv_buf; + match_req->data_len = mrecv_len; + } else { + match_req->data_len = put_event->tgt_long.rlength; + if (match_req->data_len > match_req->recv.ulen) + match_req->data_len = match_req->recv.ulen; + } + + recv_req_tgt_event(match_req, put_event); + buf = oflow_req->req_ctx; + oflow_va = (void *)CXI_IOVA_TO_VA(buf->md->md, + put_event->tgt_long.start); + + /* Copy data out of overflow buffer. */ + oflow_bytes = MIN(put_event->tgt_long.mlength, match_req->data_len); + cxip_copy_to_md(match_req->recv.recv_md, match_req->recv.recv_buf, + oflow_va, oflow_bytes); + + if (oflow_req->type == CXIP_REQ_OFLOW) + oflow_req_put_bytes(oflow_req, put_event->tgt_long.mlength); + + /* Remaining unexpected rendezvous processing is deferred until RGet + * completes. + */ + if (put_event->tgt_long.rendezvous) { + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + rdzv_recv_req_event(match_req, put_event->hdr.event_type); + return FI_SUCCESS; + } + + mb.raw = put_event->tgt_long.match_bits; + + /* Check if the initiator requires match completion guarantees. + * If so, notify the initiator that the match is now complete. + * Delay the Receive event until the notification is complete. + */ + if (mb.match_comp) { + ret = cxip_notify_match(match_req, put_event); + if (ret != FI_SUCCESS) { + if (match_req->recv.multi_recv) + cxip_evtq_req_free(match_req); + + return -FI_EAGAIN; + } + + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + return FI_SUCCESS; + } + + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + recv_req_report(match_req); + + if (match_req->recv.multi_recv) + cxip_evtq_req_free(match_req); + else + cxip_recv_req_free(match_req); + + return FI_SUCCESS; +} + +/* + * cxip_ux_send_zb() - Progress an unexpected zero-byte Send after receiving + * a Put Overflow event. + * + * Zero-byte Put events for unexpected Sends are discarded. Progress the Send + * using only the Overflow event. There is no Send data to be copied out. + */ +static int cxip_ux_send_zb(struct cxip_req *match_req, + const union c_event *oflow_event, + uint64_t mrecv_start, bool remove_recv_entry) +{ + union cxip_match_bits mb; + int ret; + struct cxip_req *parent_req = match_req; + + assert(!oflow_event->tgt_long.rlength); + + if (match_req->recv.multi_recv) { + match_req = mrecv_req_dup(match_req); + if (!match_req) + return -FI_EAGAIN; + + match_req->buf = (uint64_t) + match_req->recv.parent->recv.recv_buf + + mrecv_start; + } + + recv_req_tgt_event(match_req, oflow_event); + + match_req->data_len = 0; + + mb.raw = oflow_event->tgt_long.match_bits; + + /* Check if the initiator requires match completion guarantees. + * If so, notify the initiator that the match is now complete. + * Delay the Receive event until the notification is complete. + */ + if (mb.match_comp) { + ret = cxip_notify_match(match_req, oflow_event); + if (ret != FI_SUCCESS) { + if (match_req->recv.multi_recv) + cxip_evtq_req_free(match_req); + + return -FI_EAGAIN; + } + + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + return FI_SUCCESS; + } + + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + recv_req_report(match_req); + + if (match_req->recv.multi_recv) + cxip_evtq_req_free(match_req); + else + cxip_recv_req_free(match_req); + + return FI_SUCCESS; +} + +static bool cxip_ux_is_onload_complete(struct cxip_req *req) +{ + return !req->search.puts_pending && req->search.complete; +} + +/* + * recv_req_peek_complete - FI_PEEK operation completed + */ +static void recv_req_peek_complete(struct cxip_req *req, + struct cxip_ux_send *ux_send) +{ + /* If no unexpected message match we need to return original + * tag in the completion. + */ + if (req->recv.rc != C_RC_OK) + req->tag = req->recv.tag; + else if (req->recv.flags & FI_CLAIM) + ((struct fi_context *)req->context)->internal[0] = ux_send; + + /* Avoid truncation processing, peek does not receive data */ + req->data_len = req->recv.rlen; + + recv_req_report(req); + + cxip_recv_req_free(req); +} + +/* Caller must hold ep_obj->lock. */ +static int cxip_oflow_process_put_event(struct cxip_rxc *rxc, + struct cxip_req *req, + const union c_event *event) +{ + int ret; + struct cxip_deferred_event *def_ev; + struct cxip_req *save_req; + bool matched; + + def_ev = match_put_event(rxc, req, event, &matched); + if (!matched) + return !def_ev ? -FI_EAGAIN : FI_SUCCESS; + + RXC_DBG(rxc, "Overflow beat Put event: %p\n", def_ev->req); + + if (def_ev->ux_send) { + /* UX Send was onloaded for one of these reasons: + * 1) Flow control + * 2) ULE was claimed by a FI_CLAIM + */ + save_req = def_ev->req; + def_ev->ux_send->req = req; + def_ev->ux_send->put_ev = *event; + + if (def_ev->ux_send->claimed) { + recv_req_tgt_event(save_req, &def_ev->ux_send->put_ev); + recv_req_peek_complete(save_req, def_ev->ux_send); + RXC_DBG(rxc, "FI_CLAIM put complete: %p, ux_send %p\n", + save_req, def_ev->ux_send); + goto done; + } else { + def_ev->req->search.puts_pending--; + RXC_DBG(rxc, "put complete: %p\n", def_ev->req); + } + + if (cxip_ux_is_onload_complete(def_ev->req)) + cxip_ux_onload_complete(def_ev->req); + + } else { + ret = cxip_ux_send(def_ev->req, req, event, def_ev->mrecv_start, + def_ev->mrecv_len, false); + if (ret != FI_SUCCESS) + return -FI_EAGAIN; + } + +done: + free_put_event(rxc, def_ev); + + return FI_SUCCESS; +} + +/* Caller must hold ep_obj->lock */ +static int cxip_recv_pending_ptlte_disable(struct cxip_rxc *rxc, + bool check_fc) +{ + int ret; + + assert(rxc->state == RXC_ENABLED || + rxc->state == RXC_ONLOAD_FLOW_CONTROL || + rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->state == RXC_FLOW_CONTROL || + rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED || + rxc->state == RXC_PENDING_PTLTE_DISABLE); + + /* Having flow control triggered while in flow control is a sign of LE + * exhaustion. Software endpoint mode is required to scale past hardware + * LE limit. + */ + if (check_fc && rxc->state == RXC_FLOW_CONTROL) + RXC_FATAL(rxc, FC_SW_LE_MSG_FATAL); + + if (rxc->state != RXC_ENABLED) + return FI_SUCCESS; + + RXC_DBG(rxc, "Manual request PTLTE_DISABLED\n"); + + ret = cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq, C_PTLTE_DISABLED, + 0); + if (ret == FI_SUCCESS) + rxc->state = RXC_PENDING_PTLTE_DISABLE; + + return ret; +} + +/* cxip_rxp_check_le_usage_hybrid_preempt() - Examines LE Pool usage and forces + * a preemptive hardware to software transition if needed. + * + * In cases where the LE pool entry reservation is insufficient to meet request + * list buffers (due to multiple EP sharing an LE Pool or insufficient LE Pool + * reservation value), then enabling the periodic checking of LE allocations + * can be used to force preemptive transitions to software match mode before + * resources are exhausted or so depleted they are starve software managed + * endpoint. The lpe_stat_2 is set to the number of LE pool entries allocated + * to the LE pool and lpe_stat_1 is the current allocation. Skid is required + * as stats are relative to hardware processing, not software processing of + * the event. + * + * Caller should hold ep_obj->lock. + */ +static inline bool +cxip_rxp_check_le_usage_hybrid_preempt(struct cxip_rxc *rxc, + const union c_event *event) +{ + if (event->tgt_long.lpe_stat_1 > (event->tgt_long.lpe_stat_2 >> 1) && + rxc->state == RXC_ENABLED) { + if (cxip_recv_pending_ptlte_disable(rxc, false)) + RXC_WARN(rxc, "Force FC failed\n"); + return true; + } + return false; +} + +static int cxip_rxc_check_ule_hybrid_preempt(struct cxip_rxc *rxc) +{ + int ret; + int count; + + if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_unexpected_msg_preemptive == 1) { + count = ofi_atomic_get32(&rxc->orx_hw_ule_cnt); + + if (rxc->state == RXC_ENABLED && count > rxc->attr.size) { + ret = cxip_recv_pending_ptlte_disable(rxc, false); + if (ret == FI_SUCCESS) { + RXC_WARN(rxc, + "Transitioning to SW EP due to too many unexpected messages: posted_count=%u request_size=%lu\n", + ret, rxc->attr.size); + } else { + assert(ret == -FI_EAGAIN); + RXC_WARN(rxc, + "Failed to transition to SW EP: %d\n", + ret); + } + + return ret; + } + } + + return FI_SUCCESS; +} + +/* + * cxip_oflow_cb() - Process an Overflow buffer event. + * + * Overflow buffers are used to land unexpected Send data. Link, Unlink + * and Put events are expected from Overflow buffers. However, Link + * events will only be requested when running in hybrid RX match mode + * with FI_CXI_HYBRID_PREEMPTIVE=1. + * + * An Unlink event indicates that buffer space was exhausted. Overflow buffers + * are configured to use locally managed LEs. When enough Puts match in an + * Overflow buffer, consuming its space, the NIC automatically unlinks the LE. + * An automatic Unlink event is generated before the final Put which caused + * buffer space to become exhausted. + * + * An Unlink event is generated by an Unlink command. Overflow buffers are + * manually unlinked in this way during teardown. When an LE is manually + * unlinked the auto_unlinked field in the corresponding event is zero. In this + * case, the request is freed immediately. + * + * A Put event is generated for each Put that matches the Overflow buffer LE. + * This event indicates that data is available in the Overflow buffer. This + * event must be correlated to a Put Overflow event from a user receive buffer + * LE. The Put Overflow event may arrive before or after the Put event. + * + * When each Put event arrives, check for the existence of a previously posted + * receive buffer which generated a matching Put Overflow event. If such a + * buffer exists, copy data from the Overflow buffer to the user receive + * buffer. Otherwise, store a record of the Put event for matching once a user + * posts a new buffer that matches the unexpected Put. + * + * If data will remain in the Overflow buffer, take a reference to it to + * prevent it from being freed. If an Unlink-Put event is detected, drop a + * reference to the Overflow buffer so it is automatically freed once all user + * data is copied out. + */ +static int cxip_oflow_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_ptelist_buf *oflow_buf = req->req_ctx; + struct cxip_rxc *rxc = oflow_buf->rxc; + int ret = FI_SUCCESS; + + switch (event->hdr.event_type) { + case C_EVENT_LINK: + /* Success events only used with hybrid preemptive */ + if (cxi_event_rc(event) == C_RC_OK) { + + if (!cxip_env.hybrid_preemptive) + return FI_SUCCESS; + + /* Check for possible hybrid mode preemptive + * transitions to software managed mode. + */ + if (cxip_rxp_check_le_usage_hybrid_preempt(rxc, event)) + RXC_WARN(rxc, + "Force preemptive switch to SW EP\n"); + return FI_SUCCESS; + } + + assert(cxi_event_rc(event) == C_RC_NO_SPACE); + + RXC_DBG(rxc, "Oflow LE append failed\n"); + + ret = cxip_recv_pending_ptlte_disable(rxc, true); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "Force disable failed %d %s\n", + ret, fi_strerror(-ret)); + cxip_ptelist_buf_link_err(oflow_buf, cxi_event_rc(event)); + return ret; + case C_EVENT_UNLINK: + assert(!event->tgt_long.auto_unlinked); + + cxip_ptelist_buf_unlink(oflow_buf); + return FI_SUCCESS; + case C_EVENT_PUT: + /* Put event handling is complicated. Handle below. */ + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + + ofi_atomic_inc32(&rxc->orx_hw_ule_cnt); + + if (event->tgt_long.auto_unlinked) { + + oflow_buf->unlink_length = event->tgt_long.start - + CXI_VA_TO_IOVA(oflow_buf->md->md, oflow_buf->data) + + event->tgt_long.mlength; + + ofi_atomic_dec32(&oflow_buf->pool->bufs_linked); + + RXC_DBG(rxc, "Oflow auto unlink buf %p, linked %u\n", oflow_buf, + ofi_atomic_get32(&oflow_buf->pool->bufs_linked)); + + /* Replace the eager overflow buffer. */ + cxip_ptelist_buf_replenish(rxc->oflow_list_bufpool, false); + } + + ret = cxip_rxc_check_ule_hybrid_preempt(rxc); + if (ret) + goto err_dec_ule; + + /* Drop all unexpected 0-byte Put events. */ + if (!event->tgt_long.rlength) + return FI_SUCCESS; + + /* Handle Put events */ + ret = cxip_oflow_process_put_event(rxc, req, event); + if (ret) + goto err_dec_ule; + + return FI_SUCCESS; + +err_dec_ule: + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + return ret; +} + +static void report_send_completion(struct cxip_req *req, bool sw_cntr); +static void rdzv_send_req_event(struct cxip_req *req); + +/* + * cxip_rdzv_pte_zbp_cb() - Process zero-byte Put events. + * + * Zero-byte Puts (ZBP) are used to transfer small messages without consuming + * buffers outside of the EQ. ZBPs are currently only used for match complete + * messages. + */ +int cxip_rdzv_pte_zbp_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rdzv_pte *rdzv_pte = req->req_ctx; + struct cxip_txc *txc = rdzv_pte->txc; + struct cxip_req *put_req; + union cxip_match_bits mb; + int event_rc = cxi_event_rc(event); + int rdzv_id; + int ret; + + switch (event->hdr.event_type) { + case C_EVENT_LINK: + if (event_rc == C_RC_OK) + ofi_atomic_inc32(&rdzv_pte->le_linked_success_count); + else + ofi_atomic_inc32(&rdzv_pte->le_linked_failure_count); + return FI_SUCCESS; + + case C_EVENT_PUT: + mb.raw = event->tgt_long.match_bits; + + if (mb.rdzv_done) { + rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + mb.rdzv_id_lo; + put_req = cxip_rdzv_id_lookup(txc, rdzv_id); + if (!put_req) { + TXC_WARN(txc, "Failed to find RDZV ID: %d\n", + rdzv_id); + return FI_SUCCESS; + } + + if (event_rc != C_RC_OK) + TXC_WARN(txc, "RDZV Done error: %p rc: %s\n", + put_req, cxi_rc_to_str(event_rc)); + else + TXC_DBG(txc, "RDZV Done ACK: %p rc: %s\n", + put_req, cxi_rc_to_str(event_rc)); + + put_req->send.rc = event_rc; + rdzv_send_req_event(put_req); + + return FI_SUCCESS; + } + + /* Match complete */ + put_req = cxip_tx_id_lookup(txc, mb.tx_id); + if (!put_req) { + TXC_WARN(txc, "Failed to find TX ID: %d\n", mb.tx_id); + return FI_SUCCESS; + } + + event_rc = cxi_tgt_event_rc(event); + if (event_rc != C_RC_OK) + TXC_WARN(txc, "ZBP error: %p rc: %s\n", put_req, + cxi_rc_to_str(event_rc)); + else + TXC_DBG(txc, "ZBP received: %p rc: %s\n", put_req, + cxi_rc_to_str(event_rc)); + + ret = cxip_send_req_dequeue(put_req->send.txc, put_req); + if (ret != FI_SUCCESS) + return ret; + + cxip_tx_id_free(txc, mb.tx_id); + + /* The unexpected message has been matched. Generate a + * completion event. The ZBP event is guaranteed to arrive + * after the eager Send Ack, so the transfer is always done at + * this point. + * + * If MATCH_COMPLETE was requested, software must manage + * counters. + */ + report_send_completion(put_req, true); + + ofi_atomic_dec32(&put_req->send.txc->otx_reqs); + cxip_evtq_req_free(put_req); + + return FI_SUCCESS; + + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * cxip_oflow_bufpool_fini() - Finalize overflow buffers used for messaging. + * + * Must be called with the RX PtlTE disabled. + */ +void cxip_oflow_bufpool_fini(struct cxip_rxc *rxc) +{ + struct cxip_deferred_event *def_ev = NULL; + struct cxip_ptelist_buf *oflow_buf; + struct dlist_entry *tmp; + int i; + int def_events = 0; + + /* Clean up unexpected Put records. The PtlTE is disabled, so no more + * events can be expected. + */ + for (i = 0; i < CXIP_DEF_EVENT_HT_BUCKETS; i++) { + dlist_foreach_container_safe(&rxc->deferred_events.bh[i], + struct cxip_deferred_event, + def_ev, rxc_entry, tmp) { + /* Dropping the last reference will cause the + * oflow_buf to be removed from the RXC list and + * freed. + */ + oflow_buf = def_ev->req->req_ctx; + + if (oflow_buf->le_type == CXIP_LE_TYPE_RX) + oflow_req_put_bytes(def_ev->req, + def_ev->ev.tgt_long.mlength); + + free_put_event(rxc, def_ev); + def_events++; + } + } + + if (def_events) + RXC_DBG(rxc, "Freed %d deferred event(s)\n", def_events); + + cxip_ptelist_bufpool_fini(rxc->oflow_list_bufpool); +} + +int cxip_oflow_bufpool_init(struct cxip_rxc *rxc) +{ + struct cxip_ptelist_bufpool_attr attr = { + .list_type = C_PTL_LIST_OVERFLOW, + .ptelist_cb = cxip_oflow_cb, + .buf_size = cxip_env.oflow_buf_size, + .min_posted = cxip_env.oflow_buf_min_posted, + .max_posted = cxip_env.oflow_buf_min_posted, /* min == max */ + .max_cached = cxip_env.oflow_buf_max_cached, + .min_space_avail = rxc->max_eager_size, + }; + + return cxip_ptelist_bufpool_init(rxc, &rxc->oflow_list_bufpool, &attr); +} + +/* + * cxip_rdzv_done_notify() - Sends a rendezvous complete from target to source + * + * Sends a zero byte matching notification to the source of rendezvous + * indicating completion of a rendezvous. This is used when restricted get + * DMA (CXIP_RDZV_PROTO_ALT_READ) is used to transfer non-eager data. + */ +static int cxip_rdzv_done_notify(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->recv.rxc; + union c_fab_addr dfa; + uint32_t pid_idx = CXIP_PTL_IDX_RDZV_DEST; + uint32_t match_id; + struct c_full_dma_cmd cmd = {}; + union cxip_match_bits mb = {}; + int ret; + uint8_t idx_ext; + + mb.rdzv_id_lo = req->recv.rdzv_id; + mb.rdzv_id_hi = req->recv.rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH; + mb.rdzv_done = 1; + mb.le_type = CXIP_LE_TYPE_ZBP; + + cxi_build_dfa(req->recv.rget_nic, req->recv.rget_pid, rxc->pid_bits, + pid_idx, &dfa, &idx_ext); + match_id = CXI_MATCH_ID(rxc->pid_bits, rxc->ep_obj->src_addr.pid, + rxc->ep_obj->src_addr.nic); + + cmd.command.cmd_type = C_CMD_TYPE_DMA; + cmd.command.opcode = C_CMD_PUT; + cmd.index_ext = idx_ext; + cmd.event_send_disable = 1; + cmd.dfa = dfa; + cmd.eq = cxip_evtq_eqn(&rxc->ep_obj->txc.tx_evtq); + cmd.user_ptr = (uint64_t)req; + cmd.initiator = match_id; + cmd.match_bits = mb.raw; + + ret = cxi_cq_emit_dma(rxc->tx_cmdq->dev_cmdq, &cmd); + if (ret != FI_SUCCESS) { + RXC_DBG(rxc, "Faile to write notify IDC: %d %s\n", + ret, fi_strerror(-ret)); + return -FI_EAGAIN; + } + + cxi_cq_ring(rxc->tx_cmdq->dev_cmdq); + + RXC_DBG(rxc, "RDZV done notify send RDZV ID: %d\n", + req->recv.rdzv_id); + + return FI_SUCCESS; +} + +static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc *rxc = req->recv.rxc; + struct cxip_deferred_event *def_ev; + int event_rc; + int ret; + bool matched; + + switch (event->hdr.event_type) { + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + case C_EVENT_SEND: + RXC_WARN(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + + case C_EVENT_PUT_OVERFLOW: + /* We matched an unexpected header */ + /* Check for a previously received unexpected Put event, + * if not found defer until it arrives. + */ + def_ev = match_put_event(rxc, req, event, &matched); + if (!def_ev) + return -FI_EAGAIN; + + /* For multi-recv, management of start_offset requires events + * manage_local related events to arrive in order. + * Only C_EVENT_PUT_OVERFLOW events meet this criteria. + */ + def_ev->mrecv_start = req->recv.start_offset; + def_ev->mrecv_len = + mrecv_req_put_bytes(req, event->tgt_long.rlength); + + if (req->recv.multi_recv && event->tgt_long.auto_unlinked) { + /* If a C_EVENT_PUT_OVERFLOW unlinks a multi-recv + * buffer, mrecv_start contains the number of bytes + * consumed before this C_EVENT_PUT_OVERFLOW. Adding in + * mrecv_len gets the total bytes consumed. + */ + req->recv.auto_unlinked = true; + req->recv.mrecv_unlink_bytes = + def_ev->mrecv_start + def_ev->mrecv_len; + } + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + if (!matched) + return FI_SUCCESS; + + RXC_DBG(rxc, "Matched deferred event: %p\n", def_ev); + + ret = cxip_ux_send(req, def_ev->req, &def_ev->ev, + def_ev->mrecv_start, def_ev->mrecv_len, + false); + if (ret == FI_SUCCESS) { + free_put_event(rxc, def_ev); + } else { + /* undo mrecv_req_put_bytes() and orx_hw_ule_cnt dec */ + req->recv.start_offset -= def_ev->mrecv_len; + ofi_atomic_inc32(&rxc->orx_hw_ule_cnt); + } + + return ret; + case C_EVENT_PUT: + /* Eager data was delivered directly to the user buffer. */ + if (req->recv.multi_recv) { + if (event->tgt_long.auto_unlinked) { + uintptr_t mrecv_head; + uintptr_t mrecv_tail; + size_t mrecv_bytes_remaining; + size_t rlen; + + /* For C_EVENT_PUT, need to calculate how much + * of the multi-recv buffer was consumed while + * factoring in any truncation. + */ + mrecv_head = + CXI_IOVA_TO_VA(req->recv.recv_md->md, + event->tgt_long.start); + mrecv_tail = (uintptr_t)req->recv.recv_buf + + req->recv.ulen; + mrecv_bytes_remaining = mrecv_tail - mrecv_head; + rlen = MIN(mrecv_bytes_remaining, + event->tgt_long.rlength); + + req->recv.auto_unlinked = true; + req->recv.mrecv_unlink_bytes = + mrecv_head - + (uintptr_t)req->recv.recv_buf + rlen; + } + + req = rdzv_mrecv_req_event(req, event); + if (!req) + return -FI_EAGAIN; + + /* Set start pointer and data_len using Rendezvous or + * Put Overflow event (depending on if message was + * unexpected). + */ + } + + recv_req_tgt_event(req, event); + + /* Count the rendezvous event. */ + rdzv_recv_req_event(req, event->hdr.event_type); + return FI_SUCCESS; + case C_EVENT_RENDEZVOUS: + if (req->recv.multi_recv) { + req = rdzv_mrecv_req_event(req, event); + if (!req) + return -FI_EAGAIN; + + /* Use Rendezvous event to set start pointer and + * data_len for expected Sends. + */ + struct cxip_req *parent = req->recv.parent; + size_t mrecv_bytes_remaining; + + req->buf = CXI_IOVA_TO_VA( + parent->recv.recv_md->md, + event->tgt_long.start) - + event->tgt_long.mlength; + req->recv.recv_buf = (void *)req->buf; + + mrecv_bytes_remaining = + (uint64_t)parent->recv.recv_buf + + parent->recv.ulen - + (uint64_t)req->recv.recv_buf; + req->data_len = MIN(mrecv_bytes_remaining, + event->tgt_long.rlength); + } else { + req->data_len = MIN(req->recv.ulen, event->tgt_long.rlength); + } + + recv_req_tgt_event(req, event); + + if (!event->tgt_long.get_issued) { + if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > + rxc->max_tx || issue_rdzv_get(req)) { + + /* Could not issue get */ + ofi_atomic_dec32(&rxc->orx_tx_reqs); + + /* Undo multi-recv event processing. */ + if (req->recv.multi_recv && + !req->recv.rdzv_events) { + dlist_remove(&req->recv.children); + cxip_evtq_req_free(req); + } + return -FI_EAGAIN; + } + + RXC_DBG(rxc, "Software issued Get, req: %p\n", req); + } + + /* Count the rendezvous event. */ + rdzv_recv_req_event(req, event->hdr.event_type); + return FI_SUCCESS; + case C_EVENT_REPLY: + /* If mrecv, look up the correct child request. */ + if (req->recv.multi_recv) { + req = rdzv_mrecv_req_event(req, event); + if (!req) + return -FI_EAGAIN; + } + + /* If a rendezvous operation requires a done notification + * send it. Must wait for the ACK from the notify to be returned + * before completing the target operation. + */ + if (req->recv.done_notify) { + if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > rxc->max_tx || + cxip_rdzv_done_notify(req)) { + + /* Could not issue notify, will be retried */ + ofi_atomic_dec32(&rxc->orx_tx_reqs); + return -FI_EAGAIN; + } + } + + /* Rendezvous Get completed, update event counts and + * complete if using unrestricted get protocol. + */ + req->recv.rc = cxi_init_event_rc(event); + rdzv_recv_req_event(req, event->hdr.event_type); + + /* If RGet initiated by software return the TX credit */ + if (!event->init_short.rendezvous) { + ofi_atomic_dec32(&rxc->orx_tx_reqs); + assert(ofi_atomic_get32(&rxc->orx_tx_reqs) >= 0); + } + + return FI_SUCCESS; + + case C_EVENT_ACK: + event_rc = cxi_init_event_rc(event); + if (event_rc != C_RC_OK) + RXC_WARN(rxc, "%#x:%u Bad RDZV notify ACK status %s\n", + req->recv.rget_nic, req->recv.rget_pid, + cxi_rc_to_str(event_rc)); + + /* Special case of the ZBP destination EQ being full and ZBP + * could not complete. This must be retried, we use the TX + * credit already allocated. + */ + if (event_rc == C_RC_ENTRY_NOT_FOUND) { + usleep(CXIP_DONE_NOTIFY_RETRY_DELAY_US); + + if (cxip_rdzv_done_notify(req)) + return -FI_EAGAIN; + + return FI_SUCCESS; + } + + /* Reflect the completion status of the ACK in the target + * side completion so that a failure will not go undetected. + */ + req->recv.rc = event_rc; + ofi_atomic_dec32(&req->recv.rxc->orx_tx_reqs); + rdzv_recv_req_event(req, event->hdr.event_type); + + return FI_SUCCESS; + + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +static void cxip_rxc_record_req_stat(struct cxip_rxc *rxc, enum c_ptl_list list, + size_t rlength, struct cxip_req *req) +{ + enum fi_hmem_iface iface = rlength ? req->recv.recv_md->info.iface : FI_HMEM_SYSTEM; + + cxip_msg_counters_msg_record(&rxc->cntrs, list, iface, rlength); +} + +/* + * cxip_recv_cb() - Process a user receive buffer event. + * + * A user receive buffer is described by an LE linked to the Priority list. + * Link, Unlink, Put, Put Overflow, and Reply events are expected from a user + * receive buffer. + * + * A Link event indicates that a new user buffer has been linked to the + * priority list. Successful Link events may be suppressed. + * + * An Unlink event indicates that a user buffer has been unlinked. Normally, a + * receive is used once and unlinked when it is matched with a Send. In this + * case, a successful Unlink event may be suppressed. + * + * For expected, eager Sends, a Put will be matched to a user receive buffer by + * the NIC. Send data is copied directly to the user buffer. A Put event is + * generated describing the match. + * + * For unexpected, eager Sends, a Put will first match a buffer in the Overflow + * list. See cxip_oflow_cb() for details on Overflow event handling. Once a + * matching user receive buffer is appended to the Priority list, a Put + * Overflow event is generated. Put and Put Overflow events for an unexpected, + * eager Send must be correlated. These events may arrive in any order. Once + * both events are accounted, data is copied from the Overflow buffer to the + * user receive buffer. + * + * Unexpected, eager Sends that are longer than the eager threshold have their + * data truncated to zero. This is to avoid long messages consuming too much + * Overflow buffer space at the target. Once a match is made with a user + * receive buffer, data is re-read from the initiator using a Get. + * + * Rendezvous receive events are handled by cxip_recv_rdzv_cb(). + */ +static int cxip_recv_cb(struct cxip_req *req, const union c_event *event) +{ + int ret; + struct cxip_rxc *rxc = req->recv.rxc; + struct cxip_deferred_event *def_ev; + bool rdzv = false; + bool matched; + + /* Common processing for rendezvous and non-rendezvous events. + * TODO: Avoid having two switch statements for event_type. + */ + switch (event->hdr.event_type) { + case C_EVENT_LINK: + /* In cases where the LE pool entry reservation is insufficient + * to meet priority list buffers (due to multiple EP sharing an + * LE Pool or insufficient LE Pool reservation value), then + * enabling the periodic checking of LE allocations can be + * used to force preemptive transitions to software match mode. + */ + if (cxi_tgt_event_rc(event) == C_RC_OK) { + + if (!cxip_env.hybrid_recv_preemptive) + return FI_SUCCESS; + + /* Check for possible hybrid mode preemptive + * transitions to software managed mode. + */ + if (cxip_rxp_check_le_usage_hybrid_preempt(rxc, event)) + RXC_WARN(rxc, + "Force preemptive switch to SW EP\n"); + + return FI_SUCCESS; + } + + /* If endpoint has been disabled and an append fails, free the + * user request without reporting any event. + */ + if (rxc->state == RXC_DISABLED) { + cxip_recv_req_free(req); + return FI_SUCCESS; + } + + /* Save append to repost, NIC will initiate transition to + * software managed EP. + */ + if (cxi_tgt_event_rc(event) == C_RC_PTLTE_SW_MANAGED) { + RXC_WARN(rxc, "Append err, transitioning to SW\n"); + cxip_recv_req_dropped(req); + + return FI_SUCCESS; + } + + /* Transition into onload and flow control if an append + * fails. + */ + if (cxi_tgt_event_rc(event) != C_RC_NO_SPACE) + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT_STS, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_tgt_event_rc(event))); + + RXC_WARN(rxc, "Append err, priority LE exhaustion\n"); + + /* Manually transition to DISABLED to initiate flow control + * and onload instead of waiting for eventual NIC no match + * transition. + */ + ret = cxip_recv_pending_ptlte_disable(rxc, true); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "Force disable failed %d %s\n", + ret, fi_strerror(-ret)); + + ret = FI_SUCCESS; + cxip_recv_req_dropped(req); + + return ret; + + case C_EVENT_UNLINK: + assert(!event->tgt_long.auto_unlinked); + + /* TODO: This is broken with multi-recv. The multi-recv request + * may be freed with pending child requests. + */ + req->recv.unlinked = true; + recv_req_report(req); + cxip_recv_req_free(req); + + return FI_SUCCESS; + + case C_EVENT_PUT_OVERFLOW: + cxip_rxc_record_req_stat(rxc, C_PTL_LIST_OVERFLOW, + event->tgt_long.rlength, req); + + /* ULE freed. Update RXC state to signal that the RXC should + * be reenabled. + */ + /* TODO: this is not atomic, there must be a better way */ + if (rxc->state == RXC_ONLOAD_FLOW_CONTROL) + rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + break; + + case C_EVENT_PUT: + cxip_rxc_record_req_stat(rxc, C_PTL_LIST_PRIORITY, + event->tgt_long.rlength, req); + break; + default: + break; + } + + /* All events related to an offloaded rendezvous receive will be + * handled by cxip_recv_rdzv_cb(). Those events are identified by the + * event rendezvous field. Two exceptions are a Reply event generated + * from a SW-issued Get, and a Ack for a software done notification + * when using restricted eager get. When such an event is generated, + * the request will have already processed a Rendezvous event. If the + * rendezvous field is not set, but the rdzv_events count is elevated, + * this must be a SW-issued Reply or Ack event. + */ + if (event->hdr.event_type == C_EVENT_REPLY || + event->hdr.event_type == C_EVENT_ACK) + rdzv = (event->init_short.rendezvous || req->recv.rdzv_events); + else + rdzv = event->tgt_long.rendezvous; + + if (rdzv) + return cxip_recv_rdzv_cb(req, event); + + switch (event->hdr.event_type) { + case C_EVENT_SEND: + /* TODO Handle Send event errors. */ + assert(cxi_event_rc(event) == C_RC_OK); + return FI_SUCCESS; + case C_EVENT_PUT_OVERFLOW: + /* We matched an unexpected header */ + /* Unexpected 0-byte Put events are dropped. Skip matching. */ + if (!event->tgt_long.rlength) { + ret = cxip_ux_send_zb(req, event, + req->recv.start_offset, false); + if (ret == FI_SUCCESS) + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + return ret; + } + + /* Check for a previously received unexpected Put event, + * if not found defer until it arrives. + */ + def_ev = match_put_event(rxc, req, event, &matched); + if (!def_ev) + return -FI_EAGAIN; + + /* For multi-recv, management of start_offset requires events + * manage_local related events to arrive in order. + * Only C_EVENT_PUT_OVERFLOW events meet this criteria. + */ + def_ev->mrecv_start = req->recv.start_offset; + def_ev->mrecv_len = + mrecv_req_put_bytes(req, event->tgt_long.rlength); + + if (req->recv.multi_recv && event->tgt_long.auto_unlinked) { + /* If a C_EVENT_PUT_OVERFLOW unlinks a multi-recv + * buffer, mrecv_start contains the number of bytes + * consumed before this C_EVENT_PUT_OVERFLOW. Adding in + * mrecv_len gets the total bytes consumed. + */ + req->recv.auto_unlinked = true; + req->recv.mrecv_unlink_bytes = + def_ev->mrecv_start + def_ev->mrecv_len; + } + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + if (!matched) + return FI_SUCCESS; + + ret = cxip_ux_send(req, def_ev->req, &def_ev->ev, + def_ev->mrecv_start, def_ev->mrecv_len, + false); + if (ret == FI_SUCCESS) { + free_put_event(rxc, def_ev); + } else { + /* undo mrecv_req_put_bytes() and orx_hw_ule_cnt dec */ + req->recv.start_offset -= def_ev->mrecv_len; + ofi_atomic_inc32(&rxc->orx_hw_ule_cnt); + } + + return ret; + case C_EVENT_PUT: + /* Data was delivered directly to the user buffer. Complete the + * request. + */ + if (req->recv.multi_recv) { + if (event->tgt_long.auto_unlinked) { + uintptr_t mrecv_head; + + /* For C_EVENT_PUT, need to calculate how much + * of the multi-recv buffer was consumed while + * factoring in any truncation. + */ + mrecv_head = + CXI_IOVA_TO_VA(req->recv.recv_md->md, + event->tgt_long.start); + + req->recv.auto_unlinked = true; + req->recv.mrecv_unlink_bytes = + mrecv_head - + (uintptr_t)req->recv.recv_buf + + event->tgt_long.mlength; + } + + req = mrecv_req_dup(req); + if (!req) + return -FI_EAGAIN; + recv_req_tgt_event(req, event); + + req->buf = (uint64_t)(CXI_IOVA_TO_VA( + req->recv.recv_md->md, + event->tgt_long.start)); + req->data_len = event->tgt_long.mlength; + + recv_req_report(req); + cxip_evtq_req_free(req); + } else { + req->data_len = event->tgt_long.mlength; + recv_req_tgt_event(req, event); + recv_req_report(req); + cxip_recv_req_free(req); + } + return FI_SUCCESS; + + case C_EVENT_REPLY: + /* Long-send Get completed. Complete the request. */ + req->recv.rc = cxi_init_event_rc(event); + + recv_req_report(req); + if (req->recv.multi_recv) + cxip_evtq_req_free(req); + else + cxip_recv_req_free(req); + + return FI_SUCCESS; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * cxip_recv_cancel() - Cancel outstanding receive request. + */ +int cxip_recv_cancel(struct cxip_req *req) +{ + int ret = FI_SUCCESS; + struct cxip_rxc *rxc = req->recv.rxc; + + /* In hybrid mode requests could be on priority list + * or software receive list. + */ + if (req->recv.software_list) { + dlist_remove_init(&req->recv.rxc_entry); + req->recv.canceled = true; + req->recv.unlinked = true; + recv_req_report(req); + cxip_recv_req_free(req); + } else { + ret = cxip_pte_unlink(rxc->rx_pte, C_PTL_LIST_PRIORITY, + req->req_id, rxc->rx_cmdq); + if (ret == FI_SUCCESS) + req->recv.canceled = true; + } + return ret; +} + +/* + * cxip_recv_reenable() - Attempt to re-enable the RX queue. + * + * Called by disabled EP ready to re-enable. + * + * Determine if the RX queue can be re-enabled and perform a state change + * command if necessary. The Endpoint must receive dropped Send notifications + * from all peers who experienced drops before re-enabling the RX queue. + * + * Caller must hold ep_obj->lock. + */ +int cxip_recv_reenable(struct cxip_rxc *rxc) +{ + struct cxi_pte_status pte_status = {}; + int ret __attribute__((unused)); + + if (rxc->drop_count == -1) { + RXC_WARN(rxc, "Waiting for pending FC_NOTIFY messages\n"); + return -FI_EAGAIN; + } + + ret = cxil_pte_status(rxc->rx_pte->pte, &pte_status); + assert(!ret); + + if (rxc->drop_count != pte_status.drop_count) { + RXC_DBG(rxc, "Processed %d/%d drops\n", + rxc->drop_count, pte_status.drop_count); + return -FI_EAGAIN; + } + + RXC_WARN(rxc, "Re-enabling PTE, drop_count %d\n", + rxc->drop_count); + + do { + ret = cxip_rxc_msg_enable(rxc, rxc->drop_count); + if (ret == -FI_EAGAIN && + rxc->new_state == RXC_ENABLED_SOFTWARE) { + RXC_WARN(rxc, + "PTE disable->sm drop mismatch, will retry\n"); + break; + } + } while (ret == -FI_EAGAIN); + + if (ret != FI_SUCCESS && ret != -FI_EAGAIN) + RXC_FATAL(rxc, "cxip_rxc_msg_enable failed: %d\n", ret); + + return ret; +} + +/* + * cxip_fc_resume_cb() - Process FC resume completion events. + */ +int cxip_fc_resume_cb(struct cxip_ctrl_req *req, const union c_event *event) +{ + struct cxip_fc_drops *fc_drops = container_of(req, + struct cxip_fc_drops, req); + struct cxip_rxc *rxc = fc_drops->rxc; + int ret = FI_SUCCESS; + + switch (event->hdr.event_type) { + case C_EVENT_ACK: + switch (cxi_event_rc(event)) { + case C_RC_OK: + RXC_DBG(rxc, + "FC_RESUME to %#x:%u successfully sent: retry_count=%u\n", + fc_drops->nic_addr, fc_drops->pid, + fc_drops->retry_count); + free(fc_drops); + break; + + /* This error occurs when the target's control event queue has + * run out of space. Since the target should be processing the + * event queue, it is safe to replay messages until C_RC_OK is + * returned. + */ + case C_RC_ENTRY_NOT_FOUND: + fc_drops->retry_count++; + RXC_WARN(rxc, + "%#x:%u dropped FC message: retry_delay_usecs=%d retry_count=%u\n", + fc_drops->nic_addr, fc_drops->pid, + cxip_env.fc_retry_usec_delay, + fc_drops->retry_count); + usleep(cxip_env.fc_retry_usec_delay); + ret = cxip_ctrl_msg_send(req); + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT_STS, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + + return ret; +} + +/* + * cxip_fc_process_drops() - Process a dropped Send notification from a peer. + * + * Called by disabled EP waiting to re-enable. + * + * When a peer detects dropped Sends it follows up by sending a message to the + * disabled Endpoint indicating the number of drops experienced. The disabled + * Endpoint peer must count all drops before re-enabling its RX queue. + */ +int cxip_fc_process_drops(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, + uint32_t pid, uint16_t drops) +{ + struct cxip_rxc *rxc = &ep_obj->rxc; + struct cxip_fc_drops *fc_drops; + int ret __attribute__((unused)); + + fc_drops = calloc(1, sizeof(*fc_drops)); + if (!fc_drops) { + RXC_WARN(rxc, "Failed to allocate drops\n"); + return -FI_ENOMEM; + } + + /* TODO: Cleanup cxip_fc_drops fields. Many of the fields are redundant + * with the req structure. + */ + fc_drops->rxc = rxc; + fc_drops->nic_addr = nic_addr; + fc_drops->pid = pid; + fc_drops->drops = drops; + + fc_drops->req.send.nic_addr = nic_addr; + fc_drops->req.send.pid = pid; + fc_drops->req.send.mb.drops = drops; + + fc_drops->req.send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG; + fc_drops->req.send.mb.ctrl_msg_type = CXIP_CTRL_MSG_FC_RESUME; + fc_drops->req.cb = cxip_fc_resume_cb; + fc_drops->req.ep_obj = rxc->ep_obj; + + dlist_insert_tail(&fc_drops->rxc_entry, &rxc->fc_drops); + + RXC_DBG(rxc, "Processed drops: %d NIC: %#x PID: %d\n", + drops, nic_addr, pid); + + rxc->drop_count += drops; + + /* Wait until search and delete completes before attempting to + * re-enable. + */ + if (rxc->state == RXC_FLOW_CONTROL) { + ret = cxip_recv_reenable(rxc); + assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); + + /* Disable to software managed transition is synchronous + * in order to handle drop count mismatches correctly. If + * successful the H/W transition completed, otherwise it + * will be retried when notified and count matches. + */ + if (rxc->new_state == RXC_ENABLED_SOFTWARE && + ret == FI_SUCCESS) { + cxip_fc_progress_ctrl(rxc); + rxc->state = RXC_ENABLED_SOFTWARE; + RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n"); + } + } + + return FI_SUCCESS; +} + +/* + * cxip_recv_replay() - Replay dropped Receive requests. + * + * When no LE is available while processing an Append command, the command is + * dropped and future appends are disabled. After all outstanding commands are + * dropped and resources are recovered, replayed all Receive requests in order. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_recv_replay(struct cxip_rxc *rxc) +{ + struct cxip_req *req; + struct dlist_entry *tmp; + bool restart_seq = true; + int ret; + + dlist_foreach_container_safe(&rxc->replay_queue, + struct cxip_req, req, + recv.rxc_entry, tmp) { + dlist_remove_init(&req->recv.rxc_entry); + + /* Since the RXC and PtlTE are in a controlled state and no new + * user receives are being posted, it is safe to ignore the RXC + * state when replaying failed user posted receives. + */ + ret = cxip_recv_req_queue(req, restart_seq); + + /* Match made in software? */ + if (ret == -FI_EALREADY) + continue; + + /* TODO: Low memory or full CQ during SW matching would cause + * -FI_EAGAIN to be seen here. + */ + assert(ret == FI_SUCCESS); + + restart_seq = false; + } + + return FI_SUCCESS; +} + +/* + * cxip_recv_resume() - Send a resume message to all peers who reported dropped + * Sends. + * + * Called by disabled EP after re-enable. + * + * After counting all dropped sends targeting a disabled RX queue and + * re-enabling the queue, notify all peers who experienced dropped Sends so + * they can be replayed. + * + * Caller must hold ep_obj->lock. + */ +int cxip_recv_resume(struct cxip_rxc *rxc) +{ + struct cxip_fc_drops *fc_drops; + struct dlist_entry *tmp; + int ret; + + dlist_foreach_container_safe(&rxc->fc_drops, + struct cxip_fc_drops, fc_drops, + rxc_entry, tmp) { + ret = cxip_ctrl_msg_send(&fc_drops->req); + if (ret) + return ret; + + dlist_remove(&fc_drops->rxc_entry); + } + + return FI_SUCCESS; +} + +/* + * cxip_fc_progress_ctrl() - Progress the control EP until all resume + * control messages can be queued. + * + * Caller must hold ep_obj->lock. + */ +static void cxip_fc_progress_ctrl(struct cxip_rxc *rxc) +{ + int ret __attribute__((unused)); + + assert(rxc->state == RXC_FLOW_CONTROL); + + /* Successful transition from disabled occurred, reset + * drop count. + */ + rxc->drop_count = rxc->ep_obj->asic_ver < CASSINI_2_0 ? -1 : 0; + + while ((ret = cxip_recv_resume(rxc)) == -FI_EAGAIN) + cxip_ep_tx_ctrl_progress_locked(rxc->ep_obj); + + assert(ret == FI_SUCCESS); +} + +/* + * cxip_post_ux_onload_sw() - Nic HW-to-SW EP post UX onload processing. + * + * PTE transitioned from enabled to software managed. Onloading + * was done and appends that failed need to be replayed. + */ +static void cxip_post_ux_onload_sw(struct cxip_rxc *rxc) +{ + int ret; + + assert(cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE); + assert(rxc->prev_state == RXC_ENABLED); + assert(rxc->new_state == RXC_ENABLED_SOFTWARE); + + ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool, + true); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "Request list replenish failed %d %s\n", + ret, fi_strerror(-ret)); + + /* Priority list appends that failed during the transition can + * now be replayed. + */ + ret = cxip_recv_replay(rxc); + assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); + + if (rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) { + /* Transition from enabled to software managed is complete. + * Allow posting of receive operations. + */ + RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n"); + rxc->state = RXC_ENABLED_SOFTWARE; + } +} + +/* + * cxip_post_ux_onload_fc() - Flow control onload complete processing. + * + * PTE transitioned to disabled and UX onload has completed. + */ +static void cxip_post_ux_onload_fc(struct cxip_rxc *rxc) +{ + int ret; + + /* Disable RX matching offload if transitioning to + * software enabled EP. + */ + if (rxc->new_state == RXC_ENABLED_SOFTWARE) { + RXC_DBG(rxc, "Transitioning to SW EP\n"); + rxc->msg_offload = 0; + } + + if (rxc->fc_reason == C_SC_FC_EQ_FULL) + goto replay; + + if (rxc->new_state == RXC_ENABLED_SOFTWARE) + ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool, + true); + else + ret = cxip_ptelist_buf_replenish(rxc->oflow_list_bufpool, + true); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "%s buffer replenish failed %d %s\n", + rxc->new_state == RXC_ENABLED_SOFTWARE ? + "Request" : "Overflow", ret, fi_strerror(-ret)); + +replay: + /* Any priority list appends that failed during the transition + * can now be replayed. + */ + if (rxc->new_state == RXC_ENABLED) + rxc->msg_offload = 1; + + ret = cxip_recv_replay(rxc); + RXC_DBG(rxc, "Replay of failed receives ret: %d %s\n", + ret, fi_strerror(-ret)); + assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); + + if (rxc->state != RXC_ONLOAD_FLOW_CONTROL_REENABLE && + rxc->new_state != RXC_ENABLED_SOFTWARE) + RXC_FATAL(rxc, FC_SW_ONLOAD_MSG_FATAL); + + rxc->state = RXC_FLOW_CONTROL; + ret = cxip_recv_reenable(rxc); + assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); + RXC_WARN(rxc, "Now in RXC_FLOW_CONTROL\n"); + + /* Disable to software managed transition is synchronous in order to + * handle drop count mismatches correctly. If successful the H/W + * transition completed, otherwise the transition will occur when + * additional drop notifies are received. + */ + if (rxc->new_state == RXC_ENABLED_SOFTWARE && ret == FI_SUCCESS) { + cxip_fc_progress_ctrl(rxc); + rxc->state = RXC_ENABLED_SOFTWARE; + RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n"); + } +} + +/* + * cxip_ux_onload_complete() - Unexpected list entry onload complete. + * + * All unexpected message headers have been onloaded from hardware. + */ +static void cxip_ux_onload_complete(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->search.rxc; + + assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + free(rxc->ule_offsets); + rxc->ule_offsets = 0; + + /* During a transition to software managed PtlTE, received + * request list entries resulting from hardware not matching + * the priority list on an incoming packet were added to a + * pending unexpected message list. We merge the two + * expected list here. + */ + RXC_DBG(rxc, "Req pending %d UX entries, SW list %d UX entries\n", + rxc->sw_pending_ux_list_len, rxc->sw_ux_list_len); + + dlist_splice_tail(&rxc->sw_ux_list, &rxc->sw_pending_ux_list); + rxc->sw_ux_list_len += rxc->sw_pending_ux_list_len; + rxc->sw_pending_ux_list_len = 0; + + RXC_WARN(rxc, "Software UX list updated, %d SW UX entries\n", + rxc->sw_ux_list_len); + + if (rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) + cxip_post_ux_onload_sw(rxc); + else + cxip_post_ux_onload_fc(rxc); + + ofi_atomic_dec32(&rxc->orx_reqs); + cxip_evtq_req_free(req); +} + +/* + * cxip_get_ule_offsets() - Initialize an in-order array of ULE offsets + * + * If snapshot is requested, no more than two passes at getting offsets + * will be made. This is intended to be used with FI_CLAIM processing, + * where the PtlTE is enabled. + */ +static int cxip_get_ule_offsets(struct cxip_rxc *rxc, uint64_t **ule_offsets, + unsigned int *num_ule_offsets, bool snapshot) +{ + struct cxi_pte_status pte_status = { + .ule_count = 512 + }; + size_t cur_ule_count = 0; + int ret; + int calls = 0; + + /* Get all the unexpected header remote offsets. */ + *ule_offsets = NULL; + *num_ule_offsets = 0; + + do { + cur_ule_count = pte_status.ule_count; + *ule_offsets = reallocarray(*ule_offsets, cur_ule_count, + sizeof(*ule_offsets)); + if (*ule_offsets == NULL) { + RXC_WARN(rxc, "Failed allocate ule offset memory\n"); + ret = -FI_ENOMEM; + goto err; + } + + pte_status.ule_offsets = (void *)*ule_offsets; + ret = cxil_pte_status(rxc->rx_pte->pte, &pte_status); + assert(!ret); + } while (cur_ule_count < pte_status.ule_count && + !(snapshot && ++calls > 1)); + + *num_ule_offsets = pte_status.ule_count; + + return FI_SUCCESS; +err: + free(*ule_offsets); + + return ret; +} + +/* + * cxip_ux_onload_cb() - Process SEARCH_AND_DELETE command events. + */ +static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc *rxc = req->search.rxc; + struct cxip_deferred_event *def_ev; + struct cxip_ux_send *ux_send; + bool matched; + + assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL || + rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + switch (event->hdr.event_type) { + case C_EVENT_PUT_OVERFLOW: + assert(cxi_event_rc(event) == C_RC_OK); + + ux_send = calloc(1, sizeof(*ux_send)); + if (!ux_send) { + RXC_WARN(rxc, "Failed allocate to memory\n"); + return -FI_EAGAIN; + } + + /* Zero-byte unexpected onloads require special handling since + * no deferred structure would be allocated. + */ + if (event->tgt_long.rlength) { + + def_ev = match_put_event(rxc, req, event, &matched); + if (!matched) { + if (!def_ev) { + free(ux_send); + return -FI_EAGAIN; + } + + /* Gather Put events later */ + def_ev->ux_send = ux_send; + req->search.puts_pending++; + } else { + ux_send->req = def_ev->req; + ux_send->put_ev = def_ev->ev; + + free_put_event(rxc, def_ev); + } + } else { + ux_send->put_ev = *event; + } + + /* For flow control transition if a ULE is freed, then + * set state so that re-enable will be attempted. + */ + if (rxc->state == RXC_ONLOAD_FLOW_CONTROL) + rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + + /* Fixup event with the expected remote offset for an RGet. */ + if (event->tgt_long.rlength) { + ux_send->put_ev.tgt_long.remote_offset = + rxc->ule_offsets[rxc->cur_ule_offsets] + + event->tgt_long.mlength; + } + rxc->cur_ule_offsets++; + + dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); + rxc->sw_ux_list_len++; + + RXC_DBG(rxc, "Onloaded Send: %p\n", ux_send); + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + break; + case C_EVENT_SEARCH: + if (rxc->new_state == RXC_ENABLED_SOFTWARE && + rxc->state == RXC_ONLOAD_FLOW_CONTROL) + rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + + if (rxc->state == RXC_ONLOAD_FLOW_CONTROL) + RXC_FATAL(rxc, FC_SW_ONLOAD_MSG_FATAL); + + req->search.complete = true; + rxc->rx_evtq.ack_batch_size = rxc->rx_evtq.cq->ack_batch_size; + + RXC_DBG(rxc, "UX Onload Search done\n"); + + if (cxip_ux_is_onload_complete(req)) + cxip_ux_onload_complete(req); + + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + + return FI_SUCCESS; +} + +/* + * cxip_ux_onload() - Issue SEARCH_AND_DELETE command to on-load unexpected + * Send headers queued on the RXC message queue. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_ux_onload(struct cxip_rxc *rxc) +{ + struct cxip_req *req; + union c_cmdu cmd = {}; + int ret; + + assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL || + rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + RXC_DBG(rxc, "Initiate hardware UX list onload\n"); + + /* Get all the unexpected header remote offsets. */ + rxc->ule_offsets = NULL; + rxc->num_ule_offsets = 0; + rxc->cur_ule_offsets = 0; + + ret = cxip_get_ule_offsets(rxc, &rxc->ule_offsets, + &rxc->num_ule_offsets, false); + if (ret) { + RXC_WARN(rxc, "Failed to read UX remote offsets: %d %s\n", + ret, fi_strerror(-ret)); + goto err; + } + + /* Populate request */ + req = cxip_evtq_req_alloc(&rxc->rx_evtq, 1, NULL); + if (!req) { + RXC_DBG(rxc, "Failed to allocate request\n"); + ret = -FI_EAGAIN; + goto err_free_onload_offset; + } + ofi_atomic_inc32(&rxc->orx_reqs); + + req->cb = cxip_ux_onload_cb; + req->type = CXIP_REQ_SEARCH; + req->search.rxc = rxc; + + cmd.command.opcode = C_CMD_TGT_SEARCH_AND_DELETE; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = -1UL; + cmd.target.match_id = CXI_MATCH_ID_ANY; + + ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); + ret = -FI_EAGAIN; + goto err_dec_free_cq_req; + } + + cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); + + return FI_SUCCESS; + +err_dec_free_cq_req: + ofi_atomic_dec32(&rxc->orx_reqs); + cxip_evtq_req_free(req); +err_free_onload_offset: + free(rxc->ule_offsets); +err: + RXC_WARN(rxc, "Hardware UX list onload initiation error, ret: %d\n", + ret); + return ret; +} + +static int cxip_flush_appends_cb(struct cxip_req *req, + const union c_event *event) +{ + struct cxip_rxc *rxc = req->req_ctx; + int ret; + + assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL || + rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + assert(event->hdr.event_type == C_EVENT_SEARCH); + assert(cxi_event_rc(event) == C_RC_NO_MATCH); + + ret = cxip_ux_onload(rxc); + if (ret == FI_SUCCESS) { + ofi_atomic_dec32(&rxc->orx_reqs); + cxip_evtq_req_free(req); + } + + return ret; +} + +/* + * cxip_flush_appends() - Flush all user appends for a RXC. + * + * Before cxip_ux_onload() can be called, all user appends in the command queue + * must be flushed. If not, this can cause cxip_ux_onload() to read incorrect + * remote offsets from cxil_pte_status(). The flush is implemented by issuing + * a search command which will match zero ULEs. When the search event is + * processed, all pending user appends will have been processed. Since the RXC + * is not enabled, new appends cannot occur during this time. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_flush_appends(struct cxip_rxc *rxc) +{ + struct cxip_req *req; + union c_cmdu cmd = {}; + int ret; + + assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL || + rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + /* Populate request */ + req = cxip_evtq_req_alloc(&rxc->rx_evtq, 1, rxc); + if (!req) { + RXC_DBG(rxc, "Failed to allocate request\n"); + ret = -FI_EAGAIN; + goto err; + } + ofi_atomic_inc32(&rxc->orx_reqs); + + rxc->rx_evtq.ack_batch_size = 1; + + req->cb = cxip_flush_appends_cb; + req->type = CXIP_REQ_SEARCH; + + /* Search command which should match nothing. */ + cmd.command.opcode = C_CMD_TGT_SEARCH; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.match_bits = -1UL; + cmd.target.length = 0; + + ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); + ret = -FI_EAGAIN; + goto err_dec_free_cq_req; + } + + cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); + + return FI_SUCCESS; + +err_dec_free_cq_req: + ofi_atomic_dec32(&rxc->orx_reqs); + cxip_evtq_req_free(req); +err: + return ret; +} + +/* + * cxip_recv_pte_cb() - Process receive PTE state change events. + */ +void cxip_recv_pte_cb(struct cxip_pte *pte, const union c_event *event) +{ + struct cxip_rxc *rxc = (struct cxip_rxc *)pte->ctx; + int fc_reason = cxip_fc_reason(event); + int ret __attribute__((unused)); + + switch (pte->state) { + case C_PTLTE_ENABLED: + assert(rxc->state == RXC_FLOW_CONTROL || + rxc->state == RXC_DISABLED || + rxc->state == RXC_PENDING_PTLTE_HARDWARE); + + /* Queue any flow control resume messages */ + if (rxc->state == RXC_FLOW_CONTROL) { + cxip_fc_progress_ctrl(rxc); + RXC_WARN(rxc, "Now in RXC_ENABLED\n"); + } + + rxc->state = RXC_ENABLED; + break; + + case C_PTLTE_DISABLED: + if (rxc->state == RXC_DISABLED) + break; + + if (fc_reason == C_SC_DIS_UNCOR) + RXC_FATAL(rxc, "Disabled, LE uncorrectable err\n"); + + /* An incorrect drop count was used during PTE enable. + * Another attempt will be made when a peer sends a side-band + * drop message. + */ + if (cxi_event_rc(event) == C_RC_NO_MATCH) { + assert(rxc->state == RXC_FLOW_CONTROL || + rxc->state == RXC_ONLOAD_FLOW_CONTROL || + rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->state == + RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + RXC_WARN(rxc, FC_DROP_COUNT_MSG); + break; + } + + /* Flow control occurred while transitioning from HW to SW + * managed PTE. Since onloading of all UX entries will have + * been initiated (i.e. no new ones will be added) and the + * PTE state change from RXC_PENDING_PTLTE_SOFTWARE_MANAGED + * to RXC_ENABLED_SOFTWARE following onload complete is + * protected by the ep_obj->lock, it is safe to indicate that + * SW managed EP must be re-enabled on onload complete. + * The request list will have been replenished. + */ + if (rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) { + RXC_WARN(rxc, + "Flow control during HW to SW transition\n"); + rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + break; + } + + /* Check for flow control during flow control */ + if (rxc->state != RXC_ENABLED && + rxc->state != RXC_ENABLED_SOFTWARE && + rxc->state != RXC_PENDING_PTLTE_DISABLE) { + + /* There is race between SW disable on priority list + * and HW initiated LE flow control which can be + * ignored; otherwise it is a fatal error. + */ + if (fc_reason == CXIP_FC_SOFTWARE_INITIATED) + break; + RXC_FATAL(rxc, FC_SW_LE_MSG_FATAL); + } + + /* Starting flow control processing. The default is for + * flow control should re-enable in the previous + * hardware/software managed state. + */ + rxc->prev_state = rxc->state; + rxc->new_state = rxc->state; + rxc->state = RXC_ONLOAD_FLOW_CONTROL; + + RXC_DBG(rxc, "Flow control detected, H/W: %d reason: %d\n", + event->tgt_long.initiator.state_change.sc_nic_auto, + fc_reason); + + switch (fc_reason) { + case CXIP_FC_SOFTWARE_INITIATED: + /* Software initiated state change, drop count + * needs to start at zero instead of -1. Add 1 to + * account for this. Note this is only initiated + * from an hardware enabled PTE state. + */ + RXC_WARN(rxc, "SW initiated flow control\n"); + if (rxc->ep_obj->asic_ver < CASSINI_2_0) + rxc->drop_count++; + + /* If running in hybrid mode, resume operation as a + * software managed EP to reduce LE resource load. + */ + if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE) + rxc->new_state = RXC_ENABLED_SOFTWARE; + + rxc->num_fc_append_fail++; + break; + + case C_SC_FC_EQ_FULL: + /* EQ full does not require LE resources be recovered + * to re-enable. + */ + RXC_WARN(rxc, "Flow control EQ full\n"); + rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + rxc->num_fc_eq_full++; + break; + + case C_SC_FC_NO_MATCH: + /* Overflow list buffers were full/could not be matched + * against. Must replenish buffers, but does not in + * itself require resources be recovered. + */ + RXC_WARN(rxc, FC_OFLOW_NO_MATCH_MSG, + cxip_env.oflow_buf_size); + + rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + rxc->num_fc_no_match++; + break; + + case C_SC_FC_UNEXPECTED_FAIL: + /* Hybrid mode is not enabled and overflow matches, but + * LE resources prevent unexpected message allocation. + */ + RXC_WARN(rxc, "Flow control UX LE resources\n"); + rxc->num_fc_unexp++; + break; + + case C_SC_FC_REQUEST_FULL: + /* Running as software managed EP and request list + * buffers were full/could not be matched against. + * Must replenish buffers, but does not require that + * LE resources are recovered. + */ + RXC_WARN(rxc, FC_REQ_FULL_MSG, cxip_env.req_buf_size); + rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + rxc->num_fc_req_full++; + break; + + case C_SC_SM_APPEND_FAIL: + case C_SC_SM_UNEXPECTED_FAIL: + default: + RXC_FATAL(rxc, "Invalid disable PTE c_sc_reason: %d\n", + fc_reason); + } + rxc->fc_reason = fc_reason; + + do { + ret = cxip_flush_appends(rxc); + } while (ret == -FI_EAGAIN); + + if (ret != FI_SUCCESS) + RXC_FATAL(rxc, "cxip_flush_appends failed: %d\n", ret); + + break; + + case C_PTLTE_SOFTWARE_MANAGED: + /* There is an inherent race between hardware and software + * in setting the PtlTE state. If software requested to + * disable the PtlTE after hardware started a HW to SW + * transition; just wait for the disable event. + */ + if (rxc->state == RXC_PENDING_PTLTE_DISABLE) + break; + + RXC_DBG(rxc, "SW Managed: nic auto: %d, reason: %d\n", + event->tgt_long.initiator.state_change.sc_nic_auto, + event->tgt_long.initiator.state_change.sc_nic_auto ? + event->tgt_long.initiator.state_change.sc_reason : -1); + + /* We should not get a bad drop count status since the + * transition is synchronous but we will need this in + * the future. + */ + if (cxi_event_rc(event) == C_RC_NO_MATCH) { + RXC_WARN(rxc, "Bad drop count, ignored\n"); + break; + } + + /* Sanity check */ + if (rxc->state == RXC_FLOW_CONTROL) + RXC_FATAL(rxc, "FC to SW EP should be synchronous\n"); + + assert(rxc->state == RXC_DISABLED || + rxc->state == RXC_ENABLED || + rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + /* Hardware should only generate PTE software managed events + * in two cases: + * 1. Initial start in software mode: disabled->software. + * 2. NIC initiated software transition: enabled->software. + */ + switch (fc_reason) { + case CXIP_FC_SOFTWARE_INITIATED: + /* If messaging was initially offloaded then this + * state transition can only happen if the RXC has + * been disabled; it is safe to ignore this change. + */ + assert(rxc->state == RXC_DISABLED); + if (!cxip_env.msg_offload) { + RXC_WARN(rxc, "Software managed EP enabled\n"); + rxc->state = RXC_ENABLED_SOFTWARE; + } + break; + + case C_SC_SM_APPEND_FAIL: + case C_SC_SM_UNEXPECTED_FAIL: + /* The NIC initiated the transition; priority list + * appends that are in flight will fail and be added + * to the receive replay list. Update state so that + * no additional appends will be attempted until + * onload completes and the failed appends are + * replayed. + */ + RXC_WARN(rxc, + "NIC transition to SW EP, c_sc_reason: %d\n", + fc_reason); + rxc->fc_reason = fc_reason; + rxc->prev_state = rxc->state; + rxc->new_state = RXC_ENABLED_SOFTWARE; + + if (rxc->fc_reason == C_SC_SM_UNEXPECTED_FAIL) + rxc->num_sc_nic_hw2sw_unexp++; + else if (rxc->fc_reason == C_SC_SM_APPEND_FAIL) + rxc->num_sc_nic_hw2sw_append_fail++; + + rxc->msg_offload = 0; + rxc->state = RXC_PENDING_PTLTE_SOFTWARE_MANAGED; + do { + /* Flush and kick-off onloading of UX list */ + ret = cxip_flush_appends(rxc); + } while (ret == -FI_EAGAIN); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "Flush/UX onload err: %d\n", ret); + break; + default: + RXC_FATAL(rxc, "Invalid PTE c_sc_reason: %d\n", + fc_reason); + } + + break; + default: + RXC_FATAL(rxc, "Unexpected state received: %u\n", pte->state); + } +} + +/* + * tag_match() - Compare UX Send tag and Receive tags in SW. + */ +static bool tag_match(uint64_t init_mb, uint64_t mb, uint64_t ib) +{ + return !((init_mb ^ mb) & ~ib); +} + +/* + * tag_match() - Compare UX Send initiator and Receive initiator in SW. + */ +static bool init_match(struct cxip_rxc *rxc, uint32_t init, uint32_t match_id) +{ + if (match_id == CXI_MATCH_ID_ANY) + return true; + + if (rxc->ep_obj->av->symmetric) { + init = CXI_MATCH_ID_EP(rxc->pid_bits, init); + match_id = CXI_MATCH_ID_EP(rxc->pid_bits, match_id); + } + + return init == match_id; +} + +/* + * cxip_claim_onload_cb() - Process SEARCH and DELETE of claimed UX message. + */ +static int cxip_claim_onload_cb(struct cxip_req *req, + const union c_event *evt) +{ + struct cxip_rxc *rxc = req->req_ctx; + struct cxip_deferred_event *def_ev; + struct cxip_ux_send *ux_send; + bool matched = false; + + if (evt->hdr.event_type != C_EVENT_PUT_OVERFLOW) + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(evt), + cxi_rc_to_str(cxi_event_rc(evt))); + + /* Failed to onload UX message, return ENOMSG */ + if (cxi_event_rc(evt) != C_RC_OK) { + RXC_WARN(rxc, "FI_CLAIM HW onload failed: %d\n", + cxi_event_rc(evt)); + recv_req_peek_complete(req, NULL); + + return FI_SUCCESS; + } + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + /* FI_CLAIM UX message onloaded from hardware */ + ux_send = calloc(1, sizeof(*ux_send)); + if (!ux_send) { + RXC_WARN(rxc, "Failed allocate UX memory\n"); + return -FI_EAGAIN; + } + ux_send->claimed = true; + + /* Zero-byte unexpected onloads require special handling + * since no deferred structure would be allocated. + */ + if (evt->tgt_long.rlength) { + def_ev = match_put_event(rxc, req, evt, &matched); + if (!matched) { + /* The EVENT_PUT to the overflow list has not been + * processed. The FI_CLAIM operation will be completed + * when the matching put is received. + */ + if (!def_ev) { + free(ux_send); + return -FI_EAGAIN; + } + def_ev->ux_send = ux_send; + } else { + ux_send->req = def_ev->req; + ux_send->put_ev = def_ev->ev; + free_put_event(rxc, def_ev); + } + + /* Fixup event remote offset for an RGet. */ + if (evt->tgt_long.rlength) + ux_send->put_ev.tgt_long.remote_offset = + req->recv.ule_offset + evt->tgt_long.mlength; + + } else { + matched = true; + ux_send->put_ev = *evt; + } + + /* Add to the sw UX list as a claimed entry, it will be ignored in + * recieve matching of UX list entries. Its order no longer matters. + */ + dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); + rxc->sw_ux_list_len++; + + RXC_DBG(rxc, "FI_CLAIM Onload req: %p ux_send %p\n", req, ux_send); + recv_req_tgt_event(req, &ux_send->put_ev); + + /* Put was already received, return FI_CLAIM completion */ + if (matched) { + recv_req_peek_complete(req, ux_send); + RXC_DBG(rxc, "FI_CLAIM onload complete, req %p, ux_send %p\n", + req, ux_send); + } + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + return FI_SUCCESS; +} + +/* + * cxip_claim_ux_onload() - Initiate SEARCH and DELETE of FI_CLAIM ux entry. + */ +static int cxip_claim_ux_onload(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->req_ctx; + int ret = FI_SUCCESS; + union c_cmdu cmd = {}; + union cxip_match_bits mb = {}; + union cxip_match_bits ib = {}; + + if (rxc->state != RXC_ENABLED) { + RXC_DBG(rxc, "FC inprogress, fail claim req %p\n", req); + goto err; + } + + /* Initiate a search to get the remote offset for the + * unexpected list entry we matched. + */ + req->cb = cxip_claim_onload_cb; + mb.tag = req->recv.tag; + mb.tagged = 1; + ib.tx_id = ~0; + ib.cq_data = ~0; + ib.match_comp = ~0; + ib.rdzv_done = ~0; + ib.le_type = ~0; + ib.tag = req->recv.ignore; + + cmd.command.opcode = C_CMD_TGT_SEARCH_AND_DELETE; + + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = ib.raw; + cmd.target.match_bits = mb.raw; + cmd.target.match_id = req->recv.match_id; + /* Delete first match */ + cmd.target.use_once = 1; + + ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); + if (ret) { + /* This condition should clear */ + RXC_WARN(rxc, + "Cannot emit of UX delete cmd, return -FI_EAGAIN\n"); + return -FI_EAGAIN; + } + + cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); + + /* Hardware handles the race between subsequent priority list + * appends to the search and delete command. Re-enable. + */ + rxc->hw_claim_in_progress = false; + RXC_DBG(rxc, "FI_CLAIM Search and Delete of UX entry initiated\n"); + + return FI_SUCCESS; + +err: + /* Unable to initiate FI_CLAIM, report as ENOMSG */ + rxc->hw_claim_in_progress = false; + recv_req_peek_complete(req, NULL); + + return FI_SUCCESS; +} + +/* + * cxip_hw_claim_offset_cb() - Process SEARCH command events to get remote + * offset of entry to be deleted. + */ +static int cxip_hw_claim_offset_cb(struct cxip_req *req, + const union c_event *evt) +{ + struct cxip_rxc *rxc = req->recv.rxc; + union cxip_match_bits ux_mb; + uint32_t ux_init; + int ret; + + switch (evt->hdr.event_type) { + case C_EVENT_SEARCH: + if (cxi_event_rc(evt) == C_RC_OK) { + RXC_DBG(rxc, "Claim UX offset search entry, req: %p\n", + req); + + if (req->recv.offset_found) + break; + + req->recv.cur_ule_offsets++; + + /* Not found in range of the offsets we have */ + if (req->recv.cur_ule_offsets > + req->recv.num_ule_offsets) { + RXC_DBG(rxc, "Claim UX offsets exceeded\n"); + break; + } + + /* Check for a match against the FI_PEEK */ + ux_mb.raw = evt->tgt_long.match_bits; + ux_init = evt->tgt_long.initiator.initiator.process; + + if (req->recv.tagged != ux_mb.tagged) + break; + if (ux_mb.tagged + && !tag_match(ux_mb.tag, req->recv.tag, + req->recv.ignore)) + break; + if (!init_match(rxc, ux_init, req->recv.match_id)) + break; + + /* Matched, update to ignore any future events */ + req->recv.offset_found = true; + req->recv.ule_offset = + req->recv.ule_offsets[req->recv.cur_ule_offsets - 1]; + + RXC_DBG(rxc, "Found offset for claim %p, %d : 0x%lX\n", + req, req->recv.cur_ule_offsets - 1, + req->recv.ule_offset); + break; + } + + assert(cxi_event_rc(evt) == C_RC_NO_MATCH); + + RXC_DBG(rxc, "FI_CLAIM remote offset search done, status %d\n", + cxi_event_rc(evt)); + + if (!req->recv.offset_found) { + RXC_DBG(rxc, "Req %p, FI_CLAIM UX not found\n", req); + goto err_not_found; + } + + ret = cxip_claim_ux_onload(req); + if (ret) { + /* Unable to initiate SEARCH and DELETE, this + * should clear. All other errors return ENOMSG. + */ + if (ret == -FI_EAGAIN) + return ret; + + RXC_WARN(rxc, "claim_ux_onload failed %d\n", ret); + goto err_not_found; + } + + RXC_DBG(rxc, "FI_CLAIM req %p remote offset 0x%lX\n", + req, req->recv.ule_offset); + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(evt), + cxi_rc_to_str(cxi_event_rc(evt))); + } + + return FI_SUCCESS; + +err_not_found: + /* Terminate FI_PEEK with FI_CLAIM with ENOMSG */ + rxc->hw_claim_in_progress = false; + free(req->recv.ule_offsets); + req->recv.ule_offsets = NULL; + recv_req_peek_complete(req, NULL); + + return FI_SUCCESS; +} + +/* + * cxip_initiate_hw_claim() - Onload the specified peek, claiming it. + */ +static int cxip_initiate_hw_claim(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->req_ctx; + union c_cmdu cmd = {}; + int ret = FI_SUCCESS; + + if (rxc->state != RXC_ENABLED) { + RXC_DBG(rxc, "FC inprogress, unable to claim req %p\n", req); + goto err; + } + + /* UX entry exists in hardware, the initial search acts as a flush of + * the event queue for priority list appends. Get remote offset for + * the associated unexpected list entry. + */ + req->recv.cur_ule_offsets = 0; + ret = cxip_get_ule_offsets(rxc, &req->recv.ule_offsets, + &req->recv.num_ule_offsets, true); + if (ret) { + RXC_WARN(rxc, "Unable to get FI_CLAIM UX offsets\n"); + goto err; + } + + RXC_DBG(rxc, "ule_offsets %p, num offsets %d\n", + req->recv.ule_offsets, req->recv.num_ule_offsets); + + /* Initiate a search to get the remote offset for the + * unexpected list entry we matched. This requires going + * through the list. + */ + req->cb = cxip_hw_claim_offset_cb; + + cmd.command.opcode = C_CMD_TGT_SEARCH; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = -1UL; + cmd.target.match_id = CXI_MATCH_ID_ANY; + + ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); + goto err_free_offsets; + } + + cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); + + RXC_DBG(rxc, "Search for remote offsets initiated, req %p\n", req); + + return FI_SUCCESS; + +err_free_offsets: + free(req->recv.ule_offsets); + req->recv.ule_offsets = NULL; +err: + /* Unable to initiate FI_CLAIM, report as ENOMSG */ + rxc->hw_claim_in_progress = false; + recv_req_peek_complete(req, NULL); + + return FI_SUCCESS; +} + +/* + * cxip_ux_peek_cb() - Process UX list SEARCH command events. + */ +static int cxip_ux_peek_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc *rxc = req->req_ctx; + + assert(req->recv.flags & FI_PEEK); + + switch (event->hdr.event_type) { + case C_EVENT_SEARCH: + /* Will receive event for only first match or failure */ + if (cxi_event_rc(event) == C_RC_OK) { + RXC_DBG(rxc, "Peek UX search req: %p matched\n", req); + if (req->recv.flags & FI_CLAIM) { + RXC_DBG(rxc, "req: %p UX must be claimed\n", + req); + return cxip_initiate_hw_claim(req); + } + + /* FI_PEEK only was found */ + recv_req_tgt_event(req, event); + } else { + RXC_DBG(rxc, "Peek UX search req: %p no match\n", req); + } + + recv_req_peek_complete(req, NULL); + break; + + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + + return FI_SUCCESS; +} + +/* + * cxip_ux_peek() - Issue a SEARCH command to peek for a matching send + * on the RXC offloaded unexpected message list. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_ux_peek(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->req_ctx; + union c_cmdu cmd = {}; + union cxip_match_bits mb = {}; + union cxip_match_bits ib = {}; + int ret; + + assert(req->recv.flags & FI_PEEK); + + req->cb = cxip_ux_peek_cb; + + mb.tag = req->recv.tag; + mb.tagged = 1; + ib.tx_id = ~0; + ib.cq_data = ~0; + ib.match_comp = ~0; + ib.rdzv_done = ~0; + ib.le_type = ~0; + ib.tag = req->recv.ignore; + + cmd.command.opcode = C_CMD_TGT_SEARCH; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = ib.raw; + cmd.target.match_bits = mb.raw; + cmd.target.match_id = req->recv.match_id; + /* First match only */ + cmd.target.use_once = 1; + + if (cxip_evtq_saturated(&rxc->rx_evtq)) { + RXC_DBG(rxc, "Target HW EQ saturated\n"); + return -FI_EAGAIN; + } + + RXC_DBG(rxc, "Peek UX search req: %p mb.raw: 0x%" PRIx64 " match_id: 0x%x ignore: 0x%" PRIx64 "\n", + req, mb.raw, req->recv.match_id, req->recv.ignore); + + ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); + return -FI_EAGAIN; + } + + cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); + + /* If FI_CLAIM, we disable priority list appends so the + * search acts as a flush of outstanding appends. + */ + if (req->flags & FI_CLAIM) + rxc->hw_claim_in_progress = true; + + return FI_SUCCESS; +} + +/* cxip_set_ux_dump_entry() - initialize a CQ entry structure + * and/or source address with UX message info. + */ +static void cxip_set_ux_dump_entry(struct cxip_req *req, + const union c_event *evt) +{ + struct cxip_ux_dump_state *ux_dump = req->recv.ux_dump; + union cxip_match_bits mb; + struct fi_cq_tagged_entry *cq_entry = NULL; + fi_addr_t *src_addr = NULL; + + ux_dump->ux_count++; + + /* If exceeding caller provided space updating the total + * available UX message count is all that is required. + */ + if (ux_dump->ret_count >= ux_dump->max_count) + return; + + if (ux_dump->entry) + cq_entry = &ux_dump->entry[ux_dump->ret_count]; + if (ux_dump->src_addr) + src_addr = &ux_dump->src_addr[ux_dump->ret_count]; + + if (cq_entry || src_addr) { + ux_dump->ret_count++; + + req->recv.tgt_event = false; + req->flags = 0; + recv_req_tgt_event(req, evt); + + if (cq_entry) { + /* Need to add FI_TAGGED or FI_MSG directly */ + mb.raw = evt->tgt_long.match_bits; + if (mb.tagged) + req->flags |= FI_TAGGED; + else + req->flags |= FI_MSG; + cq_entry->op_context = NULL; + cq_entry->flags = req->flags; + cq_entry->len = req->recv.rlen; + cq_entry->buf = NULL; + cq_entry->data = req->data; + cq_entry->tag = req->tag; + } + + if (src_addr && req->recv.rxc->attr.caps & FI_SOURCE) + *src_addr = recv_req_src_addr(req); + } +} + +/* + * cxip_unexp_msg_dump_cb() - Process search command dumping H/W UX entries. + */ +static int cxip_unexp_msg_dump_cb(struct cxip_req *req, + const union c_event *evt) +{ + struct cxip_rxc *rxc = req->recv.rxc; + + if (evt->hdr.event_type != C_EVENT_SEARCH) + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(evt), + cxi_rc_to_str(cxi_event_rc(evt))); + + if (cxi_event_rc(evt) == C_RC_NO_MATCH) { + req->recv.ux_dump->done = true; + return FI_SUCCESS; + } + assert(cxi_event_rc(evt) == C_RC_OK); + + cxip_set_ux_dump_entry(req, evt); + + return FI_SUCCESS; +} + +/* + * cxip_build_debug_ux_entry_info() - Initialize UX info array from ULE. + * + * It is expected that a debugger is utilizing this interface and is + * expecting synchronous behavior. + * + * Caller should hold ep_obj->lock. + */ +int cxip_build_ux_entry_info(struct cxip_ep *ep, + struct fi_cq_tagged_entry *entry, size_t count, + fi_addr_t *src_addr, size_t *ux_count) +{ + struct cxip_rxc *rxc = &ep->ep_obj->rxc; + struct cxip_ux_dump_state *ux_dump; + struct cxip_ux_send *ux_send; + struct dlist_entry *tmp; + struct cxip_req *req = NULL; + union c_cmdu cmd = {}; + int ret_count; + int ret; + + ret = cxip_recv_req_alloc(rxc, NULL, 0, &req); + if (ret) + return ret; + + ux_dump = calloc(1, sizeof(struct cxip_ux_dump_state)); + if (!ux_dump) { + RXC_WARN(rxc, "ENOMEM on allocate of UX state buffer\n"); + ret_count = -FI_ENOMEM; + goto done; + } + + ux_dump->max_count = count; + ux_dump->entry = entry; + ux_dump->src_addr = src_addr; + req->recv.ux_dump = ux_dump; + + /* Get entries from software UX list first */ + dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, + ux_send, rxc_entry, tmp) + cxip_set_ux_dump_entry(req, &ux_send->put_ev); + + if (!rxc->msg_offload) + goto done; + + /* Read H/W UX list processing the request events synchronously + * until we set "Done" in the request callback. + */ + req->cb = cxip_unexp_msg_dump_cb; + cmd.command.opcode = C_CMD_TGT_SEARCH; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = -1UL; + cmd.target.match_id = CXI_MATCH_ID_ANY; + + ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write ULE Search command: %d\n", ret); + ret_count = ret; + goto done; + } + cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); + + RXC_DBG(rxc, "Search for ULE dump initiated, req %p\n", req); + do { + cxip_evtq_progress(&rxc->rx_evtq); + sched_yield(); + } while (!ux_dump->done); + + RXC_DBG(rxc, "Search ULE dump done, req %p, count %ld\n", + req, ux_dump->ret_count); +done: + ret_count = ux_dump->ret_count; + *ux_count = ux_dump->ux_count; + + free(ux_dump); + cxip_recv_req_free(req); + + return ret_count; +} + +/* + * cxip_recv_sw_matched() - Progress the SW Receive match. + * + * Progress the operation which matched in SW. + */ +static int cxip_recv_sw_matched(struct cxip_req *req, + struct cxip_ux_send *ux_send) +{ + int ret; + uint64_t mrecv_start; + uint32_t mrecv_len; + bool req_done = true; + uint32_t ev_init; + uint32_t ev_rdzv_id; + struct cxip_req *rdzv_req; + struct cxip_rxc *rxc = req->recv.rxc; + + assert(req->type == CXIP_REQ_RECV); + + mrecv_start = req->recv.start_offset; + mrecv_len = mrecv_req_put_bytes(req, ux_send->put_ev.tgt_long.rlength); + + if (req->recv.multi_recv && + (req->recv.ulen - req->recv.start_offset) >= + req->recv.rxc->min_multi_recv) + req_done = false; + + if (ux_send->put_ev.tgt_long.rendezvous) { + + /* Make sure we can issue the RGet; if not we stall + * and TX event queue progress will free up credits. + */ + if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > rxc->max_tx) { + ofi_atomic_dec32(&rxc->orx_tx_reqs); + return -FI_EAGAIN; + } + + ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, + mrecv_start, mrecv_len, req_done); + if (ret != FI_SUCCESS) { + req->recv.start_offset -= mrecv_len; + ofi_atomic_dec32(&rxc->orx_tx_reqs); + + return ret; + } + + /* If multi-recv, a child request was created from + * cxip_ux_send(). Need to lookup this request. + * + * NOTE: Since the same event will be used, the evenet checks + * must be NOT be performed. The event checks are only needed + * when hardware is generating put and put overflow events for + * an mrecv buffer. If we have reached here, we know a put + * overflow event will never occur since the mrecv buffer has + * not been offloaded to hardware. + */ + if (req->recv.multi_recv) { + ret = rdzv_mrecv_req_lookup(req, &ux_send->put_ev, + &ev_init, &ev_rdzv_id, + false, &rdzv_req); + + /* If the previous cxip_ux_send() returns FI_SUCCESS, + * a matching rdzv mrecv req will always exist. + */ + assert(ret == FI_SUCCESS); + } else { + rdzv_req = req; + } + + /* Rendezvous event will not happen. So ack rendezvous event + * now. + */ + rdzv_recv_req_event(rdzv_req, ux_send->put_ev.hdr.event_type); + + cxip_recv_req_set_rget_info(rdzv_req); + + + /* A TX credit has been reserved and user receive request may + * have been removed from the ordered SW queue. If the command + * queue is backed up the condition will clear and the rget + * must get sent out, so wait for it. + */ + do { + ret = issue_rdzv_get(rdzv_req); + } while (ret == -FI_EAGAIN); + assert(ret == FI_SUCCESS); + } else { + if (ux_send->put_ev.tgt_long.rlength) + ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, + mrecv_start, mrecv_len, req_done); + else + ret = cxip_ux_send_zb(req, &ux_send->put_ev, + mrecv_start, req_done); + + if (ret != FI_SUCCESS) { + /* undo mrecv_req_put_bytes() */ + req->recv.start_offset -= mrecv_len; + return ret; + } + } + + /* If this is a multi-receive request and there is still space, return + * a special code to indicate SW should keep matching messages to it. + */ + if (ret == FI_SUCCESS && !req_done) + return -FI_EINPROGRESS; + + return ret; +} + +static bool cxip_match_recv_sw(struct cxip_rxc *rxc, struct cxip_req *req, + struct cxip_ux_send *ux, bool claimed) +{ + union cxip_match_bits ux_mb; + uint32_t ux_init; + + if (claimed != ux->claimed) + return false; + + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + ux_init = ux->put_ev.tgt_long.initiator.initiator.process; + + if (req->recv.tagged != ux_mb.tagged) + return false; + + if (ux_mb.tagged && + !tag_match(ux_mb.tag, req->recv.tag, req->recv.ignore)) + return false; + + if (!init_match(rxc, ux_init, req->recv.match_id)) + return false; + + return true; +} + +static int cxip_recv_sw_matcher(struct cxip_rxc *rxc, struct cxip_req *req, + struct cxip_ux_send *ux, bool claimed) +{ + int ret; + + if (!cxip_match_recv_sw(rxc, req, ux, claimed)) + return -FI_ENOMSG; + + ret = cxip_recv_sw_matched(req, ux); + if (ret == -FI_EAGAIN) + return -FI_EAGAIN; + + /* FI_EINPROGRESS is return for a multi-recv match. */ + assert(ret == FI_SUCCESS || ret == -FI_EINPROGRESS); + + /* TODO: Manage freeing of UX entries better. */ + dlist_remove(&ux->rxc_entry); + if (ux->req && ux->req->type == CXIP_REQ_RBUF) { + cxip_req_buf_ux_free(ux); + rxc->sw_ux_list_len--; + } else { + free(ux); + rxc->sw_ux_list_len--; + } + + RXC_DBG(rxc, + "Software match, req: %p ux_send: %p (sw_ux_list_len: %u)\n", + req, ux, req->recv.rxc->sw_ux_list_len); + + return ret; +} + +/* + * cxip_recv_ux_sw_matcher() - Attempt to match an unexpected message to a user + * posted receive. + * + * User must hold the ep_obj->lock. + */ +int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux) +{ + struct cxip_ptelist_buf *rbuf = ux->req->req_ctx; + struct cxip_rxc *rxc = rbuf->rxc; + struct cxip_req *req; + struct dlist_entry *tmp; + int ret; + + if (dlist_empty(&rxc->sw_recv_queue)) + return -FI_ENOMSG; + + dlist_foreach_container_safe(&rxc->sw_recv_queue, struct cxip_req, req, + recv.rxc_entry, tmp) { + /* Only matches against unclaimed UX messages */ + ret = cxip_recv_sw_matcher(rxc, req, ux, false); + + /* Unexpected message found match but unable to progress */ + if (ret == -FI_EAGAIN) + return ret; + + /* Unexpected message found a match. */ + if (ret == FI_SUCCESS || ret == -FI_EINPROGRESS) + return FI_SUCCESS; + } + + return -FI_ENOMSG; +} + +/* + * cxip_recv_req_sw_matcher() - Attempt to match the receive request in SW. + * + * Loop through all onloaded UX Sends looking for a match for the Receive + * request. If a match is found, progress the operation. + * + * Caller must hold ep_obj->lock. + */ +int cxip_recv_req_sw_matcher(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->recv.rxc; + struct cxip_ux_send *ux_send; + struct dlist_entry *tmp; + int ret; + + if (dlist_empty(&rxc->sw_ux_list)) + return -FI_ENOMSG; + + dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, + ux_send, rxc_entry, tmp) { + /* Only match against unclaimed UX messages */ + ret = cxip_recv_sw_matcher(rxc, req, ux_send, false); + switch (ret) { + /* On successful multi-recv or no match, keep matching. */ + case -FI_EINPROGRESS: + case -FI_ENOMSG: + break; + + /* Stop matching. */ + default: + return ret; + } + } + + return -FI_ENOMSG; +} + +/* + * cxip_recv_req_dropped() - Mark the Received request dropped. + * + * If HW does not have sufficient LEs to perform an append, the command is + * dropped. Queue the request for replay. When all outstanding append commands + * complete, replay all Receives. + * + * Caller must hold ep_obj->lock + */ +static int cxip_recv_req_dropped(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->recv.rxc; + int ret __attribute__((unused)); + + assert(dlist_empty(&req->recv.rxc_entry)); + dlist_insert_tail(&req->recv.rxc_entry, &rxc->replay_queue); + + RXC_DBG(rxc, "Receive dropped: %p\n", req); + + return FI_SUCCESS; +} + +/* + * cxip_recv_req_peek() - Peek for matching unexpected message on RXC. + * + * Examine onloaded UX sends, if not found there and HW offload is enabled, + * initiate check of HW UX list. In either case the operation will not + * consume the UX send, but only report the results of the peek to the CQ. + * + * Caller must hold the ep_obj->lock. + */ +static int cxip_recv_req_peek(struct cxip_req *req, bool check_rxc_state) +{ + struct cxip_rxc *rxc = req->recv.rxc; + struct cxip_ux_send *ux_send; + struct dlist_entry *tmp; + int ret; + + if (check_rxc_state && rxc->state != RXC_ENABLED && + rxc->state != RXC_ENABLED_SOFTWARE) + return -FI_EAGAIN; + + /* Attempt to match the onloaded UX list first */ + dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, + ux_send, rxc_entry, tmp) { + if (cxip_match_recv_sw(rxc, req, ux_send, false)) { + if (req->recv.flags & FI_CLAIM) + ux_send->claimed = true; + + recv_req_tgt_event(req, &ux_send->put_ev); + recv_req_peek_complete(req, ux_send); + return FI_SUCCESS; + } + } + + if (rxc->msg_offload) { + /* Must serialize H/W FI_CLAIM due to getting remote offsets */ + if (rxc->hw_claim_in_progress) + return -FI_EAGAIN; + + ret = cxip_ux_peek(req); + } else { + req->recv.rc = C_RC_NO_MATCH; + recv_req_peek_complete(req, NULL); + ret = FI_SUCCESS; + } + + return ret; +} + +/* + * cxip_recv_req_queue() - Queue Receive request on RXC. + * + * Before appending a new Receive request to a HW list, attempt to match the + * Receive to any onloaded UX Sends. + * + * Caller must hold the RXC lock and ensure correct RXC state if required. + */ +static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq) +{ + struct cxip_rxc *rxc = req->recv.rxc; + int ret; + + /* Try to match against onloaded Sends first. */ + ret = cxip_recv_req_sw_matcher(req); + if (ret == FI_SUCCESS) + return -FI_EALREADY; + else if (ret == -FI_EAGAIN) + return -FI_EAGAIN; + else if (ret != -FI_ENOMSG) + RXC_FATAL(rxc, "SW matching failed: %d\n", ret); + + if (rxc->msg_offload) { + /* Can not append to priority list if claimng UX */ + if (rxc->hw_claim_in_progress) + goto err_dequeue_req; + + ret = _cxip_recv_req(req, restart_seq); + if (ret) + goto err_dequeue_req; + } else { + + req->recv.software_list = true; + dlist_insert_tail(&req->recv.rxc_entry, &rxc->sw_recv_queue); + } + + return FI_SUCCESS; + +err_dequeue_req: + dlist_remove_init(&req->recv.rxc_entry); + + return -FI_EAGAIN; +} + +static int cxip_rxc_check_recv_count_hybrid_preempt(struct cxip_rxc *rxc) +{ + int ret; + int count; + + if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_posted_recv_preemptive == 1) { + count = ofi_atomic_get32(&rxc->orx_reqs); + + if (count > rxc->attr.size) { + assert(rxc->state == RXC_ENABLED); + + /* On success, need to return -FI_EAGAIN which will + * propagate back to the user. In addition, RXC state + * will have transitioned to RXC_PENDING_PTLTE_DISABLE. + */ + ret = cxip_recv_pending_ptlte_disable(rxc, false); + if (ret == FI_SUCCESS) { + RXC_WARN(rxc, + "Transitioning to SW EP due to too many posted recvs: posted_count=%u request_size=%lu\n", + ret, rxc->attr.size); + return -FI_EAGAIN; + } + + RXC_WARN(rxc, "Failed to transition to SW EP: %d\n", + ret); + return ret; + } + } + + return FI_SUCCESS; +} + +/* + * _cxip_recv_req() - Submit Receive request to hardware. + */ +static ssize_t _cxip_recv_req(struct cxip_req *req, bool restart_seq) +{ + struct cxip_rxc *rxc = req->recv.rxc; + uint32_t le_flags = 0; + union cxip_match_bits mb = {}; + union cxip_match_bits ib = { + .tx_id = ~0, + .match_comp = 1, + .cq_data = 1, + .rdzv_done = 1, + .le_type = ~0, + }; + int ret; + struct cxip_md *recv_md = req->recv.recv_md; + uint64_t recv_iova = 0; + + ret = cxip_rxc_check_recv_count_hybrid_preempt(rxc); + if (ret != FI_SUCCESS) + return ret; + + if (req->recv.tagged) { + mb.tagged = 1; + mb.tag = req->recv.tag; + ib.tag = req->recv.ignore; + } + + /* For poorly written applications a periodic check LE pool + * resources can be requested to force transitions to software mode. + * For this to occur, the code must be executing in hybrid mode, + * still matching in hardware, and FI_CXI_HYBRID_RECV_PREEMPTIVE + * explicitly set by the application. + */ + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE || + ++rxc->recv_appends & CXIP_HYBRID_RECV_CHECK_INTERVAL) + le_flags = C_LE_EVENT_LINK_DISABLE; + + /* Always set manage_local in Receive LEs. This makes Cassini ignore + * initiator remote_offset in all Puts. With this, remote_offset in Put + * events can be used by the initiator for protocol data. The behavior + * of use_once is not impacted by manage_local. + */ + le_flags |= C_LE_EVENT_UNLINK_DISABLE | C_LE_MANAGE_LOCAL | + C_LE_UNRESTRICTED_BODY_RO | C_LE_UNRESTRICTED_END_RO | + C_LE_OP_PUT; + + if (!req->recv.multi_recv) + le_flags |= C_LE_USE_ONCE; + if (restart_seq) + le_flags |= C_LE_RESTART_SEQ; + + if (recv_md) + recv_iova = CXI_VA_TO_IOVA(recv_md->md, + (uint64_t)req->recv.recv_buf + + req->recv.start_offset); + + req->recv.hw_offloaded = true; + + /* Issue Append command */ + ret = cxip_pte_append(rxc->rx_pte, recv_iova, + req->recv.ulen - req->recv.start_offset, + recv_md ? recv_md->md->lac : 0, + C_PTL_LIST_PRIORITY, req->req_id, + mb.raw, ib.raw, req->recv.match_id, + req->recv.multi_recv ? rxc->min_multi_recv : 0, + le_flags, NULL, rxc->rx_cmdq, + !(req->recv.flags & FI_MORE)); + if (ret != FI_SUCCESS) { + RXC_WARN(rxc, "Failed to write Append command: %d\n", ret); + return ret; + } + + return FI_SUCCESS; +} + +/* + * cxip_recv_common() - Common message receive function. Used for tagged and + * untagged sends of all sizes. + */ +ssize_t cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, + void *desc, fi_addr_t src_addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags, + bool tagged, struct cxip_cntr *comp_cntr) +{ + int ret; + struct cxip_req *req; + struct cxip_addr caddr; + struct cxip_ux_send *ux_msg; + uint32_t match_id; + + if (len && !buf) + return -FI_EINVAL; + + if (rxc->state == RXC_DISABLED) + return -FI_EOPBADSTATE; + + /* HW to SW PtlTE transition, ensure progress is made */ + if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { + cxip_cq_progress(rxc->recv_cq); + return -FI_EAGAIN; + } + + if (tagged) { + if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) { + RXC_WARN(rxc, + "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", + tag, ignore, CXIP_TAG_MASK); + return -FI_EINVAL; + } + flags &= ~FI_MULTI_RECV; + } + + /* If FI_DIRECTED_RECV and a src_addr is specified, encode the address + * in the LE for matching. If application AVs are symmetric, use + * logical FI address for matching. Otherwise, use physical address. + */ + if (rxc->attr.caps & FI_DIRECTED_RECV && + src_addr != FI_ADDR_UNSPEC) { + if (rxc->ep_obj->av->symmetric) { + /* PID is not used for matching */ + match_id = CXI_MATCH_ID(rxc->pid_bits, C_PID_ANY, + src_addr); + } else { + ret = cxip_av_lookup_addr(rxc->ep_obj->av, src_addr, + &caddr); + if (ret != FI_SUCCESS) { + RXC_WARN(rxc, "Failed to look up FI addr: %d\n", + ret); + return -FI_EINVAL; + } + + match_id = CXI_MATCH_ID(rxc->pid_bits, caddr.pid, + caddr.nic); + } + } else { + match_id = CXI_MATCH_ID_ANY; + } + + ofi_genlock_lock(&rxc->ep_obj->lock); + ret = cxip_recv_req_alloc(rxc, buf, len, &req); + if (ret) + goto err; + + /* req->data_len, req->tag, req->data must be set later. req->buf may + * be overwritten later. + */ + req->context = (uint64_t)context; + + req->flags = FI_RECV | (flags & FI_COMPLETION); + if (tagged) + req->flags |= FI_TAGGED; + else + req->flags |= FI_MSG; + + req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; + req->recv.match_id = match_id; + req->recv.tag = tag; + req->recv.ignore = ignore; + req->recv.flags = flags; + req->recv.tagged = tagged; + req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); + + if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { + ret = -FI_EAGAIN; + goto err_free_request; + } + + if (!(req->recv.flags & (FI_PEEK | FI_CLAIM))) { + + ret = cxip_recv_req_queue(req, false); + /* Match made in software? */ + if (ret == -FI_EALREADY) { + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return FI_SUCCESS; + } + + /* RXC busy (onloading Sends or full CQ)? */ + if (ret != FI_SUCCESS) + goto err_free_request; + + ofi_genlock_unlock(&rxc->ep_obj->lock); + + RXC_DBG(rxc, + "req: %p buf: %p len: %lu src_addr: %ld tag(%c):" + " 0x%lx ignore: 0x%lx context: %p\n", + req, buf, len, src_addr, tagged ? '*' : '-', tag, + ignore, context); + + return FI_SUCCESS; + } + + /* FI_PEEK with/without FI_CLAIM */ + if (req->recv.flags & FI_PEEK) { + if (req->recv.flags & FI_CLAIM && !req->context) { + RXC_WARN(rxc, "FI_CLAIM requires fi_context\n"); + ret = -FI_EINVAL; + goto err_free_request; + } + ret = cxip_recv_req_peek(req, true); + if (ret == FI_SUCCESS) { + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return ret; + } + + goto err_free_request; + } + + /* FI_CLAIM without FI_PEEK */ + ux_msg = ((struct fi_context *)req->context)->internal[0]; + if (!ux_msg->claimed) { + RXC_WARN(rxc, "Bad fi_context specified with FI_CLAIM\n"); + ret = -FI_EINVAL; + goto err_free_request; + } + + RXC_DBG(rxc, "FI_CLAIM invoke sw matcher %p\n", ux_msg); + ret = cxip_recv_sw_matcher(rxc, req, ux_msg, true); + if (ret == FI_SUCCESS || ret == -FI_EINPROGRESS) { + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return FI_SUCCESS; + } + +err_free_request: + cxip_recv_req_free(req); +err: + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return ret; +} + +/* + * cxip_txc_fi_addr() - Return the FI address of the TXC. + */ +static fi_addr_t _txc_fi_addr(struct cxip_txc *txc) +{ + if (txc->ep_obj->fi_addr == FI_ADDR_NOTAVAIL) { + txc->ep_obj->fi_addr = + cxip_av_lookup_fi_addr(txc->ep_obj->av, + &txc->ep_obj->src_addr); + TXC_DBG(txc, "Found EP FI Addr: %lu\n", txc->ep_obj->fi_addr); + } + + return txc->ep_obj->fi_addr; +} + +/* + * cxip_msg_match_id() - Return the TXC's initiator address used to transmit a + * message. + * + * By default, the physical address of the TXC is returned. This address is + * sent along with message data and is used for source address matching at the + * target. When the target receives a message, the physical ID is translated to + * a logical FI address. Translation adds overhead to the receive path. + * + * As an optimization, if rendezvous offload is not being used and the process + * is part of a job with symmetric AVs, a logical FI address is returned. This + * way, there is no source address translation overhead involved in the + * receive. + */ +static uint32_t cxip_msg_match_id(struct cxip_txc *txc) +{ + /* PID is not used for logical matching, but is used for rendezvous. */ + if (txc->ep_obj->av->symmetric) + return CXI_MATCH_ID(txc->pid_bits, txc->ep_obj->src_addr.pid, + _txc_fi_addr(txc)); + + return CXI_MATCH_ID(txc->pid_bits, txc->ep_obj->src_addr.pid, + txc->ep_obj->src_addr.nic); +} + +/* + * report_send_completion() - Report the completion of a send operation. + */ +static void report_send_completion(struct cxip_req *req, bool sw_cntr) +{ + int ret; + int ret_err; + int success_event = (req->flags & FI_COMPLETION); + struct cxip_txc *txc = req->send.txc; + + req->flags &= (FI_MSG | FI_TAGGED | FI_SEND); + + if (req->send.rc == C_RC_OK) { + TXC_DBG(txc, "Request success: %p\n", req); + + if (success_event) { + ret = cxip_cq_req_complete(req); + if (ret != FI_SUCCESS) + TXC_WARN(txc, + "Failed to report completion: %d\n", + ret); + } + + if (sw_cntr && req->send.cntr) { + ret = cxip_cntr_mod(req->send.cntr, 1, false, false); + if (ret) + TXC_WARN(txc, "cxip_cntr_mod returned: %d\n", + ret); + } + } else { + ret_err = proverr2errno(req->send.rc); + TXC_WARN(txc, "Request dest_addr: %ld caddr.nic: %#X caddr.pid: %u error: %p (err: %d, %s)\n", + req->send.dest_addr, req->send.caddr.nic, + req->send.caddr.pid, req, ret_err, + cxi_rc_to_str(req->send.rc)); + + ret = cxip_cq_req_error(req, 0, ret_err, + req->send.rc, NULL, 0, + FI_ADDR_UNSPEC); + if (ret != FI_SUCCESS) + TXC_WARN(txc, "Failed to report error: %d\n", ret); + + if (sw_cntr && req->send.cntr) { + ret = cxip_cntr_mod(req->send.cntr, 1, false, true); + if (ret) + TXC_WARN(txc, "cxip_cntr_mod returned: %d\n", + ret); + } + } +} + +/* + * rdzv_send_req_complete() - Complete long send request. + */ +static void rdzv_send_req_complete(struct cxip_req *req) +{ + cxip_rdzv_id_free(req->send.txc, req->send.rdzv_id); + + cxip_send_buf_fini(req); + + report_send_completion(req, true); + + ofi_atomic_dec32(&req->send.txc->otx_reqs); + cxip_evtq_req_free(req); +} + +/* + * rdzv_send_req_event() - Count a rendezvous send event. + * + * Call for each initiator event. The events could be generated in any order. + * Once all expected events are received, complete the request. + * + * A successful rendezvous Send generates two events: Ack and Get. + */ +static void rdzv_send_req_event(struct cxip_req *req) +{ + if (++req->send.rdzv_send_events == 2) + rdzv_send_req_complete(req); +} + +/* + * cxip_send_rdzv_put_cb() - Long send callback. + * + * Progress a long send operation to completion. + */ +static int cxip_send_rdzv_put_cb(struct cxip_req *req, + const union c_event *event) +{ + int event_rc; + int ret; + struct cxip_txc *txc = req->send.txc; + + switch (event->hdr.event_type) { + case C_EVENT_ACK: + /* The source Put completed. */ + event_rc = cxi_init_event_rc(event); + + TXC_DBG(txc, "Acked: %p (rc: %s list: %s)\n", req, + cxi_rc_to_str(event_rc), + cxi_ptl_list_to_str(event->init_short.ptl_list)); + + /* If the message was dropped, mark the peer as disabled. Do + * not generate a completion. Free associated resources. Do not + * free the request (it will be used to replay the Send). + */ + if (event_rc == C_RC_PT_DISABLED) { + ret = cxip_send_req_dropped(req->send.txc, req); + if (ret == FI_SUCCESS) + cxip_rdzv_id_free(req->send.txc, + req->send.rdzv_id); + else + ret = -FI_EAGAIN; + + return ret; + } + + /* Message was accepted by the peer. Match order is preserved. + * The request can be dequeued from the SW message queue. This + * allows flow-control recovery to be performed before + * outstanding long Send operations have completed. + */ + ret = cxip_send_req_dequeue(req->send.txc, req); + if (ret != FI_SUCCESS) + return ret; + + /* The transaction is complete if the put failed */ + if (event_rc != C_RC_OK) { + req->send.rc = event_rc; + rdzv_send_req_complete(req); + } else { + /* Count the event, another may be expected. */ + rdzv_send_req_event(req); + } + return FI_SUCCESS; + + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + case C_EVENT_SEND: + { + struct cxi_md *md = req->send.send_md->md; + + TXC_WARN(txc, "Unexpected %s event: rc:%s buf:%p len:0x%lx iova:0x%llx md.va:0x%llx lac:%d\n", + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event)), req->send.buf, + req->send.len, CXI_VA_TO_IOVA(md, req->send.buf), + md->iova, md->lac); + } + return FI_SUCCESS; + + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * cxip_rdzv_pte_src_cb() - Process rendezvous source buffer events. + * + * A Get event is generated for each rendezvous Send indicating Send completion. + */ +int cxip_rdzv_pte_src_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rdzv_pte *rdzv_pte = req->req_ctx; + struct cxip_txc *txc = rdzv_pte->txc; + struct cxip_req *get_req; + union cxip_match_bits mb; + int event_rc = cxi_event_rc(event); + int rdzv_id; + + switch (event->hdr.event_type) { + case C_EVENT_LINK: + if (event_rc == C_RC_OK) + ofi_atomic_inc32(&rdzv_pte->le_linked_success_count); + else + ofi_atomic_inc32(&rdzv_pte->le_linked_failure_count); + return FI_SUCCESS; + + case C_EVENT_GET: + mb.raw = event->tgt_long.match_bits; + rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + mb.rdzv_id_lo; + get_req = cxip_rdzv_id_lookup(txc, rdzv_id); + if (!get_req) { + TXC_WARN(txc, "Failed to find RDZV ID: %d\n", + mb.rdzv_id_lo); + return FI_SUCCESS; + } + + if (event_rc != C_RC_OK) + TXC_WARN(txc, "Get error: %p rc: %s\n", get_req, + cxi_rc_to_str(event_rc)); + else + TXC_DBG(txc, "Get received: %p rc: %s\n", get_req, + cxi_rc_to_str(event_rc)); + + get_req->send.rc = event_rc; + + /* Count the event, another may be expected. */ + rdzv_send_req_event(get_req); + + return FI_SUCCESS; + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +static inline int cxip_send_prep_cmdq(struct cxip_cmdq *cmdq, + struct cxip_req *req, + uint32_t tclass) +{ + struct cxip_txc *txc = req->send.txc; + int ret; + uint16_t vni; + + if (!req->triggered) { + if (txc->ep_obj->av_auth_key) + vni = req->send.caddr.vni; + else + vni = txc->ep_obj->auth_key.vni; + + ret = cxip_txq_cp_set(cmdq, vni, + cxip_ofi_to_cxi_tc(txc->tclass), + CXI_TC_TYPE_DEFAULT); + if (ret != FI_SUCCESS) + return ret; + } + + if (req->send.flags & FI_FENCE) { + ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE); + if (ret) { + TXC_DBG(txc, "Failed to issue CQ_FENCE command: %d\n", + ret); + return -FI_EAGAIN; + } + } + + return FI_SUCCESS; +} + +/* + * _cxip_send_rdzv_put() - Initiate a send rendezvous put operation. + * + * The rendezvous protocol works as follows: + * + * 1. The Initiator performs a Rendezvous Put command which includes a portion + * of the source buffer data. + * 2. Once the Put is matched to a user receive buffer (in the Priority list), + * a Get of the remaining source data is performed. + */ +static ssize_t _cxip_send_rdzv_put(struct cxip_req *req) +{ + struct cxip_txc *txc = req->send.txc; + union c_fab_addr dfa; + uint8_t idx_ext; + struct c_full_dma_cmd cmd = {}; + union cxip_match_bits put_mb = {}; + int rdzv_id; + int lac = req->send.send_md->md->lac; + int ret; + struct cxip_cmdq *cmdq = + req->triggered ? txc->domain->trig_cmdq : txc->tx_cmdq; + + /* Zero length rendezvous not supported. */ + assert(req->send.send_md); + assert(req->send.len); + + /* Allocate rendezvous ID */ + rdzv_id = cxip_rdzv_id_alloc(txc, req); + if (rdzv_id < 0) + return -FI_EAGAIN; + + /* Calculate DFA */ + cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits, + CXIP_PTL_IDX_RXQ, &dfa, &idx_ext); + + /* Allocate a source request for the given LAC. This makes the source + * memory accessible for rendezvous. + */ + ret = cxip_rdzv_pte_src_req_alloc(txc->rdzv_pte, lac); + if (ret) { + TXC_WARN(txc, "Failed to prepare source window: %d\n", ret); + goto err_free_rdzv_id; + } + + + /* Allocate restricted source window. If resources can not be allocated + * discontinue use of the restricted protocol, falling back + * to unrestricted. TODO: keep track and only switch for LAC that + * failed. + */ + if (txc->rdzv_proto == CXIP_RDZV_PROTO_ALT_READ && + !txc->rdzv_nomatch_pte[lac]) { + TXC_DBG(txc, "allocate restricted PTE lac %d\n", lac); + + ret = cxip_rdzv_nomatch_pte_alloc(txc, lac, + &txc->rdzv_nomatch_pte[lac]); + if (ret) { + TXC_WARN(txc, WARN_RESTRICTED_DISABLED, + cxip_rdzv_proto_to_str(txc->rdzv_proto), + cxip_rdzv_proto_to_str(CXIP_RDZV_PROTO_DEFAULT)); + txc->rdzv_proto = CXIP_RDZV_PROTO_DEFAULT; + } + } + + /* Build match bits */ + if (req->send.tagged) { + put_mb.tagged = 1; + put_mb.tag = req->send.tag; + } + + if (req->send.flags & FI_REMOTE_CQ_DATA) + put_mb.cq_data = 1; + + put_mb.rdzv_proto = txc->rdzv_proto; + + req->send.rdzv_id = rdzv_id; + req->cb = cxip_send_rdzv_put_cb; + req->send.rdzv_send_events = 0; + + /* Build Put command descriptor */ + cmd.command.cmd_type = C_CMD_TYPE_DMA; + cmd.index_ext = idx_ext; + cmd.lac = req->send.send_md->md->lac; + cmd.event_send_disable = 1; + cmd.restricted = 0; + cmd.dfa = dfa; + cmd.local_addr = CXI_VA_TO_IOVA(req->send.send_md->md, req->send.buf); + cmd.request_len = req->send.len; + cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + cmd.user_ptr = (uint64_t)req; + cmd.initiator = cxip_msg_match_id(txc); + cmd.header_data = req->send.data; + cmd.remote_offset = + CXI_VA_TO_IOVA(req->send.send_md->md, req->send.buf); + cmd.command.opcode = C_CMD_RENDEZVOUS_PUT; + cmd.eager_length = txc->rdzv_eager_size; + cmd.use_offset_for_get = 1; + + put_mb.rdzv_id_hi = rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH; + put_mb.rdzv_lac = req->send.send_md->md->lac; + put_mb.le_type = CXIP_LE_TYPE_RX; + cmd.match_bits = put_mb.raw; + cmd.rendezvous_id = rdzv_id; + + if (req->triggered) { + const struct c_ct_cmd ct_cmd = { + .trig_ct = req->trig_cntr->ct->ctn, + .threshold = req->trig_thresh, + }; + + /* Triggered command queue is domain resource, lock. */ + ofi_genlock_lock(&txc->domain->trig_cmdq_lock); + + ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); + if (ret) { + ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); + goto err_free_rdzv_id; + } + + /* Clear the triggered flag to prevent retrying of operation, + * due to flow control, from using the triggered path. + */ + req->triggered = false; + + ret = cxi_cq_emit_trig_full_dma(cmdq->dev_cmdq, &ct_cmd, + &cmd); + if (ret) { + ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); + goto err_enqueue; + } + + cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), + ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); + ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); + } else { + + ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); + if (ret) + goto err_free_rdzv_id; + + ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &cmd); + if (ret) + goto err_enqueue; + + cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), + ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); + } + + return FI_SUCCESS; + +err_enqueue: + TXC_DBG(txc, "Failed to enqueue Put: %d, return -FI_EAGAIN\n", ret); +err_free_rdzv_id: + cxip_rdzv_id_free(txc, rdzv_id); + + return -FI_EAGAIN; +} + +/* + * cxip_send_eager_cb() - Eager send callback. Used for both tagged and + * untagged messages. + */ +static int cxip_send_eager_cb(struct cxip_req *req, + const union c_event *event) +{ + int match_complete = req->flags & FI_MATCH_COMPLETE; + int ret; + + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + if (event->hdr.event_type == C_EVENT_SEND) { + TXC_WARN(req->send.txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + } + + assert(event->hdr.event_type == C_EVENT_ACK); + + req->send.rc = cxi_init_event_rc(event); + + /* If the message was dropped, mark the peer as disabled. Do not + * generate a completion. Free associated resources. Do not free the + * request (it will be used to replay the Send). + */ + if (req->send.rc == C_RC_PT_DISABLED) { + + ret = cxip_send_req_dropped(req->send.txc, req); + if (ret != FI_SUCCESS) + return -FI_EAGAIN; + + if (match_complete) + cxip_tx_id_free(req->send.txc, req->send.tx_id); + + return FI_SUCCESS; + } + + ret = cxip_send_req_dequeue(req->send.txc, req); + if (ret != FI_SUCCESS) + return ret; + + cxip_send_buf_fini(req); + + /* If MATCH_COMPLETE was requested and the the Put did not match a user + * buffer, do not generate a completion event until the target notifies + * the initiator that the match is complete. + */ + if (match_complete) { + if (req->send.rc == C_RC_OK && + event->init_short.ptl_list != C_PTL_LIST_PRIORITY) { + TXC_DBG(req->send.txc, + "Waiting for match complete: %p\n", req); + return FI_SUCCESS; + } + + TXC_DBG(req->send.txc, "Match complete with Ack: %p\n", req); + cxip_tx_id_free(req->send.txc, req->send.tx_id); + } + + /* If MATCH_COMPLETE was requested, software must manage counters. */ + report_send_completion(req, match_complete); + + ofi_atomic_dec32(&req->send.txc->otx_reqs); + cxip_evtq_req_free(req); + + return FI_SUCCESS; +} + +static inline int cxip_set_eager_mb(struct cxip_req *req, + union cxip_match_bits *mb) +{ + int tx_id; + + mb->raw = 0; + mb->le_type = CXIP_LE_TYPE_RX; + mb->tagged = req->send.tagged; + mb->tag = req->send.tag; + mb->cq_data = !!(req->send.flags & FI_REMOTE_CQ_DATA); + + /* Allocate a TX ID if match completion guarantees are required */ + if (req->send.flags & FI_MATCH_COMPLETE) { + + tx_id = cxip_tx_id_alloc(req->send.txc, req); + if (tx_id < 0) { + TXC_DBG(req->send.txc, + "Failed to allocate TX ID: %d\n", tx_id); + return -FI_EAGAIN; + } + + req->send.tx_id = tx_id; + mb->match_comp = 1; + mb->tx_id = tx_id; + } + + return FI_SUCCESS; +} + +/* + * _cxip_send_eager_idc() - Enqueue eager IDC message + */ +static ssize_t _cxip_send_eager_idc(struct cxip_req *req) +{ + struct cxip_txc *txc = req->send.txc; + union c_fab_addr dfa; + uint8_t idx_ext; + union cxip_match_bits mb; + ssize_t ret; + struct cxip_cmdq *cmdq = txc->tx_cmdq; + const void *buf; + struct c_cstate_cmd cstate_cmd = {}; + struct c_idc_msg_hdr idc_cmd; + + assert(req->send.len > 0); + +#if ENABLE_DEBUG + if (req->send.flags & FI_INJECT) + assert(req->send.ibuf); + + /* ibuf and send_md are mutually exclusive. */ + if (req->send.ibuf) { + assert(req->send.send_md == NULL); + } else if (req->send.send_md) { + assert(req->send.ibuf == NULL); + + /* All non FI_HMEM_SYSTEM buffers require an ibuf. */ + assert(req->send.send_md->info.iface == FI_HMEM_SYSTEM); + } +#endif + + /* Calculate DFA */ + cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits, + CXIP_PTL_IDX_RXQ, &dfa, &idx_ext); + + /* Favor bounce buffer if allocated. */ + if (req->send.ibuf) + buf = req->send.ibuf; + else + buf = req->send.buf; + + ret = cxip_set_eager_mb(req, &mb); + if (ret) + goto err; + + req->cb = cxip_send_eager_cb; + + /* Build commands before taking lock */ + cstate_cmd.event_send_disable = 1; + cstate_cmd.index_ext = idx_ext; + cstate_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + cstate_cmd.initiator = cxip_msg_match_id(txc); + + /* If MATCH_COMPLETE was requested, software must manage + * counters. + */ + if (req->send.cntr && !mb.match_comp) { + cstate_cmd.event_ct_ack = 1; + cstate_cmd.ct = req->send.cntr->ct->ctn; + } + + /* Note: IDC command completely filled in */ + idc_cmd.unused_0 = 0; + idc_cmd.dfa = dfa; + idc_cmd.match_bits = mb.raw; + idc_cmd.header_data = req->send.data; + idc_cmd.user_ptr = (uint64_t)req; + + /* Submit command */ + ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); + if (ret) + goto err_cleanup; + + ret = cxip_cmdq_emit_c_state(cmdq, &cstate_cmd); + if (ret) { + TXC_DBG(txc, "Failed to issue C_STATE command: %ld\n", ret); + goto err_cleanup; + } + + ret = cxi_cq_emit_idc_msg(cmdq->dev_cmdq, &idc_cmd, buf, req->send.len); + if (ret) { + TXC_DBG(txc, "Failed to write IDC: %ld\n", ret); + + /* Return error according to Domain Resource Management */ + ret = -FI_EAGAIN; + goto err_cleanup; + } + + cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), + ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); + + return FI_SUCCESS; + +err_cleanup: + if (mb.match_comp) + cxip_tx_id_free(txc, req->send.tx_id); +err: + return ret; +} + +/* + * _cxip_send_eager() - Enqueue eager send command. + */ +static ssize_t _cxip_send_eager(struct cxip_req *req) +{ + struct cxip_txc *txc = req->send.txc; + union c_fab_addr dfa; + uint8_t idx_ext; + union cxip_match_bits mb; + ssize_t ret; + struct cxip_cmdq *cmdq = + req->triggered ? txc->domain->trig_cmdq : txc->tx_cmdq; + bool trig = req->triggered; + struct c_full_dma_cmd cmd = {}; + + /* Calculate DFA */ + cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits, + CXIP_PTL_IDX_RXQ, &dfa, &idx_ext); + + ret = cxip_set_eager_mb(req, &mb); + if (ret) + goto err; + + req->cb = cxip_send_eager_cb; + + cmd.command.cmd_type = C_CMD_TYPE_DMA; + cmd.command.opcode = C_CMD_PUT; + cmd.index_ext = idx_ext; + cmd.event_send_disable = 1; + cmd.dfa = dfa; + cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + cmd.user_ptr = (uint64_t)req; + cmd.initiator = cxip_msg_match_id(txc); + cmd.match_bits = mb.raw; + cmd.header_data = req->send.data; + + /* Triggered ops could result in 0 length DMA */ + if (req->send.send_md) { + cmd.lac = req->send.send_md->md->lac; + cmd.local_addr = CXI_VA_TO_IOVA(req->send.send_md->md, + req->send.buf); + cmd.request_len = req->send.len; + } + + /* If MATCH_COMPLETE was requested, software must manage + * counters. + */ + if (req->send.cntr && !mb.match_comp) { + cmd.event_ct_ack = 1; + cmd.ct = req->send.cntr->ct->ctn; + } + + /* Issue Eager Put command */ + if (trig) { + const struct c_ct_cmd ct_cmd = { + .trig_ct = req->trig_cntr->ct->ctn, + .threshold = req->trig_thresh, + }; + + /* Triggered command queue is domain resource, lock. */ + ofi_genlock_lock(&txc->domain->trig_cmdq_lock); + ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); + if (ret) { + ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); + goto err; + } + + /* Clear the triggered flag to prevent retrying of + * operation, due to flow control, from using the + * triggered path. + */ + req->triggered = false; + + ret = cxi_cq_emit_trig_full_dma(cmdq->dev_cmdq, &ct_cmd, + &cmd); + if (ret) { + ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); + goto err_enqueue; + } + cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), + ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); + ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); + + } else { + ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); + if (ret) + goto err; + + ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &cmd); + if (ret) + goto err_enqueue; + + cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), + ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); + } + + return FI_SUCCESS; + +err_enqueue: + TXC_DBG(txc, "Failed to write DMA command: %ld\n", ret); + ret = -FI_EAGAIN; + + if (mb.match_comp) + cxip_tx_id_free(txc, req->send.tx_id); +err: + return ret; +} + +static bool cxip_send_eager_idc(struct cxip_req *req) +{ + return (req->send.len <= CXIP_INJECT_SIZE) && + !cxip_env.disable_non_inject_msg_idc; +} + +static ssize_t _cxip_send_req(struct cxip_req *req) +{ + /* Force all zero-byte operations to use the eager path. This utilizes + * a smaller command format. + */ + if (req->send.len == 0) + return _cxip_send_eager(req); + + /* IDC commands are not supported with triggered operations. */ + if (!req->triggered && + ((req->send.flags & FI_INJECT) || cxip_send_eager_idc(req))) + return _cxip_send_eager_idc(req); + + if (req->send.len <= req->send.txc->max_eager_size) + return _cxip_send_eager(req); + + return _cxip_send_rdzv_put(req); +} + +/* + * cxip_fc_peer_lookup() - Check if a peer is disabled. + * + * Look up disabled peer state and return it, if available. + * + * Caller must hold ep_obj->lock. + */ +static struct cxip_fc_peer *cxip_fc_peer_lookup(struct cxip_txc *txc, + struct cxip_addr caddr) +{ + struct cxip_fc_peer *peer; + + dlist_foreach_container(&txc->fc_peers, struct cxip_fc_peer, + peer, txc_entry) { + if (CXIP_ADDR_EQUAL(peer->caddr, caddr)) + return peer; + } + + return NULL; +} + +/* + * cxip_fc_peer_put() - Account for completion of an outstanding Send targeting + * a disabled peer. + * + * Drop a reference to a disabled peer. When the last reference is dropped, + * attempt flow-control recovery. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_fc_peer_put(struct cxip_fc_peer *peer) +{ + int ret; + + assert(peer->pending > 0); + + /* Account for the completed Send */ + if (!--peer->pending) { + peer->req.send.mb.drops = peer->dropped; + + ret = cxip_ctrl_msg_send(&peer->req); + if (ret != FI_SUCCESS) { + peer->pending++; + return ret; + } + + peer->pending_acks++; + + TXC_DBG(peer->txc, + "Notified disabled peer NIC: %#x PID: %u dropped: %u\n", + peer->caddr.nic, peer->caddr.pid, peer->dropped); + } + + return FI_SUCCESS; +} + +/* + * cxip_fc_peer_fini() - Remove disabled peer state. + * + * Caller must hold ep_obj->lock. + */ +static void cxip_fc_peer_fini(struct cxip_fc_peer *peer) +{ + assert(dlist_empty(&peer->msg_queue)); + dlist_remove(&peer->txc_entry); + free(peer); +} + +/* + * cxip_fc_notify_cb() - Process FC notify completion events. + */ +int cxip_fc_notify_cb(struct cxip_ctrl_req *req, const union c_event *event) +{ + struct cxip_fc_peer *peer = container_of(req, struct cxip_fc_peer, req); + struct cxip_txc *txc = peer->txc; + + switch (event->hdr.event_type) { + case C_EVENT_ACK: + switch (cxi_event_rc(event)) { + case C_RC_OK: + TXC_DBG(txc, + "FC_NOTIFY to %#x:%u successfully sent: retry_count=%u\n", + peer->caddr.nic, peer->caddr.pid, + peer->retry_count); + + /* Peer flow control structure can only be freed if + * replay is complete and all acks accounted for. + */ + peer->pending_acks--; + if (!peer->pending_acks && peer->replayed) + cxip_fc_peer_fini(peer); + + return FI_SUCCESS; + + /* This error occurs when the target's control event queue has + * run out of space. Since the target should be processing the + * event queue, it is safe to replay messages until C_RC_OK is + * returned. + */ + case C_RC_ENTRY_NOT_FOUND: + peer->retry_count++; + TXC_WARN(txc, + "%#x:%u dropped FC message: retry_delay_usecs=%d retry_count=%u\n", + peer->caddr.nic, peer->caddr.pid, + cxip_env.fc_retry_usec_delay, + peer->retry_count); + usleep(cxip_env.fc_retry_usec_delay); + return cxip_ctrl_msg_send(req); + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT_STS, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * cxip_fc_peer_init() - Mark a peer as disabled. + * + * Called by sending EP after experiencing first dropped Send to a peer. + * + * Allocate state to track the disabled peer. Locate all outstanding Sends + * targeting the peer. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_fc_peer_init(struct cxip_txc *txc, struct cxip_addr caddr, + struct cxip_fc_peer **peer) +{ + struct cxip_fc_peer *p; + struct cxip_req *req; + struct dlist_entry *tmp; + + p = calloc(1, sizeof(*p)); + if (!p) { + TXC_WARN(txc, "Failed to allocate FC Peer\n"); + return -FI_ENOMEM; + } + + p->caddr = caddr; + p->txc = txc; + dlist_init(&p->msg_queue); + dlist_insert_tail(&p->txc_entry, &txc->fc_peers); + + p->req.send.nic_addr = caddr.nic; + p->req.send.pid = caddr.pid; + /* TODO: remove */ + p->req.send.mb.txc_id = 0; + p->req.send.mb.rxc_id = 0; + + p->req.send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG; + p->req.send.mb.ctrl_msg_type = CXIP_CTRL_MSG_FC_NOTIFY; + p->req.cb = cxip_fc_notify_cb; + p->req.ep_obj = txc->ep_obj; + + /* Queue all Sends to the FC'ed peer */ + dlist_foreach_container_safe(&txc->msg_queue, struct cxip_req, + req, send.txc_entry, tmp) { + if (CXIP_ADDR_EQUAL(req->send.caddr, caddr)) { + dlist_remove(&req->send.txc_entry); + dlist_insert_tail(&req->send.txc_entry, &p->msg_queue); + p->pending++; + req->send.fc_peer = p; + } + } + + *peer = p; + + return FI_SUCCESS; +} + +/* + * cxip_fc_resume() - Replay dropped Sends. + * + * Called by sending EP after being notified disabled peer was re-enabled. + * + * Replay all dropped Sends in order. + */ +int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid) +{ + struct cxip_txc *txc = &ep_obj->txc; + struct cxip_fc_peer *peer; + struct cxip_addr caddr = { + .nic = nic_addr, + .pid = pid, + }; + struct cxip_req *req; + struct dlist_entry *tmp; + int ret __attribute__((unused)); + + peer = cxip_fc_peer_lookup(txc, caddr); + if (!peer) + TXC_FATAL(txc, "Fatal, FC peer not found: NIC: %#x PID: %d\n", + nic_addr, pid); + + TXC_DBG(txc, "Replaying dropped sends, NIC: %#x PID: %d\n", + nic_addr, pid); + + dlist_foreach_container_safe(&peer->msg_queue, struct cxip_req, + req, send.txc_entry, tmp) { + /* -FI_EAGAIN can be return if the command queue is full. Loop + * until this goes through. + */ + do { + ret = _cxip_send_req(req); + } while (ret == -FI_EAGAIN); + assert(ret == FI_SUCCESS); + + /* Move request back to the message queue. */ + dlist_remove(&req->send.txc_entry); + req->send.fc_peer = NULL; + dlist_insert_tail(&req->send.txc_entry, &txc->msg_queue); + + TXC_DBG(txc, "Replayed %p\n", req); + } + + /* Peer flow control structure can only be freed if replay is complete + * and all acks accounted for. + */ + if (!peer->pending_acks) + cxip_fc_peer_fini(peer); + else + peer->replayed = true; + + return FI_SUCCESS; +} + +/* + * cxip_send_req_dropped() - Mark the Send request dropped. + * + * Mark the Send request dropped. Mark the target peer as disabled. Track all + * outstanding Sends targeting the disabled peer. When all outstanding Sends + * are completed, recovery will be performed. + */ +static int cxip_send_req_dropped(struct cxip_txc *txc, struct cxip_req *req) +{ + struct cxip_fc_peer *peer; + int ret; + + /* Check if peer is already disabled */ + peer = cxip_fc_peer_lookup(txc, req->send.caddr); + if (!peer) { + ret = cxip_fc_peer_init(txc, req->send.caddr, &peer); + if (ret != FI_SUCCESS) + return ret; + + TXC_DBG(txc, + "Disabled peer detected, NIC: %#x PID: %u pending: %u\n", + peer->caddr.nic, peer->caddr.pid, peer->pending); + } + + /* Account for the dropped message. */ + peer->dropped++; + ret = cxip_fc_peer_put(peer); + if (ret) + peer->dropped--; + else + TXC_DBG(txc, + "Send dropped, req: %p NIC: %#x PID: %u pending: %u dropped: %u\n", + req, peer->caddr.nic, peer->caddr.pid, peer->pending, + peer->dropped); + + return ret; +} + +/* + * cxip_send_req_queue() - Queue Send request on TXC. + * + * Place the Send request in an ordered SW queue. Return error if the target + * peer is disabled. + */ +static int cxip_send_req_queue(struct cxip_txc *txc, struct cxip_req *req) +{ + struct cxip_fc_peer *peer; + + if (!dlist_empty(&txc->fc_peers)) { + peer = cxip_fc_peer_lookup(txc, req->send.caddr); + if (peer) { + /* Peer is disabled. Progress control EQs so future + * cxip_send_req_queue() may succeed. + */ + cxip_ep_ctrl_progress_locked(txc->ep_obj); + + return -FI_EAGAIN; + } + } + + dlist_insert_tail(&req->send.txc_entry, &txc->msg_queue); + + return FI_SUCCESS; +} + +/* + * cxip_send_req_dequeue() - Dequeue Send request from TXC. + * + * Remove the Send requst from the ordered message queue. Update peer + * flow-control state, if necessary. + */ +static int cxip_send_req_dequeue(struct cxip_txc *txc, struct cxip_req *req) +{ + int ret; + + if (req->send.fc_peer) { + /* The peer was disabled after this message arrived. */ + TXC_DBG(txc, + "Send not dropped, req: %p NIC: %#x PID: %u pending: %u dropped: %u\n", + req, req->send.fc_peer->caddr.nic, + req->send.fc_peer->caddr.pid, + req->send.fc_peer->pending, req->send.fc_peer->dropped); + + ret = cxip_fc_peer_put(req->send.fc_peer); + if (ret != FI_SUCCESS) + return ret; + + req->send.fc_peer = NULL; + } + + dlist_remove(&req->send.txc_entry); + + return FI_SUCCESS; +} + +static void cxip_send_buf_fini(struct cxip_req *req) +{ + if (req->send.send_md) + cxip_unmap(req->send.send_md); + if (req->send.ibuf) + cxip_txc_ibuf_free(req->send.txc, req->send.ibuf); +} + +static int cxip_send_buf_init(struct cxip_req *req) +{ + struct cxip_txc *txc = req->send.txc; + int ret; + + /* Nothing to do for zero byte sends. */ + if (!req->send.len) + return FI_SUCCESS; + + /* Triggered operation always requires memory registration. */ + if (req->triggered) + return cxip_map(txc->domain, req->send.buf, req->send.len, 0, + &req->send.send_md); + + /* FI_INJECT operations always require an internal bounce buffer. This + * is needed to replay FI_INJECT operations which may experience flow + * control. + */ + if (req->send.flags & FI_INJECT) { + + req->send.ibuf = cxip_txc_ibuf_alloc(txc); + if (!req->send.ibuf) + return -FI_EAGAIN; + + if (txc->hmem) { + ret = cxip_txc_copy_from_hmem(txc, NULL, req->send.ibuf, + req->send.buf, + req->send.len); + if (ret) { + TXC_WARN(txc, + "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_buf_fini; + } + + return FI_SUCCESS; + } + + memcpy(req->send.ibuf, req->send.buf, req->send.len); + return FI_SUCCESS; + } + + /* If message is going to be sent as an IDC, a bounce buffer is needed + * if FI_HMEM is being used. This is due to the buffer type being + * unknown. + */ + if (cxip_send_eager_idc(req)) { + if (txc->hmem) { + + req->send.ibuf = cxip_txc_ibuf_alloc(txc); + if (!req->send.ibuf) { + ret = -FI_EAGAIN; + goto err_buf_fini; + } + + ret = cxip_txc_copy_from_hmem(txc, NULL, req->send.ibuf, + req->send.buf, + req->send.len); + if (ret) { + TXC_WARN(txc, + "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_buf_fini; + } + } + + return FI_SUCCESS; + } + + /* Everything else requires memory registeration. */ + return cxip_map(txc->domain, req->send.buf, req->send.len, 0, + &req->send.send_md); + +err_buf_fini: + cxip_send_buf_fini(req); + + return ret; +} + +/* + * cxip_send_common() - Common message send function. Used for tagged and + * untagged sends of all sizes. This includes triggered operations. + */ +ssize_t cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, + size_t len, void *desc, uint64_t data, + fi_addr_t dest_addr, uint64_t tag, void *context, + uint64_t flags, bool tagged, bool triggered, + uint64_t trig_thresh, struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr) +{ + struct cxip_req *req; + struct cxip_addr caddr; + int ret; + + if (len && !buf) + return -FI_EINVAL; + + if (len > CXIP_EP_MAX_MSG_SZ) + return -FI_EMSGSIZE; + + if (tagged && tag & ~CXIP_TAG_MASK) { + TXC_WARN(txc, "Invalid tag: %#018lx (%#018lx)\n", + tag, CXIP_TAG_MASK); + return -FI_EINVAL; + } + + if (flags & FI_INJECT && len > CXIP_INJECT_SIZE) { + TXC_WARN(txc, "Invalid inject length: %lu\n", len); + return -FI_EMSGSIZE; + } + + ofi_genlock_lock(&txc->ep_obj->lock); + + req = cxip_evtq_req_alloc(&txc->tx_evtq, false, txc); + if (!req) { + TXC_DBG(txc, "Failed to allocate request, return -FI_EAGAIN\n"); + ret = -FI_EAGAIN; + goto unlock; + } + + /* Restrict outstanding success event requests to queue size */ + if (ofi_atomic_inc32(&txc->otx_reqs) > txc->attr.size) { + ret = -FI_EAGAIN; + goto err_req_free; + } + + req->triggered = triggered; + req->trig_thresh = trig_thresh; + req->trig_cntr = trig_cntr; + + /* Save Send parameters to replay */ + req->type = CXIP_REQ_SEND; + req->send.txc = txc; + req->send.tclass = tclass; + + req->send.cntr = triggered ? comp_cntr : txc->send_cntr; + req->send.buf = buf; + req->send.len = len; + req->send.data = data; + req->send.flags = flags; + + /* Set completion parameters */ + req->context = (uint64_t)context; + req->flags = FI_SEND | (flags & (FI_COMPLETION | FI_MATCH_COMPLETE)); + if (tagged) { + req->send.tagged = tagged; + req->send.tag = tag; + req->flags |= FI_TAGGED; + } else { + req->flags |= FI_MSG; + } + + ret = cxip_send_buf_init(req); + if (ret) { + TXC_WARN(txc, "cxip_send_buf_init failed: %d:%s\n", ret, + fi_strerror(-ret)); + goto err_req_free; + } + + /* Look up target CXI address */ + ret = cxip_av_lookup_addr(txc->ep_obj->av, dest_addr, &caddr); + if (ret != FI_SUCCESS) { + TXC_WARN(txc, "Failed to look up FI addr: %d\n", ret); + goto err_req_buf_fini; + } + + req->send.caddr = caddr; + req->send.dest_addr = dest_addr; + + if (cxip_evtq_saturated(&txc->tx_evtq)) { + TXC_DBG(txc, "TX HW EQ saturated\n"); + ret = -FI_EAGAIN; + goto err_req_buf_fini; + } + + /* Check if target peer is disabled */ + ret = cxip_send_req_queue(req->send.txc, req); + if (ret != FI_SUCCESS) { + TXC_DBG(txc, "Target peer disabled\n"); + goto err_req_buf_fini; + } + + /* Try Send */ + ret = _cxip_send_req(req); + if (ret != FI_SUCCESS) + goto err_req_dequeue; + + ofi_genlock_unlock(&txc->ep_obj->lock); + + TXC_DBG(txc, + "req: %p buf: %p len: %lu dest_addr: 0x%lX nic: %d pid: %d tag(%c): 0x%lx context %#lx\n", + req, req->send.buf, req->send.len, dest_addr, caddr.nic, + caddr.pid, req->send.tagged ? '*' : '-', req->send.tag, + req->context); + + return FI_SUCCESS; + +err_req_dequeue: + cxip_send_req_dequeue(req->send.txc, req); +err_req_buf_fini: + cxip_send_buf_fini(req); +err_req_free: + ofi_atomic_dec32(&txc->otx_reqs); + cxip_evtq_req_free(req); +unlock: + ofi_genlock_unlock(&txc->ep_obj->lock); + + return ret; +} + +/* + * Libfabric APIs + */ +static ssize_t cxip_trecv(struct fid_ep *fid_ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, uint64_t tag, + uint64_t ignore, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_recv_common(&ep->ep_obj->rxc, buf, len, desc, src_addr, + tag, ignore, context, ep->rx_attr.op_flags, + true, NULL); +} + +static ssize_t cxip_trecvv(struct fid_ep *fid_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + size_t len; + void *buf; + void *mr_desc; + + if (count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (iov && count == 1) { + len = iov[0].iov_len; + buf = iov[0].iov_base; + mr_desc = desc ? desc[0] : NULL; + } else { + RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + return cxip_recv_common(&ep->ep_obj->rxc, buf, len, mr_desc, src_addr, + tag, ignore, context, ep->rx_attr.op_flags, + true, NULL); +} + +static ssize_t cxip_trecvmsg(struct fid_ep *fid_ep, + const struct fi_msg_tagged *msg, uint64_t flags) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + size_t len; + void *buf; + void *mr_desc; + + if (flags & ~(CXIP_RX_OP_FLAGS | CXIP_RX_IGNORE_OP_FLAGS | + FI_PEEK | FI_CLAIM)) + return -FI_EBADFLAGS; + + if (!msg) { + RXC_WARN(&ep->ep_obj->rxc, "NULL msg not supported\n"); + return -FI_EINVAL; + } + + /* If selective completion is not requested, always generate + * completions. + */ + if (!ep->ep_obj->rxc.selective_completion) + flags |= FI_COMPLETION; + + if (!(flags & FI_PEEK)) { + if (msg->iov_count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (msg->msg_iov && msg->iov_count == 1) { + len = msg->msg_iov[0].iov_len; + buf = msg->msg_iov[0].iov_base; + mr_desc = msg->desc ? msg->desc[0] : NULL; + } else { + RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + return cxip_recv_common(&ep->ep_obj->rxc, buf, len, mr_desc, + msg->addr, msg->tag, msg->ignore, + msg->context, flags, true, NULL); + } + + /* FI_PEEK does not post a recv or return message payload */ + return cxip_recv_common(&ep->ep_obj->rxc, NULL, 0UL, NULL, msg->addr, + msg->tag, msg->ignore, msg->context, flags, + true, NULL); +} + +static ssize_t cxip_tsend(struct fid_ep *fid_ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, + desc, 0, dest_addr, tag, context, + ep->tx_attr.op_flags, true, false, 0, + NULL, NULL); +} + +static ssize_t cxip_tsendv(struct fid_ep *fid_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + size_t len; + const void *buf; + void *mr_desc; + + if (count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (iov && count == 1) { + len = iov[0].iov_len; + buf = iov[0].iov_base; + mr_desc = desc ? desc[0] : NULL; + } else { + TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, + mr_desc, 0, dest_addr, tag, context, + ep->tx_attr.op_flags, true, false, 0, NULL, + NULL); +} + +static ssize_t cxip_tsendmsg(struct fid_ep *fid_ep, + const struct fi_msg_tagged *msg, uint64_t flags) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + size_t len; + const void *buf; + void *mr_desc; + + if (!msg) { + TXC_WARN(&ep->ep_obj->txc, "NULL msg not supported\n"); + return -FI_EINVAL; + } + + if (msg->iov_count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (msg->msg_iov && msg->iov_count == 1) { + len = msg->msg_iov[0].iov_len; + buf = msg->msg_iov[0].iov_base; + mr_desc = msg->desc ? msg->desc[0] : NULL; + } else { + TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + if (flags & ~CXIP_TX_OP_FLAGS) + return -FI_EBADFLAGS; + + if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE)) + return -FI_EINVAL; + + /* If selective completion is not requested, always generate + * completions. + */ + if (!txc->selective_completion) + flags |= FI_COMPLETION; + + return cxip_send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc, + msg->data, msg->addr, msg->tag, msg->context, + flags, true, false, 0, NULL, NULL); +} + +static ssize_t cxip_tinject(struct fid_ep *fid_ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, + NULL, 0, dest_addr, tag, NULL, FI_INJECT, + true, false, 0, NULL, NULL); +} + +static ssize_t cxip_tsenddata(struct fid_ep *fid_ep, const void *buf, + size_t len, void *desc, uint64_t data, + fi_addr_t dest_addr, uint64_t tag, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, + desc, data, dest_addr, tag, context, + ep->tx_attr.op_flags | FI_REMOTE_CQ_DATA, + true, false, 0, NULL, NULL); +} + +static ssize_t cxip_tinjectdata(struct fid_ep *fid_ep, const void *buf, + size_t len, uint64_t data, fi_addr_t dest_addr, + uint64_t tag) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, + len, NULL, data, dest_addr, tag, NULL, + FI_INJECT | FI_REMOTE_CQ_DATA, + true, false, 0, NULL, NULL); +} + +struct fi_ops_tagged cxip_ep_tagged_no_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = fi_no_tagged_send, + .sendv = fi_no_tagged_sendv, + .sendmsg = fi_no_tagged_sendmsg, + .inject = fi_no_tagged_inject, + .senddata = fi_no_tagged_senddata, + .injectdata = fi_no_tagged_injectdata, +}; + +struct fi_ops_tagged cxip_ep_tagged_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = cxip_trecv, + .recvv = cxip_trecvv, + .recvmsg = cxip_trecvmsg, + .send = cxip_tsend, + .sendv = cxip_tsendv, + .sendmsg = cxip_tsendmsg, + .inject = cxip_tinject, + .senddata = cxip_tsenddata, + .injectdata = cxip_tinjectdata, +}; + +struct fi_ops_tagged cxip_ep_tagged_no_tx_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = cxip_trecv, + .recvv = cxip_trecvv, + .recvmsg = cxip_trecvmsg, + .send = fi_no_tagged_send, + .sendv = fi_no_tagged_sendv, + .sendmsg = fi_no_tagged_sendmsg, + .inject = fi_no_tagged_inject, + .senddata = fi_no_tagged_senddata, + .injectdata = fi_no_tagged_injectdata, +}; + +struct fi_ops_tagged cxip_ep_tagged_no_rx_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = cxip_tsend, + .sendv = cxip_tsendv, + .sendmsg = cxip_tsendmsg, + .inject = cxip_tinject, + .senddata = cxip_tsenddata, + .injectdata = cxip_tinjectdata, +}; + +static ssize_t cxip_recv(struct fid_ep *fid_ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_recv_common(&ep->ep_obj->rxc, buf, len, desc, src_addr, 0, + 0, context, ep->rx_attr.op_flags, false, NULL); +} + +static ssize_t cxip_recvv(struct fid_ep *fid_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + size_t len; + void *buf; + void *mr_desc; + + if (count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (iov && count == 1) { + len = iov[0].iov_len; + buf = iov[0].iov_base; + mr_desc = desc ? desc[0] : NULL; + } else { + RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + return cxip_recv_common(&ep->ep_obj->rxc, buf, len, mr_desc, src_addr, + 0, 0, context, ep->rx_attr.op_flags, false, + NULL); +} + +static ssize_t cxip_recvmsg(struct fid_ep *fid_ep, const struct fi_msg *msg, + uint64_t flags) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_rxc *rxc = &ep->ep_obj->rxc; + size_t len; + void *buf; + void *mr_desc; + + if (flags & ~(CXIP_RX_OP_FLAGS | CXIP_RX_IGNORE_OP_FLAGS)) + return -FI_EBADFLAGS; + + if (!msg) { + RXC_WARN(rxc, "NULL msg not supported\n"); + return -FI_EINVAL; + } + + if (msg->iov_count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (msg->msg_iov && msg->iov_count == 1) { + len = msg->msg_iov[0].iov_len; + buf = msg->msg_iov[0].iov_base; + mr_desc = msg->desc ? msg->desc[0] : NULL; + } else { + RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + /* If selective completion is not requested, always generate + * completions. + */ + if (!rxc->selective_completion) + flags |= FI_COMPLETION; + + return cxip_recv_common(rxc, buf, len, mr_desc, msg->addr, 0, 0, + msg->context, flags, false, NULL); +} + +static ssize_t cxip_send(struct fid_ep *fid_ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, + buf, len, desc, 0, dest_addr, 0, context, + ep->tx_attr.op_flags, false, false, 0, + NULL, NULL); +} + +static ssize_t cxip_sendv(struct fid_ep *fid_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + size_t len; + const void *buf; + void *mr_desc; + + if (count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (iov && count == 1) { + len = iov[0].iov_len; + buf = iov[0].iov_base; + mr_desc = desc ? desc[0] : NULL; + } else { + TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, + buf, len, mr_desc, 0, dest_addr, 0, context, + ep->tx_attr.op_flags, false, false, 0, + NULL, NULL); +} + +static ssize_t cxip_sendmsg(struct fid_ep *fid_ep, const struct fi_msg *msg, + uint64_t flags) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + size_t len; + const void *buf; + void *mr_desc; + + if (!msg) { + TXC_WARN(&ep->ep_obj->txc, "NULL msg not supported\n"); + return -FI_EINVAL; + } + + if (msg->iov_count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (msg->msg_iov && msg->iov_count == 1) { + len = msg->msg_iov[0].iov_len; + buf = msg->msg_iov[0].iov_base; + mr_desc = msg->desc ? msg->desc[0] : NULL; + } else { + TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + if (flags & ~CXIP_TX_OP_FLAGS) + return -FI_EBADFLAGS; + + if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE)) + return -FI_EINVAL; + + /* If selective completion is not requested, always generate + * completions. + */ + if (!txc->selective_completion) + flags |= FI_COMPLETION; + + return cxip_send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc, + msg->data, msg->addr, 0, msg->context, flags, + false, false, 0, NULL, NULL); +} + +static ssize_t cxip_inject(struct fid_ep *fid_ep, const void *buf, size_t len, + fi_addr_t dest_addr) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, + len, NULL, 0, dest_addr, 0, NULL, FI_INJECT, + false, false, 0, NULL, NULL); +} + +static ssize_t cxip_senddata(struct fid_ep *fid_ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, + len, desc, data, dest_addr, 0, context, + ep->tx_attr.op_flags | FI_REMOTE_CQ_DATA, + false, false, 0, NULL, NULL); +} + +static ssize_t cxip_injectdata(struct fid_ep *fid_ep, const void *buf, + size_t len, uint64_t data, fi_addr_t dest_addr) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, + len, NULL, data, dest_addr, 0, NULL, + FI_INJECT | FI_REMOTE_CQ_DATA, + false, false, 0, NULL, NULL); +} + +struct fi_ops_msg cxip_ep_msg_no_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +struct fi_ops_msg cxip_ep_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = cxip_recv, + .recvv = cxip_recvv, + .recvmsg = cxip_recvmsg, + .send = cxip_send, + .sendv = cxip_sendv, + .sendmsg = cxip_sendmsg, + .inject = cxip_inject, + .senddata = cxip_senddata, + .injectdata = cxip_injectdata, +}; + +struct fi_ops_msg cxip_ep_msg_no_tx_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = cxip_recv, + .recvv = cxip_recvv, + .recvmsg = cxip_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +struct fi_ops_msg cxip_ep_msg_no_rx_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = cxip_send, + .sendv = cxip_sendv, + .sendmsg = cxip_sendmsg, + .inject = cxip_inject, + .senddata = cxip_senddata, + .injectdata = cxip_injectdata, +}; diff --git a/prov/cxi/src/cxip_nic.c b/prov/cxi/src/cxip_nic.c new file mode 100644 index 00000000000..90df4cf73d8 --- /dev/null +++ b/prov/cxi/src/cxip_nic.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ +#include "config.h" +#include "cxip.h" +#include "ofi.h" +#include "ofi_str.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_FABRIC, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_FABRIC, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_FABRIC, __VA_ARGS__) + +static int cxip_nic_get_ss_env_get_vni(void) +{ + char *vni_str; + char *vni_str_dup; + char *token; + int vni = -FI_EINVAL; + + vni_str = getenv("SLINGSHOT_VNIS"); + if (!vni_str) { + CXIP_INFO("SLINGSHOT_VNIS not found\n"); + return -FI_ENOSYS; + } + + vni_str_dup = strdup(vni_str); + if (!vni_str_dup) + return -FI_ENOMEM; + + /* Index/token zero is the per job-step VNI. Only use this value. Index + * one is the inter-job-step VNI. Ignore this one. + */ + token = strtok(vni_str_dup, ","); + if (token) + vni = (uint16_t)atoi(token); + else + CXIP_WARN("VNI not found in SLINGSHOT_VNIS: %s\n", vni_str); + + free(vni_str_dup); + + return vni; +} + +static int cxip_gen_auth_key_ss_env_get_svc_id(struct cxip_if *nic_if) +{ + char *svc_id_str; + char *dev_str; + char *svc_id_str_dup; + char *dev_str_dup; + int device_index; + char *token; + bool found; + int svc_id; + + svc_id_str = getenv("SLINGSHOT_SVC_IDS"); + if (!svc_id_str) { + CXIP_INFO("SLINGSHOT_SVC_IDS not found\n"); + return -FI_ENOSYS; + } + + dev_str = getenv("SLINGSHOT_DEVICES"); + if (!dev_str) { + CXIP_INFO("SLINGSHOT_DEVICES not found\n"); + return -FI_ENOSYS; + } + + dev_str_dup = strdup(dev_str); + if (!dev_str_dup) + return -FI_ENOMEM; + + found = false; + device_index = 0; + token = strtok(dev_str_dup, ","); + while (token != NULL) { + if (strcmp(token, nic_if->info->device_name) == 0) { + found = true; + break; + } + + device_index++; + token = strtok(NULL, ","); + } + + free(dev_str_dup); + + if (!found) { + CXIP_WARN("Failed to find %s in SLINGSHOT_DEVICES: %s\n", + nic_if->info->device_name, dev_str); + return -FI_ENOSYS; + } + + svc_id_str_dup = strdup(svc_id_str); + if (!svc_id_str_dup) + return -FI_ENOMEM; + + found = false; + token = strtok(svc_id_str_dup, ","); + while (token != NULL) { + if (device_index == 0) { + svc_id = atoi(token); + found = true; + break; + } + + device_index--; + token = strtok(NULL, ","); + } + + free(svc_id_str_dup); + + if (!found) { + CXIP_WARN("Failed to find service ID in SLINGSHOT_SVC_IDS: %s\n", + svc_id_str); + return -FI_EINVAL; + } + + return svc_id; +} + +static int cxip_nic_get_rgroup_vni_ss_env(struct cxip_if *nic_if, + unsigned int *rgroup, + unsigned int *vni) +{ + int ret; + + ret = cxip_nic_get_ss_env_get_vni(); + if (ret < 0) + return ret; + + *vni = ret; + + ret = cxip_gen_auth_key_ss_env_get_svc_id(nic_if); + if (ret < 0) + return ret; + + *rgroup = ret; + + CXIP_INFO("Generated (%u:%u) for %s\n", *rgroup, *vni, + nic_if->info->device_name); + + return FI_SUCCESS; +} + +static int cxip_nic_get_best_rgroup_vni(struct cxip_if *nic_if, + unsigned int *rgroup, + unsigned int *vni) +{ + int ret; + struct cxil_svc_list *svc_list; + uid_t uid; + gid_t gid; + int i; + int j; + struct cxi_svc_desc *desc; + int found_uid; + int found_gid; + int found_unrestricted; + + uid = geteuid(); + gid = getegid(); + + ret = cxil_get_svc_list(nic_if->dev, &svc_list); + if (ret) { + CXIP_WARN("cxil_get_svc_list failed: %d:%s\n", ret, + strerror(-ret)); + return ret; + } + + /* Find the service indexes which can be used by this process. These are + * services which are unrestricted, have a matching UID, or have a + * matching GID. If there are multiple service IDs which could match + * unrestricted, UID, and GID, only the first one found is selected. + */ + found_uid = -1; + found_gid = -1; + found_unrestricted = -1; + + for (i = svc_list->count - 1; i >= 0; i--) { + desc = svc_list->descs + i; + + if (!desc->enable || desc->is_system_svc) + continue; + + if (!desc->restricted_members) { + if (found_unrestricted == -1) + found_unrestricted = i; + continue; + } + + for (j = 0; j < CXI_SVC_MAX_MEMBERS; j++) { + if (desc->members[j].type == CXI_SVC_MEMBER_UID && + desc->members[j].svc_member.uid == uid && + found_uid == -1) + found_uid = i; + else if (desc->members[j].type == CXI_SVC_MEMBER_GID && + desc->members[j].svc_member.gid == gid && + found_gid == -1) + found_gid = i; + } + } + + /* Prioritized list for matching service ID. */ + if (found_uid != -1) + i = found_uid; + else if (found_gid != -1) { + i = found_gid; + } else if (found_unrestricted != -1) { + i = found_unrestricted; + } else { + cxil_free_svc_list(svc_list); + return -FI_ENOSYS; + } + + /* Generate auth_key using matched service ID. */ + desc = svc_list->descs + i; + + if (desc->restricted_vnis) { + if (desc->num_vld_vnis == 0) { + CXIP_WARN("No valid VNIs for %s service ID %u\n", + nic_if->info->device_name, i); + + cxil_free_svc_list(svc_list); + + return -FI_EINVAL; + } + + *vni = (uint16_t)desc->vnis[0]; + } else { + *vni = (uint16_t)cxip_env.default_vni; + } + + *rgroup = desc->svc_id; + + CXIP_INFO("Found (%u:%u) for %s\n", *rgroup, *vni, + nic_if->info->device_name); + + return FI_SUCCESS; +} + +static int cxip_nic_get_rgroup_vni(struct cxip_if *nic_if, + unsigned int *rgroup, unsigned int *vni) +{ + int ret; + + ret = cxip_nic_get_rgroup_vni_ss_env(nic_if, rgroup, vni); + if (ret == FI_SUCCESS) + return FI_SUCCESS; + + ret = cxip_nic_get_best_rgroup_vni(nic_if, rgroup, vni); + if (ret == -FI_ENOSYS) { + CXIP_WARN("Failed to find valid default rgroup and vni for %s\n", + nic_if->info->device_name); + *rgroup = 0; + *vni = 0; + ret = FI_SUCCESS; + } + + return ret; +} + +static int cxip_nic_close(struct fid *fid) +{ + struct fid_nic *nic = (struct fid_nic *) fid; + + free(nic->prov_attr); + return ofi_nic_close(fid); +} + +static int cxip_nic_control(struct fid *fid, int command, void *arg) +{ + int ret; + struct fid_nic *nic = container_of(fid, struct fid_nic, fid); + struct cxip_nic_attr *nic_attr = nic->prov_attr; + struct fid_nic **dup = (struct fid_nic **) arg; + struct cxip_if *nic_if; + + if (command == FI_OPT_CXI_NIC_REFRESH_ATTR) { + ret = cxip_get_if(nic_attr->addr, &nic_if); + if (ret != FI_SUCCESS) + return ret; + + ret = cxip_nic_get_rgroup_vni(nic_if, + (void *)&nic_attr->default_rgroup_id, + (void *)&nic_attr->default_vni); + + cxip_put_if(nic_if); + + return ret; + } + + ret = ofi_nic_control(fid, command, arg); + if (ret != FI_SUCCESS) + return ret; + + if (command == FI_DUP) { + (*dup)->prov_attr = mem_dup(nic->prov_attr, sizeof(struct cxip_nic_attr)); + if (!(*dup)->prov_attr) { + cxip_nic_close(&(*dup)->fid); + return -FI_ENOMEM; + } + } + + return FI_SUCCESS; +} + +static int cxip_nic_tostr(const struct fid *fid_nic, char *buf, size_t len) +{ + return ofi_nic_tostr(fid_nic, buf, len); +} + +static struct fi_ops cxip_nic_ops = { + .size = sizeof(struct fi_ops), + .close = cxip_nic_close, + .control = cxip_nic_control, + .tostr = cxip_nic_tostr, +}; + +int cxip_nic_alloc(struct cxip_if *nic_if, struct fid_nic **fid_nic) +{ + struct fid_nic *nic; + struct cxip_nic_attr *nic_attr; + int ret; + + /* Reuse the common fid_nic as must as possible. */ + nic = ofi_nic_dup(NULL); + if (!nic) + return -FI_ENOMEM; + + nic_attr = calloc(1, sizeof(*nic_attr)); + if (!nic_attr) { + ret = -FI_ENOMEM; + goto err_free_nic; + } + + nic->prov_attr = nic_attr; + + ret = cxip_nic_get_rgroup_vni(nic_if, + (void *)&nic_attr->default_rgroup_id, + (void *)&nic_attr->default_vni); + if (ret != FI_SUCCESS) + goto err_free_nic; + + memcpy((void *)&nic_attr->addr, &nic_if->info->nic_addr, + sizeof(nic_attr->addr)); + nic_attr->version = FI_CXI_NIC_ATTR_VER; + + /* Update the fid_nic to point to our operations. */ + nic->fid.ops = &cxip_nic_ops; + + nic->device_attr->name = strdup(nic_if->info->device_name); + if (!nic->device_attr->name) { + ret = -FI_ENOMEM; + goto err_free_nic; + } + + ret = asprintf(&nic->device_attr->device_id, "0x%x", + nic_if->info->device_id); + if (ret < 0) + goto err_free_nic; + + ret = asprintf(&nic->device_attr->device_version, "%u", + nic_if->info->device_rev); + if (ret < 0) + goto err_free_nic; + + ret = asprintf(&nic->device_attr->vendor_id, "0x%x", + nic_if->info->vendor_id); + if (ret < 0) + goto err_free_nic; + + nic->device_attr->driver = strdup(nic_if->info->driver_name); + + nic->bus_attr->bus_type = FI_BUS_PCI; + nic->bus_attr->attr.pci.domain_id = nic_if->info->pci_domain; + nic->bus_attr->attr.pci.bus_id = nic_if->info->pci_bus; + nic->bus_attr->attr.pci.device_id = nic_if->info->pci_device; + nic->bus_attr->attr.pci.function_id = nic_if->info->pci_function; + + ret = asprintf(&nic->link_attr->address, "0x%x", + nic_if->info->nic_addr); + if (ret < 0) + goto err_free_nic; + + nic->link_attr->mtu = nic_if->info->link_mtu; + /* Convert Mb/s to libfabric reported b/s */ + nic->link_attr->speed = (size_t)nic_if->speed * 1000000; + nic->link_attr->state = nic_if->link ? FI_LINK_UP : FI_LINK_DOWN; + nic->link_attr->network_type = strdup("HPC Ethernet"); + + *fid_nic = nic; + + return FI_SUCCESS; + +err_free_nic: + cxip_nic_close(&nic->fid); + + return ret; +} diff --git a/prov/cxi/src/cxip_portals_table.c b/prov/cxi/src/cxip_portals_table.c new file mode 100644 index 00000000000..625b349fe1d --- /dev/null +++ b/prov/cxi/src/cxip_portals_table.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_DOMAIN, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__) + +int cxip_portals_table_alloc(struct cxip_lni *lni, uint16_t *vni, + size_t vni_count, uint32_t pid, + struct cxip_portals_table **ptable) +{ + struct cxip_portals_table *table; + int ret; + int i; + + + if (!vni_count) { + CXIP_WARN("Invalid VNI count\n"); + return -FI_EINVAL; + } + + table = calloc(1, sizeof(*table)); + if (!table) { + CXIP_WARN("Failed to allocate IF domain\n"); + return -FI_ENOMEM; + } + + table->doms = calloc(vni_count, sizeof(*table->doms)); + if (!table->doms) { + CXIP_WARN("Failed to allocate domain array\n"); + ret = -FI_ENOMEM; + goto err_free_table; + } + + for (i = 0; i < vni_count; i++) { + ret = cxil_alloc_domain(lni->lni, vni[i], pid, &table->doms[i]); + if (ret) { + CXIP_WARN("Failed to allocate CXI Domain, ret: %d\n", + ret); + ret = -FI_ENOSPC; + goto err_free_doms; + } + + /* To handle C_PID_ANY correctly, the same PID needs to be used + * for each domain. Thus, update PID after the first domain + * is allocated to a valid value. + */ + pid = table->doms[i]->pid; + } + + table->pid = pid; + table->doms_count = vni_count; + table->lni = lni; + + CXIP_DBG("Allocated portals table, %s PID: %u\n", + lni->iface->info->device_name, table->pid); + + *ptable = table; + + return FI_SUCCESS; + +err_free_doms: + for (i--; i >= 0; i--) { + ret = cxil_destroy_domain(table->doms[i]); + if (ret) + CXIP_WARN("Failed to destroy domain: %d\n", ret); + } + + free(table->doms); +err_free_table: + free(table); + + return ret; +} + +/* + * cxip_free_if_domain() - Free an IF Domain. + */ +void cxip_portals_table_free(struct cxip_portals_table *ptable) +{ + int ret; + int i; + + CXIP_DBG("Freeing portals table, %s PID: %u\n", + ptable->lni->iface->info->device_name, ptable->pid); + + for (i = 0; i < ptable->doms_count; i++) { + ret = cxil_destroy_domain(ptable->doms[i]); + if (ret) + CXIP_WARN("Failed to destroy domain: %d\n", ret); + } + + free(ptable->doms); + free(ptable); +} diff --git a/prov/cxi/src/cxip_pte.c b/prov/cxi/src/cxip_pte.c new file mode 100644 index 00000000000..bdbcda67e3b --- /dev/null +++ b/prov/cxi/src/cxip_pte.c @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_DOMAIN, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_DOMAIN, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_DOMAIN, __VA_ARGS__) + +/* Caller musthold ep_obj->lock. */ +int cxip_pte_set_state(struct cxip_pte *pte, struct cxip_cmdq *cmdq, + enum c_ptlte_state new_state, uint32_t drop_count) +{ + int ret; + struct c_set_state_cmd set_state = { + .command.opcode = C_CMD_TGT_SETSTATE, + .ptlte_index = pte->pte->ptn, + .ptlte_state = new_state, + .drop_count = drop_count, + }; + + ret = cxi_cq_emit_target(cmdq->dev_cmdq, &set_state); + if (ret) { + CXIP_WARN("Failed to enqueue command: %d\n", ret); + return -FI_EAGAIN; + } + + cxi_cq_ring(cmdq->dev_cmdq); + + return FI_SUCCESS; +} + +/* + * cxip_pte_set_wait() - Set a new PTE state synchronously. + * + * TODO: EP lock associated with the EP must be taken. + */ +int cxip_pte_set_state_wait(struct cxip_pte *pte, struct cxip_cmdq *cmdq, + struct cxip_evtq *evtq, + enum c_ptlte_state new_state, uint32_t drop_count) +{ + int ret; + + ret = cxip_pte_set_state(pte, cmdq, new_state, drop_count); + if (ret == FI_SUCCESS) { + do { + sched_yield(); + cxip_evtq_progress(evtq); + } while (pte->state != new_state); + } + + return ret; +} + +/* + * cxip_pte_append() - Append a buffer to a PtlTE. + * + * Caller must hold ep_obj->lock. + */ +int cxip_pte_append(struct cxip_pte *pte, uint64_t iova, size_t len, + unsigned int lac, enum c_ptl_list list, + uint32_t buffer_id, uint64_t match_bits, + uint64_t ignore_bits, uint32_t match_id, + uint64_t min_free, uint32_t flags, + struct cxip_cntr *cntr, struct cxip_cmdq *cmdq, + bool ring) +{ + union c_cmdu cmd = {}; + int rc; + + cmd.command.opcode = C_CMD_TGT_APPEND; + cmd.target.ptl_list = list; + cmd.target.ptlte_index = pte->pte->ptn; + cmd.target.buffer_id = buffer_id; + cmd.target.lac = lac; + cmd.target.start = iova; + cmd.target.length = len; + cmd.target.ct = cntr ? cntr->ct->ctn : 0; + cmd.target.match_bits = match_bits; + cmd.target.ignore_bits = ignore_bits; + cmd.target.match_id = match_id; + cmd.target.min_free = min_free; + + cxi_target_cmd_setopts(&cmd.target, flags); + + rc = cxi_cq_emit_target(cmdq->dev_cmdq, &cmd); + if (rc) { + CXIP_DBG("Failed to write Append command: %d\n", rc); + /* Return error according to Domain Resource Management */ + return -FI_EAGAIN; + } + + if (ring) + cxi_cq_ring(cmdq->dev_cmdq); + + return FI_SUCCESS; +} + +/* + * cxip_pte_unlink() - Unlink a buffer from a PtlTE. + * + * Caller must hold ep_obj->lock. + */ +int cxip_pte_unlink(struct cxip_pte *pte, enum c_ptl_list list, + int buffer_id, struct cxip_cmdq *cmdq) +{ + union c_cmdu cmd = {}; + int rc; + + cmd.command.opcode = C_CMD_TGT_UNLINK; + cmd.target.ptl_list = list; + cmd.target.ptlte_index = pte->pte->ptn; + cmd.target.buffer_id = buffer_id; + + rc = cxi_cq_emit_target(cmdq->dev_cmdq, &cmd); + if (rc) { + CXIP_DBG("Failed to write Append command: %d\n", rc); + /* Return error according to Domain Resource Management */ + return -FI_EAGAIN; + } + + cxi_cq_ring(cmdq->dev_cmdq); + + return FI_SUCCESS; +} + +static void cxip_pte_unmap_list(struct dlist_entry *map_list) +{ + struct cxip_pte_map_entry *entry; + int ret; + + while ((entry = + dlist_first_entry_or_null(map_list, struct cxip_pte_map_entry, + entry))) { + dlist_remove(&entry->entry); + + ret = cxil_unmap_pte(entry->map); + if (ret) + CXIP_WARN("Failed to unmap PTE: %d\n", ret); + + free(entry); + } +} + +/* + * cxip_pte_map() - Map a PtlTE to a specific PID index. A single PtlTE can be + * mapped into MAX_PTE_MAP_COUNT different PID indices. + */ +int cxip_pte_map(struct cxip_pte *pte, uint64_t pid_idx, bool is_multicast) +{ + DEFINE_LIST(map_list); + struct cxip_pte_map_entry *entry; + int ret; + int i; + + for (i = 0; i < pte->ptable->doms_count; i++) { + + entry = calloc(1, sizeof(*entry)); + if (!entry) { + CXIP_WARN("Failed to allocated map entry memory"); + goto err_unmap; + } + + ret = cxil_map_pte(pte->pte, pte->ptable->doms[i], pid_idx, + is_multicast, &entry->map); + if (ret) { + CXIP_WARN("Failed to map PTE: %d\n", ret); + free(entry); + ret = -FI_EADDRINUSE; + goto err_unmap; + } + + dlist_insert_tail(&entry->entry, &map_list); + } + + dlist_splice_tail(&pte->map_list, &map_list); + + return FI_SUCCESS; + +err_unmap: + cxip_pte_unmap_list(&map_list); + + return ret; +} + +/* + * cxip_pte_alloc_nomap() - Allocate a PtlTE without performing any mapping + * during allocation. + */ +int cxip_pte_alloc_nomap(struct cxip_portals_table *ptable, struct cxi_eq *evtq, + struct cxi_pt_alloc_opts *opts, + void (*state_change_cb)(struct cxip_pte *pte, + const union c_event *event), + void *ctx, struct cxip_pte **pte) +{ + struct cxip_pte *new_pte; + int ret; + + new_pte = calloc(1, sizeof(*new_pte)); + if (!new_pte) { + CXIP_WARN("Failed to allocate PTE structure\n"); + return -FI_ENOMEM; + } + + /* Allocate a PTE */ + ret = cxil_alloc_pte(ptable->lni->lni, evtq, opts, + &new_pte->pte); + if (ret) { + CXIP_WARN("Failed to allocate PTE: %d\n", ret); + ret = -FI_ENOSPC; + goto free_mem; + } + + ofi_spin_lock(&ptable->lni->iface->lock); + dlist_insert_tail(&new_pte->pte_entry, &ptable->lni->iface->ptes); + ofi_spin_unlock(&ptable->lni->iface->lock); + + new_pte->ptable = ptable; + new_pte->state_change_cb = state_change_cb; + new_pte->ctx = ctx; + new_pte->state = C_PTLTE_DISABLED; + dlist_init(&new_pte->map_list); + + *pte = new_pte; + + return FI_SUCCESS; + +free_mem: + free(new_pte); + + return ret; +} + +/* + * cxip_pte_alloc() - Allocate and map a PTE for use. + */ +int cxip_pte_alloc(struct cxip_portals_table *ptable, struct cxi_eq *evtq, + uint64_t pid_idx, bool is_multicast, + struct cxi_pt_alloc_opts *opts, + void (*state_change_cb)(struct cxip_pte *pte, + const union c_event *event), + void *ctx, struct cxip_pte **pte) +{ + int ret; + + ret = cxip_pte_alloc_nomap(ptable, evtq, opts, state_change_cb, + ctx, pte); + if (ret) + return ret; + + ret = cxip_pte_map(*pte, pid_idx, is_multicast); + if (ret) + goto free_pte; + + return FI_SUCCESS; + +free_pte: + cxip_pte_free(*pte); + + return ret; +} + +/* + * cxip_pte_free() - Free a PTE. + */ +void cxip_pte_free(struct cxip_pte *pte) +{ + int ret; + + ofi_spin_lock(&pte->ptable->lni->iface->lock); + dlist_remove(&pte->pte_entry); + ofi_spin_unlock(&pte->ptable->lni->iface->lock); + + cxip_pte_unmap_list(&pte->map_list); + + assert(dlist_empty(&pte->map_list)); + + ret = cxil_destroy_pte(pte->pte); + if (ret) + CXIP_WARN("Failed to free PTE: %d\n", ret); + + free(pte); +} + +/* + * cxip_pte_state_change() - Atomically update PTE state. Used during + * STATE_CHANGE event processing. + */ +int cxip_pte_state_change(struct cxip_if *dev_if, const union c_event *event) +{ + struct cxip_pte *pte; + + ofi_spin_lock(&dev_if->lock); + + dlist_foreach_container(&dev_if->ptes, + struct cxip_pte, pte, pte_entry) { + if (pte->pte->ptn == event->tgt_long.ptlte_index) { + pte->state = event->tgt_long.initiator.state_change.ptlte_state; + if (pte->state_change_cb) + pte->state_change_cb(pte, event); + + ofi_spin_unlock(&dev_if->lock); + return FI_SUCCESS; + } + } + + ofi_spin_unlock(&dev_if->lock); + + return -FI_EINVAL; +} diff --git a/prov/cxi/src/cxip_ptelist_buf.c b/prov/cxi/src/cxip_ptelist_buf.c new file mode 100644 index 00000000000..bfeaddb058c --- /dev/null +++ b/prov/cxi/src/cxip_ptelist_buf.c @@ -0,0 +1,553 @@ +/* + * Copyright (c) 2018,2021-2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include "config.h" + +#include +#include +#include +#include + +#include "cxip.h" + +static const char* +cxip_ptelist_to_str(struct cxip_ptelist_bufpool *pool) +{ + return cxi_ptl_list_to_str(pool->attr.list_type); +} + +static int cxip_ptelist_unlink_buf(struct cxip_ptelist_buf *buf) +{ + struct cxip_rxc *rxc = buf->rxc; + int ret; + + ret = cxip_pte_unlink(rxc->rx_pte, buf->pool->attr.list_type, + buf->req->req_id, rxc->rx_cmdq); + if (ret) + RXC_DBG(rxc, "Failed to write command %d %s\n", + ret, fi_strerror(-ret)); + + return ret; +} + +static int cxip_ptelist_link_buf(struct cxip_ptelist_buf *buf, + bool seq_restart) +{ + struct cxip_rxc *rxc = buf->rxc; + uint32_t le_flags = C_LE_MANAGE_LOCAL | C_LE_NO_TRUNCATE | + C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_PUT | + C_LE_UNRESTRICTED_END_RO | C_LE_EVENT_UNLINK_DISABLE; + int ret; + + /* Match all eager, long sends */ + union cxip_match_bits mb = { + .le_type = CXIP_LE_TYPE_RX + }; + union cxip_match_bits ib = { + .tag = ~0, + .tx_id = ~0, + .cq_data = 1, + .tagged = 1, + .match_comp = 1, + .rdzv_done = 1, + }; + + if (!(buf->pool->attr.list_type == C_PTL_LIST_OVERFLOW && + cxip_env.hybrid_preemptive)) + le_flags |= C_LE_EVENT_LINK_DISABLE; + + if (seq_restart) + le_flags |= C_LE_RESTART_SEQ; + + RXC_DBG(rxc, "%s link buf %p num linked %u\n", + cxip_ptelist_to_str(buf->pool), buf, + ofi_atomic_get32(&buf->pool->bufs_linked)); + + /* Reset request buffer stats used to know when the buffer is consumed. + */ + assert(dlist_empty(&buf->request.pending_ux_list)); + buf->unlink_length = -1; + buf->cur_offset = 0; + + /* Take a request buffer reference for the link. */ + ret = cxip_pte_append(rxc->rx_pte, + CXI_VA_TO_IOVA(buf->md->md, buf->data), + buf->pool->attr.buf_size, buf->md->md->lac, + buf->pool->attr.list_type, + buf->req->req_id, mb.raw, + ib.raw, CXI_MATCH_ID_ANY, + buf->pool->attr.min_space_avail, + le_flags, NULL, rxc->rx_cmdq, true); + if (ret) { + RXC_WARN(rxc, "Failed to write %s append %d %s\n", + cxip_ptelist_to_str(buf->pool), + ret, fi_strerror(-ret)); + return ret; + } + + dlist_remove(&buf->buf_entry); + dlist_insert_tail(&buf->buf_entry, + &buf->pool->active_bufs); + ofi_atomic_inc32(&buf->pool->bufs_linked); + + /* Reference taken until buffer is consumed or manually + * unlinked. + */ + cxip_ptelist_buf_get(buf); + + RXC_DBG(rxc, "APPEND %s buf %p num linked %u\n", + cxip_ptelist_to_str(buf->pool), buf, + ofi_atomic_get32(&buf->pool->bufs_linked)); + + return ret; +} + +/* + * cxip_ptelist_buf_alloc() - Allocate a buffer for the Ptl buffer pool. + */ +static struct cxip_ptelist_buf* +cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool) +{ + struct cxip_rxc *rxc = pool->rxc; + struct cxip_ptelist_buf *buf; + int ret; + + buf = calloc(1, sizeof(*buf)); + if (!buf) + goto err; + + buf->data = aligned_alloc(pool->buf_alignment, pool->attr.buf_size); + if (!buf->data) + goto err_free_buf; + + if (rxc->hmem && !cxip_env.disable_host_register) { + ret = ofi_hmem_host_register(buf->data, pool->attr.buf_size); + if (ret) { + RXC_WARN(rxc, + "Failed to register buffer with HMEM: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_data_buf; + } + } + + ret = cxip_map(rxc->domain, buf->data, pool->attr.buf_size, + OFI_MR_NOCACHE, &buf->md); + if (ret) + goto err_unreg_buf; + + buf->req = cxip_evtq_req_alloc(&rxc->rx_evtq, true, buf); + if (!buf->req) + goto err_unmap_buf; + + buf->pool = pool; + buf->req->cb = pool->attr.ptelist_cb; + buf->rxc = rxc; + buf->le_type = CXIP_LE_TYPE_RX; + + if (pool->attr.list_type == C_PTL_LIST_REQUEST) + buf->req->type = CXIP_REQ_RBUF; + else + buf->req->type = CXIP_REQ_OFLOW; + + ofi_atomic_initialize32(&buf->refcount, 0); + dlist_init(&buf->request.pending_ux_list); + dlist_init(&buf->buf_entry); + ofi_atomic_inc32(&pool->bufs_allocated); + + RXC_DBG(rxc, "Allocated %s buf %p num alloc %u\n", + cxip_ptelist_to_str(buf->pool), buf, + ofi_atomic_get32(&pool->bufs_allocated)); + + return buf; + +err_unmap_buf: + cxip_unmap(buf->md); +err_unreg_buf: + if (rxc->hmem && !cxip_env.disable_host_register) + ofi_hmem_host_unregister(buf); +err_free_data_buf: + free(buf->data); +err_free_buf: + free(buf); +err: + return NULL; +} + +static void cxip_ptelist_buf_free(struct cxip_ptelist_buf *buf) +{ + struct cxip_ux_send *ux; + struct dlist_entry *tmp; + struct cxip_rxc *rxc = buf->rxc; + + /* Sanity check making sure the buffer was properly removed before + * freeing. + */ + assert(dlist_empty(&buf->buf_entry)); + + if (buf->pool->attr.list_type == C_PTL_LIST_REQUEST) { + dlist_foreach_container_safe(&buf->request.pending_ux_list, + struct cxip_ux_send, + ux, rxc_entry, tmp) { + dlist_remove(&ux->rxc_entry); + _cxip_req_buf_ux_free(ux, false); + } + } + + if (ofi_atomic_get32(&buf->refcount) != 0) + RXC_FATAL(rxc, "%s buf %p non-zero refcount %u\n", + cxip_ptelist_to_str(buf->pool), buf, + ofi_atomic_get32(&buf->refcount)); + cxip_evtq_req_free(buf->req); + cxip_unmap(buf->md); + if (rxc->hmem && !cxip_env.disable_host_register) + ofi_hmem_host_unregister(buf->data); + + ofi_atomic_dec32(&buf->pool->bufs_allocated); + + RXC_DBG(rxc, "Freeing %s buf %p num_alloc %u\n", + cxip_ptelist_to_str(buf->pool), buf, + ofi_atomic_get32(&buf->pool->bufs_allocated)); + free(buf->data); + free(buf); +} + +static void cxip_ptelist_buf_dlist_free(struct dlist_entry *head) +{ + struct cxip_ptelist_buf *buf; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(head, struct cxip_ptelist_buf, buf, + buf_entry, tmp) { + dlist_remove_init(&buf->buf_entry); + cxip_ptelist_buf_free(buf); + } +} + +void cxip_ptelist_buf_link_err(struct cxip_ptelist_buf *buf, + int rc_link_error) +{ + struct cxip_rxc *rxc = buf->pool->rxc; + + RXC_WARN(rxc, "%s buffer %p link error %s\n", + cxip_ptelist_to_str(buf->pool), + buf, cxi_rc_to_str(rc_link_error)); + + assert(rc_link_error == C_RC_NO_SPACE); + + cxip_ptelist_buf_put(buf, false); + ofi_atomic_dec32(&buf->pool->bufs_linked); + + /* We are running out of LE resources, do not repost + * immediately. + */ + assert(ofi_atomic_get32(&buf->refcount) == 0); + dlist_remove(&buf->buf_entry); + dlist_insert_tail(&buf->buf_entry, &buf->pool->free_bufs); + ofi_atomic_inc32(&buf->pool->bufs_free); +} + +void cxip_ptelist_buf_unlink(struct cxip_ptelist_buf *buf) +{ + struct cxip_ptelist_bufpool *pool = buf->pool; + + cxip_ptelist_buf_put(buf, false); + ofi_atomic_dec32(&pool->bufs_linked); + + RXC_DBG(pool->rxc, "%s buffer unlink\n", cxip_ptelist_to_str(pool)); +} + +int cxip_ptelist_bufpool_init(struct cxip_rxc *rxc, + struct cxip_ptelist_bufpool **pool, + struct cxip_ptelist_bufpool_attr *attr) +{ + int i; + struct cxip_ptelist_buf *buf; + struct dlist_entry tmp_buf_list; + struct dlist_entry *tmp; + struct cxip_ptelist_bufpool *_pool; + int ret; + size_t buf_size; + + + if (attr->list_type != C_PTL_LIST_REQUEST && + attr->list_type != C_PTL_LIST_OVERFLOW) + return -FI_EINVAL; + + _pool = calloc(1, sizeof(*_pool)); + if (!_pool) + return -FI_ENOMEM; + + _pool->buf_alignment = ofi_get_page_size(); + + buf_size = roundup(attr->buf_size, _pool->buf_alignment); + if (attr->buf_size != buf_size) + RXC_INFO(rxc, + "Aligning buf size to %lu: prev_size=%lu new_size=%lu\n", + _pool->buf_alignment, attr->buf_size, buf_size); + attr->buf_size = buf_size; + + _pool->attr = *attr; + _pool->rxc = rxc; + dlist_init(&_pool->active_bufs); + dlist_init(&_pool->consumed_bufs); + dlist_init(&_pool->free_bufs); + ofi_atomic_initialize32(&_pool->bufs_linked, 0); + ofi_atomic_initialize32(&_pool->bufs_allocated, 0); + ofi_atomic_initialize32(&_pool->bufs_free, 0); + + dlist_init(&tmp_buf_list); + + for (i = 0; i < _pool->attr.min_posted; i++) { + buf = cxip_ptelist_buf_alloc(_pool); + if (!buf) { + ret = -FI_ENOMEM; + goto err_free_bufs; + } + + dlist_insert_tail(&buf->buf_entry, &tmp_buf_list); + } + + /* Since this is called during RXC initialization, RXQ CMDQ should be + * empty. Thus, linking a request buffer should not fail. + */ + dlist_foreach_container_safe(&tmp_buf_list, struct cxip_ptelist_buf, + buf, buf_entry, tmp) { + ret = cxip_ptelist_link_buf(buf, false); + if (ret != FI_SUCCESS) + CXIP_FATAL("Failed to link request buffer: %d %s\n", + ret, fi_strerror(-ret)); + } + + *pool = _pool; + return FI_SUCCESS; + +err_free_bufs: + cxip_ptelist_buf_dlist_free(&tmp_buf_list); + + return ret; +} + +void cxip_ptelist_bufpool_fini(struct cxip_ptelist_bufpool *pool) +{ + struct cxip_rxc *rxc = pool->rxc; + struct cxip_ptelist_buf *buf; + int ret; + + assert(rxc->rx_pte->state == C_PTLTE_DISABLED); + + RXC_INFO(rxc, "Number of %s buffers allocated %d\n", + cxip_ptelist_to_str(pool), + ofi_atomic_get32(&pool->bufs_allocated)); + + /* All request buffers are split between the active and consumed list. + * Only active buffers need to be unlinked. + */ + dlist_foreach_container(&pool->active_bufs, struct cxip_ptelist_buf, + buf, buf_entry) { + ret = cxip_ptelist_unlink_buf(buf); + if (ret != FI_SUCCESS) + CXIP_FATAL("PtlTE %d failed to unlink %s buf %d %s\n", + rxc->rx_pte->pte->ptn, + cxip_ptelist_to_str(pool), ret, + fi_strerror(-ret)); + } + + do { + cxip_evtq_progress(&rxc->rx_evtq); + } while (ofi_atomic_get32(&pool->bufs_linked)); + + cxip_ptelist_buf_dlist_free(&pool->active_bufs); + cxip_ptelist_buf_dlist_free(&pool->consumed_bufs); + cxip_ptelist_buf_dlist_free(&pool->free_bufs); + + assert(ofi_atomic_get32(&pool->bufs_allocated) == 0); + + assert(pool); + free(pool); +} + +/* + * cxip_ptelist_buf_replenish() - Replenish PtlTE overflow or request list + * buffers. + * + * Caller must hold ep_obj->lock. + */ +int cxip_ptelist_buf_replenish(struct cxip_ptelist_bufpool *pool, + bool seq_restart) +{ + struct cxip_rxc *rxc = pool->rxc; + struct cxip_ptelist_buf *buf; + int bufs_added = 0; + int ret = FI_SUCCESS; + + if (rxc->msg_offload && pool->attr.list_type == C_PTL_LIST_REQUEST) + return FI_SUCCESS; + + while (ofi_atomic_get32(&pool->bufs_linked) < pool->attr.min_posted) { + + /* Always prefer to use a free buffer for which + * reposting was deferred or an append failed. + */ + if (!dlist_empty(&pool->free_bufs)) { + dlist_pop_front(&pool->free_bufs, + struct cxip_ptelist_buf, buf, + buf_entry); + ofi_atomic_dec32(&buf->pool->bufs_free); + + RXC_DBG(rxc, "%s LINK REPOST buf %p\n", + cxip_ptelist_to_str(pool), buf); + } else { + buf = cxip_ptelist_buf_alloc(pool); + + RXC_DBG(rxc, "%s LINK NEW buf %p\n", + cxip_ptelist_to_str(pool), buf); + + } + + if (!buf) { + RXC_WARN(rxc, "%s buffer allocation err\n", + cxip_ptelist_to_str(pool)); + break; + } + + RXC_DBG(rxc, "Link %s buf entry %p\n", + cxip_ptelist_to_str(pool), buf); + + ret = cxip_ptelist_link_buf(buf, !bufs_added); + if (ret) { + RXC_WARN(rxc, "%s append failure %d %s\n", + cxip_ptelist_to_str(pool), ret, + fi_strerror(-ret)); + + dlist_insert_tail(&buf->buf_entry, + &pool->free_bufs); + ofi_atomic_inc32(&pool->bufs_free); + break; + } + bufs_added++; + } + + /* If no buffer appended, check for fatal conditions. */ + if (!bufs_added) { + if (ofi_atomic_get32(&pool->bufs_linked) < 1) + RXC_FATAL(rxc, "%s buffer list exhausted\n", + cxip_ptelist_to_str(pool)); + } + + RXC_DBG(rxc, "%s current bufs alloc %u, num linked %u\n", + cxip_ptelist_to_str(pool), + ofi_atomic_get32(&pool->bufs_allocated), + ofi_atomic_get32(&pool->bufs_linked)); + + return ret; +} + +void cxip_ptelist_buf_get(struct cxip_ptelist_buf *buf) +{ + ofi_atomic_inc32(&buf->refcount); + + RXC_DBG(buf->rxc, "%s GET buf %p refcnt %u\n", + cxip_ptelist_to_str(buf->pool), + buf, ofi_atomic_get32(&buf->refcount)); +} + +void cxip_ptelist_buf_put(struct cxip_ptelist_buf *buf, bool repost) +{ + int ret; + int refcount = ofi_atomic_dec32(&buf->refcount); + + RXC_DBG(buf->rxc, "%s PUT buf %p refcnt %u repost %d\n", + cxip_ptelist_to_str(buf->pool), buf, refcount, repost); + + if (refcount < 0) { + RXC_FATAL(buf->rxc, "%s buffer refcount underflow %d\n", + cxip_ptelist_to_str(buf->pool), refcount); + /* not needed */ + return; + } + + if (refcount == 0 && repost) { + + /* Overflow buffers should just be freed if no longer + * in hardware RX match mode. + */ + if (buf->pool->attr.list_type == C_PTL_LIST_OVERFLOW && + (!buf->rxc->msg_offload || buf->rxc->state != RXC_ENABLED)) + goto free_buf; + + if (buf->pool->attr.list_type == C_PTL_LIST_REQUEST && + buf->rxc->state != RXC_ENABLED_SOFTWARE) + goto skip_repost; + + /* Limit immediate repost if already sufficient */ + if (ofi_atomic_get32(&buf->pool->bufs_linked) < + buf->pool->attr.max_posted) { + + do { + ret = cxip_ptelist_link_buf(buf, false); + } while (ret == -FI_EAGAIN); + + if (ret != FI_SUCCESS) + RXC_FATAL(buf->rxc, + "Fatal %s buf link err %d %s", + cxip_ptelist_to_str(buf->pool), + ret, fi_strerror(-ret)); + + return; + } + +skip_repost: + /* To avoid thrashing on buffer allocation, cache + * free buffers until a sufficient number are kept + * for reuse. This will help bursty traffic from + * holding on to unnecessary buffers. + */ + if (!buf->pool->attr.max_cached || + (ofi_atomic_get32(&buf->pool->bufs_linked) + + ofi_atomic_get32(&buf->pool->bufs_free) < + buf->pool->attr.max_cached)) { + + dlist_remove(&buf->buf_entry); + dlist_insert_tail(&buf->buf_entry, + &buf->pool->free_bufs); + ofi_atomic_inc32(&buf->pool->bufs_free); + + return; + } + +free_buf: + dlist_remove_init(&buf->buf_entry); + cxip_ptelist_buf_free(buf); + } +} + +void cxip_ptelist_buf_consumed(struct cxip_ptelist_buf *buf) +{ + RXC_DBG(buf->rxc, "%s CONSUMED off %ld len %ld buf %p\n", + cxip_ptelist_to_str(buf->pool), buf->cur_offset, + buf->unlink_length, buf); + + dlist_remove(&buf->buf_entry); + dlist_insert_tail(&buf->buf_entry, + &buf->pool->consumed_bufs); + + /* Since buffer is consumed, return reference + * taken during the initial linking. + */ + cxip_ptelist_buf_put(buf, true); +} + +void _cxip_req_buf_ux_free(struct cxip_ux_send *ux, bool repost) +{ + struct cxip_ptelist_buf *buf = ux->req->req_ctx; + + assert(ux->req->type == CXIP_REQ_RBUF); + + cxip_ptelist_buf_put(buf, repost); + free(ux); + + RXC_DBG(buf->rxc, "%s buf %p ux %p\n", + cxip_ptelist_to_str(buf->pool), buf, ux); +} diff --git a/prov/cxi/src/cxip_rdzv_pte.c b/prov/cxi/src/cxip_rdzv_pte.c new file mode 100644 index 00000000000..a0af4ea10ce --- /dev/null +++ b/prov/cxi/src/cxip_rdzv_pte.c @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ +#include "config.h" +#include "cxip.h" + +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) + +static void cxip_rdzv_pte_cb(struct cxip_pte *pte, const union c_event *event) +{ + switch (pte->state) { + case C_PTLTE_ENABLED: + break; + default: + CXIP_FATAL("Unexpected state received: %u\n", pte->state); + } +} + +static bool cxip_rdzv_pte_append_done(struct cxip_rdzv_pte *pte, + int expected_success_count) +{ + if (ofi_atomic_get32(&pte->le_linked_success_count) == + expected_success_count) + return true; + + if (ofi_atomic_get32(&pte->le_linked_failure_count) != 0) + return true; + + return false; +} + +static int cxip_rdzv_pte_wait_append(struct cxip_rdzv_pte *pte, + uint32_t expected_count) +{ + int ret = FI_SUCCESS; + + /* Poll until the LE is linked or a failure occurs. */ + do { + cxip_evtq_progress(&pte->txc->tx_evtq); + sched_yield(); + } while (!cxip_rdzv_pte_append_done(pte, expected_count)); + + if (ofi_atomic_get32(&pte->le_linked_failure_count)) { + ret = -FI_EIO; + CXIP_WARN("Failed to append zero byte put LE: %d:%s\n", ret, + fi_strerror(-ret)); + } + + return ret; +} + +static void cxip_rdzv_pte_src_reqs_free(struct cxip_rdzv_match_pte *pte) +{ + int i; + + /* The corresponding LE is not freed using an unlink command. Instead, + * this logic relies on the freeing of the hardware PtlTE to release the + * LEs. + */ + for (i = 0; i < RDZV_SRC_LES; i++) { + if (pte->src_reqs[i]) + cxip_evtq_req_free(pte->src_reqs[i]); + } +} + +/* caller should hold ep_obj->lock */ +int cxip_rdzv_pte_src_req_alloc(struct cxip_rdzv_match_pte *pte, int lac) +{ + int ret; + union cxip_match_bits mb; + union cxip_match_bits ib; + uint32_t le_flags; + int expected_success_count; + struct cxip_req *req; + struct cxip_rdzv_pte *base = &pte->base_pte; + + /* Reuse a previously allocated request whenever possible. */ + if (pte->src_reqs[lac]) + return FI_SUCCESS; + + mb.raw = 0; + mb.rdzv_lac = lac; + ib.raw = ~0; + ib.rdzv_lac = 0; + le_flags = C_LE_UNRESTRICTED_BODY_RO | C_LE_UNRESTRICTED_END_RO | + C_LE_OP_GET | C_LE_EVENT_UNLINK_DISABLE; + + req = cxip_evtq_req_alloc(&base->txc->tx_evtq, 1, base); + if (!req) { + ret = -FI_EAGAIN; + CXIP_WARN("Failed to allocate %d rendezvous source request: %d:%s\n", + lac, ret, fi_strerror(-ret)); + return ret; + } + req->cb = cxip_rdzv_pte_src_cb; + + expected_success_count = + ofi_atomic_get32(&base->le_linked_success_count) + 1; + + ret = cxip_pte_append(base->pte, 0, -1ULL, lac, C_PTL_LIST_PRIORITY, + req->req_id, mb.raw, ib.raw, + CXI_MATCH_ID_ANY, 0, le_flags, NULL, + base->txc->rx_cmdq, true); + if (ret) { + CXIP_WARN("Failed to issue %d rendezvous source request LE append: %d:%s\n", + lac, ret, fi_strerror(-ret)); + goto err_free_req; + } + + ret = cxip_rdzv_pte_wait_append(base, expected_success_count); + if (ret != FI_SUCCESS) + goto err_free_req; + + pte->src_reqs[lac] = req; + + return FI_SUCCESS; + +err_free_req: + cxip_evtq_req_free(req); + + return ret; +} + +static void cxip_rdzv_pte_zbp_req_free(struct cxip_rdzv_match_pte *pte) +{ + /* The corresponding LE is not freed using an unlink command. Instead, + * this logic relies on the freeing of the hardware PtlTE to release the + * LEs. + */ + cxip_evtq_req_free(pte->zbp_req); +} + +static int cxip_rdzv_pte_zbp_req_alloc(struct cxip_rdzv_match_pte *pte) +{ + uint32_t le_flags = C_LE_UNRESTRICTED_BODY_RO | + C_LE_UNRESTRICTED_END_RO | C_LE_OP_PUT | + C_LE_EVENT_UNLINK_DISABLE; + union cxip_match_bits mb = { + .le_type = CXIP_LE_TYPE_ZBP, + }; + union cxip_match_bits ib = { + .tag = ~0, + .tx_id = ~0, + .cq_data = 1, + .tagged = 1, + .match_comp = 1, + .rdzv_done = 1, + }; + struct cxip_rdzv_pte *base = &pte->base_pte; + int ret; + int expected_success_count; + + pte->zbp_req = cxip_evtq_req_alloc(&base->txc->tx_evtq, 1, pte); + if (!pte->zbp_req) { + ret = -FI_ENOMEM; + CXIP_WARN("Failed to allocate zero byte put request: %d:%s\n", + ret, fi_strerror(-ret)); + return ret; + } + + pte->zbp_req->cb = cxip_rdzv_pte_zbp_cb; + + expected_success_count = + ofi_atomic_get32(&base->le_linked_success_count) + 1; + + ret = cxip_pte_append(base->pte, 0, 0, 0, C_PTL_LIST_PRIORITY, + pte->zbp_req->req_id, mb.raw, ib.raw, + CXI_MATCH_ID_ANY, 0, le_flags, NULL, + base->txc->rx_cmdq, true); + if (ret) { + CXIP_WARN("Failed to issue zero byte put LE append: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_req; + } + + ret = cxip_rdzv_pte_wait_append(base, expected_success_count); + if (ret != FI_SUCCESS) + goto err_free_req; + + return FI_SUCCESS; + +err_free_req: + cxip_evtq_req_free(pte->zbp_req); + + return ret; +} + +static void cxip_rdzv_pte_free(struct cxip_rdzv_pte *pte) +{ + /* Freeing the PtlTE causes the PtlTE to be reset and all LEs to be + * freed. Thus, no need to issue disable and/or unlink commands. + */ + cxip_pte_free(pte->pte); + + /* Flush the CQ to ensure any events referencing the rendezvous requests + * are processed. + */ + cxip_evtq_progress(&pte->txc->tx_evtq); +} + +void cxip_rdzv_match_pte_free(struct cxip_rdzv_match_pte *pte) +{ + cxip_rdzv_pte_free(&pte->base_pte); + + /* Release all the rendezvous requests. */ + cxip_rdzv_pte_src_reqs_free(pte); + cxip_rdzv_pte_zbp_req_free(pte); + + free(pte); +} + +void cxip_rdzv_nomatch_pte_free(struct cxip_rdzv_nomatch_pte *pte) +{ + cxip_rdzv_pte_free(&pte->base_pte); + cxip_evtq_req_free(pte->le_req); + + free(pte); +} + +static int cxip_rdzv_base_pte_alloc(struct cxip_txc *txc, + uint32_t write_pid_idx, bool write, + uint32_t read_pid_idx, bool read, + bool matching, + struct cxip_rdzv_pte *base_pte) +{ + int ret; + struct cxi_pt_alloc_opts pt_opts = { + .is_matching = matching, + }; + + base_pte->txc = txc; + ofi_atomic_initialize32(&base_pte->le_linked_success_count, 0); + ofi_atomic_initialize32(&base_pte->le_linked_failure_count, 0); + + if (matching && txc->ep_obj->av->symmetric) + pt_opts.use_logical = 1; + + /* Reserve the Rendezvous Send PTE */ + ret = cxip_pte_alloc_nomap(txc->ep_obj->ptable, txc->tx_evtq.eq, + &pt_opts, cxip_rdzv_pte_cb, txc, + &base_pte->pte); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to alloc base rendezvous PtlTE: %d:%s\n", + ret, fi_strerror(-ret)); + return ret; + } + + if (write) { + ret = cxip_pte_map(base_pte->pte, write_pid_idx, false); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to map write PTE: %d\n", ret); + goto err_free_rdzv_pte; + } + } + + if (read) { + ret = cxip_pte_map(base_pte->pte, read_pid_idx, false); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to map read PTE: %d\n", ret); + goto err_free_rdzv_pte; + } + } + + /* Set to enable, event will be processed on link */ + ret = cxip_pte_set_state(base_pte->pte, txc->rx_cmdq, + C_PTLTE_ENABLED, 0); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to enqueue enable command: %d:%s\n", ret, + fi_strerror(-ret)); + goto err_free_rdzv_pte; + } + + return FI_SUCCESS; + +err_free_rdzv_pte: + cxip_pte_free(base_pte->pte); + base_pte->pte = NULL; + + return ret; +} + +/* ep_obj->lock should be held by caller */ +int cxip_rdzv_match_pte_alloc(struct cxip_txc *txc, + struct cxip_rdzv_match_pte **rdzv_pte) +{ + int ret; + struct cxip_rdzv_match_pte *match_pte; + uint32_t pid_idx = txc->domain->iface->dev->info.rdzv_get_idx; + struct cxip_rdzv_pte *base; + + match_pte = calloc(1, sizeof(*match_pte)); + if (!match_pte) { + ret = -ENOMEM; + CXIP_WARN("Rendzvous matching PtlTE allocation error: %d:%s\n", + ret, fi_strerror(-ret)); + return ret; + } + + ret = cxip_rdzv_base_pte_alloc(txc, pid_idx, true, + CXIP_PTL_IDX_RDZV_DEST, true, + true, &match_pte->base_pte); + if (ret != FI_SUCCESS) + goto err_free_rdzv_pte_mem; + + /* Matching specific initialization */ + base = &match_pte->base_pte; + + ret = cxip_rdzv_pte_zbp_req_alloc(match_pte); + if (ret) { + CXIP_WARN("Failed to allocate zero byte put request: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_rdzv_pte; + } + *rdzv_pte = match_pte; + + return FI_SUCCESS; + +err_free_rdzv_pte: + cxip_pte_free(base->pte); +err_free_rdzv_pte_mem: + free(match_pte); + + return ret; +} + +/* ep_obj->lock should be held by caller */ +int cxip_rdzv_nomatch_pte_alloc(struct cxip_txc *txc, int lac, + struct cxip_rdzv_nomatch_pte **rdzv_pte) +{ + int ret; + struct cxip_rdzv_nomatch_pte *nomatch_pte; + struct cxip_rdzv_pte *base; + uint32_t le_flags; + uint32_t expected_success_count; + uint64_t ib = 0; + uint32_t pid_idx = CXIP_PTL_IDX_RDZV_RESTRICTED(lac); + +#if ENABLE_DEBUG + /* Enable testing of fallback to default rendezvous protocol + * if unable to allocate required non-matching PTE/LE resources. + */ + if (txc->force_err & CXIP_TXC_FORCE_ERR_ALT_READ_PROTO_ALLOC) + return -FI_ENOSPC; +#endif + nomatch_pte = calloc(1, sizeof(*nomatch_pte)); + if (!nomatch_pte) { + ret = -ENOMEM; + CXIP_WARN("Rdzv restricted PtlTE allocation error: %d:%s\n", + ret, fi_strerror(-ret)); + return ret; + } + + ret = cxip_rdzv_base_pte_alloc(txc, 0, false, pid_idx, true, + false, &nomatch_pte->base_pte); + if (ret != FI_SUCCESS) + goto err_free_rdzv_pte_mem; + + /* Non-matching specific initialization */ + base = &nomatch_pte->base_pte; + + nomatch_pte->le_req = cxip_evtq_req_alloc(&txc->tx_evtq, 1, + nomatch_pte); + if (!nomatch_pte->le_req) { + ret = -FI_EAGAIN; + CXIP_WARN("Rdzv PtlTE LAC %d request allocation error: %d:%s\n", + lac, ret, fi_strerror(-ret)); + goto err_free_rdzv_pte; + } + + nomatch_pte->le_req->cb = cxip_rdzv_pte_src_cb; + + le_flags = C_LE_EVENT_COMM_DISABLE | C_LE_EVENT_SUCCESS_DISABLE | + C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_GET; + ib = 1; + expected_success_count = + ofi_atomic_get32(&base->le_linked_success_count) + 1; + + ret = cxip_pte_append(base->pte, 0, -1ULL, lac, C_PTL_LIST_PRIORITY, + nomatch_pte->le_req->req_id, 0, ib, + CXI_MATCH_ID_ANY, 0, le_flags, NULL, + txc->rx_cmdq, true); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to enqueue append cmd: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_rdzv_pte; + } + + ret = cxip_rdzv_pte_wait_append(base, expected_success_count); + if (ret != FI_SUCCESS) + goto err_free_req; + + *rdzv_pte = nomatch_pte; + return FI_SUCCESS; + +err_free_req: + cxip_evtq_req_free(nomatch_pte->le_req); +err_free_rdzv_pte: + cxip_pte_free(nomatch_pte->base_pte.pte); +err_free_rdzv_pte_mem: + free(nomatch_pte); + + return ret; +} diff --git a/prov/cxi/src/cxip_repsum.c b/prov/cxi/src/cxip_repsum.c new file mode 100644 index 00000000000..6c0f5c93186 --- /dev/null +++ b/prov/cxi/src/cxip_repsum.c @@ -0,0 +1,283 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP + */ + +/* Notes: + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cxip.h" + +/** + * @brief REPRODUCIBLE SUM IMPLEMENATION + * + * - Reference: + * - https://www2.eecs.berkeley.edu/Pubs/TechRpts/2016/EECS-2016-121.pdf + * Algorithm 7 + * - Example Code: + * - https://github.com/peterahrens/ReproBLAS.git + * + * This code supplies the software component of the RSDG Reproducible Sum + * collective reduction operation. + * + * Conceptually, the 52-bit mantissa of a double precision IEEE floating point + * value, extended to 53-bits to include the "hidden" bit, is placed in a + * register containing 2048 bits (the full possible range of IEEE double + * exponents) and shifted so that the MSBit of the mantissa is aligned with the + * 11-bit exponent. + * + * This large register is then divided into numbered "bins" of W bits. Each bin + * is then expanded by adding (64 - W) zero bits to the most-significant end of + * each bin, and these 64-bit quantities are copied into an array of Kt 64-bit + * registers, along with the bin number M in which the MSBit of the value is + * located. + * + * The extra space in each bin allow us to sum without carry from bin-to-bin + * until the end of the computation. With W=40, there are 24 bits of overflow, + * allowing at least 2^24 summations to occur before overflow can occur. + * + * If overflow does occur, both Rosetta and this software set an overflow flag, + * and the final result should be treated as invalid. + * + * Low order bits can be discarded in the process, and this will set an inexact + * flag. The result should still be reproducible, and accurate to within + * round-off error. + */ + +#define W 40 +#define Kt 4 + +/* special values of M for non-numbers */ +#define MNInf 125 +#define MInf 126 +#define MNaN 127 + +/** + * @Description + * + * BIN() converts the exponent 'e' to a W-bit bin number. + * + * OFF() provides the offset of exponent 'e' within the W-bit bin. + * + * MSK() provides a bitmask for the W LSBits. + */ +#define BIN(e) (((e) - 1023 + 1024*W)/W - 1024) +#define OFF(e) ((e) - 1023 - W*BIN(e)) +#define MSK(w) ((1ULL << w) - 1) + +/** + * @brief Convert double to repsum + * + * Rosetta expects T[0] to be the LSBits of the value, so we load from Kt-1 + * downward. Because W=40, T[0] will always be zero: 53 bits of mantissa cannot + * span more than three 40-bit registers, regardless of alignment. + * + * Note that injection of a sNaN will set the invalid bit. + * + * @param x returned repsum object + * @param d double to convert + */ +void cxip_dbl_to_rep(struct cxip_repsum *x, double d) +{ + unsigned long m; // double mantissa + int e; // double exponent + int s; // double sign + int w; // bin offset of MSbit + int lsh; // left-shift amount + int rem; // remaining bits to shift + int siz; // number of bits to keep + int i; + + memset(x, 0, sizeof(*x)); + _decompose_dbl(d, &s, &e, &m); + if (isnan(d)) { + // NaN, bit 51 clear is sNaN, sign ignored + x->M = MNaN; + w = 0; + m = 0; + // injecting sNaN sets the invalid bit + x->invalid = !(m & 0x0008000000000000); + } else if (isinf(d)) { + // inf, sign captured in x->M + x->M = (s < 0) ? MNInf : MInf; + w = 0; + m = 0; + // injecting inf sets the overflow bit + x->overflow = true; + x->overflow_id = 3; + } else if (e) { + // Normal values, extend m with implicit MSBit == 1 + x->M = BIN(e); + w = OFF(e); + m |= 1ULL << 52; + } else { + // Subnormal values, zero + x->M = BIN(1); + w = OFF(1); + } + + /** + * Copy the mantissa into the correct locations within T[]. + * + * T[3] should contain the w+1 MSBits of m, aligned to bit 0. + * T[2] should contain the next W bits, aligned to bit W-1. + * T[1] should contain any remaining bits, aligned to bit W-1. + * T[0] will always be zero. + */ + rem = 53; // number of bits to process + siz = w + 1; // bits to include in MSRegister + lsh = 0; // left-shift to align + i = Kt; // start with most significant + while (rem) { + x->T[--i] = s*((m >> (rem - siz)) << lsh); + rem -= siz; // siz MSBits consumed + m &= MSK(rem); // keep only rem LSBits + siz = (rem < W) ? rem : W; + lsh = W - siz; // align to bit W-1 + } + while (i) + x->T[--i] = 0; // clear remaining bins +} + +/** + * @brief Convert repsum back to double. + * + * Simply use scalbn() to scale the signed mantissas and add to the accumulator. + * + * @param x repsum object + * @return double returned value + */ +void cxip_rep_to_dbl(double *d, const struct cxip_repsum *x) +{ + int i, m; + + *d = 0.0; + switch (x->M) { + case MNaN: // quiet NaN only + *d = NAN; + return; + case MNInf: + *d = -INFINITY; + return; + case MInf: + *d = INFINITY; + return; + } + m = x->M; + for (i = Kt-1; i >= 0; i--) { + *d += scalbn(1.0*(int64_t)x->T[i], W*m); + m--; + } +} + +/** + * @brief Add two repsum objects, and return the result in x. + * + * @param x accumulator + * @param y added to accumulator + */ +void cxip_rep_add(struct cxip_repsum *x, const struct cxip_repsum *y) +{ + struct cxip_repsum swap; + int i, j; + + /* swap x and y if necessary, to make x the largest M. + * NaN is largest, followed by +Inf, -Inf, and numbers + */ + if (y->M > x->M) { + memcpy(&swap, x, sizeof(struct cxip_repsum)); + memcpy(x, y, sizeof(struct cxip_repsum)); + y = (const struct cxip_repsum *)&swap; + } + /* +Inf > -Inf, and if added, promote to NaN */ + if (x->M == MInf && y->M == MNInf) { + x->M = MNaN; + /* subtracting infinities sets the invalid bit */ + x->invalid = true; + } + /* Handle the not-numbers */ + if (x->M == MNaN || x->M == MInf || x->M == MNInf) + return; + /* inexact always propagates, no matter how small */ + if (y->inexact) + x->inexact = true; + /* advance j until bins are aligned, note bits discarded */ + for (j = 0; j < Kt && j + y->M < x->M; j++) { + if (y->T[j]) + x->inexact = true; + } + /* any remaining overflow propagates */ + if (y->overflow && y->overflow_id >= j) { + x->overflow = true; + x->overflow_id = y->overflow_id - j; + } + /* Add remaining y to x in each aligned bin, check for overflow */ + for (i = 0; i < Kt && j < Kt; i++, j++) { + int sgn0, sgn1; + + sgn0 = x->T[i] >> 63; + x->T[i] += y->T[j]; + sgn1 = x->T[i] >> 63; + /* sign change in wrong direction */ + if (sgn0 != sgn1 && sgn1 != y->T[j] >> 63) { + x->inexact = true; + x->overflow = true; + x->overflow_id = MAX(x->overflow_id, i); + } + } +} + +/** + * @brief Add two doubles using the repsum method. + * + * @param d1 : operand 1 + * @param d2 : operand 2 + * @return double result + */ +double cxip_rep_add_dbl(double d1, double d2) +{ + struct cxip_repsum x, y; + + cxip_dbl_to_rep(&x, d1); + cxip_dbl_to_rep(&y, d2); + cxip_rep_add(&x, &y); + cxip_rep_to_dbl(&d1, &x); + + return d1; +} + +/** + * @brief Sum over a list of values. + * + * @param count : count of values + * @param values : array of values to sum + * @return double result + */ +double cxip_rep_sum(size_t count, double *values) +{ + struct cxip_repsum x, y; + double d; + size_t i; + + if (count <= 0) + return 0.0; + if (count == 1) + return values[0]; + + cxip_dbl_to_rep(&x, values[0]); + for (i = 1; i < count; i++) { + cxip_dbl_to_rep(&y, values[i]); + cxip_rep_add(&x, &y); + } + cxip_rep_to_dbl(&d, &x); + return d; +} diff --git a/prov/cxi/src/cxip_req_buf.c b/prov/cxi/src/cxip_req_buf.c new file mode 100644 index 00000000000..84c72cd3488 --- /dev/null +++ b/prov/cxi/src/cxip_req_buf.c @@ -0,0 +1,321 @@ +/* + * (C) Copyright 2021-2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ +#include "config.h" +#include "cxip.h" + +static bool cxip_req_buf_is_head(struct cxip_ptelist_buf *buf) +{ + struct cxip_ptelist_buf *head_buf = + container_of(buf->pool->active_bufs.next, + struct cxip_ptelist_buf, buf_entry); + + return head_buf == buf; +} + +static bool cxip_req_buf_is_consumed(struct cxip_ptelist_buf *buf) +{ + return buf->unlink_length && buf->unlink_length == buf->cur_offset && + dlist_empty(&buf->request.pending_ux_list); +} + +static bool cxip_req_buf_is_next_put(struct cxip_ptelist_buf *buf, + const union c_event *event) +{ + return (CXI_VA_TO_IOVA(buf->md->md, buf->data) + buf->cur_offset) == + event->tgt_long.start; +} + +static void cxip_req_buf_get_header_info(struct cxip_ptelist_buf *buf, + struct cxip_ux_send *ux, + size_t *header_length, + uint64_t *remote_offset) +{ + struct c_port_fab_hdr *fab_hdr = + (void *)CXI_IOVA_TO_VA(buf->md->md, ux->put_ev.tgt_long.start); + struct c_port_unrestricted_hdr *unres_hdr = + (void *)((char *)fab_hdr + sizeof(*fab_hdr)); + + if (fab_hdr->ver != 4) + RXC_FATAL(buf->rxc, "Unsupported fabric header version: %u\n", + fab_hdr->ver); + + switch (unres_hdr->ver_pkt_type) { + case C_V4_PKT_UNRESTRICTED: + *header_length = sizeof(*fab_hdr) + + sizeof(struct c_port_unrestricted_hdr); + *remote_offset = + c_port_unrestricted_hdr_get_remote_offset(unres_hdr); + break; + case C_V4_PKT_SMALLMSG: + *header_length = sizeof(*fab_hdr) + + sizeof(struct c_port_small_msg_hdr); + *remote_offset = 0; + break; + default: + RXC_FATAL(buf->rxc, "Unsupported packet type: %u\n", + unres_hdr->ver_pkt_type); + } +} + +void cxip_req_buf_ux_free(struct cxip_ux_send *ux) +{ + _cxip_req_buf_ux_free(ux, true); +} + +static struct cxip_ux_send *cxip_req_buf_ux_alloc(struct cxip_ptelist_buf *buf, + const union c_event *event) +{ + struct cxip_ux_send *ux; + + ux = calloc(1, sizeof(*ux)); + if (!ux) + return NULL; + + ux->put_ev = *event; + ux->req = buf->req; + dlist_init(&ux->rxc_entry); + cxip_ptelist_buf_get(buf); + + RXC_DBG(buf->rxc, "Referenced REQ buf=%p ux=%p\n", buf, ux); + + return ux; +} + +/* Caller must hold ep_obj->lock */ +static int cxip_req_buf_process_ux(struct cxip_ptelist_buf *buf, + struct cxip_ux_send *ux) +{ + struct cxip_rxc *rxc = buf->rxc; + size_t header_length; + uint64_t remote_offset; + int ret; + size_t unlink_length; + bool unlinked = ux->put_ev.tgt_long.auto_unlinked; + + /* Pre-processing of unlink events. */ + if (unlinked) + unlink_length = ux->put_ev.tgt_long.start - + CXI_VA_TO_IOVA(buf->md->md, buf->data) + + ux->put_ev.tgt_long.mlength; + + buf->cur_offset += ux->put_ev.tgt_long.mlength; + + /* Fixed the put event to point to where the payload resides in the + * request buffer. In addition, extract the remote offset needed for + * rendezvous. + */ + cxip_req_buf_get_header_info(buf, ux, &header_length, &remote_offset); + assert((ssize_t)ux->put_ev.tgt_long.mlength - + (ssize_t)header_length >= 0); + + ux->put_ev.tgt_long.start += header_length; + ux->put_ev.tgt_long.mlength -= header_length; + ux->put_ev.tgt_long.remote_offset = remote_offset + + ux->put_ev.tgt_long.mlength; + + rxc->sw_ux_list_len++; + + ret = cxip_recv_ux_sw_matcher(ux); + switch (ret) { + /* Unexpected message needs to be processed again. Put event fields + * need to be reset. + */ + case -FI_EAGAIN: + ux->put_ev.tgt_long.mlength += header_length; + ux->put_ev.tgt_long.start -= header_length; + buf->cur_offset -= ux->put_ev.tgt_long.mlength; + + rxc->sw_ux_list_len--; + return -FI_EAGAIN; + + /* Unexpected message failed to match a user posted request. Need to + * queue the unexpected message for future processing. + */ + case -FI_ENOMSG: + /* Check to see if a PtlTE transition to software managed EP + * is in progress, and if so add to the pending UX list which + * will be appended to software UX message list following + * completion of the on-loading. + */ + if (rxc->state != RXC_ENABLED_SOFTWARE && + rxc->state != RXC_FLOW_CONTROL) { + rxc->sw_ux_list_len--; + dlist_insert_tail(&ux->rxc_entry, + &rxc->sw_pending_ux_list); + rxc->sw_pending_ux_list_len++; + + RXC_DBG(buf->rxc, + "rbuf=%p ux=%p sw_pending_ux_list_len=%u\n", + buf, ux, buf->rxc->sw_pending_ux_list_len); + } else { + dlist_insert_tail(&ux->rxc_entry, &rxc->sw_ux_list); + + RXC_DBG(buf->rxc, "rbuf=%p ux=%p sw_ux_list_len=%u\n", + buf, ux, buf->rxc->sw_ux_list_len); + } + break; + + /* Unexpected message successfully matched a user posted request. */ + case FI_SUCCESS: + break; + + default: + RXC_FATAL(rxc, "Unexpected cxip_recv_ux_sw_matcher() rc: %d\n", + ret); + } + + /* Once unexpected send has been accepted, complete processing of the + * unlink. + */ + if (unlinked) { + buf->unlink_length = unlink_length; + ofi_atomic_dec32(&buf->pool->bufs_linked); + + RXC_DBG(rxc, "rbuf=%p rxc_rbuf_linked=%u\n", buf, + ofi_atomic_get32(&buf->pool->bufs_linked)); + + /* Replenish to keep minimum linked */ + ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool, false); + if (ret) + RXC_WARN(rxc, "Request replenish failed: %d\n", ret); + } + + RXC_DBG(rxc, "rbuf=%p processed ux_send=%p\n", buf, ux); + + return FI_SUCCESS; +} + +static void cxip_req_buf_progress_pending_ux(struct cxip_ptelist_buf *buf) +{ + struct cxip_ux_send *ux; + struct dlist_entry *tmp; + int ret; + +again: + dlist_foreach_container_safe(&buf->request.pending_ux_list, + struct cxip_ux_send, ux, rxc_entry, tmp) { + if (cxip_req_buf_is_next_put(buf, &ux->put_ev)) { + dlist_remove(&ux->rxc_entry); + + /* The corresponding event from the completion queue has + * already been consumed. Thus, -FI_EAGAIN cannot be + * returned. + */ + do { + ret = cxip_req_buf_process_ux(buf, ux); + } while (ret == -FI_EAGAIN); + + /* Previously processed unexpected messages may now be + * valid. Need to reprocess the entire list. + */ + goto again; + } + } +} + +static int cxip_req_buf_process_put_event(struct cxip_ptelist_buf *buf, + const union c_event *event) +{ + struct cxip_ux_send *ux; + int ret = FI_SUCCESS; + struct cxip_rxc *rxc = buf->rxc; + struct cxip_ptelist_bufpool *pool = buf->pool; + + assert(event->tgt_long.mlength >= CXIP_REQ_BUF_HEADER_MIN_SIZE); + + ux = cxip_req_buf_ux_alloc(buf, event); + if (!ux) { + RXC_WARN(rxc, "Memory allocation error\n"); + return -FI_EAGAIN; + } + + /* Target events can be out-of-order with respect to how they were + * matched on the PtlTE request list. To maintain the hardware matched + * order, software unexpected entries are only processed in the order in + * which they land in the request buffer. + */ + if (cxip_req_buf_is_head(buf) && cxip_req_buf_is_next_put(buf, event)) { + ret = cxip_req_buf_process_ux(buf, ux); + if (ret == -FI_EAGAIN) { + _cxip_req_buf_ux_free(ux, false); + return ret; + } + + /* Since events arrive out-of-order, it is possible that a + * non-head request buffer receive an event. Scrub all request + * buffers processing their pending unexpected lists until a + * request buffer is not consumed. + */ + while ((buf = dlist_first_entry_or_null(&pool->active_bufs, + struct cxip_ptelist_buf, + buf_entry))) { + cxip_req_buf_progress_pending_ux(buf); + + if (cxip_req_buf_is_consumed(buf)) { + RXC_DBG(rxc, "buf=%p consumed\n", buf); + cxip_ptelist_buf_consumed(buf); + } else { + break; + } + } + } else { + /* Out-of-order target event. Queue unexpected message on + * pending list until these addition events occur. + */ + dlist_insert_tail(&ux->rxc_entry, + &buf->request.pending_ux_list); + + RXC_DBG(rxc, "rbuf=%p pend ux_send=%p\n", buf, ux); + } + + return ret; +} + +static int cxip_req_buf_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_ptelist_buf *buf = req->req_ctx; + + switch (event->hdr.event_type) { + case C_EVENT_LINK: + /* Success events not requested */ + cxip_ptelist_buf_link_err(buf, cxi_event_rc(event)); + return FI_SUCCESS; + + case C_EVENT_UNLINK: + assert(!event->tgt_long.auto_unlinked); + cxip_ptelist_buf_unlink(buf); + return FI_SUCCESS; + + case C_EVENT_PUT: + return cxip_req_buf_process_put_event(buf, event); + + default: + RXC_FATAL(buf->rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +int cxip_req_bufpool_init(struct cxip_rxc *rxc) +{ + struct cxip_ptelist_bufpool_attr attr = { + .list_type = C_PTL_LIST_REQUEST, + .ptelist_cb = cxip_req_buf_cb, + .buf_size = cxip_env.req_buf_size, + .min_space_avail = CXIP_REQ_BUF_HEADER_MAX_SIZE + + rxc->max_eager_size, + .min_posted = cxip_env.req_buf_min_posted, + /* Allow growing the number request bufs posted */ + .max_posted = cxip_env.req_buf_min_posted << 3, + .max_cached = cxip_env.req_buf_max_cached, + }; + + return cxip_ptelist_bufpool_init(rxc, &rxc->req_list_bufpool, &attr); +} + +void cxip_req_bufpool_fini(struct cxip_rxc *rxc) +{ + return cxip_ptelist_bufpool_fini(rxc->req_list_bufpool); +} diff --git a/prov/cxi/src/cxip_rma.c b/prov/cxi/src/cxip_rma.c new file mode 100644 index 00000000000..25877a73b04 --- /dev/null +++ b/prov/cxi/src/cxip_rma.c @@ -0,0 +1,866 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2018,2021-2023 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cxip.h" + +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) + +/* + * cxip_rma_selective_completion_cb() - RMA selective completion callback. + */ +int cxip_rma_selective_completion_cb(struct cxip_req *req, + const union c_event *event) +{ + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + if (event->hdr.event_type == C_EVENT_SEND) { + CXIP_WARN("Unexpected %s event: rc=%s\n", + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + } + + int event_rc; + + event_rc = cxi_init_event_rc(event); + int ret_err; + + ret_err = proverr2errno(event_rc); + return cxip_cq_req_error(req, 0, ret_err, + cxi_event_rc(event), NULL, 0, + FI_ADDR_UNSPEC); +} + +/* + * cxip_rma_write_selective_completion_req() - Return request state associated + * with all RMA write with selective completion transactions on the transmit + * context. + * + * The request is freed when the TXC send CQ is closed. + */ +static struct cxip_req *cxip_rma_write_selective_completion_req(struct cxip_txc *txc) +{ + if (!txc->rma_write_selective_completion_req) { + struct cxip_req *req; + + req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); + if (!req) + return NULL; + + req->cb = cxip_rma_selective_completion_cb; + req->context = (uint64_t)txc->context; + req->flags = FI_RMA | FI_WRITE; + req->addr = FI_ADDR_UNSPEC; + + txc->rma_write_selective_completion_req = req; + } + + return txc->rma_write_selective_completion_req; +} + +/* + * cxip_rma_read_selective_completion_req() - Return request state associated + * with all RMA read with selective completion transactions on the transmit + * context. + * + * The request is freed when the TXC send CQ is closed. + */ +static struct cxip_req *cxip_rma_read_selective_completion_req(struct cxip_txc *txc) +{ + if (!txc->rma_read_selective_completion_req) { + struct cxip_req *req; + + req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); + if (!req) + return NULL; + + req->cb = cxip_rma_selective_completion_cb; + req->context = (uint64_t)txc->context; + req->flags = FI_RMA | FI_READ; + req->addr = FI_ADDR_UNSPEC; + + txc->rma_read_selective_completion_req = req; + } + + return txc->rma_read_selective_completion_req; +} + +/* + * cxip_rma_cb() - RMA event callback. + */ +static int cxip_rma_cb(struct cxip_req *req, const union c_event *event) +{ + int ret; + int event_rc; + int ret_err; + bool success_event = !!(req->flags & FI_COMPLETION); + struct cxip_txc *txc = req->rma.txc; + + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + if (event->hdr.event_type == C_EVENT_SEND) { + TXC_WARN(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + } + + req->flags &= (FI_RMA | FI_READ | FI_WRITE); + + if (req->rma.local_md) + cxip_unmap(req->rma.local_md); + + if (req->rma.ibuf) + cxip_txc_ibuf_free(txc, req->rma.ibuf); + + event_rc = cxi_init_event_rc(event); + if (event_rc == C_RC_OK) { + if (success_event) { + ret = cxip_cq_req_complete(req); + if (ret != FI_SUCCESS) + TXC_WARN(txc, + "Failed to report completion: %d\n", + ret); + } + } else { + ret_err = proverr2errno(event_rc); + ret = cxip_cq_req_error(req, 0, ret_err, event_rc, + NULL, 0, FI_ADDR_UNSPEC); + if (ret != FI_SUCCESS) + TXC_WARN(txc, "Failed to report error: %d\n", ret); + } + + ofi_atomic_dec32(&req->rma.txc->otx_reqs); + cxip_evtq_req_free(req); + + return FI_SUCCESS; +} + +static int cxip_rma_emit_dma(struct cxip_txc *txc, const void *buf, size_t len, + struct cxip_mr *mr, union c_fab_addr *dfa, + uint8_t *idx_ext, uint16_t vni, uint64_t addr, + uint64_t key, uint64_t data, uint64_t flags, + void *context, bool write, bool unr, + uint32_t tclass, + enum cxi_traffic_class_type tc_type, + bool triggered, uint64_t trig_thresh, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr) +{ + struct cxip_req *req = NULL; + struct cxip_md *dma_md = NULL; + void *dma_buf; + struct c_full_dma_cmd dma_cmd = {}; + int ret; + struct cxip_domain *dom = txc->domain; + struct cxip_cntr *cntr; + void *inject_req; + + /* MR desc cannot be value unless hybrid MR desc is enabled. */ + if (!dom->hybrid_mr_desc) + mr = NULL; + + /* DMA commands always require a request structure regardless if + * FI_COMPLETION is set. This is due to the provider doing internally + * memory registration and having to clean up the registration on DMA + * operation completion. + */ + if ((len && (flags & FI_INJECT)) || (flags & FI_COMPLETION) || !mr) { + req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); + if (!req) { + ret = -FI_EAGAIN; + TXC_WARN(txc, "Failed to allocate request: %d:%s\n", + ret, fi_strerror(-ret)); + goto err; + } + + req->context = (uint64_t)context; + req->cb = cxip_rma_cb; + req->flags = FI_RMA | (write ? FI_WRITE : FI_READ) | + (flags & FI_COMPLETION); + req->rma.txc = txc; + req->type = CXIP_REQ_RMA; + req->trig_cntr = trig_cntr; + } + + if (len) { + /* If the operation is an DMA inject operation (which can occur + * when doing RMA commands to unoptimized MRs), a provider + * bounce buffer is always needed to store the user payload. + * + * Always prefer user provider MR over internally mapping the + * buffer. + */ + if (flags & FI_INJECT) { + assert(req != NULL); + + req->rma.ibuf = cxip_txc_ibuf_alloc(txc); + if (!req->rma.ibuf) { + ret = -FI_EAGAIN; + TXC_WARN(txc, + "Failed to allocate bounce buffer: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_cq_req; + } + + ret = cxip_txc_copy_from_hmem(txc, NULL, req->rma.ibuf, + buf, len); + if (ret){ + TXC_WARN(txc, + "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_rma_buf; + } + + dma_buf = (void *)req->rma.ibuf; + dma_md = cxip_txc_ibuf_md(req->rma.ibuf); + } else if (mr) { + dma_buf = (void *)buf; + dma_md = mr->md; + } else { + assert(req != NULL); + + ret = cxip_map(dom, buf, len, 0, &req->rma.local_md); + if (ret) { + TXC_WARN(txc, "Failed to map buffer: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_cq_req; + } + + dma_buf = (void *)buf; + dma_md = req->rma.local_md; + } + } + + dma_cmd.command.cmd_type = C_CMD_TYPE_DMA; + dma_cmd.index_ext = *idx_ext; + dma_cmd.event_send_disable = 1; + dma_cmd.dfa = *dfa; + ret = cxip_adjust_remote_offset(&addr, key); + if (ret) { + TXC_WARN(txc, "Remote offset overflow\n"); + goto err_free_cq_req; + } + dma_cmd.remote_offset = addr; + dma_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + dma_cmd.match_bits = CXIP_KEY_MATCH_BITS(key); + + if (req) { + dma_cmd.user_ptr = (uint64_t)req; + } else { + if (write) + inject_req = cxip_rma_write_selective_completion_req(txc); + else + inject_req = cxip_rma_read_selective_completion_req(txc); + + if (!inject_req) { + ret = -FI_EAGAIN; + TXC_WARN(txc, + "Failed to allocate inject request: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_rma_buf; + } + + dma_cmd.user_ptr = (uint64_t)inject_req; + dma_cmd.event_success_disable = 1; + } + + if (!unr) + dma_cmd.restricted = 1; + + if (write) { + dma_cmd.command.opcode = C_CMD_PUT; + + /* Triggered DMA operations have their own completion counter + * and the one associated with the TXC cannot be used. + */ + cntr = triggered ? comp_cntr : txc->write_cntr; + if (cntr) { + dma_cmd.event_ct_ack = 1; + dma_cmd.ct = cntr->ct->ctn; + } + + if (flags & (FI_DELIVERY_COMPLETE | FI_MATCH_COMPLETE)) + dma_cmd.flush = 1; + } else { + dma_cmd.command.opcode = C_CMD_GET; + + /* Triggered DMA operations have their own completion counter + * and the one associated with the TXC cannot be used. + */ + cntr = triggered ? comp_cntr : txc->read_cntr; + if (cntr) { + dma_cmd.event_ct_reply = 1; + dma_cmd.ct = cntr->ct->ctn; + } + } + + /* Only need to fill if DMA command address fields if MD is valid. */ + if (dma_md) { + dma_cmd.lac = dma_md->md->lac; + dma_cmd.local_addr = CXI_VA_TO_IOVA(dma_md->md, dma_buf); + dma_cmd.request_len = len; + } + + ret = cxip_txc_emit_dma(txc, vni, cxip_ofi_to_cxi_tc(tclass), + tc_type, trig_cntr, trig_thresh, + &dma_cmd, flags); + if (ret) { + TXC_WARN(txc, "Failed to emit dma command: %d:%s\n", ret, + fi_strerror(-ret)); + goto err_free_rma_buf; + } + + return FI_SUCCESS; + +err_free_rma_buf: + if (req && req->rma.ibuf) + cxip_txc_ibuf_free(txc, req->rma.ibuf); +err_free_cq_req: + if (req) + cxip_evtq_req_free(req); +err: + return ret; +} + +static int cxip_rma_emit_idc(struct cxip_txc *txc, const void *buf, size_t len, + union c_fab_addr *dfa, uint8_t *idx_ext, + uint16_t vni, uint64_t addr, uint64_t key, + uint64_t data, uint64_t flags, void *context, + bool unr, uint32_t tclass, + enum cxi_traffic_class_type tc_type) +{ + int ret; + struct cxip_req *req = NULL; + void *hmem_buf = NULL; + void *idc_buf; + struct c_cstate_cmd cstate_cmd = {}; + struct c_idc_put_cmd idc_put = {}; + void *inject_req; + + /* IDCs must be traffic if the user requests a completion event. */ + if (flags & FI_COMPLETION) { + req = cxip_evtq_req_alloc(&txc->tx_evtq, 0, txc); + if (!req) { + ret = -FI_EAGAIN; + TXC_WARN(txc, "Failed to allocate request: %d:%s\n", + ret, fi_strerror(-ret)); + goto err; + } + + req->context = (uint64_t)context; + req->cb = cxip_rma_cb; + req->flags = FI_RMA | FI_WRITE | (flags & FI_COMPLETION); + req->rma.txc = txc; + req->type = CXIP_REQ_RMA; + } + + /* If HMEM is request and since the buffer type may not be host memory, + * doing a memcpy could result in a segfault. Thus, an HMEM bounce + * buffer is required to ensure IDC payload is in host memory. + */ + if (txc->hmem && len) { + hmem_buf = cxip_txc_ibuf_alloc(txc); + if (!hmem_buf) { + ret = -FI_EAGAIN; + TXC_WARN(txc, + "Failed to allocate bounce buffer: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_cq_req; + } + + ret = cxip_txc_copy_from_hmem(txc, NULL, hmem_buf, buf, len); + if (ret) { + TXC_WARN(txc, + "cxip_txc_copy_from_hmem failed: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_hmem_buf; + } + + idc_buf = hmem_buf; + } else { + idc_buf = (void *)buf; + } + + cstate_cmd.event_send_disable = 1; + cstate_cmd.index_ext = *idx_ext; + cstate_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + + if (flags & (FI_DELIVERY_COMPLETE | FI_MATCH_COMPLETE)) + cstate_cmd.flush = 1; + + if (!unr) + cstate_cmd.restricted = 1; + + if (txc->write_cntr) { + cstate_cmd.event_ct_ack = 1; + cstate_cmd.ct = txc->write_cntr->ct->ctn; + } + + /* If the user has not request a completion, success events will be + * disabled. But, if for some reason the operation completes with an + * error, an event will occur. For this case, a TXC inject request is + * allocated. This request enables the reporting of failed operation to + * the completion queue. This request is freed when the TXC is closed. + */ + if (req) { + cstate_cmd.user_ptr = (uint64_t)req; + } else { + inject_req = cxip_rma_write_selective_completion_req(txc); + if (!inject_req) { + ret = -FI_EAGAIN; + TXC_WARN(txc, + "Failed to allocate inject request: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_free_hmem_buf; + } + + cstate_cmd.user_ptr = (uint64_t)inject_req; + cstate_cmd.event_success_disable = 1; + } + + idc_put.idc_header.dfa = *dfa; + + ret = cxip_adjust_remote_offset(&addr, key); + if (ret) { + TXC_WARN(txc, "Remote offset overflow\n"); + goto err_free_hmem_buf; + } + idc_put.idc_header.remote_offset = addr; + + ret = cxip_txc_emit_idc_put(txc, vni, cxip_ofi_to_cxi_tc(tclass), + tc_type, &cstate_cmd, &idc_put, idc_buf, + len, flags); + if (ret) { + TXC_WARN(txc, "Failed to emit idc_put command: %d:%s\n", ret, + fi_strerror(-ret)); + goto err_free_hmem_buf; + } + + if (hmem_buf) + cxip_txc_ibuf_free(txc, hmem_buf); + + return FI_SUCCESS; + +err_free_hmem_buf: + if (hmem_buf) + cxip_txc_ibuf_free(txc, hmem_buf); +err_free_cq_req: + if (req) + cxip_evtq_req_free(req); +err: + return ret; +} + +static bool cxip_rma_is_unrestricted(struct cxip_txc *txc, uint64_t key, + uint64_t msg_order, bool write) +{ + /* Unoptimized keys are implemented with match bits and must always be + * unrestricted. + */ + if (!cxip_generic_is_mr_key_opt(key)) + return true; + + /* If MR indicates remote events are required unrestricted must be + * used. If the MR is a client key, we assume if FI_RMA_EVENTS are + * requested, the remote client key MR is attached to a counter or + * requires RMA events, so unrestricted is used. + */ + if (cxip_generic_is_mr_key_events(txc->ep_obj->caps, key)) + return true; + + /* If the operation is an RMA write and the user has requested fabric + * write after write ordering, unrestricted must be used. + */ + if (write && msg_order & (FI_ORDER_WAW | FI_ORDER_RMA_WAW)) + return true; + + return false; +} + +static bool cxip_rma_is_idc(struct cxip_txc *txc, uint64_t key, size_t len, + bool write, bool triggered, bool unr) +{ + size_t max_idc_size = unr ? CXIP_INJECT_SIZE : C_MAX_IDC_PAYLOAD_RES; + + /* IDC commands are not supported for unoptimized MR since the IDC + * small message format does not support remote offset which is needed + * for RMA commands. + */ + if (!cxip_generic_is_mr_key_opt(key)) + return false; + + /* IDC commands are only support with RMA writes. */ + if (!write) + return false; + + /* IDC commands only support a limited payload size. */ + if (len > max_idc_size) + return false; + + /* Triggered operations never can be issued with an IDC. */ + if (triggered) + return false; + + return true; +} + +/* + * cxip_rma_common() - Perform an RMA operation. + * + * Common RMA function. Performs RMA reads and writes of all kinds. + * + * Generally, operations are supported by Cassini DMA commands. IDC commands + * are used instead for Write operations smaller than the maximum IDC payload + * size. + * + * If the FI_COMPLETION flag is specified, the operation will generate a + * libfabric completion event. If an event is not requested and an IDC command + * is used, hardware success events will be suppressed. If a completion is + * required but an IDC can't be used, the provider tracks the request + * internally, but will suppress the libfabric event. The provider must track + * DMA commands in order to clean up the source buffer mapping on completion. + */ +ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, + const void *buf, size_t len, void *desc, + fi_addr_t tgt_addr, uint64_t addr, uint64_t key, + uint64_t data, uint64_t flags, uint32_t tclass, + uint64_t msg_order, void *context, + bool triggered, uint64_t trig_thresh, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr) +{ + struct cxip_addr caddr; + union c_fab_addr dfa; + uint8_t idx_ext; + uint32_t pid_idx; + enum cxi_traffic_class_type tc_type; + bool write = op == FI_OP_WRITE; + bool unr; + bool idc; + int ret; + uint16_t vni; + + if (len && !buf) { + TXC_WARN(txc, "Invalid buffer\n"); + return -FI_EINVAL; + } + + if ((flags & FI_INJECT) && len > CXIP_INJECT_SIZE) { + TXC_WARN(txc, "RMA inject size exceeds limit\n"); + return -FI_EMSGSIZE; + } + + if (len > CXIP_EP_MAX_MSG_SZ) { + TXC_WARN(txc, "RMA length exceeds limit\n"); + return -FI_EMSGSIZE; + } + + if (!cxip_generic_is_valid_mr_key(key)) { + TXC_WARN(txc, "Invalid remote key: 0x%lx\n", key); + return -FI_EKEYREJECTED; + } + + unr = cxip_rma_is_unrestricted(txc, key, msg_order, write); + idc = cxip_rma_is_idc(txc, key, len, write, triggered, unr); + + /* Build target network address. */ + ret = cxip_av_lookup_addr(txc->ep_obj->av, tgt_addr, &caddr); + if (ret) { + TXC_WARN(txc, "Failed to look up FI addr: %d:%s\n", + ret, fi_strerror(-ret)); + return ret; + } + + if (txc->ep_obj->av_auth_key) + vni = caddr.vni; + else + vni = txc->ep_obj->auth_key.vni; + + pid_idx = cxip_generic_mr_key_to_ptl_idx(txc->domain, key, write); + cxi_build_dfa(caddr.nic, caddr.pid, txc->pid_bits, pid_idx, &dfa, + &idx_ext); + + /* Select the correct traffic class type within a traffic class. */ + if (!unr && (flags & FI_CXI_HRP)) + tc_type = CXI_TC_TYPE_HRP; + else if (!unr) + tc_type = CXI_TC_TYPE_RESTRICTED; + else + tc_type = CXI_TC_TYPE_DEFAULT; + + /* IDC commands are preferred wherever possible since the payload is + * written with the command thus avoiding all memory registration. In + * addition, this allows for success events to be surpressed if + * FI_COMPLETION is not requested. + */ + ofi_genlock_lock(&txc->ep_obj->lock); + if (idc) + ret = cxip_rma_emit_idc(txc, buf, len, &dfa, &idx_ext, vni, + addr, key, data, flags, context, unr, + tclass, tc_type); + else + ret = cxip_rma_emit_dma(txc, buf, len, desc, &dfa, &idx_ext, + vni, addr, key, data, flags, context, + write, unr, tclass, tc_type, + triggered, trig_thresh, + trig_cntr, comp_cntr); + ofi_genlock_unlock(&txc->ep_obj->lock); + + if (ret) + TXC_WARN(txc, + "%s RMA %s failed: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + idc ? "IDC" : "DMA", write ? "write" : "read", + buf, len, key, addr, caddr.nic, caddr.pid, pid_idx); + else + TXC_DBG(txc, + "%s RMA %s emitted: buf=%p len=%lu rkey=%#lx roffset=%#lx nic=%#x pid=%u pid_idx=%u\n", + idc ? "IDC" : "DMA", write ? "write" : "read", + buf, len, key, addr, caddr.nic, caddr.pid, pid_idx); + + return ret; +} + +/* + * Libfabric APIs + */ +static ssize_t cxip_rma_write(struct fid_ep *fid_ep, const void *buf, + size_t len, void *desc, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_rma_common(FI_OP_WRITE, &ep->ep_obj->txc, buf, len, desc, + dest_addr, addr, key, 0, ep->tx_attr.op_flags, + ep->tx_attr.tclass, ep->tx_attr.msg_order, + context, false, 0, NULL, NULL); +} + +static ssize_t cxip_rma_writev(struct fid_ep *fid_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + size_t len; + const void *buf; + void *mr_desc; + + if (count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (iov && count == 1) { + len = iov[0].iov_len; + buf = iov[0].iov_base; + mr_desc = desc ? desc[0] : NULL; + } else { + TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + return cxip_rma_common(FI_OP_WRITE, &ep->ep_obj->txc, buf, len, + mr_desc, dest_addr, addr, key, 0, + ep->tx_attr.op_flags, ep->tx_attr.tclass, + ep->tx_attr.msg_order, context, false, 0, NULL, + NULL); +} + +static ssize_t cxip_rma_writemsg(struct fid_ep *fid_ep, + const struct fi_msg_rma *msg, uint64_t flags) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + size_t len; + const void *buf; + void *mr_desc; + + if (!msg) { + TXC_WARN(txc, "NULL msg not supported\n"); + return -FI_EINVAL; + } + + if (msg->rma_iov_count != 1) { + TXC_WARN(txc, "Invalid RMA iov\n"); + return -FI_EINVAL; + } + + if (msg->iov_count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (msg->msg_iov && msg->iov_count == 1) { + len = msg->msg_iov[0].iov_len; + buf = msg->msg_iov[0].iov_base; + mr_desc = msg->desc ? msg->desc[0] : NULL; + } else { + TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS | FI_CXI_HRP | + FI_CXI_WEAK_FENCE)) + return -FI_EBADFLAGS; + + if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE)) + return -FI_EINVAL; + + /* If selective completion is not requested, always generate + * completions. + */ + if (!txc->selective_completion) + flags |= FI_COMPLETION; + + return cxip_rma_common(FI_OP_WRITE, txc, buf, len, mr_desc, msg->addr, + msg->rma_iov[0].addr, msg->rma_iov[0].key, + msg->data, flags, ep->tx_attr.tclass, + ep->tx_attr.msg_order, msg->context, false, 0, + NULL, NULL); +} + +ssize_t cxip_rma_inject(struct fid_ep *fid_ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t addr, uint64_t key) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_rma_common(FI_OP_WRITE, &ep->ep_obj->txc, buf, len, NULL, + dest_addr, addr, key, 0, FI_INJECT, + ep->tx_attr.tclass, ep->tx_attr.msg_order, NULL, + false, 0, NULL, NULL); +} + +static ssize_t cxip_rma_read(struct fid_ep *fid_ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + + return cxip_rma_common(FI_OP_READ, &ep->ep_obj->txc, buf, len, desc, + src_addr, addr, key, 0, ep->tx_attr.op_flags, + ep->tx_attr.tclass, ep->tx_attr.msg_order, + context, false, 0, NULL, NULL); +} + +static ssize_t cxip_rma_readv(struct fid_ep *fid_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + size_t len; + const void *buf; + void *mr_desc; + + if (count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (iov && count == 1) { + len = iov[0].iov_len; + buf = iov[0].iov_base; + mr_desc = desc ? desc[0] : NULL; + } else { + TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + return cxip_rma_common(FI_OP_READ, &ep->ep_obj->txc, buf, len, mr_desc, + src_addr, addr, key, 0, ep->tx_attr.op_flags, + ep->tx_attr.tclass, ep->tx_attr.msg_order, + context, false, 0, NULL, NULL); +} + +static ssize_t cxip_rma_readmsg(struct fid_ep *fid_ep, + const struct fi_msg_rma *msg, uint64_t flags) +{ + struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = &ep->ep_obj->txc; + size_t len; + const void *buf; + void *mr_desc; + + if (!msg) { + TXC_WARN(txc, "NULL msg not supported\n"); + return -FI_EINVAL; + } + + if (msg->rma_iov_count != 1) { + TXC_WARN(txc, "Invalid RMA iov\n"); + return -FI_EINVAL; + } + + if (msg->iov_count == 0) { + len = 0; + buf = NULL; + mr_desc = NULL; + } else if (msg->msg_iov && msg->iov_count == 1) { + len = msg->msg_iov[0].iov_len; + buf = msg->msg_iov[0].iov_base; + mr_desc = msg->desc ? msg->desc[0] : NULL; + } else { + TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + return -FI_EINVAL; + } + + if (flags & ~CXIP_READMSG_ALLOWED_FLAGS) + return -FI_EBADFLAGS; + + if (flags & FI_FENCE && !(txc->attr.caps & FI_FENCE)) + return -FI_EINVAL; + + /* If selective completion is not requested, always generate + * completions. + */ + if (!txc->selective_completion) + flags |= FI_COMPLETION; + + return cxip_rma_common(FI_OP_READ, txc, buf, len, mr_desc, msg->addr, + msg->rma_iov[0].addr, msg->rma_iov[0].key, + msg->data, flags, ep->tx_attr.tclass, + ep->tx_attr.msg_order, msg->context, false, 0, + NULL, NULL); +} + +struct fi_ops_rma cxip_ep_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = cxip_rma_read, + .readv = cxip_rma_readv, + .readmsg = cxip_rma_readmsg, + .write = cxip_rma_write, + .writev = cxip_rma_writev, + .writemsg = cxip_rma_writemsg, + .inject = cxip_rma_inject, + .injectdata = fi_no_rma_injectdata, + .writedata = fi_no_rma_writedata, +}; + +struct fi_ops_rma cxip_ep_rma_no_ops = { + .size = sizeof(struct fi_ops_rma), + .read = fi_no_rma_read, + .readv = fi_no_rma_readv, + .readmsg = fi_no_rma_readmsg, + .write = fi_no_rma_write, + .writev = fi_no_rma_writev, + .writemsg = fi_no_rma_writemsg, + .inject = fi_no_rma_inject, + .injectdata = fi_no_rma_injectdata, + .writedata = fi_no_rma_writedata, +}; diff --git a/prov/cxi/src/cxip_rxc.c b/prov/cxi/src/cxip_rxc.c new file mode 100644 index 00000000000..3fce655a6d7 --- /dev/null +++ b/prov/cxi/src/cxip_rxc.c @@ -0,0 +1,555 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2019,2020-2023 Hewlett Packard Enterprise Development LP + */ + +/* CXI RX Context Management */ + +#include "config.h" + +#include +#include + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__) + +#define CXIP_SC_STATS "FC/SC stats - EQ full: %d append fail: %d no match: %d"\ + " request full: %d unexpected: %d, NIC HW2SW unexp: %d"\ + " NIC HW2SW append fail: %d\n" + +/* + * cxip_rxc_msg_enable() - Enable RXC messaging. + * + * Change the RXC RX PtlTE to enabled state. Once in enabled state, messages + * will be accepted by hardware. Prepare all messaging resources before + * enabling the RX PtlTE. + * + * Caller must hold ep_obj->lock. + */ +int cxip_rxc_msg_enable(struct cxip_rxc *rxc, uint32_t drop_count) +{ + int ret; + + /* If transitioning from disabled to the software managed state a + * synchronous call is used which handles drop count mismatches. + */ + if (rxc->new_state == RXC_ENABLED_SOFTWARE) { + ret = cxil_pte_transition_sm(rxc->rx_pte->pte, drop_count); + if (ret) + RXC_WARN(rxc, + "Error transitioning to SW EP %d %s\n", + ret, fi_strerror(-ret)); + return ret; + } + + return cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq, + C_PTLTE_ENABLED, drop_count); +} + +/* + * rxc_msg_disable() - Disable RXC messaging. + * + * Change the RXC RX PtlTE to disabled state. Once in disabled state, the PtlTE + * will receive no additional events. + * + * Caller must hold rxc->ep_obj->lock. + */ +static int rxc_msg_disable(struct cxip_rxc *rxc) +{ + int ret; + + if (rxc->state != RXC_ENABLED && + rxc->state != RXC_ENABLED_SOFTWARE) + RXC_FATAL(rxc, "RXC in bad state to be disabled: state=%d\n", + rxc->state); + + rxc->state = RXC_DISABLED; + + ret = cxip_pte_set_state_wait(rxc->rx_pte, rxc->rx_cmdq, &rxc->rx_evtq, + C_PTLTE_DISABLED, 0); + if (ret == FI_SUCCESS) + CXIP_DBG("RXC PtlTE disabled: %p\n", rxc); + + return ret; +} + +#define RXC_RESERVED_FC_SLOTS 1 + +/* + * rxc_msg_init() - Initialize an RX context for messaging. + * + * Allocates and initializes hardware resources used for receiving expected and + * unexpected message data. + * + * Caller must hold ep_obj->lock. + */ +static int rxc_msg_init(struct cxip_rxc *rxc) +{ + int ret; + struct cxi_pt_alloc_opts pt_opts = { + .use_long_event = 1, + .is_matching = 1, + .en_flowctrl = 1, + .lossless = cxip_env.msg_lossless, + }; + struct cxi_cq_alloc_opts cq_opts = {}; + + ret = cxip_ep_cmdq(rxc->ep_obj, false, FI_TC_UNSPEC, + rxc->rx_evtq.eq, &rxc->rx_cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate RX CMDQ, ret: %d\n", ret); + return -FI_EDOMAIN; + } + + /* For FI_TC_UNSPEC, reuse the TX context command queue if possible. If + * a specific traffic class is requested, allocate a new command queue. + * This is done to prevent performance issues with reusing the TX + * context command queue and changing the communication profile. + */ + if (cxip_env.rget_tc == FI_TC_UNSPEC) { + ret = cxip_ep_cmdq(rxc->ep_obj, true, FI_TC_UNSPEC, + rxc->rx_evtq.eq, &rxc->tx_cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate TX CMDQ, ret: %d\n", ret); + ret = -FI_EDOMAIN; + goto put_rx_cmdq; + } + } else { + cq_opts.count = rxc->ep_obj->txq_size * 4; + cq_opts.flags = CXI_CQ_IS_TX; + cq_opts.policy = cxip_env.cq_policy; + + ret = cxip_cmdq_alloc(rxc->ep_obj->domain->lni, + rxc->rx_evtq.eq, &cq_opts, + rxc->ep_obj->auth_key.vni, + cxip_ofi_to_cxi_tc(cxip_env.rget_tc), + CXI_TC_TYPE_DEFAULT, &rxc->tx_cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate CMDQ, ret: %d\n", ret); + ret = -FI_ENOSPC; + goto put_rx_cmdq; + } + } + + /* If applications AVs are symmetric, use logical FI addresses for + * matching. Otherwise, physical addresses will be used. + */ + if (rxc->ep_obj->av->symmetric) { + CXIP_DBG("Using logical PTE matching\n"); + pt_opts.use_logical = 1; + } + + ret = cxip_pte_alloc(rxc->ep_obj->ptable, + rxc->rx_evtq.eq, CXIP_PTL_IDX_RXQ, false, + &pt_opts, cxip_recv_pte_cb, rxc, &rxc->rx_pte); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate RX PTE: %d\n", ret); + goto put_tx_cmdq; + } + + /* One slot must be reserved to support hardware generated state change + * events. + */ + ret = cxip_evtq_adjust_reserved_fc_event_slots(&rxc->rx_evtq, + RXC_RESERVED_FC_SLOTS); + if (ret) { + CXIP_WARN("Unable to adjust RX reserved event slots: %d\n", + ret); + goto free_pte; + } + + return FI_SUCCESS; + +free_pte: + cxip_pte_free(rxc->rx_pte); +put_tx_cmdq: + if (cxip_env.rget_tc == FI_TC_UNSPEC) + cxip_ep_cmdq_put(rxc->ep_obj, true); + else + cxip_cmdq_free(rxc->tx_cmdq); +put_rx_cmdq: + cxip_ep_cmdq_put(rxc->ep_obj, false); + + return ret; +} + +/* + * rxc_msg_fini() - Finalize RX context messaging. + * + * Free hardware resources allocated when the RX context was initialized for + * messaging. + * + * Caller must hold ep_obj->lock. + */ +static int rxc_msg_fini(struct cxip_rxc *rxc) +{ + int ret __attribute__((unused)); + + cxip_pte_free(rxc->rx_pte); + + cxip_ep_cmdq_put(rxc->ep_obj, false); + + if (cxip_env.rget_tc == FI_TC_UNSPEC) + cxip_ep_cmdq_put(rxc->ep_obj, true); + else + cxip_cmdq_free(rxc->tx_cmdq); + + cxip_evtq_adjust_reserved_fc_event_slots(&rxc->rx_evtq, + -1 * RXC_RESERVED_FC_SLOTS); + + cxip_evtq_fini(&rxc->rx_evtq); + + return FI_SUCCESS; +} + +static void cxip_rxc_free_ux_entries(struct cxip_rxc *rxc) +{ + struct cxip_ux_send *ux_send; + struct dlist_entry *tmp; + + /* TODO: Manage freeing of UX entries better. This code is redundant + * with the freeing in cxip_recv_sw_matcher(). + */ + dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, + ux_send, rxc_entry, tmp) { + dlist_remove(&ux_send->rxc_entry); + if (ux_send->req && ux_send->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux_send); + else + free(ux_send); + + rxc->sw_ux_list_len--; + } + + if (rxc->sw_ux_list_len != 0) + CXIP_WARN("sw_ux_list_len %d != 0\n", rxc->sw_ux_list_len); + assert(rxc->sw_ux_list_len == 0); + + /* Free any pending UX entries waiting from the request list */ + dlist_foreach_container_safe(&rxc->sw_pending_ux_list, + struct cxip_ux_send, ux_send, + rxc_entry, tmp) { + dlist_remove(&ux_send->rxc_entry); + if (ux_send->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux_send); + else + free(ux_send); + + rxc->sw_pending_ux_list_len--; + } + + if (rxc->sw_pending_ux_list_len != 0) + CXIP_WARN("sw_pending_ux_list_len %d != 0\n", + rxc->sw_pending_ux_list_len); + assert(rxc->sw_pending_ux_list_len == 0); +} + +static size_t cxip_rxc_get_num_events(struct cxip_rxc *rxc) +{ + size_t num_events; + + /* Hardware will ensure incoming RDMA operations have event queue space. + * It is the responsibility of software to ensure that any SW initiated + * target commands which may generate an event (e.g. append with failure + * or search) have enough space in the EQ. This can be done in two ways. + * + * 1. Continually increase EQ buffer size until EQ overflows go away. + * This option is not ideal since many application variables are in play + * which impact number of events needed. + * + * 2. Use hybrid endpoint mode to preemptively transition to software + * endpoint when event queue space may be under pressure. When in + * software endpoint mode, software should not be issuing commands, like + * append and search/search & delete, which could result in events being + * generated. + * + * For both cases, RXC size will be used to size number of events. To + * accommodate a stream of unexpected puts and append failures, RXC size + * is added again. With correct credit control for hybrid endpoint to + * preemptively transition to software endpoint, 2* RXC size should be + * enough to prevent EQ overflow. For all other cases, EQ size needs to + * be increased. + */ + + num_events = rxc->attr.size * 2; + + /* Add 1 more event for software initiated state change. */ + num_events++; + + return num_events; +} + +/* + * cxip_rxc_enable() - Enable an RX context for use. + * + * Called via fi_enable(). The context could be used in a standard endpoint or + * a scalable endpoint. + */ +int cxip_rxc_enable(struct cxip_rxc *rxc) +{ + int ret; + int tmp; + size_t num_events; + enum c_ptlte_state state; + + if (rxc->state != RXC_DISABLED) + return FI_SUCCESS; + + if (!ofi_recv_allowed(rxc->attr.caps)) { + rxc->state = RXC_ENABLED; + return FI_SUCCESS; + } + + if (!rxc->recv_cq) { + CXIP_WARN("Undefined recv CQ\n"); + return -FI_ENOCQ; + } + + num_events = cxip_rxc_get_num_events(rxc); + ret = cxip_evtq_init(&rxc->rx_evtq, rxc->recv_cq, num_events, 1); + if (ret) { + CXIP_WARN("Failed to initialize RXC event queue: %d, %s\n", + ret, fi_strerror(-ret)); + return ret; + } + + ret = rxc_msg_init(rxc); + if (ret != FI_SUCCESS) { + CXIP_WARN("rxc_msg_init returned: %d\n", ret); + ret = -FI_EDOMAIN; + goto evtq_fini; + } + + /* If starting in or able to transition to software managed + * PtlTE, append request list entries first. + */ + if (cxip_software_pte_allowed()) { + ret = cxip_req_bufpool_init(rxc); + if (ret != FI_SUCCESS) + goto err_msg_fini; + } + + if (rxc->msg_offload) { + state = C_PTLTE_ENABLED; + ret = cxip_oflow_bufpool_init(rxc); + if (ret != FI_SUCCESS) + goto err_req_buf_fini; + } else { + state = C_PTLTE_SOFTWARE_MANAGED; + } + + /* Start accepting Puts. */ + ret = cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq, state, 0); + if (ret != FI_SUCCESS) { + CXIP_WARN("cxip_pte_set_state returned: %d\n", ret); + goto err_oflow_buf_fini; + } + + /* Wait for PTE state change */ + do { + sched_yield(); + cxip_evtq_progress(&rxc->rx_evtq); + } while (rxc->rx_pte->state != state); + + rxc->pid_bits = rxc->domain->iface->dev->info.pid_bits; + CXIP_DBG("RXC messaging enabled: %p, pid_bits: %d\n", + rxc, rxc->pid_bits); + + return FI_SUCCESS; + +err_oflow_buf_fini: + if (rxc->msg_offload) + cxip_oflow_bufpool_fini(rxc); + +err_req_buf_fini: + if (cxip_software_pte_allowed()) + cxip_req_bufpool_fini(rxc); + +err_msg_fini: + tmp = rxc_msg_fini(rxc); + if (tmp != FI_SUCCESS) + CXIP_WARN("rxc_msg_fini returned: %d\n", tmp); + +evtq_fini: + cxip_evtq_fini(&rxc->rx_evtq); + + return ret; +} + +/* + * rxc_cleanup() - Attempt to free outstanding requests. + * + * Outstanding commands may be dropped when the RX Command Queue is freed. + * This leads to missing events. Attempt to gather all events before freeing + * the RX CQ. If events go missing, resources will be leaked until the + * Completion Queue is freed. + */ +static void rxc_cleanup(struct cxip_rxc *rxc) +{ + int ret; + uint64_t start; + int canceled = 0; + struct cxip_fc_drops *fc_drops; + struct dlist_entry *tmp; + + if (!ofi_atomic_get32(&rxc->orx_reqs)) + return; + + cxip_evtq_req_discard(&rxc->rx_evtq, rxc); + + do { + ret = cxip_evtq_req_cancel(&rxc->rx_evtq, rxc, 0, false); + if (ret == FI_SUCCESS) + canceled++; + } while (ret == FI_SUCCESS); + + if (canceled) + CXIP_DBG("Canceled %d Receives: %p\n", canceled, rxc); + + start = ofi_gettime_ms(); + while (ofi_atomic_get32(&rxc->orx_reqs)) { + sched_yield(); + cxip_evtq_progress(&rxc->rx_evtq); + + if (ofi_gettime_ms() - start > CXIP_REQ_CLEANUP_TO) { + CXIP_WARN("Timeout waiting for outstanding requests.\n"); + break; + } + } + + dlist_foreach_container_safe(&rxc->fc_drops, struct cxip_fc_drops, + fc_drops, rxc_entry, tmp) { + dlist_remove(&fc_drops->rxc_entry); + free(fc_drops); + } + + if (rxc->num_fc_eq_full || rxc->num_fc_no_match || + rxc->num_fc_req_full || rxc->num_fc_unexp || + rxc->num_fc_append_fail || rxc->num_sc_nic_hw2sw_unexp || + rxc->num_sc_nic_hw2sw_append_fail) + CXIP_INFO(CXIP_SC_STATS, rxc->num_fc_eq_full, + rxc->num_fc_append_fail, rxc->num_fc_no_match, + rxc->num_fc_req_full, rxc->num_fc_unexp, + rxc->num_sc_nic_hw2sw_unexp, + rxc->num_sc_nic_hw2sw_append_fail); +} + +static void cxip_rxc_dump_counters(struct cxip_rxc *rxc) +{ + int i; + int j; + int k; + size_t msg_size; + bool print_header; + int count; + + for (i = 0; i < CXIP_LIST_COUNTS; i++) { + for (j = 0; j < OFI_HMEM_MAX; j++) { + + print_header = true; + + for (k = 0; k < CXIP_COUNTER_BUCKETS; k++) { + if (k == 0) + msg_size = 0; + else + msg_size = (1ULL << (k - 1)); + + count = ofi_atomic_get32(&rxc->cntrs.msg_count[i][j][k]); + if (count) { + if (print_header) { + RXC_INFO(rxc, "Recv Message Size %s - %s Histogram\n", + c_ptl_list_strs[i], + fi_tostr(&j, FI_TYPE_HMEM_IFACE)); + RXC_INFO(rxc, "%-14s Count\n", "Size"); + print_header = false; + } + + RXC_INFO(rxc, "%-14lu %u\n", msg_size, + count); + } + } + } + + } +} + +void cxip_rxc_struct_init(struct cxip_rxc *rxc, const struct fi_rx_attr *attr, + void *context) +{ + int i; + + dlist_init(&rxc->ep_list); + ofi_atomic_initialize32(&rxc->orx_hw_ule_cnt, 0); + ofi_atomic_initialize32(&rxc->orx_reqs, 0); + ofi_atomic_initialize32(&rxc->orx_tx_reqs, 0); + rxc->max_tx = cxip_env.sw_rx_tx_init_max; + + rxc->context = context; + rxc->attr = *attr; + + for (i = 0; i < CXIP_DEF_EVENT_HT_BUCKETS; i++) + dlist_init(&rxc->deferred_events.bh[i]); + + dlist_init(&rxc->fc_drops); + dlist_init(&rxc->replay_queue); + dlist_init(&rxc->sw_ux_list); + dlist_init(&rxc->sw_recv_queue); + dlist_init(&rxc->sw_pending_ux_list); + + rxc->max_eager_size = cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; + rxc->drop_count = rxc->ep_obj->asic_ver < CASSINI_2_0 ? -1 : 0; + + /* TODO make configurable */ + rxc->min_multi_recv = CXIP_EP_MIN_MULTI_RECV; + rxc->state = RXC_DISABLED; + rxc->msg_offload = cxip_env.msg_offload; + rxc->hmem = !!(attr->caps & FI_HMEM); + rxc->sw_ep_only = cxip_env.rx_match_mode == CXIP_PTLTE_SOFTWARE_MODE; + rxc->rget_align_mask = cxip_env.rdzv_aligned_sw_rget ? + cxip_env.cacheline_size - 1 : 0; + + cxip_msg_counters_init(&rxc->cntrs); +} + +/* + * cxip_rxc_disable() - Disable the RX context of an base endpoint object. + * + * Free hardware resources allocated when the context was enabled. Called via + * fi_close(). + */ +void cxip_rxc_disable(struct cxip_rxc *rxc) +{ + int ret; + + cxip_rxc_dump_counters(rxc); + + if (rxc->state == RXC_DISABLED) + return; + + if (ofi_recv_allowed(rxc->attr.caps)) { + /* Stop accepting Puts. */ + ret = rxc_msg_disable(rxc); + if (ret != FI_SUCCESS) + CXIP_WARN("rxc_msg_disable returned: %d\n", ret); + + cxip_rxc_free_ux_entries(rxc); + + rxc_cleanup(rxc); + + if (cxip_software_pte_allowed()) + cxip_req_bufpool_fini(rxc); + + if (cxip_env.msg_offload) + cxip_oflow_bufpool_fini(rxc); + + /* Free hardware resources. */ + ret = rxc_msg_fini(rxc); + if (ret != FI_SUCCESS) + CXIP_WARN("rxc_msg_fini returned: %d\n", ret); + } +} diff --git a/prov/cxi/src/cxip_telemetry.c b/prov/cxi/src/cxip_telemetry.c new file mode 100644 index 00000000000..6bbb16bea0c --- /dev/null +++ b/prov/cxi/src/cxip_telemetry.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ +#include "config.h" +#include "cxip.h" + +#define TELEMETRY_FILE_FMT "/sys/class/cxi/cxi%u/device/telemetry/%s" + +static long cxip_telemetry_entry_read_value(struct cxip_telemetry_entry *entry) +{ + long ret; + char path[FI_PATH_MAX]; + FILE *f; + long value; + long tstamp_sec; + long tstamp_nsec; + + ret = snprintf(path, FI_PATH_MAX, TELEMETRY_FILE_FMT, + entry->telemetry->dom->iface->info->dev_id, entry->name); + if (ret < 0) + return ret; + + f = fopen(path, "r"); + if (!f) + return -errno; + + /* Even though only value is needed, extract 3 values to ensure + * telemetry data is in the expected format. + */ + ret = fscanf(f, "%ld@%ld.%ld", &value, &tstamp_sec, &tstamp_nsec); + if (ret != 3) { + if (ret == EOF) + ret = -errno; + else + ret = -FI_EINVAL; + } else { + ret = value; + } + + fclose(f); + + return ret; +} + +static void cxip_telemetry_entry_dump_delta(struct cxip_telemetry_entry *entry) +{ + long delta; + + delta = cxip_telemetry_entry_read_value(entry); + if (delta < 0) { + DOM_WARN(entry->telemetry->dom, "Failed to read %s: %ld:%s\n", + entry->name, delta, fi_strerror(-delta)); + return; + } + + if (delta < entry->value) { + DOM_WARN(entry->telemetry->dom, + "Failed to perform delta due to %s reset\n", + entry->name); + return; + } + + delta -= entry->value; + + DOM_INFO(entry->telemetry->dom, "%s: %ld\n", entry->name, delta); +} + +static int cxip_telemetry_entry_reset_value(struct cxip_telemetry_entry *entry) +{ + long ret; + + ret = cxip_telemetry_entry_read_value(entry); + if (ret < 0) { + DOM_WARN(entry->telemetry->dom, "Failed to read %s: %ld:%s\n", + entry->name, ret, fi_strerror(-ret)); + return ret; + } + + entry->value = ret; + + return FI_SUCCESS; +} + +static void +cxip_telemetry_entry_free(struct cxip_telemetry_entry *entry) +{ + dlist_remove(&entry->telemetry_entry); + free(entry); +} + +static bool +cxip_telemetry_entry_validate_token_file(struct cxip_telemetry *telemetry, + const char *telemetry_token) +{ + char path[FI_PATH_MAX]; + int ret; + + ret = snprintf(path, FI_PATH_MAX, TELEMETRY_FILE_FMT, + telemetry->dom->iface->info->dev_id, telemetry_token); + if (ret < 0) + return false; + + /* Verify user read access to the telemetry file. */ + if (access(path, R_OK)) + return false; + + return true; +} + +static bool +cxip_telemetry_entry_validate_token(struct cxip_telemetry *telemetry, + const char *telemetry_token) +{ + /* The telemetry directory has an ALL-in-binary entry. This file is + * considered invalid for this telemetry implementation. + */ + if (strcmp(telemetry_token, "ALL-in-binary") == 0) + return false; + + return cxip_telemetry_entry_validate_token_file(telemetry, + telemetry_token); +} + +static int cxip_telemetry_entry_alloc(struct cxip_telemetry *telemetry, + const char *telemetry_token) +{ + struct cxip_telemetry_entry *entry; + int ret; + + if (!cxip_telemetry_entry_validate_token(telemetry, telemetry_token)) { + DOM_WARN(telemetry->dom, "Invalid telemetry: %s\n", + telemetry_token); + return -FI_EINVAL; + } + + entry = calloc(1, sizeof(*entry)); + if (!entry) + return -FI_ENOMEM; + + entry->telemetry = telemetry; + + strncpy(entry->name, telemetry_token, TELEMETRY_ENTRY_NAME_SIZE - 1); + entry->name[TELEMETRY_ENTRY_NAME_SIZE - 1] = '\0'; + + /* Revalidate the name after the memcpy. */ + if (!cxip_telemetry_entry_validate_token(telemetry, entry->name)) { + DOM_WARN(telemetry->dom, "Invalid telemetry: %s\n", + entry->name); + ret = FI_EINVAL; + goto err_free_entry; + } + + ret = cxip_telemetry_entry_reset_value(entry); + if (ret) + goto err_free_entry; + + dlist_insert_tail(&entry->telemetry_entry, &telemetry->telemetry_list); + + return FI_SUCCESS; + +err_free_entry: + free(entry); + + return ret; +} + +static int cxip_telemetry_sleep_duration(void) +{ + int ret; + int msec_sleep; + char *path = "/sys/module/cxi_core/parameters/cntr_refresh_interval"; + FILE *f; + + f = fopen(path, "r"); + if (!f) + return -errno; + + ret = fscanf(f, "%d", &msec_sleep); + if (ret != 1) { + if (ret == EOF) + ret = -errno; + else + ret = -FI_EINVAL; + } else { + /* Convert sleep duration to seconds. */ + ret = msec_sleep / 1000; + if (msec_sleep % 1000) + ret++; + ret = MAX(ret, 1); + } + + fclose(f); + + return ret; +} + +void cxip_telemetry_dump_delta(struct cxip_telemetry *telemetry) +{ + struct cxip_telemetry_entry *entry; + int sleep_duration; + + /* Since sysfs telemetry entries are refreshed as some interval, we need + * to sleep for a refresh interval to get updates. Else, the application + * could run and telemetry deltas would be zero. + */ + sleep_duration = cxip_telemetry_sleep_duration(); + if (sleep_duration < 0) { + DOM_WARN(telemetry->dom, + "Failed to retrieve telemetry sleep duration: %d:%s\n", + sleep_duration, fi_strerror(-sleep_duration)); + return; + } + + sleep(sleep_duration); + + dlist_foreach_container(&telemetry->telemetry_list, + struct cxip_telemetry_entry, entry, + telemetry_entry) + cxip_telemetry_entry_dump_delta(entry); +} + +void cxip_telemetry_free(struct cxip_telemetry *telemetry) +{ + struct cxip_telemetry_entry *entry; + struct dlist_entry *tmp; + + dlist_foreach_container_safe(&telemetry->telemetry_list, + struct cxip_telemetry_entry, + entry, telemetry_entry, tmp) + cxip_telemetry_entry_free(entry); + + free(telemetry); +} + +int cxip_telemetry_alloc(struct cxip_domain *dom, + struct cxip_telemetry **telemetry) +{ + struct cxip_telemetry *_telemetry; + const char *telemetry_token; + char *telemetry_copy; + int ret = FI_SUCCESS; + + /* If user defined telemtry string is NULL or RGID does not match, + * return -FI_ENOSYS signalling this function is not supported. + */ + if (!cxip_env.telemetry || + (cxip_env.telemetry_rgid >= 0 && + dom->lni->lni->id != cxip_env.telemetry_rgid)) + return -FI_ENOSYS; + + _telemetry = calloc(1, sizeof(*_telemetry)); + if (!_telemetry) + return -FI_ENOMEM; + + _telemetry->dom = dom; + dlist_init(&_telemetry->telemetry_list); + + telemetry_copy = malloc(strlen(cxip_env.telemetry) + 1); + if (!telemetry_copy) { + ret = -FI_ENOMEM; + goto err_free_telemetry; + } + + strcpy(telemetry_copy, cxip_env.telemetry); + + /* The following will parse the comma separated list and attempt to + * allocate a telemetry entry for any valid substring/token. If a + * telemetry entry fails to be allocated for a given substring/token, + * this is not considered fatal and parsing will continue. + */ + telemetry_token = strtok(telemetry_copy, ","); + while (telemetry_token != NULL) { + ret = cxip_telemetry_entry_alloc(_telemetry, telemetry_token); + if (ret) + DOM_WARN(dom, "Failed to allocated %s telemetry entry: %d:%s\n", + telemetry_token, ret, fi_strerror(-ret)); + else + DOM_INFO(dom, "Telemetry entry allocated for %s\n", + telemetry_token); + + telemetry_token = strtok(NULL, ","); + } + + free(telemetry_copy); + + if (dlist_empty(&_telemetry->telemetry_list)) { + DOM_WARN(dom, "Failed to allocated any telemetry entries\n"); + ret = -FI_EINVAL; + goto err_free_telemetry; + } + + *telemetry = _telemetry; + + return FI_SUCCESS; + +err_free_telemetry: + cxip_telemetry_free(_telemetry); + + return ret; +} diff --git a/prov/cxi/src/cxip_trace.c b/prov/cxi/src/cxip_trace.c new file mode 100644 index 00000000000..5d3a371b5f4 --- /dev/null +++ b/prov/cxi/src/cxip_trace.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +/** + * @brief TRACE function for producing runtime debugging logs + * + * The following should be inserted at the top of a code module to trace: + * + * #define TRACE(fmt, ...) CXIP_TRACE(, fmt, ##__VA_ARGS__) + * + * If ENABLE_DEBUG is false at compile time, CXIP_TRACE is a syntactically + * robust NOOP which results in no code being emitted, ensuring that these + * trace calls do not affect performance in production, and none of the + * following comment apply. + * + * - cxip_trace_fn is the function that logs a trace message. + * - cxip_trace_flush_fn can be used to flush buffered trace messages. + * - cxip_trace_close_fn can be used to flush and close the output. + * - cxip_trace_enable_fn is used to enable/disable all tracing. + * - cxip_trace_set() is used to enable a tracing module. + * - cxip_trace_clr() is used to disable a tracing module. + * + * Modules are defined by the list of enum cxip_trace_module values, which + * can be extended as needed to provide finer control over tracing. + * + * The initial values are set in cxip_trace_init() below, using run-time + * environment variables. cxip_trace_enable() can be used to dynamically + * enable or disable tracing. cxip_trace_set() and cxip_trace_clr() can be + * used to dynamically modify which traces will generate output. + * + * Some initialization is required by the use of environment variables: + * + * Specifying the environment variable CXIP_TRACE_FILENAME will deliver + * output to a file with the specified name, followed by the PMI_RANK value + * (if there is one). + * + * Specifying CXIP_TRACE_APPEND in conjunction with CXIP_TRACE_FILENAME will + * open the file in append mode. This is important for NETSIM tests under + * Criterion, since each test is run in a separate process and closes all + * files at completion of each test. + * + * Specifying PMI_RANK as a rank value will apply a prefix to the trace lines + * that identifies the rank of the trace. + * + * Specifying PMI_SIZE will expand the prefix to show the number of ranks. + * + * cxip_trace_fid is exposed, and can be manipulated using the normal stream + * file functions. Default buffering is fully buffered output, which can + * result in delays in the appearance of logging information. Using + * setlinebuf() will run slower, but will display lines more quickly. + * + * cxip_trace_flush() forces all output be flushed AND written to disk, but + * leaves the file open for more writing. + * + * cxip_trace_close() flushes all output and closes the file. + */ +#include "config.h" + +#include +#include +#include +#include +#include + +#include "cxip.h" + +bool cxip_trace_initialized; +bool cxip_trace_enabled; +bool cxip_trace_append; +bool cxip_trace_linebuf; // set line buffering for trace +int cxip_trace_rank; +int cxip_trace_numranks; +char *cxip_trace_filename; +FILE *cxip_trace_fid; +uint64_t cxip_trace_mask; + +/* Static initialization of default trace functions, can be overridden */ +cxip_trace_t cxip_trace_attr cxip_trace_fn = cxip_trace; +cxip_trace_flush_t cxip_trace_flush_fn = cxip_trace_flush; +cxip_trace_close_t cxip_trace_close_fn = cxip_trace_close; +cxip_trace_enable_t cxip_trace_enable_fn = cxip_trace_enable; + +/* Get environment variable as string representation of int */ +static int getenv_int(const char *name) +{ + char *env; + int value; + + value = -1; + env = getenv(name); + if (env) + sscanf(env, "%d", &value); + return value; +} + +void cxip_trace_init(void) +{ + const char *fname; + + if (cxip_trace_initialized) + return; + + cxip_trace_initialized = true; + cxip_trace_enabled = !!getenv("CXIP_TRACE_ENABLE"); + cxip_trace_append = !!getenv("CXIP_TRACE_APPEND"); + cxip_trace_linebuf = !!getenv("CXIP_TRACE_LINEBUF"); + cxip_trace_rank = getenv_int("PMI_RANK"); + cxip_trace_numranks = getenv_int("PMI_SIZE"); + cxip_trace_append = getenv("CXIP_TRACE_APPEND"); + fname = getenv("CXIP_TRACE_FILENAME"); + + cxip_trace_mask = 0L; + if (getenv("CXIP_TRC_CTRL")) + cxip_trace_set(CXIP_TRC_CTRL); + if (getenv("CXIP_TRC_ZBCOLL")) + cxip_trace_set(CXIP_TRC_ZBCOLL); + if (getenv("CXIP_TRC_CURL")) + cxip_trace_set(CXIP_TRC_CURL); + if (getenv("CXIP_TRC_COLL_PKT")) + cxip_trace_set(CXIP_TRC_COLL_PKT); + if (getenv("CXIP_TRC_COLL_JOIN")) + cxip_trace_set(CXIP_TRC_COLL_JOIN); + if (getenv("CXIP_TRC_COLL_DEBUG")) + cxip_trace_set(CXIP_TRC_COLL_DEBUG); + if (getenv("CXIP_TRC_TEST_CODE")) + cxip_trace_set(CXIP_TRC_TEST_CODE); + + if (!fname) + fname = "trace"; + if (fname) { + asprintf(&cxip_trace_filename, "./%s%d", + fname, cxip_trace_rank); + cxip_trace_fid = fopen(cxip_trace_filename, + cxip_trace_append ? "a" : "w"); + if (!cxip_trace_fid) { + fprintf(stderr, "open(%s) failed: %s\n", + cxip_trace_filename, strerror(errno)); + } + if (cxip_trace_linebuf && cxip_trace_fid) + setlinebuf(cxip_trace_fid); + } +} + +void cxip_trace_flush(void) +{ + cxip_trace_init(); + if (cxip_trace_fid) { + fflush(cxip_trace_fid); + fsync(fileno(cxip_trace_fid)); + } +} + +void cxip_trace_close(void) +{ + cxip_trace_init(); + if (cxip_trace_fid) { + cxip_trace_flush(); + fclose(cxip_trace_fid); + cxip_trace_fid = NULL; + cxip_trace_initialized = false; + } +} + +int cxip_trace_attr cxip_trace(const char *fmt, ...) +{ + va_list args; + char *str; + int len; + + cxip_trace_init(); + if (!cxip_trace_enabled) + return 0; + va_start(args, fmt); + len = vasprintf(&str, fmt, args); + va_end(args); + if (len >= 0) { + len = fprintf(cxip_trace_fid, "[%2d|%2d] %s", + cxip_trace_rank, cxip_trace_numranks, str); + free(str); + } + return len; +} + +bool cxip_trace_enable(bool enable) +{ + bool was_enabled = cxip_trace_enabled; + + cxip_trace_init(); + cxip_trace_enabled = enable; + return was_enabled; +} diff --git a/prov/cxi/src/cxip_txc.c b/prov/cxi/src/cxip_txc.c new file mode 100644 index 00000000000..a15ed8ee65b --- /dev/null +++ b/prov/cxi/src/cxip_txc.c @@ -0,0 +1,695 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2019-2023 Hewlett Packard Enterprise Development LP + */ + +/* CXI TX Context Management */ + +#include "config.h" + +#include +#include + +#include "cxip.h" + +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) + +/* 8 Rendezvous, 2 RMA and 2 Atomic + 4 extra */ +#define CXIP_INTERNAL_TX_REQS 16 + +struct cxip_md *cxip_txc_ibuf_md(void *ibuf) +{ + return ofi_buf_hdr(ibuf)->region->context; +} + +/* + * cxip_txc_ibuf_alloc() - Allocate an inject buffer. + * + * Caller must hold txc->ep_obj.lock + */ +void *cxip_txc_ibuf_alloc(struct cxip_txc *txc) +{ + void *ibuf; + + ibuf = (struct cxip_req *)ofi_buf_alloc(txc->ibuf_pool); + if (ibuf) + CXIP_DBG("Allocated inject buffer: %p\n", ibuf); + else + CXIP_WARN("Failed to allocate inject buffer\n"); + + return ibuf; +} + +/* + * cxip_txc_ibuf_free() - Free an inject buffer. + * + * Caller must hold txc->ep_obj.lock + */ +void cxip_txc_ibuf_free(struct cxip_txc *txc, void *ibuf) +{ + ofi_buf_free(ibuf); + CXIP_DBG("Freed inject buffer: %p\n", ibuf); +} + +int cxip_ibuf_chunk_init(struct ofi_bufpool_region *region) +{ + struct cxip_txc *txc = region->pool->attr.context; + struct cxip_md *md; + int ret; + + ret = cxip_map(txc->domain, region->mem_region, + region->pool->region_size, OFI_MR_NOCACHE, &md); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to map inject buffer chunk\n"); + return ret; + } + + region->context = md; + + return FI_SUCCESS; +} + +void cxip_ibuf_chunk_fini(struct ofi_bufpool_region *region) +{ + cxip_unmap(region->context); +} + +int cxip_txc_ibuf_create(struct cxip_txc *txc) +{ + struct ofi_bufpool_attr bp_attrs = {}; + int ret; + + bp_attrs.size = CXIP_INJECT_SIZE; + bp_attrs.alignment = 8; + bp_attrs.max_cnt = UINT16_MAX; + bp_attrs.chunk_cnt = 64; + bp_attrs.alloc_fn = cxip_ibuf_chunk_init; + bp_attrs.free_fn = cxip_ibuf_chunk_fini; + bp_attrs.context = txc; + + /* Avoid creating VA holes outside the buffer pool + * if CXI_FORK_SAFE/CXI_FORK_SAFE_HP is enabled. + */ + if (cxip_env.fork_safe_requested) + bp_attrs.flags = OFI_BUFPOOL_NONSHARED; + + ret = ofi_bufpool_create_attr(&bp_attrs, &txc->ibuf_pool); + if (ret) + ret = -FI_ENOMEM; + + return ret; +} + +/* + * cxip_tx_id_alloc() - Allocate a TX ID. + * + * TX IDs are assigned to Put operations that need to be tracked by the target. + * One example of this is a Send with completion that guarantees match + * completion at the target. This only applies to eager, unexpected Sends. + * + * Caller must hold txc->ep_obj.lock + */ +int cxip_tx_id_alloc(struct cxip_txc *txc, void *ctx) +{ + int id; + + id = ofi_idx_insert(&txc->tx_ids, ctx); + if (id < 0 || id >= CXIP_TX_IDS) { + CXIP_DBG("Failed to allocate TX ID: %d\n", id); + if (id > 0) + ofi_idx_remove(&txc->tx_ids, id); + + return -FI_ENOSPC; + } + + CXIP_DBG("Allocated ID: %d\n", id); + + return id; +} + +/* + * cxip_tx_id_free() - Free a TX ID. + * + * Caller must hold txc->ep_obj.lock + */ +int cxip_tx_id_free(struct cxip_txc *txc, int id) +{ + if (id < 0 || id >= CXIP_TX_IDS) + return -FI_EINVAL; + + ofi_idx_remove(&txc->tx_ids, id); + CXIP_DBG("Freed ID: %d\n", id); + + return FI_SUCCESS; +} + +/* Caller must hold txc->ep_obj.lock */ +void *cxip_tx_id_lookup(struct cxip_txc *txc, int id) +{ + return ofi_idx_lookup(&txc->tx_ids, id); +} + +/* + * cxip_rdzv_id_alloc() - Allocate a rendezvous ID. + * + * A Rendezvous ID are assigned to rendezvous Send operation. The ID is used by + * the target to differentiate rendezvous Send operations initiated by a source. + * + * Caller must hold txc->ep_obj->lock. + */ +int cxip_rdzv_id_alloc(struct cxip_txc *txc, struct cxip_req *req) +{ + struct indexer *rdzv_ids; + int max_rdzv_id; + int id_offset; + int id; + + /* FI_TAGGED sends by definition do not support FI_MULTI_RECV; + * they can utilize the pool of rendezvous ID [256 to 32K-1]. + * FI_MSG which supports FI_MULTI_RECV is restricted to a rendezvous + * ID range of [0 to 255]. + */ + if (req->send.tagged) { + rdzv_ids = &txc->rdzv_ids; + max_rdzv_id = CXIP_RDZV_IDS; + id_offset = CXIP_RDZV_IDS_MULTI_RECV; + } else { + rdzv_ids = &txc->msg_rdzv_ids; + max_rdzv_id = CXIP_RDZV_IDS_MULTI_RECV; + id_offset = 0; + } + + id = ofi_idx_insert(rdzv_ids, req); + if (id < 0 || id + id_offset >= max_rdzv_id) { + CXIP_DBG("Failed to allocate rdzv ID: %d\n", id); + if (id > 0) + ofi_idx_remove(rdzv_ids, id); + + return -FI_ENOSPC; + } + + id += id_offset; + CXIP_DBG("Allocated ID: %d\n", id); + + return id; +} + +/* + * cxip_rdzv_id_free() - Free a rendezvous ID. + * + * Caller must hold txc->ep_obj->lock. + */ +int cxip_rdzv_id_free(struct cxip_txc *txc, int id) +{ + if (id < 0 || id >= CXIP_RDZV_IDS) + return -FI_EINVAL; + + CXIP_DBG("Freed RDZV ID: %d\n", id); + + /* ID value indicates which pool it comes from */ + if (id >= CXIP_RDZV_IDS_MULTI_RECV) { + id -= CXIP_RDZV_IDS_MULTI_RECV; + ofi_idx_remove(&txc->rdzv_ids, id); + } else { + ofi_idx_remove(&txc->msg_rdzv_ids, id); + } + + return FI_SUCCESS; +} + +/* Caller must hold txc->ep_obj->lock. */ +void *cxip_rdzv_id_lookup(struct cxip_txc *txc, int id) +{ + + if (id >= CXIP_RDZV_IDS_MULTI_RECV) { + id -= CXIP_RDZV_IDS_MULTI_RECV; + return ofi_idx_lookup(&txc->rdzv_ids, id); + } + return ofi_idx_lookup(&txc->msg_rdzv_ids, id); +} + +/* + * txc_msg_init() - Initialize an RX context for messaging. + * + * Allocates and initializes hardware resources used for transmitting messages. + * + * Caller must hold ep_obj->lock + */ +static int txc_msg_init(struct cxip_txc *txc) +{ + int ret; + + /* Allocate TGQ for posting source data */ + ret = cxip_ep_cmdq(txc->ep_obj, false, FI_TC_UNSPEC, + txc->tx_evtq.eq, &txc->rx_cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate TGQ, ret: %d\n", ret); + return -FI_EDOMAIN; + } + + ret = cxip_rdzv_match_pte_alloc(txc, &txc->rdzv_pte); + if (ret) { + CXIP_WARN("Failed to allocate rendezvous PtlTE: %d:%s\n", ret, + fi_strerror(-ret)); + goto err_put_rx_cmdq; + } + txc->rdzv_proto = cxip_env.rdzv_proto; + + CXIP_DBG("TXC RDZV PtlTE enabled: %p proto: %s\n", + txc, cxip_rdzv_proto_to_str(txc->rdzv_proto)); + + return FI_SUCCESS; + +err_put_rx_cmdq: + cxip_ep_cmdq_put(txc->ep_obj, false); + + return ret; +} + +/* + * txc_msg_fini() - Finalize TX context messaging. + * + * Free hardware resources allocated when the TX context was initialized for + * messaging. + * + * Caller must hold txc->ep_obj->lock. + */ +static int txc_msg_fini(struct cxip_txc *txc) +{ + int i; + + cxip_rdzv_match_pte_free(txc->rdzv_pte); + + for (i = 0; i < RDZV_NO_MATCH_PTES; i++) { + if (txc->rdzv_nomatch_pte[i]) + cxip_rdzv_nomatch_pte_free(txc->rdzv_nomatch_pte[i]); + } + + cxip_ep_cmdq_put(txc->ep_obj, false); + + return FI_SUCCESS; +} + +static size_t cxip_txc_get_num_events(struct cxip_txc *txc) +{ + size_t num_events; + + /* Need enough events to accommodate initiator credits which is + * based on TX attr size. + */ + num_events = txc->attr.size; + + /* Worse case is an initiator credit needs two events (e.g. rendezvous + * send). + */ + num_events *= 2; + + /* For messaging, target initiator rendezvous gets has its own set of + * credits. These are always single event operations. + */ + num_events += cxip_env.sw_rx_tx_init_max; + + /* Account for internal operations. */ + num_events += CXIP_INTERNAL_TX_REQS; + + return num_events; +} + +/* + * cxip_txc_enable() - Enable a TX context for use. + * + * Called via fi_enable(). The context could be used in a standard endpoint or + * a scalable endpoint. + */ +int cxip_txc_enable(struct cxip_txc *txc) +{ + int ret = FI_SUCCESS; + size_t num_events; + + if (txc->enabled) + return FI_SUCCESS; + + if (!txc->send_cq) { + CXIP_WARN("Undefined send CQ\n"); + return -FI_ENOCQ; + } + + ret = cxip_txc_ibuf_create(txc); + if (ret) { + CXIP_WARN("Failed to create inject bufpool %d\n", ret); + return ret; + } + + /* Protected with ep_obj->lock */ + memset(&txc->rdzv_ids, 0, sizeof(txc->rdzv_ids)); + memset(&txc->msg_rdzv_ids, 0, sizeof(txc->msg_rdzv_ids)); + memset(&txc->tx_ids, 0, sizeof(txc->tx_ids)); + + num_events = cxip_txc_get_num_events(txc); + ret = cxip_evtq_init(&txc->tx_evtq, txc->send_cq, num_events, 0); + if (ret) { + CXIP_WARN("Failed to initialize TX event queue: %d, %s\n", + ret, fi_strerror(-ret)); + goto destroy_ibuf; + } + + ret = cxip_ep_cmdq(txc->ep_obj, true, txc->tclass, + txc->tx_evtq.eq, &txc->tx_cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate TX CMDQ, ret: %d\n", ret); + ret = -FI_EDOMAIN; + /* CQ disable will be done at CQ close */ + goto destroy_evtq; + } + + if (ofi_send_allowed(txc->attr.caps)) { + ret = txc_msg_init(txc); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to init TX CTX, ret: %d\n", ret); + goto put_tx_cmdq; + } + } + + txc->pid_bits = txc->domain->iface->dev->info.pid_bits; + txc->enabled = true; + + return FI_SUCCESS; + +put_tx_cmdq: + cxip_ep_cmdq_put(txc->ep_obj, true); +destroy_evtq: + cxip_evtq_fini(&txc->tx_evtq); +destroy_ibuf: + ofi_idx_reset(&txc->tx_ids); + ofi_idx_reset(&txc->rdzv_ids); + ofi_idx_reset(&txc->msg_rdzv_ids); + ofi_bufpool_destroy(txc->ibuf_pool); + + return ret; +} + +/* + * txc_cleanup() - Attempt to free outstanding requests. + * + * Outstanding commands may be dropped when the TX Command Queue is freed. + * This leads to missing events. Attempt to gather all events before freeing + * the TX CQ. If events go missing, resources will be leaked until the + * Completion Queue is freed. + */ +static void txc_cleanup(struct cxip_txc *txc) +{ + uint64_t start; + struct cxip_fc_peer *fc_peer; + struct dlist_entry *tmp; + + if (!ofi_atomic_get32(&txc->otx_reqs)) + goto free_fc_peers; + + cxip_evtq_req_discard(&txc->tx_evtq, txc); + + start = ofi_gettime_ms(); + while (ofi_atomic_get32(&txc->otx_reqs)) { + sched_yield(); + + cxip_evtq_progress(&txc->tx_evtq); + cxip_ep_ctrl_progress_locked(txc->ep_obj); + + if (ofi_gettime_ms() - start > CXIP_REQ_CLEANUP_TO) { + CXIP_WARN("Timeout waiting for outstanding requests.\n"); + break; + } + } + + assert(ofi_atomic_get32(&txc->otx_reqs) == 0); + +free_fc_peers: + dlist_foreach_container_safe(&txc->fc_peers, struct cxip_fc_peer, + fc_peer, txc_entry, tmp) { + dlist_remove(&fc_peer->txc_entry); + free(fc_peer); + } +} + +void cxip_txc_struct_init(struct cxip_txc *txc, const struct fi_tx_attr *attr, + void *context) +{ + dlist_init(&txc->ep_list); + ofi_atomic_initialize32(&txc->otx_reqs, 0); + dlist_init(&txc->msg_queue); + dlist_init(&txc->fc_peers); + + txc->context = context; + txc->attr = *attr; + txc->max_eager_size = cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; + txc->rdzv_eager_size = cxip_env.rdzv_eager_size; + txc->hmem = !!(attr->caps & FI_HMEM); +} + +/* + * cxip_txc_disable() - Disable a TX context for a base endpoint object. + * + * Free hardware resources allocated when the context was enabled. Called via + * fi_close(). + */ +void cxip_txc_disable(struct cxip_txc *txc) +{ + int ret; + + if (!txc->enabled) + return; + + txc->enabled = false; + txc_cleanup(txc); + + ofi_idx_reset(&txc->tx_ids); + ofi_idx_reset(&txc->rdzv_ids); + ofi_idx_reset(&txc->msg_rdzv_ids); + ofi_bufpool_destroy(txc->ibuf_pool); + + if (ofi_send_allowed(txc->attr.caps)) { + ret = txc_msg_fini(txc); + if (ret) + CXIP_WARN("Unable to destroy TX CTX, ret: %d\n", + ret); + } + + cxip_ep_cmdq_put(txc->ep_obj, true); + cxip_evtq_fini(&txc->tx_evtq); +} + +/* Caller must hold ep_obj->lock. */ +void cxip_txc_flush_msg_trig_reqs(struct cxip_txc *txc) +{ + struct cxip_req *req; + struct dlist_entry *tmp; + + /* Drain the message queue. */ + dlist_foreach_container_safe(&txc->msg_queue, struct cxip_req, req, + send.txc_entry, tmp) { + if (cxip_is_trig_req(req)) { + ofi_atomic_dec32(&txc->otx_reqs); + dlist_remove(&req->send.txc_entry); + cxip_unmap(req->send.send_md); + cxip_evtq_req_free(req); + } + } +} + +static bool cxip_txc_can_emit_op(struct cxip_txc *txc, + bool event_success_disabled) +{ + if (cxip_evtq_saturated(&txc->tx_evtq)) { + TXC_WARN(txc, "TX HW EQ saturated\n"); + return false; + } + + /* If taking a successful completion, limit outstanding operations */ + if (!event_success_disabled && + (ofi_atomic_get32(&txc->otx_reqs) >= txc->attr.size)) { + TXC_WARN(txc, "TXC attr size saturated\n"); + return false; + } + + return true; +} + +int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + const struct c_cstate_cmd *c_state, + const struct c_idc_put_cmd *put, const void *buf, + size_t len, uint64_t flags) +{ + int ret; + + if (!cxip_txc_can_emit_op(txc, c_state->event_success_disable)) + return -FI_EAGAIN; + + /* Ensure correct traffic class is used. */ + ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type); + if (ret) { + TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + ret = cxip_cmdq_emit_idc_put(txc->tx_cmdq, c_state, put, buf, len, + flags); + if (ret) { + TXC_WARN(txc, "Failed to emit idc_put command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + /* Kick the command queue. */ + cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), + ofi_atomic_get32(&txc->otx_reqs)); + + if (!c_state->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return FI_SUCCESS; +} + +int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cntr *trig_cntr, size_t trig_thresh, + struct c_full_dma_cmd *dma, uint64_t flags) +{ + int ret; + + if (!cxip_txc_can_emit_op(txc, dma->event_success_disable)) + return -FI_EAGAIN; + + if (trig_cntr) { + ret = cxip_domain_dwq_emit_dma(txc->domain, vni, + tc, tc_type, trig_cntr, + trig_thresh, dma, flags); + if (ret) + TXC_WARN(txc, + "Failed to emit trigger dma command: %d:%s\n", + ret, fi_strerror(-ret)); + else if (!dma->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return ret; + } + + /* Ensure correct traffic class is used. */ + ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type); + if (ret) { + TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + ret = cxip_cmdq_emit_dma(txc->tx_cmdq, dma, flags); + if (ret) { + TXC_WARN(txc, "Failed to emit dma command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + /* Kick the command queue. */ + cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), + ofi_atomic_get32(&txc->otx_reqs)); + + if (!dma->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return FI_SUCCESS; +} + +int cxip_txc_emit_idc_amo(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + const struct c_cstate_cmd *c_state, + const struct c_idc_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush) +{ + int ret; + + if (!cxip_txc_can_emit_op(txc, c_state->event_success_disable)) + return -FI_EAGAIN; + + /* Ensure correct traffic class is used. */ + ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type); + if (ret) { + TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + ret = cxip_cmdq_emic_idc_amo(txc->tx_cmdq, c_state, amo, flags, + fetching, flush); + if (ret) { + TXC_WARN(txc, "Failed to emit idc_put command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + /* Kick the command queue. */ + cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), + ofi_atomic_get32(&txc->otx_reqs)); + + if (!c_state->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return FI_SUCCESS; +} + +int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct cxip_cntr *trig_cntr, size_t trig_thresh, + struct c_dma_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush) +{ + int ret; + + if (!cxip_txc_can_emit_op(txc, amo->event_success_disable)) + return -FI_EAGAIN; + + if (trig_cntr) { + ret = cxip_domain_dwq_emit_amo(txc->domain, vni, tc, + CXI_TC_TYPE_DEFAULT, trig_cntr, + trig_thresh, amo, flags, + fetching, flush); + if (ret) + TXC_WARN(txc, + "Failed to emit trigger amo command: %d:%s\n", + ret, fi_strerror(-ret)); + else if (!amo->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return ret; + } + + /* Ensure correct traffic class is used. */ + ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type); + if (ret) { + TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + ret = cxip_cmdq_emit_dma_amo(txc->tx_cmdq, amo, flags, fetching, flush); + if (ret) { + TXC_WARN(txc, "Failed to emit DMA amo command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + /* Kick the command queue. */ + cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), + ofi_atomic_get32(&txc->otx_reqs)); + + if (!amo->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return FI_SUCCESS; +} diff --git a/prov/cxi/src/cxip_zbcoll.c b/prov/cxi/src/cxip_zbcoll.c new file mode 100644 index 00000000000..7f59b2ba599 --- /dev/null +++ b/prov/cxi/src/cxip_zbcoll.c @@ -0,0 +1,1686 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021-2022 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cxip.h" + +/* Distinctions: + * CXIP_DBG() is generally useless in a multi-node collective. Use TRACE(). + * CXIP_INFO() is generally useless in internal code of this sort. + * CXIP_WARN() is used to leave a log trace to identify failures. + * -FI_ENOMEM is not logged, since where it occurs is irrelevant: all + * memory allocation in this module is small, so heap exhaustion + * indicates a systemic failure. + * -FI_EAGAIN and -FI_EBUSY are not logged, as they are transient + */ +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) + +#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_ZBCOLL, fmt, ##__VA_ARGS__) + +/* see data packing structures below */ +#define ZB_MAP_BITS 54 +#define ZB_GRPID_BITS 6 +#define ZB_SIM_BITS 5 +#define ZB_SIM_MAX (1 << (ZB_SIM_BITS)) +#define ZB_NEG_BIT ((ZB_MAP_BITS) - 1) + +static int zbdata_send_cb(struct cxip_ctrl_req *req, + const union c_event *event); + +/**************************************************************************** + * OVERVIEW + * + * There are two related components in this file. + * - An abstract radix tree constructor + * - A collective implemention built on the Zero-Buffer Put control channel. + * + * The basic operational flow is as follows: + * - cxip_zbcoll_init() prepares the system for zbcoll collectives. + * - cxip_zbcoll_alloc() allocates and configures a collective structure. + * - cxip_zbcoll_getgroup() negotiates a collective identifier (one time). + * - cxip_zbcoll_barrier() performs a barrier (can be repeated). + * - cxip_zbcoll_broadcast() performs a broadcast (can be repeated). + * - cxip_zbcoll_progress() progresses getgroup/barrier/broadcast/reduce on ep. + * - cxip_zbcoll_free() releases the collective structure and identifier. + * - cxip_zbcoll_fini() releases all collectives and cleans up. + * + * Any number of collective structures can be created, spanning the same, or + * different node-sets. + * + * To enable the structure, it must acquire a group identifier using the + * getgroup operation, which is itself a collective operation. Getgroup + * negotiates for and acquires one of 53 possible group identifiers (43 in + * simulation). The group identifier remains with that structure until the + * structure is deleted, allowing it to be used for multiple collective + * operations without renegotiating. + * + * Collective operations are concurrent for groups with different group + * identifiers. Collective operations for a single group are serialized, + * returning -FI_EAGAIN if there is already a collective operation in progress + * for that group. + * + * The getgroup, barrier, and broadcast functions support a callback stack that + * allows caller-defined callback functions to be stacked for execution upon + * completion of a collective. The callback can initiate a new collective on the + * same object. + * + * Note that this is NOT a general-purpose collective implementation. + */ + +/**************************************************************************** + * ABSTRACT RADIX TREE + * + * We lay out all of the node indices (0..maxnodes-1) in layers, as follows: + * + * RADIX 1: + * row: nodeidx + * 0: 0 + * 1: 1 + * 2: 2 + * ... + * + * RADIX 2: + * row: nodeidx + * 0: 0 + * 1: 1, 2 + * 2: 3, 4, 5, 6 + * 3: 7, 8, 9, 10, 11, 12, 13, 14 + * ... + * + * RADIX 3: + * row: nodeidx + * 0: 0 + * 1: 1, 2, 3 + * 2: 4, 5, 6, 7, 8, 9, 10, 11, 12 + * 3: 13, 14, 15, 16, 17, 18, ... 38, 39 + * ... + * + * The parent of any node is in the row above it, and the children are in the + * row below it. The width of any row is (RADIX ^ row), so for every node, there + * can be up to RADIX children, and one parent, with the exception of the root + * node (no parent). + */ + +/** + * @brief Compute row and column for a given node index. + * + * @param radix : radix of tree + * @param nodeidx : node index + * @param row : returned row of this node + * @param col : returned offset of this node in the row + * @param siz : returned size of the row, (0 <= col < siz) + */ +void cxip_tree_rowcol(int radix, int nodeidx, int *row, int *col, int *siz) +{ + int rownum = 0; + int rowcum = 0; + int rowsiz = 1; + + *row = 0; + *col = 0; + *siz = rowsiz; + if (radix < 1) + return; + while (nodeidx > rowcum) { + rowsiz *= radix; + *row = rownum + 1; + *col = nodeidx - rowcum - 1; + *siz = rowsiz; + rowcum += rowsiz; + rownum += 1; + } +} + +/** + * @brief Compute the node index for a give row and column. + * + * Note that illegal columns can be specified for a row, which results + * in a return index of -1. + * + * @param radix : radix of tree + * @param row : row of node + * @param col : column of node + * @param nodeidx : returned node index, or -1 if illegal + */ +void cxip_tree_nodeidx(int radix, int row, int col, int *nodeidx) +{ + int rownum = 0; + int rowcum = 0; + int rowsiz = 1; + + *nodeidx = 0; + while (radix && rownum < row) { + rowsiz *= radix; + *nodeidx = rowcum + col + 1; + rowcum += rowsiz; + rownum += 1; + } + if (col >= rowsiz) + *nodeidx = -1; +} + +/** + * @brief Provide the relatives (parent, children) of a node + * + * The rels array must be provided, and must have RADIX+1 entries. + * + * The parent position [0] will always be populated, but with -1 if the node is + * the root node. + * + * Only valid child positions in [1..RADIX] will be populated. + * + * This returns the total number of positions populated. + * + * If radix < 1, there can be no relatives, and this returns 0. + * + * @param radix : radix of tree + * @param nodeidx : index of node to find relatives for + * @param maxnodes : maximum valid node indices available + * @param rels : relative index array + * @return int : number of valid relatives found + */ +int cxip_tree_relatives(int radix, int nodeidx, int maxnodes, int *rels) +{ + int row, col, siz, idx, n; + + if (radix < 1 || !maxnodes || !rels) + return 0; + + cxip_tree_rowcol(radix, nodeidx, &row, &col, &siz); + + idx = 0; + if (row) + cxip_tree_nodeidx(radix, row - 1, col / radix, &rels[idx++]); + else + rels[idx++] = -1; + + cxip_tree_nodeidx(radix, row+1, col*radix, &nodeidx); + for (n = 0; n < radix; n++) { + if ((nodeidx + n) >= maxnodes) + break; + rels[idx++] = nodeidx + n; + } + + return idx; +} + +/**************************************************************************** + * @brief Zero-buffer collectives. + * + * ZB collectives are intended for implementation of the fi_join_collective() + * function. + * + * The ep_obj has a container structure of type cxip_ep_zbcoll_obj, which + * maintains endpoint-global state for all zb collectives on that NIC endpoint. + * We refer to this as the zbcoll object, and it is an extension of the endpoint + * itself. + * + * The zbcoll object contains dynamic zb objects, each representing a collective + * group. + * + * Each zb object contains one or more state structures, which support simulated + * operations on a single node. Production code will use only one state for the + * NID. + * + * Diagnostic counters are maintained: + * + * - ack_count == successful sends + * - err_count == failed sends + * - rcv_count == successful receives + * - dsc_count == discarded receives + */ + +static inline void _setbit(uint64_t *mask, int bit) +{ + *mask |= (1ULL << bit); +} + +static inline void _clrbit(uint64_t *mask, int bit) +{ + *mask &= ~(1ULL << bit); +} + +void cxip_zbcoll_get_counters(struct cxip_ep_obj *ep_obj, uint32_t *dsc, + uint32_t *err, uint32_t *ack, uint32_t *rcv) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + + zbcoll = &ep_obj->zbcoll; + if (dsc) + *dsc = ofi_atomic_get32(&zbcoll->dsc_count); + if (err) + *err = ofi_atomic_get32(&zbcoll->err_count); + if (ack) + *ack = ofi_atomic_get32(&zbcoll->ack_count); + if (rcv) + *rcv = ofi_atomic_get32(&zbcoll->rcv_count); +} + +/** + * @brief Link a secondary zb object to a primary zb object. + * + * This is used with multi-zb object simulation. The basic (single-zb) model is + * that the zb object has num_caddrs state structures to manage the state of + * each simulated destination address, each of which has a backpointer to the + * containing zb object. + * + * For the multi-zb simulation, the root zb (simrank == 0) has num_caddrs state + * structures, but each points back to a different zb object. When packets are + * received, the are initially received by the root zb, which determines the + * state structure to use from the simulated 'dst' address embedded in the + * packet, and that is then re-routed through the state[dst].zb pointer to the + * correct target zb object and state[dst] on that object. + * + * - zb[0]->state[0].zb -> zb[0] + * - zb[0]->state[1].zb -> zb[1] + * - ... + * - zb[0]->state[n].zb -> zb[n] + * + * This also modifies each of the other structures to backlink state[0] to the + * root structure. This allows data from the leaf nodes to be placed in the root + * structure for sending. + * + * - zb[1]->state[0].zb -> zb[0] + * - ... + * - zb[n]->state[0].zb -> zb[0] + * + * Note that only zb->state[0].zb is a "real" zb pointer. If the pointer + * reference is needed, use the BASEZB() macro below. + * + * @param zb0 : primary (root) zb structure + * @param zb : secondary zb structure to link to the root + * @return int error if conditions aren't met + */ + +#define BASEZB(zb) zb->state[0].zb + +int cxip_zbcoll_simlink(struct cxip_zbcoll_obj *zb0, + struct cxip_zbcoll_obj *zb1) +{ + int i; + + if (zb0 == zb1) + return FI_SUCCESS; + if (!zb0 || !zb1) { + CXIP_WARN("arguments cannot be NULL\n"); + return -FI_EINVAL; + } + if (zb0->num_caddrs != zb1->num_caddrs) { + CXIP_WARN("address counts do not match\n"); + return -FI_EINVAL; + } + for (i = 0; i < zb0->num_caddrs; i++) + if (!CXIP_ADDR_EQUAL(zb0->caddrs[i], zb1->caddrs[i])) { + CXIP_WARN("address values do not match caddr[%d]\n", i); + return -FI_EINVAL; + } + /* zb0 must be root */ + if (zb0->simrank != 0) { + CXIP_WARN("zb0 simrank != 0\n"); + return -FI_EINVAL; + } + /* zb1 must be valid simrank */ + if (zb1->simrank <= 0 || zb1->simrank >= zb1->num_caddrs) { + CXIP_WARN("zb1 simrank %d invalid, max = %d\n", + zb1->simrank, zb1->num_caddrs); + return -FI_EINVAL; + } + /* may only link once for a simrank */ + if (zb0->state[zb1->simrank].zb != zb0) { + CXIP_WARN("zb0 state[%d] cannot be re-linked\n", zb1->simrank); + return -FI_EINVAL; + } + /* may not re-link after linking is done */ + if (zb1->state[0].zb != zb1) { + CXIP_WARN("zb1 state[0] cannot be re-linked\n"); + return -FI_EINVAL; + } + + /* link each to the other */ + zb0->simref++; + zb0->state[zb1->simrank].zb = zb1; + zb1->state[zb0->simrank].zb = zb0; + + return FI_SUCCESS; +} + +/* utility to free a zbcoll object */ +static void _free_zbcoll(struct cxip_zbcoll_obj *zb) +{ + int i; + + if (zb->state) + for (i = 0; i < zb->simcount; i++) + free(zb->state[i].relatives); + cxip_zbcoll_rlsgroup(zb); + free(zb->caddrs); + free(zb->state); + free(zb->shuffle); + free(zb); +} + +/** + * @brief Free zb object. + * + * This flushes the callback stack, and releases the group identifier associated + * with this zb object. It also removes the backreference in the multi-zb + * simulation. + * + * In the multi-zb simulation, it must defer actual deletion until all of the zb + * objects in the collective have been deleted, since the tree may still be in + * use until all of have deleted. + * + * @param zb : zb object to free + */ +void cxip_zbcoll_free(struct cxip_zbcoll_obj *zb) +{ + int i; + + if (!zb) + return; + + /* edge case in some tests */ + if (!zb->state) { + _free_zbcoll(zb); + return; + } + if (zb->simrank >= 0) { + zb = BASEZB(zb); + if (--zb->simref) + return; + for (i = 1; i < zb->simcount; i++) { + _free_zbcoll(zb->state[i].zb); + } + } + _free_zbcoll(zb); +} + +/* configure the zb object -- error frees zb in caller */ +static int _state_config(struct cxip_zbcoll_obj *zb) +{ + struct cxip_zbcoll_state *zbs; + int radix, n; + + radix = cxip_env.zbcoll_radix; + + zb->state = calloc(zb->simcount, sizeof(*zbs)); + if (!zb->state) + return -FI_ENOMEM; + + for (n = 0; n < zb->simcount; n++) { + zbs = &zb->state[n]; + zbs->zb = zb; + + /* do not create relatives if no addrs */ + if (!zb->num_caddrs) + continue; + + /* simcount == 1, production: user specifies rank + * simcount > 1, simulation: each state has its own rank + */ + zbs->grp_rank = (zb->simcount == 1) ? zb->simrank : n; + + /* create space for relatives */ + zbs->relatives = calloc(radix + 1, sizeof(*zbs->relatives)); + if (!zbs->relatives) + return -FI_ENOMEM; + + /* This produces indices in an abstract tree */ + zbs->num_relatives = + cxip_tree_relatives(radix, zbs->grp_rank, + zb->num_caddrs, zbs->relatives); + } + return FI_SUCCESS; +} + +/* sort out the various configuration cases -- error frees zb in caller */ +static int _zbcoll_config(struct cxip_zbcoll_obj *zb, int num_addrs, + fi_addr_t *fiaddrs) +{ + int i, ret; + + if (!num_addrs) { + /* test case: no nics, send-to-self only */ + zb->num_caddrs = 1; + zb->caddrs = calloc(zb->num_caddrs, sizeof(*zb->caddrs)); + if (!zb->caddrs) + return -FI_ENOMEM; + zb->caddrs[0] = zb->ep_obj->src_addr; + zb->simrank = 0; + zb->simcount = 1; + } else if (zb->simrank != ZB_NOSIM) { + /* test case: regression with simulated addresses */ + if (num_addrs > ZB_SIM_MAX || zb->simrank >= num_addrs) { + CXIP_WARN("Simulation maximum size = %d\n", + MIN(num_addrs, ZB_SIM_MAX)); + return -FI_EINVAL; + } + zb->num_caddrs = num_addrs; + zb->caddrs = calloc(zb->num_caddrs, sizeof(*zb->caddrs)); + if (!zb->caddrs) + return -FI_ENOMEM; + for (i = 0; i < num_addrs; i++) { + zb->caddrs[i].nic = i; + zb->caddrs[i].pid = zb->ep_obj->src_addr.pid; + } + zb->simcount = num_addrs; + } else { + /* production case: real addresses supplied */ + zb->num_caddrs = num_addrs; + zb->caddrs = calloc(zb->num_caddrs, sizeof(*zb->caddrs)); + if (!zb->caddrs) + return -FI_ENOMEM; + zb->simrank = -1; + for (i = 0; i < num_addrs; i++) { + ret = cxip_av_lookup_addr(zb->ep_obj->av, + fiaddrs[i], &zb->caddrs[i]); + if (ret) { + CXIP_WARN("Lookup on fiaddr=%ld failed\n", + fiaddrs[i]); + return -FI_ECONNREFUSED; + } + if (zb->simrank < 0 && + CXIP_ADDR_EQUAL(zb->caddrs[i], + zb->ep_obj->src_addr)) + zb->simrank = i; + } + if (zb->simrank < 0) { + CXIP_WARN("Endpoint addr not in addrs[]\n"); + return -FI_ECONNREFUSED; + } + zb->simcount = 1; + } + + /* find the index of the source address in the address list */ + return _state_config(zb); +} + +/** + * @brief Allocate and configure a zb object. + * + * The zb object represents a radix tree through multiple nics that can perform + * sequential synchronizing collectives. It can be reused. + * + * This supports several test modes. + * + * If num_nics == 0, the zb object can only be used to test cxip_zbcoll_send(), + * to exercise a send-to-self using the ctrl channel, and will work with NETSIM. + * + * If simrank is ZB_NOSIM, this will be used to perform real collectives over + * the group specified by the specified nics. The self-address of the node + * calling this must be a member of this set. + * + * If simrank is ZB_ALLSIM, this will be used to perform an internal simulation + * of all the nics with a single call to a collective operation. + * + * If simrank is >= 0, then it represents the rank to be simulated by this zb + * object. The test will need to create num_nics zb objects, each with a + * different rank, and the zb collective operation will have to be initiated on + * each of these to complete the collective. + * + * Simulation is limited to (1 << ZB_SIM_BITS) simulated endpoints. Simulation + * also reduces the number of group identifiers that can be used. + * + * nid[0] is defined as the collective root nid. + * + * @param ep_obj : NIC endpoint object + * @param num_addrs : number of fabric addresses + * @param fiaddrs : fabric addresses + * @param simrank : simulated rank + * @param zbp : returned zb object + * @return int : FI_SUCCESS or error value + */ +int cxip_zbcoll_alloc(struct cxip_ep_obj *ep_obj, + int num_addrs, fi_addr_t *fiaddrs, int simrank, + struct cxip_zbcoll_obj **zbp) +{ + struct cxip_zbcoll_obj *zb; + int ret; + + if (!zbp) { + CXIP_WARN("zbp is NULL\n"); + return -FI_EINVAL; + } + + /* allocate the zb object */ + *zbp = NULL; + zb = calloc(1, sizeof(*zb)); + if (!zb) + return -FI_ENOMEM; + dlist_init(&zb->ready_link); + zb->ep_obj = ep_obj; + zb->grpmskp = &ep_obj->zbcoll.grpmsk; + zb->grpid = ZB_NEG_BIT; + zb->simrank = simrank; + zb->simref = 1; + + /* configure the zb object */ + ret = _zbcoll_config(zb, num_addrs, fiaddrs); + if (ret) { + cxip_zbcoll_free(zb); + CXIP_WARN("Failed to configure zb object = %s\n", + fi_strerror(-ret)); + return ret; + } + + /* return the zb object */ + *zbp = zb; + return FI_SUCCESS; +} + +/** + * Data packing structures. + * + * This defines the specific bit meanings in the 64-bit zb put packet. Bit + * mapping could be modified, see considerations below. + * + * Considerations for the (production) network field: + * + * - dat MUST hold a multicast address and hardware root data + * - grpid size limits the number of concurrent zbcoll operations + * - sim requires only one bit and applies only to devel testing + * - pad is fixed by the control channel implementation + * + * Implementation of the negotiation operation requires that dat contain a + * bitmap. The choice of 54 allows for 54 grpid values (0-53), which will fit + * into a 6-bit grpid value. This is a large number for concurrencies. The grpid + * field could be reduced to 5 bits, offering only 32 concurrent operations. The + * map bits should then be reduced to 32, which would free up 23 bits for other + * information during negotiation, should extra bits be required. + * + * For broadcast, the full dat field is available for multicast information. The + * multicast address is currently 13 bits. Future revisions of Rosetta may + * increase this. The remaining bits can be used for a representation of the + * root node. A full caddr would require 32 bits, while using a 32-bit index + * into the fi_av_set would allow for a collective spanning up to 4 billion + * endpoints. This allows the multicast address to expand by another 9 bits, for + * a total of 22 bits, or 4 million multicast addresses. + * + * Considerations for the simulation fields: + * + * - src and dst must have the same number of bits + * - src/dst bits constrain the size of the simulated zbcoll tree + */ +union packer { + struct { + uint64_t dat: (ZB_MAP_BITS - 2*ZB_SIM_BITS); + uint64_t src: ZB_SIM_BITS; + uint64_t dst: ZB_SIM_BITS; + uint64_t grpid: ZB_GRPID_BITS; + uint64_t sim: 1; + uint64_t pad: 3; + } sim __attribute__((__packed__)); + struct { + uint64_t dat: ZB_MAP_BITS; + uint64_t grpid: ZB_GRPID_BITS; + uint64_t sim: 1; + uint64_t pad: 3; + } net __attribute__((__packed__)); + uint64_t raw; +}; + + +/* pack data */ +static inline uint64_t zbpack(int sim, int src, int dst, int grpid, + uint64_t dat) +{ + union packer x = {.raw = 0}; + if (sim) { + x.sim.sim = 1; + x.sim.src = src; + x.sim.dst = dst; + x.sim.grpid = grpid; + x.sim.dat = dat; + } else { + x.sim.sim = 0; + x.net.grpid = grpid; + x.net.dat = dat; + } + return x.raw; +} + +/* unpack data */ +static inline int zbunpack(uint64_t data, int *src, int *dst, int *grpid, + uint64_t *dat) +{ + union packer x = {.raw = data}; + if (x.sim.sim) { + *src = x.sim.src; + *dst = x.sim.dst; + *grpid = x.sim.grpid; + *dat = x.sim.dat; + } else { + *src = 0; + *dst = 0; + *grpid = x.net.grpid; + *dat = x.net.dat; + } + return x.sim.sim; +} + +/** + * zbcoll state machine. + * + * The zbcollectives are intended to perform necessary synchronization among all + * NIDs participating in a fi_join_collective() operation. Every join will have + * its own set of NIDs, which may overlap with the NIDs used in another + * concurrently-executing fi_join_collective(). Thus, every NID may be + * participating simultaneously in a different number of join operations. + * + * Every process (NID) in the collective sits somewhere in a radix tree, with + * one parent as relative[0] (except for the root), and up to RADIX-1 children + * at relative[1,...]. + * + * The collective follows a two-stage data flow, first from children toward the + * root (upstream), then from root toward the children (downstream). + * + * Processes (NIDs) must wait for all children to report before forwarding their + * own contribution toward the root. When the children of the root all report, + * the root reflects the result back to its children, and completes. As each + * child receives from its parent, it propagates the result to its children, and + * completes. + * + * Packets are unrestricted, and thus receive confirmation ACK messages from the + * hardware, or NAK and retry if delivery fails. + * + * The leaf (childless) NIDs contribute immediately and send the zb->dataval + * data upstream. Each parent collects data from its children and bitwise-ANDs + * the data with its own zb->dataval. When all children have reported to the + * root, the root sends the root contents of *zb->dataptr downstream, and the + * children simply propagate the received data to the leaves. This fixed + * behavior covers all our use-cases. + * + * For the barrier operation, zb->dataptr is set to NULL, and zb->dataval is set + * to zero. Both are effectively ignored. + * + * For the broadcast operation, zb->dataptr is a caller-supplied pointer, and + * zb->dataval is ignored. When all contributions have arrived on the root, the + * user-supplied value of *zb->dataptr is sent downstream, and propagated to all + * leaves, overwriting *zb->dataptr on each endpoint. + * + * For the reduce operation, zb->dataptr is set to a caller-supplied pointer, + * and zb->dataval is set to the value contained in this pointer. All of these + * caller values are sent upstream and reduced using a bitwise-AND reduction. + * When all contributions have arrived on the root, the value of the root + * *zb->dataptr is overwritten with the reduced zb->dataval, and then propagated + * to all leaves. + * + * Barrier, broadcast, and reduce must be preceded by a getgroup operation, to + * obtain a grpid value for the zb object. + * + * For the getgroup operation, zb->dataptr points to &zb->dataval, and + * zb->dataval contains a copy of the endpoint zbcoll grpmsk, which has a bit + * set to 1 for every grpid that is available for that NID. NIDs may have + * different grpmsk values. All of these masks are passed upstream through + * zb->dataval in a bitwise-AND reduction. When it reaches the root, the set + * bits in zb->dataval are the grpid values still available across all of the + * NIDs in the group. Because zb->dataptr == &zb-dataval, *zb->dataptr on the + * root contains the final reduced value, which is then propagated to all the + * leaves. + * + * The negotiated group id is the lowest numbered bit still set, and every NID + * computes this from the bitmask. + * + * It is possible for all group ID values to be exhausted. In this case, the + * getgroup operation will report -FI_EBUSY, and the caller should retry until a + * join operation completes, releasing one of the group ID values. If zb + * collective objects are never released, new operations will be blocked + * indefinitely. + * + * Getgroup operations are always serialized across the entire endpoint. + * Attempting a second getgroup on any (new) zb object before the first has + * completed will return -FI_EAGAIN. This is required to prevent race conditions + * that would issue the same group id to multiple zbcoll objects. + * + * We are externally guaranteed that all fi_join_collective() operations will + * observe proper collective ordering. Specifically, if any two joins share two + * or more NIDs, those joins will be initiated in the same order on all shared + * NIDs (possibly interspersed with other joins for unrelated groups). This + * behavior is necessary to ensure that all NIDs in a group obtain the same + * grpid value. + */ + +/* send a zbcoll packet -- wrapper for cxip_ctrl_msg_send(). + * + * Caller must hold ep_obj->lock. + */ +static void zbsend(struct cxip_ep_obj *ep_obj, uint32_t dstnic, uint32_t dstpid, + uint64_t mbv) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + struct cxip_ctrl_req *req; + int ret; + + zbcoll = &ep_obj->zbcoll; + + req = calloc(1, sizeof(*req)); + if (!req) { + CXIP_WARN("failed request allocation\n"); + ofi_atomic_inc32(&zbcoll->err_count); + return; + } + + req->ep_obj = ep_obj; + req->cb = zbdata_send_cb; + req->send.nic_addr = dstnic; + req->send.pid = dstpid; + req->send.mb.raw = mbv; + req->send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG; + req->send.mb.ctrl_msg_type = CXIP_CTRL_MSG_ZB_DATA; + + /* If we can't send, collective cannot complete, just spin */ + do { + ret = cxip_ctrl_msg_send(req); + if (ret == -FI_EAGAIN) + cxip_ep_ctrl_progress_locked(ep_obj); + } while (ret == -FI_EAGAIN); + if (ret) { + CXIP_WARN("failed CTRL message send\n"); + ofi_atomic_inc32(&zbcoll->err_count); + } +} + +/* send a rejection packet */ +static void reject(struct cxip_ep_obj *ep_obj, int dstnic, int dstpid, + int sim, int src, int dst, int grpid) +{ + union cxip_match_bits mb; + + mb.raw = zbpack(sim, src, dst, grpid, 0); + zbsend(ep_obj, dstnic, dstpid, mb.raw); +} + +/** + * @brief Send a zero-buffer collective packet. + * + * Creates a request packet that must be freed (or retried) in callback. + * + * This can physically send ONLY from the endpoint source address, but the src + * address can be provided for simulation. + * + * Only the lower bits of the 64-bit payload will be delivered, depending on the + * specific packing model. Upper control bits will be overwritten as necessary. + * + * @param zb : indexed zb structure + * @param srcidx : source address index (ignored unless simulating) + * @param dstidx : destination address index (required) + * @param payload : packet value to send + */ +void cxip_zbcoll_send(struct cxip_zbcoll_obj *zb, int srcidx, int dstidx, + uint64_t payload) +{ + union cxip_match_bits mb = {.raw = 0}; + struct cxip_addr dstaddr; + + /* resolve NETSIM testcase */ + TRACE("SND %04x->%04x %016lx\n", srcidx, dstidx, payload); + if (zb->simcount > 1) { + if (dstidx >= zb->simcount) { + ofi_atomic_inc32(&zb->ep_obj->zbcoll.err_count); + return; + } + /* alter the data to pass srcaddr/dstaddr */ + mb.zb_data = zbpack(1, srcidx, dstidx, zb->grpid, payload); + dstaddr = zb->ep_obj->src_addr; + } else { + /* srcidx, dstaddr are discarded in zbpack() */ + if (dstidx >= zb->num_caddrs) { + ofi_atomic_inc32(&zb->ep_obj->zbcoll.err_count); + return; + } + mb.zb_data = zbpack(0, 0, 0, zb->grpid, payload); + dstaddr = zb->caddrs[dstidx]; + } + zbsend(zb->ep_obj, dstaddr.nic, dstaddr.pid, mb.raw); +} + +/* set the group ID */ +static void setgrpid(struct cxip_zbcoll_obj *zb, uint64_t mask) +{ + uint64_t v; + int grpid; + + TRACE("search for grpid in %016lx\n", mask); + for (grpid = 0, v = 1; grpid <= ZB_NEG_BIT; grpid++, v<<=1) + if (v & mask) + break; + TRACE("found grpid = %d\n", grpid); + + /* manage a rejection due to a transient race condition */ + if (grpid > ZB_NEG_BIT) { + /* race condition reported */ + TRACE("cancel: getgroup transient race\n"); + zb->error = -FI_EAGAIN; + return; + } + + /* manage failure due to all grpid values in-use */ + if (grpid == ZB_NEG_BIT) { + /* no group IDs available */ + TRACE("cancel: getgroup no grpid available\n"); + zb->error = -FI_EBUSY; + return; + } + + /* we found our group ID */ + TRACE("set grpid = %d\n", grpid); + zb->grpid = grpid; + _clrbit(zb->grpmskp, grpid); +} + +/* mark a collective operation done */ +static inline void zbdone(struct cxip_zbcoll_state *zbs, uint64_t mbv) +{ + struct cxip_zbcoll_obj *zb; + struct cxip_ep_zbcoll_obj *zbcoll; + + /* getgroup: + * single-zb sim: refcnt=1, busy=N + * multi-zb sim: refcnt=N, busy=1 + * production : refcnt=1, busy=1 + * reduction: + * single-zb sim: refcnt=0, busy=N + * multi-zb sim: refcnt=0, busy=1 + * production : refcnt=0, busy=1 + */ + zb = zbs->zb; + zbcoll = &zbs->zb->ep_obj->zbcoll; + TRACE("%s: zb[%d] contribs=%d\n", __func__, zb->simrank, zbs->contribs); + + ofi_spin_lock(&zbcoll->lock); + zbs->contribs = 0; + TRACE("--REFCNT=%d in %s\n", zbcoll->refcnt, __func__); + TRACE("--BUSY =%d in %s\n", zb->busy, __func__); + /* Reduce the refcnt when we are no longer busy */ + if (zb->busy && !--zb->busy) { + if (zb->grpid == ZB_NEG_BIT) + setgrpid(zb, mbv); + /* Complete the negotiation on the last reference */ + if (!zbcoll->refcnt || !--zbcoll->refcnt) { + if (zbcoll->grptbl[ZB_NEG_BIT] == BASEZB(zb)) { + TRACE("GETGROUP FINISHED\n"); + zbcoll->grptbl[zb->grpid] = BASEZB(zb); + zbcoll->grptbl[ZB_NEG_BIT] = NULL; + } + } + TRACE(".. append to zb[%d]\n", zb->simrank); + dlist_insert_tail(&zb->ready_link, &zbcoll->ready_list); + } + ofi_spin_unlock(&zbcoll->lock); +} + +/* mark a collective send failure and end the collective */ +static void zbsend_fail(struct cxip_zbcoll_state *zbs, + struct cxip_ctrl_req *req, int ret) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + + /* highly unexpected ret == -FI_EIO */ + zbcoll = &zbs->zb->ep_obj->zbcoll; + ofi_atomic_inc32(&zbcoll->err_count); + zbs->zb->error = ret; + free(req); +} + +/* root has no parent */ +static inline bool isroot(struct cxip_zbcoll_state *zbs) +{ + return (zbs->relatives[0] < 0); +} + +/* receive is complete when all contributors have spoken */ +static inline bool rcvcomplete(struct cxip_zbcoll_state *zbs) +{ + return (zbs->contribs == zbs->num_relatives); +} + +/* send upstream to the parent */ +static void zbsend_up(struct cxip_zbcoll_state *zbs, + uint64_t mbv) +{ + TRACE("%04x->%04x: %-10s %-10s %d/%d\n", + zbs->grp_rank, zbs->relatives[0], "", __func__, + zbs->contribs, zbs->num_relatives); + cxip_zbcoll_send(zbs->zb, zbs->grp_rank, zbs->relatives[0], mbv); + } + +/* send downstream to all of the children */ +static void zbsend_dn(struct cxip_zbcoll_state *zbs, + uint64_t mbv) +{ + int relidx; + + for (relidx = 1; relidx < zbs->num_relatives; relidx++) { + TRACE("%04x->%04x: %-10s %-10s\n", + zbs->grp_rank, zbs->relatives[relidx], + __func__, ""); + cxip_zbcoll_send(zbs->zb, zbs->grp_rank, + zbs->relatives[relidx], mbv); + } +} + +/* advance the upstream data flow, reverse direction at root */ +static void advance(struct cxip_zbcoll_state *zbs, uint64_t mbv) +{ + union cxip_match_bits mb = {.raw = mbv}; + + if (!rcvcomplete(zbs)) + return; + + if (isroot(zbs)) { + /* Reduction overwrites root data */ + if (zbs->dataptr && zbs->zb->reduce) + *zbs->dataptr = zbs->dataval; + /* The root always reflects its data down */ + mb.zb_data = (zbs->dataptr) ? (*zbs->dataptr) : 0; + zbsend_dn(zbs, mb.raw); + zbdone(zbs, mbv); + } else { + /* completed children send up */ + zbsend_up(zbs, mbv); + } +} + +/* standard message for discarding a packet (should be rare) */ +static void discard_msg(uint32_t inic, uint32_t ipid, char *msg) +{ + CXIP_WARN("discard: INI=%04x PID=%d: %s\n", inic, ipid, msg); + TRACE("discard: INI=%04x PID=%d: %s\n", inic, ipid, msg); +} + +/** + * @brief zbcoll message receive callback. + * + * This is called by the cxip_ctrl handler when a ZB collective packet is + * received. This function is "installed" at ep initialization, so it can begin + * receiving packets before a zb object has been allocated to receive the data. + * Races are handled by issuing a rejection packet back to the sender, which + * results in a retry. + * + * All incoming packets pass through this function. The group identifier is part + * of the packet format, and directs the packet to the zb object in the grptbl[] + * associated with that grpid, which allows for multiple concurrent collective + * operations. + * + * For the production case, there is only one zb associated with a grpid, with + * one state entry. The source address is provided to us by the NIC, and the + * destination is (obviously) this NIC. + * + * For the single-zb simulation case, there is only one zb associated with a + * grpid, with a state entry for each simulated collective endpoint. The + * simulated source and destination is present in the packet format, and this is + * used to identify the source, and direct the packet to the correct destination + * state object. The actual source address (always this NIC) is ignored. + * + * In the multi-zb simulation, there are multiple (linked) zb objects associated + * with the grpid, each with a state entry for each simulated endpoint. The + * grptbl[] only selects a single zb, which is the root (simrank=0) zb. Each + * state in this object contains a backpointer that normally points to the + * containing zb, but the linking operation modifies this to point to the + * separate zb objects. So a simple redirection through the state backpointer + * gets us to the correct zb and state within that zb. The linking operation + * also modifies the state[0] entry in each of the different zb objects to point + * back to the simrank=0 zb. The other state entries are unused. While this + * requires an O(N^2) memory where only O(N) is used, we are fundamentally + * limited to N=32 simulated endpoints by the space available in the packet for + * addresses, so the waste is negligible. + * + * Calling code does not handle error returns gracefully, so handle all errors, + * and return FI_SUCCESS. + * + * @param ep_obj : endpoint + * @param init_nic : received (actual) initiator NIC + * @param init_pid : received (actual) initiator PID + * @param mbv : received match bits + * @return int : FI_SUCCESS (formal return) + */ +int cxip_zbcoll_recv_cb(struct cxip_ep_obj *ep_obj, uint32_t init_nic, + uint32_t init_pid, uint64_t mbv) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + struct cxip_zbcoll_obj *zb; + struct cxip_zbcoll_state *zbs; + int sim, src, dst, grpid; + uint32_t inic, ipid; + uint64_t dat; + union cxip_match_bits mb = {.raw = mbv}; + int relidx; + + zbcoll = &ep_obj->zbcoll; + /* src, dst always zero for production */ + sim = zbunpack(mbv, &src, &dst, &grpid, &dat); + /* determine the initiator to use */ + if (sim) { + inic = src; + ipid = ep_obj->src_addr.pid; + } else { + inic = init_nic; + ipid = init_pid; + } + TRACE("RCV INI=%04x PID=%04x sim=%d %d->%d grp=%d dat=%016lx\n", + inic, ipid, sim, src, dst, grpid, dat); + + /* discard if grpid is explicitly invalid (bad packet) */ + if (grpid > ZB_NEG_BIT) { + discard_msg(inic, ipid, "rejected by target"); + ofi_atomic_inc32(&zbcoll->dsc_count); + return FI_SUCCESS; + } + /* low-level packet test */ + if (zbcoll->disable) { + /* Attempting a low-level test */ + ofi_atomic_inc32(&zbcoll->rcv_count); + return FI_SUCCESS; + } + /* resolve the zb object */ + zb = zbcoll->grptbl[grpid]; + if (grpid == ZB_NEG_BIT) { + /* This is a negotiation packet */ + if (!zb) { + /* mask from downstream node, we aren't ready */ + TRACE("reject: getgroup negotiation conflict %08lx\n", + dat); + reject(ep_obj, inic, ipid, sim, dst, src, grpid); + return FI_SUCCESS; + } + if (!dat) { + /* negotiation rejection from upstream node */ + zbs = &zb->state[dst]; + zbs->dataval = *zb->grpmskp; + zbs->dataptr = &zbs->dataval; + mb.zb_data = zbs->dataval; + TRACE("rejected: re-send %016lx\n", mb.raw); + zbsend_up(zbs, mb.zb_data); + return FI_SUCCESS; + } + /* upstream/downstream mask for negotiating zb */ + } else { + /* This is a collective packet */ + if (!zb) { + /* Received packet for unknown group */ + discard_msg(inic, ipid, "reject unknown group ID"); + reject(ep_obj, inic, ipid, sim, dst, src, ZB_MAP_BITS); + ofi_atomic_inc32(&zbcoll->dsc_count); + return FI_SUCCESS; + } + /* upstream/downstream data for collective zb */ + } + /* discard bad state indices */ + if (src >= zb->simcount || dst >= zb->simcount) { + TRACE("discard: simsrc=%d simdst=%d\n", src, dst); + CXIP_WARN("Bad simulation: src=%d dst=%d max=%d\n", + src, dst, zb->simcount); + ofi_atomic_inc32(&zbcoll->dsc_count); + return FI_SUCCESS; + } + /* set the state object */ + zbs = &zb->state[dst]; + /* simulation redirection for multi-zb simulation */ + if (zbs->zb != zb) { + zb = zbs->zb; + zbs = &zb->state[dst]; + } + /* raw send test case, we are done */ + if (!zbs->num_relatives) { + TRACE("ZBCOLL no relatives: test case\n"); + return FI_SUCCESS; + } + /* determine which relative this came from (upstream or downstream) */ + for (relidx = 0; relidx < zbs->num_relatives; relidx++) { + if (inic == zb->caddrs[zbs->relatives[relidx]].nic && + ipid == zb->caddrs[zbs->relatives[relidx]].pid) + break; + } + if (relidx == zbs->num_relatives) { + /* not a relative */ + discard_msg(inic, ipid, "reject initiator not in tree"); + reject(ep_obj, inic, ipid, sim, dst, src, grpid); + ofi_atomic_inc32(&zbcoll->dsc_count); + return FI_SUCCESS; + } + /* data received, increment the counter */ + ofi_atomic_inc32(&zbcoll->rcv_count); + + /* advance the state */ + if (relidx == 0) { + /* downstream recv from parent */ + + /* copy the data to the zbs */ + zbs->dataval = dat; + if (zbs->dataptr) + *zbs->dataptr = dat; + TRACE("%04x<-%04x: %-10s %-10s %d/%d (%016lx)\n", + zbs->grp_rank, zbs->relatives[0], "dn_recvd", "", + zbs->contribs, zbs->num_relatives, dat); + + /* send downstream to children */ + zbsend_dn(zbs, mb.raw); + zbdone(zbs, mb.raw); + } else { + /* upstream recv from child */ + + /* bitwise-AND the upstream data value */ + zbs->dataval &= mb.raw; + mb.zb_data = zbs->dataval; + /* upstream packets contribute */ + zbs->contribs += 1; + TRACE("%04x<-%04x: %-10s %-10s %d/%d\n", + zbs->grp_rank, inic, "", "up_recvd", zbs->contribs, + zbs->num_relatives); + + /* advance the collective */ + advance(zbs, mb.raw); + } + return FI_SUCCESS; +} + +/** + * @brief Send callback function to manage source ACK. + * + * The request must be retried, or freed. + * + * NETSIM will simply drop packets sent to non-existent addresses, which leaks + * the request packet. + * + * Calling code does not handle error returns gracefully. Handle all errors, and + * return FI_SUCCESS. + * + * @param req : original request + * @param event : CXI driver event + * @return int : FI_SUCCESS (formal return) + */ +static int zbdata_send_cb(struct cxip_ctrl_req *req, const union c_event *event) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + struct cxip_zbcoll_obj *zb; + struct cxip_zbcoll_state *zbs; + int src, dst, grpid; + int sim __attribute__((unused)); + uint64_t dat; + int ret; + + sim = zbunpack(req->send.mb.zb_data, &src, &dst, &grpid, &dat); + TRACE("ACK sim=%d %d->%d grp=%d dat=%016lx\n", + sim, src, dst, grpid, dat); + + zbcoll = &req->ep_obj->zbcoll; + ofi_atomic_inc32(&zbcoll->ack_count); + + if (grpid > ZB_NEG_BIT) { + /* rejection packet sent */ + TRACE("ACK: rejection sent\n"); + goto done; + } + zb = zbcoll->grptbl[grpid]; + if (!zb) { + /* Low-level testing, or ack is late */ + TRACE("ACK: late arrival\n"); + goto done; + } + if (src >= zb->simcount || dst >= zb->simcount) { + TRACE("ACK: bad simulation\n"); + goto done; + } + zbs = &zb->state[dst]; + + switch (event->hdr.event_type) { + case C_EVENT_ACK: + switch (cxi_event_rc(event)) { + case C_RC_OK: + ret = FI_SUCCESS; + free(req); + break; + case C_RC_ENTRY_NOT_FOUND: + /* likely a target queue is full, retry */ + CXIP_WARN("Target dropped packet, retry\n"); + usleep(cxip_env.fc_retry_usec_delay); + ret = cxip_ctrl_msg_send(req); + break; + case C_RC_PTLTE_NOT_FOUND: + /* could be a race during setup, retry */ + CXIP_WARN("Target connection failed, retry\n"); + usleep(cxip_env.fc_retry_usec_delay); + ret = cxip_ctrl_msg_send(req); + break; + default: + CXIP_WARN("ACK return code = %d, failed\n", + cxi_event_rc(event)); + ret = -FI_EIO; + break; + } + break; + default: + /* fail the send */ + CXIP_WARN(CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + ret = -FI_EIO; + break; + } + if (ret != FI_SUCCESS) + zbsend_fail(zbs, req, ret); + + return FI_SUCCESS; +done: + free(req); + return FI_SUCCESS; +} + +/** + * @brief Define the user callback function to execute on completion. + * + * @param zb + * @param userfunc + * @param userptr + * @return int + */ +void cxip_zbcoll_set_user_cb(struct cxip_zbcoll_obj *zb, + zbcomplete_t userfunc, void *userptr) +{ + zb->userfunc = userfunc; + zb->userptr = userptr; +} + +/** + * @brief Return the maximum number of groups for concurrent zbcoll operations. + * + * Maximum slots are ZB_NEG_BIT+1, with one reserved for negotiation. Using + * simulation reduces the number of bits available for negotiation. + * + * @param sim : true if nics are simulated + * @return int maximum group ID value + */ +int cxip_zbcoll_max_grps(bool sim) +{ + return (!sim) ? ZB_NEG_BIT : ZB_NEG_BIT - 2*ZB_SIM_BITS; +} + +/* used in each loop over states for each collective operation */ +static bool _skip_or_shuffle(struct cxip_zbcoll_obj *zb, int i, int *n) +{ + /* default is that this returns n as value of i */ + *n = i; + /* production means proceed over loop (of 1) with n = i */ + if (zb->simcount == 1) + return false; + /* multi-zb simulation should skip unless simrank == i */ + if (zb->simrank >= 0 && zb->simrank != i) + return true; + /* single-zb simulation simulates all values, with shuffling */ + if (zb->shuffle) + *n = zb->shuffle[i]; + return false; +} + +/** + * @brief Negotiate a group id among participants. + * + * We are guaranteed that any two negotiations that take place on any two zb + * objects will occur in the same order. However, either of those negotiations + * could be separated by an arbitrary number of other negotiations for other + * collectives that don't involve both of those zb objects. E.g. + * + * - zb1: A1 A2 + * - zb2: A1 B1 A2 + * + * zb1 is able to start negotiation A2 as soon as A1 completes, but zb2 cannot + * begin until B1 has completed. To prevent issuing the same grpid to two + * different groups, or issuing different grpids to a single group, all getgroup + * collectives are serialized over the NIC endpoint. Thus, attempting to + * negotiate for A2 on zb2 before B1 has completed will result in -FI_EAGAIN. + * + * In production, each zb represents a different process, on a different NIC + * endpoint, and these typically represent different compute nodes. + * + * In the single-zb and multi-zb simulations, the entire simulation is + * single-threaded in a single process, in a common memory space. + * + * In the single-zb simulation, there is only one zb, and each zb->state[] + * represents the different simulated collective endpoints. Operations across + * all simulated endpoints are done sequentially, though the ordering is + * randomized using the shuffle[] array. + * + * In the multi-zb simulation, there is a separate zb for each collective + * endpoint. The same collective operation must be called independently on each + * zb object, and all zb objects in that group must be called. Ordering is + * controlled by the ordering of the operations using each zb. + * + * In production and the single-zb simulation, this is a simple first-come + * first-served use of the NIC endpoint zbcoll->grptbl[ZB_NEG_BIT] pointer. + * Serialization is guaranteed by simply testing whether grptbl[ZB_NEG_BIT] is + * NULL. + * + * In the multi-zb simulation, acquiring zbcoll->grptbl[ZB_NEG_BIT] is a + * multi-step process that requires multiple calls to getgroup, using different + * (linked) zb objects. Serialization means that multiple calls must be allowed, + * provided that they all belong to the same set of linked zb objects, until all + * endpoints have been called. We use the refcnt value to determine when all + * calls have been made. + * + * @param zb : zbcoll structure + * @return int : FI_SUCCESS or error value + */ +int cxip_zbcoll_getgroup(struct cxip_zbcoll_obj *zb) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + struct cxip_zbcoll_state *zbs; + union cxip_match_bits mb = {.raw = 0}; + int i, n, ret; + + /* function could be called by non-participating nodes */ + if (!zb) { + TRACE("zb is NULL\n"); + CXIP_WARN("zb is NULL\n"); + return -FI_EINVAL; + } + + /* if disabled, exit */ + zbcoll = &zb->ep_obj->zbcoll; + if (zbcoll->disable) { + TRACE("Disabled zb\n"); + return FI_SUCCESS; + } + + /* check for already grouped */ + if (zb->grpid != ZB_NEG_BIT) { + TRACE("grpid already set = %d\n", zb->grpid); + CXIP_WARN("Cannot acquire a second group id\n"); + return -FI_EINVAL; + } + + /* getgroup operations must be serialized */ + ret = FI_SUCCESS; + ofi_spin_lock(&zbcoll->lock); + if (!zbcoll->grptbl[ZB_NEG_BIT]) { + /* free to start negotiating */ + zbcoll->grptbl[ZB_NEG_BIT] = BASEZB(zb); + zbcoll->refcnt++; + } else if (zbcoll->grptbl[ZB_NEG_BIT] == BASEZB(zb) && + zbcoll->refcnt < zb->simcount && + zb->busy < zb->simcount) { + /* single-zb sim, refcnt=1, busy=simcount + * multi-zb sim, refcnt=simcount, busy=1 + */ + zbcoll->refcnt++; + TRACE("continue grpid negotiation, refcnt=%d\n", + zbcoll->refcnt); + } else { + /* any other attempt has to wait */ + ret = -FI_EAGAIN; + TRACE("failed grpid negotiation, retry later\n"); + } + ofi_spin_unlock(&zbcoll->lock); + TRACE("++REFCNT=%d ret=%d in %s\n", zbcoll->refcnt, ret, __func__); + if (ret) + return ret; + + /* process all states */ + zb->error = FI_SUCCESS; + zb->reduce = false; + for (i = 0; i < zb->simcount; i++) { + if (_skip_or_shuffle(zb, i, &n)) + continue; + zbs = &zb->state[n]; + zbs->dataval = *zb->grpmskp; + zbs->dataptr = &zbs->dataval; + zbs->contribs++; + zb->busy++; + TRACE("%s: zb[%d] contribs=%d\n", __func__, i, zbs->contribs); + /* if terminal leaf node, will send up immediately */ + mb.zb_data = zbs->dataval; + advance(zbs, mb.raw); + } + TRACE("++BUSY =%d in %s\n", zb->busy, __func__); + return FI_SUCCESS; +} + +/** + * @brief Release negotiated group id. + * + * @param zb : zbcoll structure + */ +void cxip_zbcoll_rlsgroup(struct cxip_zbcoll_obj *zb) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + + if (!zb || zb->grpid > ZB_NEG_BIT) + return; + + zbcoll = &zb->ep_obj->zbcoll; + + ofi_spin_lock(&zbcoll->lock); + _setbit(zb->grpmskp, zb->grpid); + zbcoll->grptbl[zb->grpid] = NULL; + zb->grpid = ZB_NEG_BIT; + ofi_spin_unlock(&zbcoll->lock); +} + +/* All exported functions are variants of this core function */ +static int _zbreduce(struct cxip_zbcoll_obj *zb, uint64_t *dataptr, bool reduce) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + struct cxip_zbcoll_state *zbs; + union cxip_match_bits mb = {.raw = 0}; + int i, n; + + /* function could be called on non-participating NIDs */ + if (!zb) { + TRACE("[-] zb is NULL\n"); + CXIP_WARN("zb is NULL\n"); + return -FI_EINVAL; + } + + /* low level testing */ + zbcoll = &zb->ep_obj->zbcoll; + if (zbcoll->disable) { + TRACE("[%d] Disabled zb\n", zb->simrank); + return FI_SUCCESS; + } + + /* operations on a single zb_obj are serialized */ + if (zb->busy) { + TRACE("[%d] busy\n", zb->simrank); + return -FI_EAGAIN; + } + + /* check for not grouped */ + if (zb->grpid >= ZB_NEG_BIT) { + TRACE("[%d] Requires a group ID\n", zb->simrank); + CXIP_WARN("Requires group id\n"); + return -FI_EINVAL; + } + TRACE("[%d] grpid = %d\n", zb->simrank, zb->grpid); + + /* process all states */ + zb->error = FI_SUCCESS; + zb->reduce = reduce; + /* Note that for simulation, dataptr must be an array */ + for (i = 0; i < zb->simcount; i++) { + if (_skip_or_shuffle(zb, i, &n)) + continue; + zbs = &zb->state[n]; + zbs->dataval = (dataptr) ? *dataptr : 0; + zbs->dataptr = (dataptr) ? dataptr++ : NULL; + zbs->contribs++; + zb->busy++; + TRACE("%s: zb[%d] contribs=%d\n", __func__, i, zbs->contribs); + /* if terminal leaf node, will send up immediately */ + mb.zb_data = zbs->dataval; + advance(zbs, mb.raw); + TRACE("%s: zb[%d] contribs=%d\n", __func__, i, zbs->contribs); + } + TRACE("%s: busy=%d\n", __func__, zb->busy); + + return FI_SUCCESS; +} + +/** + * @brief Initiate a bitwise-AND reduction. + * + * All participants call this. + * + * On entry, *dataptr contains the data to be reduced. On return, *dataptr + * contains the reduced data. + * + * NOTE: When testing in simulation, dataptr should reference an array of + * uint64_t with one item for each endpoint. + * + * @param zb : zbcoll structure + * @param dataptr : pointer to return data + * @return int : FI_SUCCESS or error value + */ +int cxip_zbcoll_reduce(struct cxip_zbcoll_obj *zb, uint64_t *dataptr) +{ + return _zbreduce(zb, dataptr, true); +} + +/** + * @brief Initiate a broadcast from root to leaves. + * + * All participants call this. + * + * On entry, *dataptr on root contains the data to be broadcast. + * On return, *dataptr contains the broadcast data from root. + * + * NOTE: When testing in simulation, dataptr should reference an array of + * uint64_t with one item for each endpoint. + * + * @param zb : zbcoll structure + * @param dataptr : pointer to return data + * @return int : FI_SUCCESS or error value + */ +int cxip_zbcoll_broadcast(struct cxip_zbcoll_obj *zb, uint64_t *dataptr) +{ + return _zbreduce(zb, dataptr, false); +} + +/** + * @brief Initiate a no-data barrier. + * + * All participants call this. + * + * @param zb : zbcoll structure + * @return int : FI_SUCCESS or error value + */ +int cxip_zbcoll_barrier(struct cxip_zbcoll_obj *zb) +{ + return _zbreduce(zb, NULL, false); +} + +/** + * @brief Progress completion. + * + * This is called from cxip_coll_progress_join(), which is called when reading + * the endpoint EQ as part of progressing the zb collective operation. + * + * The callback function can thus initiate new operations without concerns about + * recursion. + * + * @param ep_obj : endpoint + * + * Caller holds eq_obj->lock. + */ +void cxip_ep_zbcoll_progress(struct cxip_ep_obj *ep_obj) +{ + struct cxip_zbcoll_obj *zb; + struct cxip_ep_zbcoll_obj *zbcoll; + + zbcoll = &ep_obj->zbcoll; + while (true) { + /* progress the underlying ctrl transfers */ + cxip_ep_ctrl_progress_locked(ep_obj); + + /* see if there is a zb ready to be advanced */ + zb = NULL; + ofi_spin_lock(&zbcoll->lock); + if (!dlist_empty(&zbcoll->ready_list)) + dlist_pop_front(&zbcoll->ready_list, + struct cxip_zbcoll_obj, + zb, ready_link); + ofi_spin_unlock(&zbcoll->lock); + if (!zb) + break; + TRACE("SAW COMPLETION on zb[%d], error=%d!!!\n", + zb->simrank, zb->error); + if (zb->userfunc) + (zb->userfunc)(zb, zb->userptr); + } +} + +/** + * @brief Intialize the zbcoll system. + * + * @param ep_obj : endpoint + * @return int : FI_SUCCESS or error value + */ +int cxip_zbcoll_init(struct cxip_ep_obj *ep_obj) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + + zbcoll = &ep_obj->zbcoll; + memset(zbcoll, 0, sizeof(*zbcoll)); + dlist_init(&zbcoll->ready_list); + zbcoll->grpmsk = -1ULL; + zbcoll->grptbl = calloc(ZB_MAP_BITS, sizeof(void *)); + if (!zbcoll->grptbl) + return -FI_ENOMEM; + ofi_spin_init(&zbcoll->lock); + ofi_atomic_initialize32(&zbcoll->dsc_count, 0); + ofi_atomic_initialize32(&zbcoll->err_count, 0); + ofi_atomic_initialize32(&zbcoll->ack_count, 0); + ofi_atomic_initialize32(&zbcoll->rcv_count, 0); + + return FI_SUCCESS; +} + +/** + * @brief Cleanup all operations in progress. + * + * @param ep_obj : endpoint + */ +void cxip_zbcoll_fini(struct cxip_ep_obj *ep_obj) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + int i; + + zbcoll = &ep_obj->zbcoll; + for (i = 0; i < ZB_MAP_BITS; i++) + cxip_zbcoll_free(zbcoll->grptbl[i]); + free(zbcoll->grptbl); + zbcoll->grptbl = NULL; +} + +/** + * @brief Reset the endpoint counters. + * + * @param ep : endpoint + */ +void cxip_zbcoll_reset_counters(struct cxip_ep_obj *ep_obj) +{ + struct cxip_ep_zbcoll_obj *zbcoll; + + zbcoll = &ep_obj->zbcoll; + ofi_atomic_set32(&zbcoll->dsc_count, 0); + ofi_atomic_set32(&zbcoll->err_count, 0); + ofi_atomic_set32(&zbcoll->ack_count, 0); + ofi_atomic_set32(&zbcoll->rcv_count, 0); +} diff --git a/prov/cxi/test/README.md b/prov/cxi/test/README.md new file mode 100644 index 00000000000..fac6ef7d7a4 --- /dev/null +++ b/prov/cxi/test/README.md @@ -0,0 +1,16 @@ +*SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP* + +# Libfabric CXI Provider Tests + +All tests in this directory are built under the Criterion tool. See [https://criterion.readthedocs.io/en/master/index.html](url). + +Common setup/teardown routines are found in cxip_test_common.c. + +Collections of related tests are found in the other files. + +The build produces an executable cxitest, which runs the pre-supplied Criterion main() function, and supports selecting launch of individual tests, or the entire test suite. + +## Running Tests + +See the test.sh file for examples of launching tests with cxitest. diff --git a/prov/cxi/test/atomic.c b/prov/cxi/test/atomic.c new file mode 100644 index 00000000000..d87d361ea0e --- /dev/null +++ b/prov/cxi/test/atomic.c @@ -0,0 +1,4433 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include +#include +#include + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +#define AMO_DISABLED false + +#define RMA_WIN_LEN 64 +#define RMA_WIN_KEY 2 +#define RMA_WIN_ACCESS (FI_REMOTE_READ | FI_REMOTE_WRITE) +#define MR_KEY_STD 200 + +/* Create MR -- works like a "remote_calloc()" */ +static void *_cxit_create_mr(struct mem_region *mr, uint64_t *key) +{ + int ret; + + mr->mem = calloc(1, RMA_WIN_LEN); + cr_assert_not_null(mr->mem); + + ret = fi_mr_reg(cxit_domain, mr->mem, RMA_WIN_LEN, RMA_WIN_ACCESS, 0, + *key, 0, &mr->mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed %d", ret); + + ret = fi_mr_bind(mr->mr, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind(ep) failed %d", ret); + + if (cxit_fi->caps & FI_RMA_EVENT && cxit_rem_cntr) { + ret = fi_mr_bind(mr->mr, &cxit_rem_cntr->fid, FI_REMOTE_WRITE); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind(cntr) failed %d", + ret); + } + + ret = fi_mr_enable(mr->mr); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed %d", ret); + + *key = fi_mr_key(mr->mr); + + return mr->mem; +} + +/* Destroy MR -- works like a "remote_free()" */ +static void _cxit_destroy_mr(struct mem_region *mr) +{ + fi_close(&mr->mr->fid); + + free(mr->mem); +} + +/* Test failures associated with bad call parameters. + */ +TestSuite(atomic_invalid, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .disabled = AMO_DISABLED, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(atomic_invalid, invalid_amo) +{ + uint64_t operand1 = 0; + struct fi_ioc iov = { + .addr = &operand1, + .count = 1 + }; + int ret; + + ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_ATOMIC_OP_LAST, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, + FI_UINT64, -1, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, + FI_DATATYPE_LAST, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, 0, + -1, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_atomic(cxit_ep, &operand1, 0, 0, cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_atomic(cxit_ep, &operand1, 2, 0, cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_atomic(cxit_ep, 0, 1, 0, cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + + ret = fi_atomicv(cxit_ep, &iov, 0, 0, cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_atomicv(cxit_ep, &iov, 0, 2, cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + iov.count = 0; + ret = fi_atomicv(cxit_ep, &iov, 0, 1, cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + iov.count = 2; + ret = fi_atomicv(cxit_ep, &iov, 0, 1, cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_atomicv(cxit_ep, 0, 0, 1, cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); +} + +Test(atomic_invalid, invalid_fetch) +{ + uint64_t operand1 = 0; + uint64_t result = 0; + struct fi_ioc iov = { + .addr = &operand1, + .count = 1 + }; + struct fi_ioc riov = { + .addr = &result, + .count = 1 + }; + int ret; + + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, + cxit_ep_fi_addr, 0, 0, FI_UINT64, + FI_ATOMIC_OP_LAST, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, + cxit_ep_fi_addr, 0, 0, FI_UINT64, -1, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, + cxit_ep_fi_addr, 0, 0, FI_DATATYPE_LAST, FI_SUM, + 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, &result, 0, + cxit_ep_fi_addr, 0, 0, -1, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, 0, 0, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomic(cxit_ep, &operand1, 0, 0, &result, 0, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomic(cxit_ep, &operand1, 2, 0, &result, 0, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomic(cxit_ep, 0, 1, 0, &result, 0, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + + + ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 0, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 2, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, 0, 0, 1, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 0, &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 2, &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_fetch_atomicv(cxit_ep, 0, 0, 1, &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + riov.count = 0; + ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + riov.count = 2; + ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + riov.count = 1; + iov.count = 0; + ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + iov.count = 2; + ret = fi_fetch_atomicv(cxit_ep, &iov, 0, 1, &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, FI_UINT64, FI_SUM, 0); + cr_assert_eq(ret, -FI_EINVAL); + iov.count = 1; + cr_assert_eq(ret, -FI_EINVAL); +} + +Test(atomic_invalid, invalid_swap) +{ + uint64_t operand1 = 0; + uint64_t compare = 0; + uint64_t result = 0; + struct fi_ioc iov = { + .addr = &operand1, + .count = 1 + }; + struct fi_ioc ciov = { + .addr = &compare, + .count = 1 + }; + struct fi_ioc riov = { + .addr = &result, + .count = 1 + }; + int ret; + + ret = fi_compare_atomic(cxit_ep, + &operand1, 1, 0, + &compare, 0, + &result, 0, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_ATOMIC_OP_LAST, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomic(cxit_ep, + &operand1, 1, 0, + &compare, 0, + &result, 0, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, -1, 0); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomic(cxit_ep, + &operand1, 1, 0, + &compare, 0, + &result, 0, + cxit_ep_fi_addr, 0, 0, + FI_DATATYPE_LAST, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomic(cxit_ep, + &operand1, 1, 0, + &compare, 0, + &result, 0, + cxit_ep_fi_addr, 0, 0, + -1, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomic(cxit_ep, + &operand1, 1, 0, + &compare, 0, + 0, 0, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomic(cxit_ep, + &operand1, 1, 0, + 0, 0, + &result, 0, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomic(cxit_ep, + &operand1, 2, 0, + &compare, 0, + &result, 0, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomic(cxit_ep, + &operand1, 0, 0, + &compare, 0, + &result, 0, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomic(cxit_ep, + 0, 1, 0, + &compare, 0, + &result, 0, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 1, + &riov, 0, 2, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 1, + &riov, 0, 0, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 2, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 0, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 2, + &ciov, 0, 1, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 0, + &ciov, 0, 1, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + riov.count = 2; + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 1, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + riov.count = 0; + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 1, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + riov.count = 1; + ciov.count = 2; + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 1, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ciov.count = 0; + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 1, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + ciov.count = 1; + iov.count = 2; + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 1, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + iov.count = 0; + ret = fi_compare_atomicv(cxit_ep, + &iov, 0, 1, + &ciov, 0, 1, + &riov, 0, 1, + cxit_ep_fi_addr, 0, 0, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert_eq(ret, -FI_EINVAL); + iov.count = 1; +} + +/* Test simple operations: AMO SUM UINT64_T, FAMO SUM UINT64_T, and CAMO SWAP_NE + * UINT64_T. If this doesn't work, nothing else will. + */ +TestSuite(atomic, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .disabled = AMO_DISABLED, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(atomic, simple_amo) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t exp_remote; + uint64_t *rma; + int ret; + int i; + uint64_t key; + + /* Test standard and optimized MRs. */ + for (i = 0; i < 2; i++) { + key = 199 + i; + + rma = _cxit_create_mr(&mr, &key); + exp_remote = 0; + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 1; + exp_remote += operand1; + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 3; + exp_remote += operand1; + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 9; + exp_remote += operand1; + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + _cxit_destroy_mr(&mr); + } +} + +/* Test atomic inject interface */ +Test(atomic, simple_inject) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t exp_remote = 0; + uint64_t *rma; + int ret; + int count = 0; + uint64_t key = RMA_WIN_KEY; + + rma = _cxit_create_mr(&mr, &key); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 1; + exp_remote += operand1; + ret = fi_inject_atomic(cxit_ep, &operand1, 1, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 3; + exp_remote += operand1; + ret = fi_inject_atomic(cxit_ep, &operand1, 1, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 9; + exp_remote += operand1; + ret = fi_inject_atomic(cxit_ep, &operand1, 1, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure no events were delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + _cxit_destroy_mr(&mr); + + /* Try using standard MR */ + + exp_remote = 0; + key = 1000; + rma = _cxit_create_mr(&mr, &key); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 1; + exp_remote += operand1; + ret = fi_inject_atomic(cxit_ep, &operand1, 1, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure no events were delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + _cxit_destroy_mr(&mr); +} + +Test(atomic, simple_fetch) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t exp_remote; + uint64_t exp_result; + uint64_t *rma; + uint64_t *loc; + int ret; + int i; + uint64_t key; + + for (i = 0; i < 2; i++) { + key = 199 + i; + + rma = _cxit_create_mr(&mr, &key); + exp_remote = 0; + exp_result = 0; + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + loc = calloc(1, RMA_WIN_LEN); + cr_assert_not_null(loc); + + fi_cntr_set(cxit_read_cntr, 0); + while (fi_cntr_read(cxit_read_cntr)); + + operand1 = 1; + *loc = -1; + exp_result = exp_remote; + exp_remote += operand1; + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, + loc, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Add Result = %ld, expected = %ld", + *rma, exp_remote); + cr_assert_eq(*loc, exp_result, + "Fetch Result = %016lx, expected = %016lx", + *loc, exp_result); + + operand1 = 3; + *loc = -1; + exp_result = exp_remote; + exp_remote += operand1; + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, + loc, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Add Result = %ld, expected = %ld", + *rma, exp_remote); + cr_assert_eq(*loc, exp_result, + "Fetch Result = %016lx, expected = %016lx", + *loc, exp_result); + + operand1 = 9; + *loc = -1; + exp_result = exp_remote; + exp_remote += operand1; + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, + loc, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Add Result = %ld, expected = %ld", + *rma, exp_remote); + cr_assert_eq(*loc, exp_result, + "Fetch Result = %016lx, expected = %016lx", + *loc, exp_result); + + while (fi_cntr_read(cxit_read_cntr) != 3) + ; + + free(loc); + _cxit_destroy_mr(&mr); + } +} + +Test(atomic, simple_fetch_read) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t exp_remote; + uint64_t exp_result; + uint64_t *rma; + uint64_t *loc; + int ret; + int i; + uint64_t key; + + for (i = 0; i < 2; i++) { + key = 199 + i; + + rma = _cxit_create_mr(&mr, &key); + exp_remote = 0; + exp_result = 0; + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + loc = calloc(1, RMA_WIN_LEN); + cr_assert_not_null(loc); + + fi_cntr_set(cxit_read_cntr, 0); + while (fi_cntr_read(cxit_read_cntr)) + ; + *rma = 1; + *loc = -1; + exp_remote = *rma; + exp_result = exp_remote; + + ret = fi_fetch_atomic(cxit_ep, NULL, 1, NULL, + loc, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_ATOMIC_READ, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Read Result = %ld, expected = %ld", + *rma, exp_remote); + cr_assert_eq(*loc, exp_result, + "Fetch Result = %016lx, expected = %016lx", + *loc, exp_result); + + *rma = 10; + *loc = -1; + exp_remote = *rma; + exp_result = exp_remote; + + ret = fi_fetch_atomic(cxit_ep, NULL, 1, NULL, + loc, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_ATOMIC_READ, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Read Result = %ld, expected = %ld", + *rma, exp_remote); + cr_assert_eq(*loc, exp_result, + "Fetch Result = %016lx, expected = %016lx", + *loc, exp_result); + + *rma = 0x0123456789abcdef; + *loc = -1; + exp_remote = *rma; + exp_result = exp_remote; + + ret = fi_fetch_atomic(cxit_ep, NULL, 1, NULL, + loc, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_ATOMIC_READ, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Read Result = %ld, expected = %ld", + *rma, exp_remote); + cr_assert_eq(*loc, exp_result, + "Fetch Result = %016lx, expected = %016lx", + *loc, exp_result); + + while (fi_cntr_read(cxit_read_cntr) != 3) + ; + + free(loc); + _cxit_destroy_mr(&mr); + } +} + +Test(atomic, simple_swap) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t compare; + uint64_t exp_remote; + uint64_t exp_result; + uint64_t *rma; + uint64_t *loc; + int ret; + int i; + uint64_t key; + + for (i = 0; i < 2; i++) { + key = 199 + i; + + rma = _cxit_create_mr(&mr, &key); + exp_remote = 0; + exp_result = 0; + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + loc = calloc(1, RMA_WIN_LEN); + cr_assert_not_null(loc); + + *rma = 0; /* remote == 0 */ + operand1 = 1; /* change remote to 1 */ + compare = 2; /* if remote != 2 (true) */ + *loc = -1; /* initialize result */ + exp_remote = 1; /* expect remote == 1 */ + exp_result = 0; /* expect result == 0 */ + ret = fi_compare_atomic(cxit_ep, + &operand1, 1, 0, + &compare, 0, + loc, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Add Result = %ld, expected = %ld", + *rma, exp_remote); + cr_assert_eq(*loc, exp_result, + "Fetch Result = %016lx, expected = %016lx", + *loc, exp_result); + + *rma = 2; /* remote == 2 */ + operand1 = 1; /* change remote to 1 */ + compare = 2; /* if remote != 2 (false) */ + *loc = -1; /* initialize result */ + exp_remote = 2; /* expect remote == 2 (no op) */ + exp_result = 2; /* expect result == 2 (does return value) */ + ret = fi_compare_atomic(cxit_ep, + &operand1, 1, 0, + &compare, 0, + loc, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_CSWAP_NE, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Add Result = %ld, expected = %ld", + *rma, exp_remote); + cr_assert_eq(*loc, exp_result, + "Fetch Result = %016lx, expected = %016lx", + *loc, exp_result); + + free(loc); + _cxit_destroy_mr(&mr); + } +} + +/* Perform a full combinatorial test suite. + */ +#define MAX_TEST_SIZE 16 + +/** + * Compare a seen value with an expected value, with 'len' valid bytes. This + * checks the seen buffer all the way to MAX_TEST_SIZE, and looks for a + * predefined value in every byte, to ensure that there is no overflow. + * The seen buffer will always be either the rma or the loc buffer, which have + * 64 bytes of space in them. + * + * Summation of real and complex types is trickier. Every decimal constant is + * internally represented by a binary approximation, and summation can + * accumulate errors. With only a single sum with two arguments, the error could + * be +1 or -1 in the LSBit. + * + * @param saw 'seen' buffer + * @param exp 'expected' value + * @param len number of valid bytes + * + * @return bool true if successful, false if comparison fails + */ +static bool _compare(void *saw, void *exp, int len, + enum fi_op op, enum fi_datatype dt) +{ + uint8_t *bval = saw; + uint8_t *bexp = exp; + uint64_t uval = 0; + uint64_t uexp = 0; + int i; + + /* Test MS pad bits */ + for (i = MAX_TEST_SIZE-1; i >= len; i--) { + if (bval[i] != bexp[i]) + return false; + } + if (op == FI_SUM) { + switch (dt) { + case FI_FLOAT: + case FI_DOUBLE: + /* Copy to UINT64, adjust diff (-1,1) to (0,2) */ + memcpy(&uval, bval, len); + memcpy(&uexp, bexp, len); + if ((uval - uexp) + 1 > 2) + return false; + return true; + case FI_FLOAT_COMPLEX: + case FI_DOUBLE_COMPLEX: + /* Do real and imag parts separately */ + memcpy(&uval, bval, len/2); + memcpy(&uexp, bexp, len/2); + if (uval - uexp + 1 > 2) + return false; + memcpy(&uval, bval+len/2, len/2); + memcpy(&uexp, bexp+len/2, len/2); + if (uval - uexp + 1 > 2) + return false; + return true; + default: + break; + } + } + /* Test LS value bits */ + for (i = len-1; i >= 0; i--) { + if (bval[i] != bexp[i]) + return false; + } + return true; +} + +/** + * Generates a useful error message. + * + * @param op opcode + * @param dt dtcode + * @param saw 'seen' buffer + * @param exp 'expected' value + * @param len number of valid bytes + * @param buf buffer to fill with message + * @param siz buffer size + * + * @return const char* returns the buf pointer + */ +static const char *_errmsg(enum fi_op op, enum fi_datatype dt, + void *saw, void *exp, int len, + char *buf, size_t siz) +{ + char *p = &buf[0]; + char *e = &buf[siz]; + uint8_t *bsaw = saw; + uint8_t *bexp = exp; + int i; + + p += snprintf(p, e-p, "%d:%d: saw=", op, dt); + for (i = MAX_TEST_SIZE-1; i >= 0; i--) + p += snprintf(p, e-p, "%02x%s", bsaw[i], i == len ? "/" : ""); + p += snprintf(p, e-p, " exp="); + for (i = MAX_TEST_SIZE-1; i >= 0; i--) + p += snprintf(p, e-p, "%02x%s", bexp[i], i == len ? "/" : ""); + return buf; +} + +/** + * The general AMO test. + * + * @param index value used to help identify the test if error + * @param dt FI datatype + * @param op FI operation + * @param err 0 if success expected, 1 if failure expected + * @param operand1 operation data value pointer + * @param compare operation compare value pointer + * @param loc operation result (local) buffer pointer + * @param loc_init operation result initialization value pointer + * @param rma operation rma (remote) buffer pointer + * @param rma_init operation rma initialization value pointer + * @param rma_expect operation rma (remote) expected value pointer + */ +static void _test_amo(int index, enum fi_datatype dt, enum fi_op op, int err, + void *operand1, + void *compare, + void *loc, void *loc_init, + void *rma, void *rma_init, void *rma_expect, + uint64_t key) +{ + struct fi_cq_tagged_entry cqe; + char msgbuf[128]; + char opstr[64]; + char dtstr[64]; + uint8_t rexp[MAX_TEST_SIZE]; + uint8_t lexp[MAX_TEST_SIZE]; + void *rma_exp = rexp; + void *loc_exp = lexp; + int len = ofi_datatype_size(dt); + int ret; + + strcpy(opstr, fi_tostr(&op, FI_TYPE_ATOMIC_OP)); + strcpy(dtstr, fi_tostr(&dt, FI_TYPE_ATOMIC_TYPE)); + + cr_log_info("Testing %s %s (%d)\n", opstr, dtstr, len); + + memset(rma, -1, MAX_TEST_SIZE); + memset(rma_exp, -1, MAX_TEST_SIZE); + memcpy(rma, rma_init, len); + memcpy(rma_exp, rma_expect, len); + + if (loc && loc_init) { + memset(loc, -1, MAX_TEST_SIZE); + memset(loc_exp, -1, MAX_TEST_SIZE); + memcpy(loc, loc_init, len); + memcpy(loc_exp, rma_init, len); + } + if (compare && loc) { + /* This is a compare command */ + ret = fi_compare_atomic(cxit_ep, operand1, 1, 0, + compare, 0, loc, 0, + cxit_ep_fi_addr, 0, key, dt, + op, NULL); + } else if (loc) { + /* This is a fetch command */ + ret = fi_fetch_atomic(cxit_ep, operand1, 1, 0, loc, 0, + cxit_ep_fi_addr, 0, key, dt, op, + NULL); + } else { + /* This is a simple command */ + ret = fi_atomic(cxit_ep, operand1, 1, 0, + cxit_ep_fi_addr, 0, key, dt, op, NULL); + } + + if (err) { + /* Expected an error. Tests only invoke "unsupported" failures, + * so any other error is fatal. Success is also fatal if we + * expect a failure. + */ + cr_assert_eq(ret, -FI_EOPNOTSUPP, + "rtn #%d:%d:%d saw=%d exp=%d\n", + index, op, dt, ret, -FI_EOPNOTSUPP); + return; + } + + + /* If we weren't expecting an error, any error is fatal */ + cr_assert_eq(ret, 0, + "rtn #%d:%d:%d saw=%d exp=%d\n", + index, op, dt, ret, err); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | (loc ? FI_READ : FI_WRITE), NULL); + + /* We expect the RMA effect to be as predicted */ + cr_expect(_compare(rma, rma_exp, len, op, dt), + "rma #%d:%s\n", index, + _errmsg(op, dt, rma, rma_exp, len, msgbuf, + sizeof(msgbuf))); + + /* We expect the local result to be as predicted, if there is one */ + if (loc && loc_init) { + cr_expect(_compare(loc, loc_exp, len, op, dt), + "loc #%d:%s\n", index, + _errmsg(op, dt, loc, loc_exp, len, msgbuf, + sizeof(msgbuf))); + } +} + +/* Every parameter list can create an OR of the following values, to indicate + * what forms should be attempted. + */ +#define _AMO 1 +#define _FAMO 2 +#define _CAMO 4 + +/* The INT tests test 8, 16, 32, and 64 bits for each line item. + */ +struct test_int_parms { + int opmask; + int index; + enum fi_op op; + int err; + uint64_t comp; + uint64_t o1; + uint64_t rini; + uint64_t rexp; + uint64_t key; +}; + +static struct test_int_parms int_parms[] = { + { _AMO|_FAMO, 11, FI_MIN, 0, 0, 123, 120, 120 }, + { _AMO|_FAMO, 12, FI_MIN, 0, 0, 120, 123, 120 }, + { _AMO|_FAMO, 21, FI_MAX, 0, 0, 123, 120, 123 }, + { _AMO|_FAMO, 22, FI_MAX, 0, 0, 120, 123, 123 }, + { _AMO|_FAMO, 31, FI_SUM, 0, 0, 1, 0, 1 }, + { _AMO|_FAMO, 32, FI_SUM, 0, 0, 1, 10, 11 }, + { _AMO|_FAMO, 33, FI_SUM, 0, 0, 2, -1, 1 }, + { _AMO|_FAMO, 41, FI_LOR, 0, 0, 0, 0, 0 }, + { _AMO|_FAMO, 42, FI_LOR, 0, 0, 128, 0, 1 }, + { _AMO|_FAMO, 43, FI_LOR, 0, 0, 0, 128, 1 }, + { _AMO|_FAMO, 44, FI_LOR, 0, 0, 64, 128, 1 }, + { _AMO|_FAMO, 51, FI_LAND, 0, 0, 0, 0, 0 }, + { _AMO|_FAMO, 52, FI_LAND, 0, 0, 128, 0, 0 }, + { _AMO|_FAMO, 53, FI_LAND, 0, 0, 0, 128, 0 }, + { _AMO|_FAMO, 54, FI_LAND, 0, 0, 64, 128, 1 }, + { _AMO|_FAMO, 61, FI_LXOR, 0, 0, 0, 0, 0 }, + { _AMO|_FAMO, 62, FI_LXOR, 0, 0, 128, 0, 1 }, + { _AMO|_FAMO, 63, FI_LXOR, 0, 0, 0, 128, 1 }, + { _AMO|_FAMO, 64, FI_LXOR, 0, 0, 64, 128, 0 }, + { _AMO|_FAMO, 71, FI_BOR, 0, 0, + 0xf0e1f2e3f4e5f6e7, + 0x1818181818181818, + 0xf8f9fafbfcfdfeff }, + { _AMO|_FAMO, 81, FI_BAND, 0, 0, + 0xf0e1f2e3f4e5f6e7, + 0x1818181818181818, + 0x1000100010001000 }, + { _AMO|_FAMO, 91, FI_BXOR, 0, 0, + 0xf0e1f2e3f4e5f6e7, + 0x1818181818181818, + 0xe8f9eafbecfdeeff }, + { _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0, 0, + 0x1234123412341234, + 0xabcdabcdabcdabcd, + 0x1234123412341234 }, + { _AMO|_FAMO, 102, FI_ATOMIC_WRITE, 0, 0, + 0x1234123412341234, + 0x1234123412341234, + 0x1234123412341234 }, + { _FAMO, 111, FI_ATOMIC_READ, 0, 0, + 0x1010101010101010, + 0x4321432143214321, + 0x4321432143214321 }, + { _AMO, 112, FI_ATOMIC_READ, 1 }, + { _CAMO, 121, FI_CSWAP, 0, 120, 123, 100, 100 }, + { _CAMO, 122, FI_CSWAP, 0, 100, 123, 100, 123 }, + { _CAMO, 131, FI_CSWAP_NE, 0, 120, 123, 100, 123 }, + { _CAMO, 132, FI_CSWAP_NE, 0, 100, 123, 100, 100 }, + { _CAMO, 141, FI_CSWAP_LE, 0, 101, 123, 100, 100 }, + { _CAMO, 142, FI_CSWAP_LE, 0, 100, 123, 100, 123 }, + { _CAMO, 143, FI_CSWAP_LE, 0, 99, 123, 100, 123 }, + { _CAMO, 151, FI_CSWAP_LT, 0, 101, 123, 100, 100 }, + { _CAMO, 152, FI_CSWAP_LT, 0, 100, 123, 100, 100 }, + { _CAMO, 153, FI_CSWAP_LT, 0, 99, 123, 100, 123 }, + { _CAMO, 161, FI_CSWAP_GE, 0, 101, 123, 100, 123 }, + { _CAMO, 162, FI_CSWAP_GE, 0, 100, 123, 100, 123 }, + { _CAMO, 163, FI_CSWAP_GE, 0, 99, 123, 100, 100 }, + { _CAMO, 171, FI_CSWAP_GT, 0, 101, 123, 100, 123 }, + { _CAMO, 173, FI_CSWAP_GT, 0, 100, 123, 100, 100 }, + { _CAMO, 173, FI_CSWAP_GT, 0, 99, 123, 100, 100 }, + { _CAMO, 181, FI_MSWAP, 0, + 0xf0f0f0f0f0f0f0f0, + 0xaaaaaaaaaaaaaaaa, + 0x1111111111111111, + 0xa1a1a1a1a1a1a1a1 + }, +}; + +ParameterizedTestParameters(atomic, test_int) +{ + struct test_int_parms *params; + int tests = ARRAY_SIZE(int_parms); + int i; + + params = malloc(sizeof(int_parms) * 2); + + memcpy(params, int_parms, sizeof(int_parms)); + memcpy((uint8_t *)params + sizeof(int_parms), int_parms, + sizeof(int_parms)); + + /* Make duplicate tests that use a standard MR key */ + for (i = 0; i < tests; i++) { + params[tests + i].key = MR_KEY_STD; + params[tests + i].index += 1000; + } + + return cr_make_param_array(struct test_int_parms, params, + tests * 2); +} + +ParameterizedTest(struct test_int_parms *p, atomic, test_int) +{ + struct mem_region mr; + enum fi_datatype dt; + uint64_t *rma; + uint64_t *loc; + uint64_t lini = -1; + + rma = _cxit_create_mr(&mr, &p->key); + + loc = calloc(1, RMA_WIN_LEN); + cr_assert_not_null(loc); + + if (p->opmask & _AMO) { + for (dt = FI_INT8; dt <= FI_UINT64; dt++) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, 0, 0, + rma, &p->rini, &p->rexp, + p->key); + } + } + + if (p->opmask & _FAMO) { + for (dt = FI_INT8; dt <= FI_UINT64; dt++) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, loc, &lini, rma, &p->rini, &p->rexp, + p->key); + } + } + + if (p->opmask & _CAMO) { + for (dt = FI_INT8; dt <= FI_UINT64; dt++) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + &p->comp, loc, &lini, rma, &p->rini, + &p->rexp, + p->key); + } + } + + free(loc); + _cxit_destroy_mr(&mr); +} + +/* The FLT tests only test the float type. + */ +struct test_flt_parms { + int opmask; + int index; + enum fi_op op; + int err; + float comp; + float o1; + float rini; + float rexp; + uint64_t key; +}; + +static struct test_flt_parms flt_parms[] = { + { _AMO|_FAMO, 11, FI_MIN, 0, 0.0f, 12.3f, 12.0f, 12.0f }, + { _AMO|_FAMO, 12, FI_MIN, 0, 0.0f, 12.0f, 12.3f, 12.0f }, + { _AMO|_FAMO, 21, FI_MAX, 0, 0.0f, 12.3f, 12.0f, 12.3f }, + { _AMO|_FAMO, 22, FI_MAX, 0, 0.0f, 12.0f, 12.3f, 12.3f }, + { _AMO|_FAMO, 31, FI_SUM, 0, 0.0f, 1.1f, 1.2f, (1.1f + 1.2f) }, + { _AMO|_FAMO, 32, FI_SUM, 0, 0.0f, 0.4f, 1.7f, (0.4f + 1.7f) }, + { _AMO|_FAMO, 41, FI_LOR, 1 }, + { _AMO|_FAMO, 51, FI_LAND, 1 }, + { _AMO|_FAMO, 61, FI_LXOR, 1 }, + { _AMO|_FAMO, 71, FI_BOR, 1 }, + { _AMO|_FAMO, 81, FI_BAND, 1 }, + { _AMO|_FAMO, 91, FI_BXOR, 1 }, + { _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0, 0.0f, 10.2f, 96.6f, 10.2f }, + { _FAMO, 111, FI_ATOMIC_READ, 0, 0.0f, 1.1f, 10.2f, 10.2f }, + { _AMO, 112, FI_ATOMIC_READ, 1 }, + { _CAMO, 121, FI_CSWAP, 0, 12.0f, 12.3f, 10.0f, 10.0f }, + { _CAMO, 122, FI_CSWAP, 0, 10.0f, 12.3f, 10.0f, 12.3f }, + { _CAMO, 131, FI_CSWAP_NE, 0, 12.0f, 12.3f, 10.0f, 12.3f }, + { _CAMO, 132, FI_CSWAP_NE, 0, 10.0f, 12.3f, 10.0f, 10.0f }, + { _CAMO, 141, FI_CSWAP_LE, 0, 10.1f, 12.3f, 10.0f, 10.0f }, + { _CAMO, 142, FI_CSWAP_LE, 0, 10.0f, 12.3f, 10.0f, 12.3f }, + { _CAMO, 143, FI_CSWAP_LE, 0, 9.9f, 12.3f, 10.0f, 12.3f }, + { _CAMO, 151, FI_CSWAP_LT, 0, 10.1f, 12.3f, 10.0f, 10.0f }, + { _CAMO, 152, FI_CSWAP_LT, 0, 10.0f, 12.3f, 10.0f, 10.0f }, + { _CAMO, 153, FI_CSWAP_LT, 0, 9.9f, 12.3f, 10.0f, 12.3f }, + { _CAMO, 161, FI_CSWAP_GE, 0, 10.1f, 12.3f, 10.0f, 12.3f }, + { _CAMO, 162, FI_CSWAP_GE, 0, 10.0f, 12.3f, 10.0f, 12.3f }, + { _CAMO, 163, FI_CSWAP_GE, 0, 9.9f, 12.3f, 10.0f, 10.0f }, + { _CAMO, 171, FI_CSWAP_GT, 0, 10.1f, 12.3f, 10.0f, 12.3f }, + { _CAMO, 172, FI_CSWAP_GT, 0, 10.0f, 12.3f, 10.0f, 10.0f }, + { _CAMO, 173, FI_CSWAP_GT, 0, 9.9f, 12.3f, 10.0f, 10.0f }, + { _CAMO, 181, FI_MSWAP, 1 }, +}; + +ParameterizedTestParameters(atomic, test_flt) +{ + struct test_flt_parms *params; + int tests = ARRAY_SIZE(flt_parms); + int i; + + params = malloc(sizeof(flt_parms) * 2); + + memcpy(params, flt_parms, sizeof(flt_parms)); + memcpy((uint8_t *)params + sizeof(flt_parms), flt_parms, + sizeof(flt_parms)); + + /* Make duplicate tests that use a standard MR key */ + for (i = 0; i < tests; i++) { + params[tests + i].key = MR_KEY_STD; + params[tests + i].index += 1000; + } + + return cr_make_param_array(struct test_flt_parms, params, + tests * 2); +} + +ParameterizedTest(struct test_flt_parms *p, atomic, test_flt) +{ + struct mem_region mr; + enum fi_datatype dt = FI_FLOAT; + uint64_t *rma; + uint64_t *loc; + uint64_t lini = -1; + + rma = _cxit_create_mr(&mr, &p->key); + + loc = calloc(1, RMA_WIN_LEN); + cr_assert_not_null(loc); + + if (p->opmask & _AMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, 0, 0, + rma, &p->rini, &p->rexp, + p->key); + } + + if (p->opmask & _FAMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, loc, &lini, rma, &p->rini, &p->rexp, + p->key); + } + + if (p->opmask & _CAMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + &p->comp, loc, &lini, rma, &p->rini, + &p->rexp, + p->key); + } + + free(loc); + _cxit_destroy_mr(&mr); +} + +/* The DBL tests only test the double type. + */ +struct test_dbl_parms { + int opmask; + int index; + enum fi_op op; + int err; + double comp; + double o1; + double rini; + double rexp; + uint64_t key; +}; + +static struct test_dbl_parms dbl_parms[] = { + { _AMO|_FAMO, 11, FI_MIN, 0, 0.0, 12.3, 12.0, 12.0 }, + { _AMO|_FAMO, 12, FI_MIN, 0, 0.0, 12.0, 12.3, 12.0 }, + { _AMO|_FAMO, 21, FI_MAX, 0, 0.0, 12.3, 12.0, 12.3 }, + { _AMO|_FAMO, 22, FI_MAX, 0, 0.0, 12.0, 12.3, 12.3 }, + { _AMO|_FAMO, 31, FI_SUM, 0, 0.0, 1.1, 1.2, (1.1 + 1.2) }, + { _AMO|_FAMO, 32, FI_SUM, 0, 0.0, 0.4, 1.7, (0.4 + 1.7) }, + { _AMO|_FAMO, 41, FI_LOR, 1 }, + { _AMO|_FAMO, 51, FI_LAND, 1 }, + { _AMO|_FAMO, 61, FI_LXOR, 1 }, + { _AMO|_FAMO, 71, FI_BOR, 1 }, + { _AMO|_FAMO, 81, FI_BAND, 1 }, + { _AMO|_FAMO, 91, FI_BXOR, 1 }, + { _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0, 0.0, 10.2, 123.4, 10.2 }, + { _FAMO, 111, FI_ATOMIC_READ, 0, 0.0, 1.1, 10.2, 10.2 }, + { _AMO, 112, FI_ATOMIC_READ, 1 }, + { _CAMO, 121, FI_CSWAP, 0, 12.0, 12.3, 10.0, 10.0 }, + { _CAMO, 122, FI_CSWAP, 0, 10.0, 12.3, 10.0, 12.3 }, + { _CAMO, 131, FI_CSWAP_NE, 0, 12.0, 12.3, 10.0, 12.3 }, + { _CAMO, 132, FI_CSWAP_NE, 0, 10.0, 12.3, 10.0, 10.0 }, + { _CAMO, 141, FI_CSWAP_LE, 0, 10.1, 12.3, 10.0, 10.0 }, + { _CAMO, 142, FI_CSWAP_LE, 0, 10.0, 12.3, 10.0, 12.3 }, + { _CAMO, 143, FI_CSWAP_LE, 0, 9.9, 12.3, 10.0, 12.3 }, + { _CAMO, 151, FI_CSWAP_LT, 0, 10.1, 12.3, 10.0, 10.0 }, + { _CAMO, 152, FI_CSWAP_LT, 0, 10.0, 12.3, 10.0, 10.0 }, + { _CAMO, 153, FI_CSWAP_LT, 0, 9.9, 12.3, 10.0, 12.3 }, + { _CAMO, 161, FI_CSWAP_GE, 0, 10.1, 12.3, 10.0, 12.3 }, + { _CAMO, 162, FI_CSWAP_GE, 0, 10.0, 12.3, 10.0, 12.3 }, + { _CAMO, 163, FI_CSWAP_GE, 0, 9.9, 12.3, 10.0, 10.0 }, + { _CAMO, 171, FI_CSWAP_GT, 0, 10.1, 12.3, 10.0, 12.3 }, + { _CAMO, 172, FI_CSWAP_GT, 0, 10.0, 12.3, 10.0, 10.0 }, + { _CAMO, 173, FI_CSWAP_GT, 0, 9.9, 12.3, 10.0, 10.0 }, + { _CAMO, 181, FI_MSWAP, 1 }, +}; + +ParameterizedTestParameters(atomic, test_dbl) +{ + struct test_dbl_parms *params; + int tests = ARRAY_SIZE(dbl_parms); + int i; + + params = malloc(sizeof(dbl_parms) * 2); + + memcpy(params, dbl_parms, sizeof(dbl_parms)); + memcpy((uint8_t *)params + sizeof(dbl_parms), dbl_parms, + sizeof(dbl_parms)); + + /* Make duplicate tests that use a standard MR key */ + for (i = 0; i < tests; i++) { + params[tests + i].key = MR_KEY_STD; + params[tests + i].index += 1000; + } + + return cr_make_param_array(struct test_dbl_parms, params, + tests * 2); +} + +ParameterizedTest(struct test_dbl_parms *p, atomic, test_dbl) +{ + struct mem_region mr; + enum fi_datatype dt = FI_DOUBLE; + uint64_t *rma; + uint64_t *loc; + uint64_t lini = -1; + + rma = _cxit_create_mr(&mr, &p->key); + + loc = calloc(1, RMA_WIN_LEN); + cr_assert_not_null(loc); + + if (p->opmask & _AMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, 0, 0, + rma, &p->rini, &p->rexp, + p->key); + } + + if (p->opmask & _FAMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, loc, &lini, rma, &p->rini, &p->rexp, + p->key); + } + + if (p->opmask & _CAMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + &p->comp, loc, &lini, rma, &p->rini, + &p->rexp, + p->key); + } + + free(loc); + _cxit_destroy_mr(&mr); +} + +/* The CMPLX tests only test the float complex type. + */ +struct test_cplx_parms { + int opmask; + int index; + enum fi_op op; + int err; + + float complex comp; + float complex o1; + float complex rini; + float complex rexp; + uint64_t key; +}; + +static struct test_cplx_parms cplx_parms[] = { + { _AMO|_FAMO, 11, FI_MIN, 1 }, + { _AMO|_FAMO, 21, FI_MAX, 1 }, + { _AMO|_FAMO, 31, FI_SUM, 0, 0.0, 1.1, 1.2, (1.1 + 1.2) }, + { _AMO|_FAMO, 32, FI_SUM, 0, 0.0, 0.4, 1.7, (0.4 + 1.7) }, + { _AMO|_FAMO, 31, FI_SUM, 0, + 0.0f, 1.1f+I*0.4f, 1.2f+I*1.7f, (1.1f+I*0.4f + 1.2f+I*1.7f) }, + { _AMO|_FAMO, 32, FI_SUM, 0, + 0.0f, 1.1f+I*1.7f, 1.2f+I*0.4f, (1.1f+I*1.7f + 1.2f+I*0.4f) }, + { _AMO|_FAMO, 41, FI_LOR, 1 }, + { _AMO|_FAMO, 51, FI_LAND, 1 }, + { _AMO|_FAMO, 61, FI_LXOR, 1 }, + { _AMO|_FAMO, 71, FI_BOR, 1 }, + { _AMO|_FAMO, 81, FI_BAND, 1 }, + { _AMO|_FAMO, 91, FI_BXOR, 1 }, + { _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0, + 0.0f, 10.2f+I*1.1f, 0.3f+I*2.2f, 10.2f+I*1.1f }, + { _FAMO, 111, FI_ATOMIC_READ, 0, + 0.0f, 1.1f+I*1.1f, 10.2f+I*1.1f, 10.2f+I*1.1f }, + { _AMO, 112, FI_ATOMIC_READ, 1 }, + { _CAMO, 121, FI_CSWAP, 0, + 12.0f+I*1.1f, 12.3f+I*1.1f, 10.0f+I*1.1f, 10.0f+I*1.1f }, + { _CAMO, 122, FI_CSWAP, 0, + 10.0f+I*1.1f, 12.3f+I*1.1f, 10.0f+I*1.1f, 12.3f+I*1.1f }, + { _CAMO, 131, FI_CSWAP_NE, 0, + 12.0f+I*1.1f, 12.3f+I*1.1f, 10.0f+I*1.1f, 12.3f+I*1.1f }, + { _CAMO, 132, FI_CSWAP_NE, 0, + 10.0f+I*1.1f, 12.3f+I*1.1f, 10.0f+I*1.1f, 10.0f+I*1.1f }, + { _CAMO, 141, FI_CSWAP_LE, 1 }, + { _CAMO, 151, FI_CSWAP_LT, 1 }, + { _CAMO, 161, FI_CSWAP_GE, 1 }, + { _CAMO, 171, FI_CSWAP_GT, 1 }, + { _CAMO, 181, FI_MSWAP, 1 }, +}; + +ParameterizedTestParameters(atomic, test_cplx) +{ + struct test_cplx_parms *params; + int tests = ARRAY_SIZE(cplx_parms); + int i; + + params = malloc(sizeof(cplx_parms) * 2); + + memcpy(params, cplx_parms, sizeof(cplx_parms)); + memcpy((uint8_t *)params + sizeof(cplx_parms), cplx_parms, + sizeof(cplx_parms)); + + /* Make duplicate tests that use a standard MR key */ + for (i = 0; i < tests; i++) { + params[tests + i].key = MR_KEY_STD; + params[tests + i].index += 1000; + } + + return cr_make_param_array(struct test_cplx_parms, params, + tests * 2); +} + +ParameterizedTest(struct test_cplx_parms *p, atomic, test_cplx) +{ + struct mem_region mr; + enum fi_datatype dt = FI_FLOAT_COMPLEX; + uint64_t *rma; + uint64_t *loc; + uint64_t lini = -1; + uint64_t key = 0; + + rma = _cxit_create_mr(&mr, &key); + + loc = calloc(1, RMA_WIN_LEN); + cr_assert_not_null(loc); + + if (p->opmask & _AMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, 0, 0, + rma, &p->rini, &p->rexp, + key); + } + + if (p->opmask & _FAMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, loc, &lini, rma, &p->rini, &p->rexp, + key); + } + + if (p->opmask & _CAMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + &p->comp, loc, &lini, rma, &p->rini, + &p->rexp, + key); + } + + free(loc); + _cxit_destroy_mr(&mr); +} + +/* The DCMPLX tests only test the double complex type. + */ + +struct test_dcplx_parms { + int opmask; + int index; + enum fi_op op; + int err; + + double complex comp; + double complex o1; + double complex rini; + double complex rexp; + uint64_t key; +}; + +static struct test_dcplx_parms dcplx_parms[] = { + { _AMO|_FAMO, 11, FI_MIN, 1 }, + { _AMO|_FAMO, 21, FI_MAX, 1 }, + { _AMO|_FAMO, 31, FI_SUM, 0, + 0.0, 1.1+I*0.4, 1.2+I*1.7, (1.1+I*0.4 + 1.2+I*1.7) }, + { _AMO|_FAMO, 32, FI_SUM, 0, + 0.0, 1.1+I*1.7, 1.2+I*0.4, (1.1+I*1.7 + 1.2+I*0.4) }, + { _AMO|_FAMO, 41, FI_LOR, 1 }, + { _AMO|_FAMO, 51, FI_LAND, 1 }, + { _AMO|_FAMO, 61, FI_LXOR, 1 }, + { _AMO|_FAMO, 71, FI_BOR, 1 }, + { _AMO|_FAMO, 81, FI_BAND, 1 }, + { _AMO|_FAMO, 91, FI_BXOR, 1 }, + { _AMO|_FAMO, 101, FI_ATOMIC_WRITE, 0, + 0.0, 10.2+I*1.1, 0.3+I*2.2, 10.2+I*1.1 }, + { _FAMO, 111, FI_ATOMIC_READ, 0, + 0.0, 1.1+I*1.1, 10.2+I*1.1, 10.2+I*1.1 }, + { _AMO, 112, FI_ATOMIC_READ, 1 }, + { _CAMO, 121, FI_CSWAP, 0, + 12.0+I*1.1, 12.3+I*1.1, 10.0+I*1.1, 10.0+I*1.1 }, + { _CAMO, 122, FI_CSWAP, 0, + 10.0+I*1.1, 12.3+I*1.1, 10.0+I*1.1, 12.3+I*1.1 }, + { _CAMO, 131, FI_CSWAP_NE, 0, + 12.0+I*1.1, 12.3+I*1.1, 10.0+I*1.1, 12.3+I*1.1 }, + { _CAMO, 132, FI_CSWAP_NE, 0, + 10.0+I*1.1, 12.3+I*1.1, 10.0+I*1.1, 10.0+I*1.1 }, + { _CAMO, 141, FI_CSWAP_LE, 1 }, + { _CAMO, 151, FI_CSWAP_LT, 1 }, + { _CAMO, 161, FI_CSWAP_GE, 1 }, + { _CAMO, 171, FI_CSWAP_GT, 1 }, + { _CAMO, 181, FI_MSWAP, 1 }, +}; + +ParameterizedTestParameters(atomic, test_dcplx) +{ + struct test_dcplx_parms *params; + int tests = ARRAY_SIZE(dcplx_parms); + int i; + + params = malloc(sizeof(dcplx_parms) * 2); + + memcpy(params, dcplx_parms, sizeof(dcplx_parms)); + memcpy((uint8_t *)params + sizeof(dcplx_parms), dcplx_parms, + sizeof(dcplx_parms)); + + /* Make duplicate tests that use a standard MR key */ + for (i = 0; i < tests; i++) { + params[tests + i].key = MR_KEY_STD; + params[tests + i].index += 1000; + } + + return cr_make_param_array(struct test_dcplx_parms, params, + tests * 2); +} + +ParameterizedTest(struct test_dcplx_parms *p, atomic, test_dcplx) +{ + struct mem_region mr; + enum fi_datatype dt = FI_DOUBLE_COMPLEX; + uint64_t *rma; + uint64_t *loc; + uint64_t lini = -1; + + rma = _cxit_create_mr(&mr, &p->key); + + loc = calloc(1, RMA_WIN_LEN); + cr_assert_not_null(loc); + + if (p->opmask & _AMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, 0, 0, + rma, &p->rini, &p->rexp, + p->key); + } + + if (p->opmask & _FAMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + 0, loc, &lini, rma, &p->rini, &p->rexp, + p->key); + } + + if (p->opmask & _CAMO) { + _test_amo(p->index, dt, p->op, p->err, &p->o1, + &p->comp, loc, &lini, rma, &p->rini, + &p->rexp, + p->key); + } + + free(loc); + _cxit_destroy_mr(&mr); +} + +Test(atomic, amo_cleanup) +{ + int ret; + long i; + uint8_t *send_buf; + int win_len = 0x1000; + int writes = 50; + struct mem_region mr; + uint64_t operand1 = 0; + uint64_t key = RMA_WIN_KEY; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + for (i = 0; i < win_len; i++) + send_buf[i] = 0xb1 * i; + + _cxit_create_mr(&mr, &key); + + /* Send 8 bytes from send buffer data to RMA window 0 */ + for (i = 0; i < writes; i++) { + do { + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_tx_cq, NULL, 0); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + } + + _cxit_destroy_mr(&mr); + + /* Exit without gathering events. */ +} + +/* Perform a batch of AMOs. A C_STATE update is required for each transaction + * since each transaction in the batch uses a unique internal request. + */ +Test(atomic, amo_batch) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + int ret; + int i; + uint64_t key = RMA_WIN_KEY; + + _cxit_create_mr(&mr, &key); + + cr_assert(!fi_cntr_read(cxit_write_cntr)); + + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + while (fi_cntr_read(cxit_write_cntr) != 4) + ; + + for (i = 0; i < 4; i++) { + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + } + + _cxit_destroy_mr(&mr); +} + +void cxit_setup_amo_selective_completion(void) +{ + cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + + cxit_setup_getinfo(); + cxit_fi_hints->tx_attr->op_flags = FI_COMPLETION; + cxit_setup_rma(); +} + +/* Test selective completion behavior with AMOs. */ +Test(atomic_sel, selective_completion, + .init = cxit_setup_amo_selective_completion, + .fini = cxit_teardown_rma) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t compare; + uint64_t result; + uint64_t exp_remote = 0; + uint64_t *rma; + int ret; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_ioc compare_ioc; + struct fi_ioc result_ioc; + struct fi_rma_ioc rma_ioc; + int count = 0; + uint64_t key = RMA_WIN_KEY; + + rma = _cxit_create_mr(&mr, &key); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + ioc.addr = &operand1; + ioc.count = 1; + + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key; + + result_ioc.addr = &result; + result_ioc.count = 1; + + compare_ioc.addr = &compare; + compare_ioc.count = 1; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + /* Non-fetching AMOs */ + + /* Completion requested by default. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Completion explicitly requested. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_atomicmsg(cxit_ep, &msg, FI_COMPLETION); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Suppress completion. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_atomicmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Inject never generates an event */ + operand1 = 1; + exp_remote += operand1; + ret = fi_inject_atomic(cxit_ep, &operand1, 1, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM); + cr_assert(ret == FI_SUCCESS); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Fetching AMOs */ + count = 0; + + /* Completion requested by default. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, + &result, NULL, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Completion explicitly requested. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_COMPLETION); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Suppress completion. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, 0); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + /* Completion explicitly requested with inject. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_COMPLETION | FI_INJECT); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Suppress completion with inject. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_INJECT); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_read_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Comp AMOs */ + + /* Completion requested by default. */ + ret = fi_compare_atomic(cxit_ep, &operand1, 1, 0, + &compare, NULL, + &result, NULL, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_CSWAP, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + + /* Completion explicitly requested. */ + msg.op = FI_CSWAP; + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1, + &result_ioc, NULL, 1, FI_COMPLETION); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + + /* Suppress completion. */ + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1, + &result_ioc, NULL, 1, 0); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_read_cntr) != count) + ; + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + _cxit_destroy_mr(&mr); +} + +void cxit_setup_amo_selective_completion_suppress(void) +{ + cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + + cxit_setup_getinfo(); + cxit_fi_hints->tx_attr->op_flags = 0; + cxit_setup_rma(); +} + +/* Test selective completion behavior with RMA. */ +Test(atomic_sel, selective_completion_suppress, + .init = cxit_setup_amo_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t compare; + uint64_t result; + uint64_t exp_remote = 0; + uint64_t *rma; + int ret; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_ioc compare_ioc; + struct fi_ioc result_ioc; + struct fi_rma_ioc rma_ioc; + int count = 0; + uint64_t key = RMA_WIN_KEY; + + rma = _cxit_create_mr(&mr, &key); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + ioc.addr = &operand1; + ioc.count = 1; + + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key; + + result_ioc.addr = &result; + result_ioc.count = 1; + + compare_ioc.addr = &compare; + compare_ioc.count = 1; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + /* Non-fetching AMOs */ + + /* Completion suppressed by default. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Completion explicitly requested. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_atomicmsg(cxit_ep, &msg, FI_COMPLETION); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + count++; + + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Suppress completion. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_atomicmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Inject never generates an event */ + operand1 = 1; + exp_remote += operand1; + ret = fi_inject_atomic(cxit_ep, &operand1, 1, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM); + cr_assert(ret == FI_SUCCESS); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Fetching AMOs */ + count = 0; + + /* Completion suppressed by default. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, 0, + &result, NULL, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_read_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Completion explicitly requested. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_COMPLETION); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Suppress completion. */ + operand1 = 1; + exp_remote += operand1; + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, 0); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_read_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Comp AMOs */ + + /* Completion suppressed by default. */ + ret = fi_compare_atomic(cxit_ep, &operand1, 1, 0, + &compare, NULL, + &result, NULL, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_CSWAP, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Completion explicitly requested. */ + msg.op = FI_CSWAP; + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1, + &result_ioc, NULL, 1, FI_COMPLETION); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + + /* Suppress completion. */ + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1, + &result_ioc, NULL, 1, 0); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + /* Completion explicitly requested with inject. */ + msg.op = FI_CSWAP; + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1, + &result_ioc, NULL, 1, + FI_COMPLETION | FI_INJECT); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + + /* Suppress completion with inject. */ + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1, + &result_ioc, NULL, 1, FI_INJECT); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + while (fi_cntr_read(cxit_read_cntr) != count) + ; + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + _cxit_destroy_mr(&mr); +} + +/* Test remote counter events with AMOs */ +Test(atomic, rem_cntr) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t exp_remote = 0; + uint64_t *rma; + int ret; + int count = 0; + uint64_t key = RMA_WIN_KEY; + + rma = _cxit_create_mr(&mr, &key); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 1; + exp_remote += operand1; + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + /* Wait for remote counter event, then check data */ + count++; + + while (fi_cntr_read(cxit_rem_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + operand1 = 3; + exp_remote += operand1; + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + /* Wait for remote counter event, then check data */ + count++; + + while (fi_cntr_read(cxit_rem_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + operand1 = 9; + exp_remote += operand1; + ret = fi_atomic(cxit_ep, &operand1, 1, 0, + cxit_ep_fi_addr, 0, key, + FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + /* Wait for remote counter event, then check data */ + count++; + + while (fi_cntr_read(cxit_rem_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + _cxit_destroy_mr(&mr); +} + +/* Test simple operations: AMO SUM UINT64_T, FAMO SUM UINT64_T, and CAMO SWAP_NE + * UINT64_T. If this doesn't work, nothing else will. + */ +TestSuite(atomic_flush, .init = cxit_setup_rma_disable_fi_rma_event, + .fini = cxit_teardown_rma, .disabled = AMO_DISABLED, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Perform a fetching AMO with flush at target. */ +Test(atomic_flush, fetch_flush) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t fetch_remote = 4; + uint64_t exp_remote = fetch_remote; + uint64_t *rma; + int ret; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_rma_ioc rma_ioc; + uint64_t result = 0; + struct fi_ioc result_ioc = { .count = 1, .addr = &result }; + int count = 0; + uint64_t flushes_start; + uint64_t flushes_end; + uint64_t key = RMA_WIN_KEY; + bool enable = false; + + /* If FI_MR_PROV_KEY disable the remote provider key cache */ + fi_control(&cxit_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE, + &enable); + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS, + &flushes_start, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + rma = _cxit_create_mr(&mr, &key); + *rma = fetch_remote; + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + ioc.addr = &operand1; + ioc.count = 1; + + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + operand1 = 1; + exp_remote += operand1; + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_DELIVERY_COMPLETE); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + cr_assert_eq(result, fetch_remote, + "Result = %ld, expected = %ld", + result, fetch_remote); + + _cxit_destroy_mr(&mr); + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS, + &flushes_end, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + cr_assert(flushes_end > flushes_start); +} + +/* Perform a fetching AMO with flush at target, but use an illegal + * RMA offset. Verify that an error is returned in the CQE even though + * the subsequent flush succeeds. + */ +Test(atomic_flush, fetch_flush_bounds_err) +{ + struct mem_region mr; + struct fi_cq_err_entry err; + struct fi_cq_tagged_entry cqe; + uint64_t operand1 = 1; + uint64_t result = 0; + uint64_t *rma; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_rma_ioc rma_ioc; + struct fi_ioc result_ioc = { .count = 1, .addr = &result }; + uint64_t key = RMA_WIN_KEY; + int ret; + bool enable = false; + + /* If FI_MR_PROV_KEY disable the remote provider key cache */ + fi_control(&cxit_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE, + &enable); + + rma = _cxit_create_mr(&mr, &key); + cr_assert_not_null(rma); + + ioc.addr = &operand1; + ioc.count = 1; + + rma_ioc.addr = RMA_WIN_LEN + 1; + rma_ioc.count = 1; + rma_ioc.key = key; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_DELIVERY_COMPLETE); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic flush success"); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1, "fi_cq_readerr error %d", ret); + cr_assert_eq(err.err, FI_EIO, "Unexpected error value: %d", err.err); + + _cxit_destroy_mr(&mr); +} + +/* Perform an AMO that uses a flushing ZBR at the target. */ +Test(atomic, flush) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t exp_remote = 0; + uint64_t *rma; + int ret; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_rma_ioc rma_ioc; + int count = 0; + uint64_t flushes_start; + uint64_t flushes_end; + uint64_t key = RMA_WIN_KEY; + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS, + &flushes_start, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + + rma = _cxit_create_mr(&mr, &key); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + ioc.addr = &operand1; + ioc.count = 1; + + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + operand1 = 1; + exp_remote += operand1; + ret = fi_atomicmsg(cxit_ep, &msg, FI_DELIVERY_COMPLETE); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + _cxit_destroy_mr(&mr); + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS, + &flushes_end, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + cr_assert(flushes_end > flushes_start); +} + +/* Test AMO FI_MORE */ +Test(atomic, more) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t exp_remote; + uint64_t *rma; + int ret; + int i = 0; + uint64_t key = 0xa; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_rma_ioc rma_ioc; + + + rma = _cxit_create_mr(&mr, &key); + exp_remote = 0; + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 1; + exp_remote += operand1; + + ioc.addr = &operand1; + ioc.count = 1; + + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + ret = fi_atomicmsg(cxit_ep, &msg, FI_MORE); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + /* Ensure no completion before the doorbell ring */ + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "write failed %d", ret); + } while (i++ < 100000); + + operand1 = 3; + exp_remote += operand1; + + ret = fi_atomicmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + /* Wait for two events. */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + /* Validate sent data */ + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + _cxit_destroy_mr(&mr); +} + +/* Test AMO FI_FENCE */ +Test(atomic, fence) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t exp_remote; + uint64_t *rma; + int ret; + uint64_t key = 0xa; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_rma_ioc rma_ioc; + + rma = _cxit_create_mr(&mr, &key); + exp_remote = 0; + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 1; + exp_remote += operand1; + + ioc.addr = &operand1; + ioc.count = 1; + + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + ret = fi_atomicmsg(cxit_ep, &msg, FI_FENCE); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + /* Validate sent data */ + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + _cxit_destroy_mr(&mr); +} + +void cxit_amo_setup_nofence(void) +{ + cxit_setup_getinfo(); + cxit_fi_hints->caps = CXIP_EP_PRI_CAPS; + cxit_setup_rma(); +} + +/* Test AMO without FI_FENCE */ +Test(atomic_nofence, nofence, + .init = cxit_amo_setup_nofence, + .fini = cxit_teardown_rma) +{ + struct mem_region mr; + uint64_t operand1; + uint64_t exp_remote; + uint64_t *rma; + int ret; + uint64_t key = 0xa; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_rma_ioc rma_ioc; + + rma = _cxit_create_mr(&mr, &key); + exp_remote = 0; + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 1; + exp_remote += operand1; + + ioc.addr = &operand1; + ioc.count = 1; + + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + ret = fi_atomicmsg(cxit_ep, &msg, FI_FENCE); + cr_assert(ret == -FI_EINVAL); + + _cxit_destroy_mr(&mr); +} + +void cxit_setup_amo_opt(void) +{ + cxit_setup_getinfo(); + + /* Explicitly request unordered RMA */ + cxit_fi_hints->caps = FI_ATOMIC; + cxit_fi_hints->tx_attr->msg_order = 0; + + cxit_setup_rma(); +} + +TestSuite(amo_opt, .init = cxit_setup_amo_opt, .fini = cxit_teardown_rma, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test Unreliable/HRP AMOs */ +Test(amo_opt, hrp) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t exp_remote; + uint64_t *rma; + int ret; + uint64_t key = 0xa; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_rma_ioc rma_ioc; + uint64_t res_start; + uint64_t res_end; + uint64_t hrp_acks_start; + uint64_t hrp_acks_end; + struct cxip_ep *cxi_ep; + uint64_t compare; + uint64_t result; + struct fi_ioc compare_ioc = { .count = 1, .addr = &compare }; + struct fi_ioc result_ioc = { .count = 1, .addr = &result }; + + /* HRP not supported in netsim */ + cxi_ep = container_of(cxit_ep, struct cxip_ep, ep); + if (is_netsim(cxi_ep->ep_obj)) + return; + + ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT, + &res_start, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + ret = cxit_dom_read_cntr(C_CNTR_HNI_HRP_ACK, + &hrp_acks_start, NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + rma = _cxit_create_mr(&mr, &key); + exp_remote = 0; + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 1; + + exp_remote += operand1; + + ioc.addr = &operand1; + ioc.count = 1; + + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_UNRELIABLE); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + /* Validate sent data */ + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* HRP requires UNRELIABLE */ + ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_HRP); + cr_assert(ret == -FI_EINVAL, "Return code = %d", ret); + + exp_remote += operand1; + ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_UNRELIABLE | FI_CXI_HRP); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + exp_remote += operand1; + ret = fi_atomicmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + /* HRP FAMO is invalid */ + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_CXI_UNRELIABLE | FI_CXI_HRP); + cr_assert(ret == -FI_EBADFLAGS, "Return code = %d", ret); + + /* Try unreliable FAMO */ + exp_remote += operand1; + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_CXI_UNRELIABLE); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + /* wait a second to check the operation was performed. The HRP response + * returns before the request hits the NIC. + */ + usleep(1000); + + /* Validate sent data */ + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* HRP compare AMO is invalid */ + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1, + &result_ioc, NULL, 1, + FI_CXI_UNRELIABLE | FI_CXI_HRP); + cr_assert(ret == -FI_EBADFLAGS, "Return code = %d", ret); + + /* Try unreliable compare AMO. */ + msg.op = FI_CSWAP; + compare = exp_remote; + operand1 = exp_remote + 1; + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, NULL, 1, + &result_ioc, NULL, 1, FI_CXI_UNRELIABLE); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + sleep(1); + + /* Validate data */ + cr_assert_eq(*rma, operand1, + "Result = %ld, expected = %ld", + *rma, operand1); + cr_assert_eq(result, exp_remote, + "Result = %ld, expected = %ld", + result, exp_remote); + + ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT, + &res_end, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + ret = cxit_dom_read_cntr(C_CNTR_HNI_HRP_ACK, + &hrp_acks_end, NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + cr_assert_eq(hrp_acks_end - hrp_acks_start, 1, + "unexpected hrp_acks count: %lu\n", + hrp_acks_end - hrp_acks_start); + cr_assert_eq(res_end - res_start, 4, + "unexpected restricted packets count: %lu\n", + res_end - res_start); + + /* HRP does not support Fetching AMOS. */ + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_CXI_UNRELIABLE | FI_CXI_HRP); + cr_assert(ret == -FI_EBADFLAGS, "Return code = %d", ret); + + ret = fi_compare_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + &result_ioc, NULL, 1, + FI_CXI_UNRELIABLE | FI_CXI_HRP); + cr_assert(ret == -FI_EBADFLAGS, "Return code = %d", ret); + + _cxit_destroy_mr(&mr); +} + +Test(atomic, std_mr_inject) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + uint64_t operand1; + uint64_t exp_remote = 0; + uint64_t *rma; + int ret; + int count = 0; + int i; + uint64_t win_key = CXIP_PTL_IDX_MR_OPT_CNT; + + rma = _cxit_create_mr(&mr, &win_key); + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + operand1 = 1; + + for (i = 0; i < 10; i++) { + exp_remote += operand1; + ret = fi_inject_atomic(cxit_ep, &operand1, 1, + cxit_ep_fi_addr, 0, win_key, + FI_UINT64, FI_SUM); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + count++; + } + + /* Corrupt the user operand buffer to make sure the NIC is not using it + * for an inject. + */ + operand1 = 0; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + cr_assert_eq(*rma, exp_remote, + "Result = %ld, expected = %ld", + *rma, exp_remote); + + /* Make sure no events were delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + _cxit_destroy_mr(&mr); +} + +/* Test ERRATA-2794 32bit non-fetch AMO with HRP work-around */ +Test(amo_opt, errata_2794) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + union { + uint32_t _32bit; + uint64_t _64bit; + } operand, exp_remote, *rma; + int ret; + uint64_t key = 0xa; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_rma_ioc rma_ioc; + struct cxip_ep *cxi_ep; + + /* HRP not supported in netsim */ + cxi_ep = container_of(cxit_ep, struct cxip_ep, ep); + if (is_netsim(cxi_ep->ep_obj)) + return; + + rma = _cxit_create_mr(&mr, &key); + + ioc.addr = &operand; + ioc.count = 1; + + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + /* Use 64-bit to make sure we are using a HRP communication profile */ + exp_remote._64bit = 0; + cr_assert_eq(rma->_64bit, exp_remote._64bit, + "Result = %" PRId64 ", expected = %" PRId64, + rma->_64bit, exp_remote._64bit); + + operand._64bit = 1UL; + exp_remote._64bit += operand._64bit; + + ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_UNRELIABLE | FI_CXI_HRP); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + /* wait a second to check the operation was performed. The HRP response + * returns before the request hits the NIC. Validate data and that + * CQ configured for HRP. + */ + usleep(1000); + cr_assert_eq(rma->_64bit, exp_remote._64bit, + "Result = %" PRId64 ", expected = %" PRId64, + rma->_64bit, exp_remote._64bit); + + /* ERRATA-2794 */ + rma->_32bit = 0; + exp_remote._32bit = 0; + msg.datatype = FI_UINT32; + + operand._32bit = 1; + exp_remote._32bit += operand._32bit; + + ret = fi_atomicmsg(cxit_ep, &msg, FI_CXI_UNRELIABLE | FI_CXI_HRP); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + /* wait a second to check the operation was performed, and validate + * data. + */ + usleep(1000); + cr_assert_eq(rma->_32bit, exp_remote._32bit, + "Result = %d, expected = %d", + rma->_32bit, exp_remote._32bit); + + /* Perform successive 32-bit unsigned non-fetching atomic, no + * communication profile change would be required. + */ + exp_remote._32bit += operand._32bit; + ret = fi_atomicmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS, "Return code = %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + /* wait a second to check the operation was performed, and + * validate data. + */ + usleep(1000); + cr_assert_eq(rma->_32bit, exp_remote._32bit, + "Result = %d, expected = %d", + rma->_32bit, exp_remote._32bit); + + _cxit_destroy_mr(&mr); +} + +static void amo_hybrid_mr_desc_test_runner(bool fetching, bool compare, + bool cq_events, bool buf_mr, + bool compare_mr, bool result_mr, + bool mswap, bool read, bool flush) +{ + struct mem_region buf_window; + struct mem_region compare_window; + struct mem_region result_window; + struct mem_region remote_window; + uint64_t remote_key = 0x1; + uint64_t buf_key = 0x2; + uint64_t compare_key = 0x3; + uint64_t result_key = 0x4; + int win_len = 1; + void *buf_desc[1] = {}; + void *compare_desc[1] = {}; + void *result_desc[1] = {}; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc = {}; + struct fi_rma_ioc rma_ioc = {}; + struct fi_ioc fetch_ioc = {}; + struct fi_ioc compare_ioc = {}; + int ret; + uint64_t cqe_flags = fetching ? FI_ATOMIC | FI_READ : + FI_ATOMIC | FI_WRITE; + struct fid_cntr *cntr = fetching ? cxit_read_cntr : cxit_write_cntr; + struct fi_cq_tagged_entry cqe; + uint64_t amo_flags = cq_events ? FI_COMPLETION : 0; + + if (flush) + amo_flags |= FI_DELIVERY_COMPLETE; + else + amo_flags |= FI_TRANSMIT_COMPLETE; + + ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &buf_key, + &buf_window); + cr_assert(ret == FI_SUCCESS); + + ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &compare_key, + &compare_window); + cr_assert(ret == FI_SUCCESS); + + ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &result_key, + &result_window); + cr_assert(ret == FI_SUCCESS); + + ret = mr_create(win_len, FI_REMOTE_READ | FI_REMOTE_WRITE, 0x3, + &remote_key, &remote_window); + cr_assert(ret == FI_SUCCESS); + + if (buf_mr) + buf_desc[0] = fi_mr_desc(buf_window.mr); + + if (compare_mr) + compare_desc[0] = fi_mr_desc(compare_window.mr); + + if (result_mr) + result_desc[0] = fi_mr_desc(result_window.mr); + + ioc.addr = buf_window.mem; + ioc.count = 1; + + rma_ioc.count = 1; + rma_ioc.key = remote_key; + + msg.msg_iov = &ioc; + msg.desc = buf_desc; + msg.iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + + if (!compare) { + msg.datatype = FI_UINT8; + + if (fetching && read) + msg.op = FI_ATOMIC_READ; + else + msg.op = FI_SUM; + + *buf_window.mem = 1; + *result_window.mem = 0; + *remote_window.mem = 1; + + if (fetching) { + fetch_ioc.addr = result_window.mem; + fetch_ioc.count = 1; + + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc, + result_desc, 1, amo_flags); + cr_assert(ret == FI_SUCCESS); + } else { + ret = fi_atomicmsg(cxit_ep, &msg, amo_flags); + cr_assert(ret == FI_SUCCESS); + } + + while (1) { + ret = fi_cntr_wait(cntr, 1, 1000); + if (ret == FI_SUCCESS) + break; + } + + if (!read) + cr_assert_eq(*remote_window.mem, 2, + "Data mismatch: expected=2 got=%d\n", + *remote_window.mem); + + if (fetching) + cr_assert_eq(*result_window.mem, 1, + "Data mismatch: expected=1 got=%d\n", + *result_window.mem); + } else if (mswap) { + msg.datatype = FI_UINT8; + msg.op = FI_MSWAP; + + compare_ioc.addr = compare_window.mem; + compare_ioc.count = 1; + + fetch_ioc.addr = result_window.mem; + fetch_ioc.count = 1; + + *buf_window.mem = 0xA0; + *compare_window.mem = 0xB; + *result_window.mem = 1; + *remote_window.mem = 0xF; + + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, + compare_desc, 1, &fetch_ioc, + result_desc, 1, amo_flags); + cr_assert_eq(ret, FI_SUCCESS, "Bad rc=%d\n", ret); + + while (1) { + ret = fi_cntr_wait(cntr, 1, 1000); + if (ret == FI_SUCCESS) + break; + } + + cr_assert_eq(*remote_window.mem, 4, + "Data mismatch: expected=4 got=%d\n", + *remote_window.mem); + + cr_assert_eq(*result_window.mem, 0xF, + "Data mismatch: expected=0xF got=%d\n", + *result_window.mem); + } else { + msg.datatype = FI_UINT8; + msg.op = FI_CSWAP; + + compare_ioc.addr = compare_window.mem; + compare_ioc.count = 1; + + fetch_ioc.addr = result_window.mem; + fetch_ioc.count = 1; + + *buf_window.mem = 3; + *compare_window.mem = 1; + *result_window.mem = 0; + *remote_window.mem = 1; + + ret = fi_compare_atomicmsg(cxit_ep, &msg, &compare_ioc, + compare_desc, 1, &fetch_ioc, + result_desc, 1, amo_flags); + cr_assert_eq(ret, FI_SUCCESS, "Bad rc=%d\n", ret); + + while (1) { + ret = fi_cntr_wait(cntr, 1, 1000); + if (ret == FI_SUCCESS) + break; + } + + cr_assert_eq(*remote_window.mem, 3, + "Data mismatch: expected=3 got=%d\n", + *remote_window.mem); + + cr_assert_eq(*result_window.mem, 1, + "Data mismatch: expected=1 got=%d\n", + *result_window.mem); + } + + if (cq_events) { + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, cqe_flags, NULL); + } + + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + mr_destroy(&remote_window); + mr_destroy(&result_window); + mr_destroy(&compare_window); + mr_destroy(&buf_window); +} + +TestSuite(amo_hybrid_mr_desc, .init = cxit_setup_rma_hybrid_mr_desc, + .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(amo_hybrid_mr_desc, non_fetching_no_mr_desc_no_cqe) +{ + amo_hybrid_mr_desc_test_runner(false, false, false, false, false, + false, false, false, false); +} + +Test(amo_hybrid_mr_desc, non_fetching_buf_result_mr_desc_no_cqe) +{ + amo_hybrid_mr_desc_test_runner(false, false, false, true, false, true, + false, false, false); +} + +Test(amo_hybrid_mr_desc, fetching_no_mr_desc_no_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, false, false, false, false, false, + false, false, false); +} + +Test(amo_hybrid_mr_desc, fetching_buf_result_mr_desc_no_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, false, false, true, false, true, + false, false, false); +} + +Test(amo_hybrid_mr_desc, non_fetching_no_mr_desc_cqe) +{ + amo_hybrid_mr_desc_test_runner(false, false, true, false, false, false, + false, false, false); +} + +Test(amo_hybrid_mr_desc, non_fetching_buf_result_mr_desc_cqe) +{ + amo_hybrid_mr_desc_test_runner(false, false, true, true, false, true, + false, false, false); +} + +Test(amo_hybrid_mr_desc, fetching_no_mr_desc_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, false, true, false, false, false, + false, false, false); +} + +Test(amo_hybrid_mr_desc, fetching_buf_result_mr_desc_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, false, true, true, false, true, + false, false, false); +} + +Test(amo_hybrid_mr_desc, compare_no_mr_desc_no_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, true, false, false, false, false, + false, false, false); +} + +Test(amo_hybrid_mr_desc, compare_buf_compare_result_mr_desc_no_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, true, false, true, true, true, + false, false, false); +} + +Test(amo_hybrid_mr_desc, compare_no_mr_desc_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, true, true, false, false, false, + false, false, false); +} + +Test(amo_hybrid_mr_desc, compare_buf_compare_result_mr_desc_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, true, true, true, true, true, + false, false, false); +} + +Test(amo_hybrid_mr_desc, compare_mswap_buf_compare_result_mr_desc_no_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, true, false, true, true, true, + true, false, false); +} + +Test(amo_hybrid_mr_desc, compare_mswap_buf_compare_result_mr_desc_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, true, true, true, true, true, + true, false, false); +} + +Test(amo_hybrid_mr_desc, read_buf_result_mr_desc_no_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, false, false, true, false, true, + false, true, false); +} + +Test(amo_hybrid_mr_desc, read_buf_result_mr_desc_cqe) +{ + amo_hybrid_mr_desc_test_runner(true, false, true, true, false, true, + false, true, false); +} + +Test(amo_hybrid_mr_desc, non_fetching_no_mr_desc_no_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(false, false, false, false, false, + false, false, false, true); +} + +Test(amo_hybrid_mr_desc, non_fetching_buf_result_mr_desc_no_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(false, false, false, true, false, true, + false, false, true); +} + +Test(amo_hybrid_mr_desc, fetching_no_mr_desc_no_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, false, false, false, false, false, + false, false, true); +} + +Test(amo_hybrid_mr_desc, fetching_buf_result_mr_desc_no_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, false, false, true, false, true, + false, false, true); +} + +Test(amo_hybrid_mr_desc, non_fetching_no_mr_desc_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(false, false, true, false, false, false, + false, false, true); +} + +Test(amo_hybrid_mr_desc, non_fetching_buf_result_mr_desc_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(false, false, true, true, false, true, + false, false, true); +} + +Test(amo_hybrid_mr_desc, fetching_no_mr_desc_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, false, true, false, false, false, + false, false, true); +} + +Test(amo_hybrid_mr_desc, fetching_buf_result_mr_desc_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, false, true, true, false, true, + false, false, true); +} + +Test(amo_hybrid_mr_desc, compare_no_mr_desc_no_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, true, false, false, false, false, + false, false, true); +} + +Test(amo_hybrid_mr_desc, compare_buf_compare_result_mr_desc_no_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, true, false, true, true, true, + false, false, true); +} + +Test(amo_hybrid_mr_desc, compare_no_mr_desc_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, true, true, false, false, false, + false, false, true); +} + +Test(amo_hybrid_mr_desc, compare_buf_compare_result_mr_desc_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, true, true, true, true, true, + false, false, true); +} + +Test(amo_hybrid_mr_desc, compare_mswap_buf_compare_result_mr_desc_no_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, true, false, true, true, true, + true, false, true); +} + +Test(amo_hybrid_mr_desc, compare_mswap_buf_compare_result_mr_desc_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, true, true, true, true, true, + true, false, true); +} + +Test(amo_hybrid_mr_desc, read_buf_result_mr_desc_no_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, false, false, true, false, true, + false, true, true); +} + +Test(amo_hybrid_mr_desc, read_buf_result_mr_desc_cqe_flush) +{ + amo_hybrid_mr_desc_test_runner(true, false, true, true, false, true, + false, true, true); +} + +Test(amo_hybrid_mr_desc, fetching_amo_failure) +{ + struct mem_region buf_window; + struct mem_region result_window; + uint64_t remote_key = 0x1; + uint64_t buf_key = 0x2; + uint64_t result_key = 0x4; + int win_len = 1; + void *buf_desc[1] = {}; + void *result_desc[1] = {}; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc = {}; + struct fi_rma_ioc rma_ioc = {}; + struct fi_ioc fetch_ioc = {}; + int ret; + struct fid_cntr *cntr = cxit_read_cntr; + struct fi_cq_tagged_entry cqe; + struct fi_cq_err_entry cq_err; + uint64_t amo_flags = FI_TRANSMIT_COMPLETE; + + ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &buf_key, + &buf_window); + cr_assert(ret == FI_SUCCESS); + + ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &result_key, + &result_window); + cr_assert(ret == FI_SUCCESS); + + buf_desc[0] = fi_mr_desc(buf_window.mr); + result_desc[0] = fi_mr_desc(result_window.mr); + + ioc.addr = buf_window.mem; + ioc.count = 1; + + rma_ioc.count = 1; + rma_ioc.key = remote_key; + + msg.msg_iov = &ioc; + msg.desc = buf_desc; + msg.iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.datatype = FI_UINT8; + msg.op = FI_SUM; + + fetch_ioc.addr = result_window.mem; + fetch_ioc.count = 1; + + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc, result_desc, 1, + amo_flags); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_readerr(cntr) != 1) + ; + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == -FI_EAVAIL); + + ret = fi_cq_readerr(cxit_tx_cq, &cq_err, 0); + cr_assert(ret == 1); + + cr_assert(cq_err.flags == (FI_ATOMIC | FI_READ)); + cr_assert(cq_err.op_context == NULL); + + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + mr_destroy(&result_window); + mr_destroy(&buf_window); +} + +Test(amo_hybrid_mr_desc, amo_failure) +{ + struct mem_region buf_window; + uint64_t remote_key = 0x1; + uint64_t buf_key = 0x2; + int win_len = 1; + void *buf_desc[1] = {}; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc = {}; + struct fi_rma_ioc rma_ioc = {}; + int ret; + struct fid_cntr *cntr = cxit_write_cntr; + struct fi_cq_tagged_entry cqe; + struct fi_cq_err_entry cq_err; + uint64_t amo_flags = FI_TRANSMIT_COMPLETE; + + ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &buf_key, + &buf_window); + cr_assert(ret == FI_SUCCESS); + + buf_desc[0] = fi_mr_desc(buf_window.mr); + + ioc.addr = buf_window.mem; + ioc.count = 1; + + rma_ioc.count = 1; + rma_ioc.key = remote_key; + + msg.msg_iov = &ioc; + msg.desc = buf_desc; + msg.iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.datatype = FI_UINT8; + msg.op = FI_SUM; + + ret = fi_atomicmsg(cxit_ep, &msg, amo_flags); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_readerr(cntr) != 1) + ; + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == -FI_EAVAIL); + + ret = fi_cq_readerr(cxit_tx_cq, &cq_err, 0); + cr_assert(ret == 1); + + cr_assert(cq_err.flags == (FI_ATOMIC | FI_WRITE)); + cr_assert(cq_err.op_context == NULL); + + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + mr_destroy(&buf_window); +} + +Test(amo_hybrid_mr_desc, invalid_addr_fetching_amo_failure) +{ + struct mem_region buf_window; + struct mem_region result_window; + uint64_t remote_key = 0x1; + uint64_t result_key = 0x4; + int win_len = 1; + void *buf_desc[1] = {}; + void *result_desc[1] = {}; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc = {}; + struct fi_rma_ioc rma_ioc = {}; + struct fi_ioc fetch_ioc = {}; + int ret; + struct fid_cntr *cntr = cxit_read_cntr; + struct fi_cq_tagged_entry cqe; + struct fi_cq_err_entry cq_err; + uint64_t amo_flags = FI_TRANSMIT_COMPLETE; + + ret = mr_create(win_len, FI_REMOTE_READ | FI_REMOTE_WRITE, + 0xa, &remote_key, &buf_window); + cr_assert(ret == FI_SUCCESS); + + ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &result_key, + &result_window); + cr_assert(ret == FI_SUCCESS); + + buf_desc[0] = fi_mr_desc(buf_window.mr); + result_desc[0] = fi_mr_desc(result_window.mr); + + ioc.addr = buf_window.mem; + ioc.count = 1; + + rma_ioc.count = 1; + rma_ioc.key = remote_key; + + msg.msg_iov = &ioc; + msg.desc = buf_desc; + msg.iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.datatype = FI_UINT8; + msg.op = FI_SUM; + + fetch_ioc.addr = result_window.mem + 0xffffffffff; + fetch_ioc.count = 1; + + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc, result_desc, 1, + amo_flags); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_readerr(cntr) != 1) + ; + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == -FI_EAVAIL); + + ret = fi_cq_readerr(cxit_tx_cq, &cq_err, 0); + cr_assert(ret == 1); + + cr_assert(cq_err.flags == (FI_ATOMIC | FI_READ)); + cr_assert(cq_err.op_context == NULL); + + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + mr_destroy(&result_window); + mr_destroy(&buf_window); +} + +struct fi_query_atomic_test { + enum fi_datatype datatype; + enum fi_op op; + bool valid_atomic_attr; + uint64_t flags; + int expected_rc; + int amo_remap_to_pcie_fadd; +}; + +ParameterizedTestParameters(atomic, query_atomic) +{ + static struct fi_query_atomic_test params[] = { + /* NULL atomic attributes. */ + { + .datatype = FI_INT8, + .op = FI_MIN, + .valid_atomic_attr = false, + .flags = 0, + .expected_rc = -FI_EINVAL, + }, + /* Bad dataype. */ + { + .datatype = 0xffff, + .op = FI_MIN, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EINVAL, + }, + /* Bad op. */ + { + .datatype = FI_INT8, + .op = 0xffff, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EINVAL, + }, + /* Bad flags. */ + { + .datatype = FI_INT8, + .op = FI_MIN, + .valid_atomic_attr = true, + .flags = FI_COMPARE_ATOMIC | FI_FETCH_ATOMIC, + .expected_rc = -FI_EINVAL, + }, + /* Valid SUM FI_INT8. */ + { + .datatype = FI_INT8, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = FI_SUCCESS, + }, + /* Valid SUM FI_INT8 fetching. */ + { + .datatype = FI_INT8, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_FETCH_ATOMIC, + .expected_rc = FI_SUCCESS, + } + }; + size_t param_sz = ARRAY_SIZE(params); + + return cr_make_param_array(struct fi_query_atomic_test, params, + param_sz); +} + +ParameterizedTest(struct fi_query_atomic_test *params, atomic, query_atomic) +{ + int ret; + struct fi_atomic_attr atomic_attr; + struct fi_atomic_attr *attr = + params->valid_atomic_attr ? &atomic_attr : NULL; + + ret = fi_query_atomic(cxit_domain, params->datatype, params->op, attr, + params->flags); + + cr_assert_eq(ret, params->expected_rc, + "Unexpected fi_query_atomic() rc: expected=%d got=%d\n", + params->expected_rc, ret); +} + +TestSuite(pcie_atomic, .init = reset_amo_remap_to_pcie_fadd, + .fini = reset_amo_remap_to_pcie_fadd, + .timeout = CXIT_DEFAULT_TIMEOUT); + +ParameterizedTestParameters(pcie_atomic, query_atomic) +{ + static struct fi_query_atomic_test params[] = { + /* Valid SUM FI_INT8. */ + { + .datatype = FI_INT8, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = FI_SUCCESS, + .amo_remap_to_pcie_fadd = -1, + }, + + /* Invalid PCIe SUM FI_INT8. Only 32 and 64 bit operations are + * supported. + */ + { + .datatype = FI_INT8, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = -1, + }, + + /* Valid SUM FI_INT32. */ + { + .datatype = FI_INT32, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = FI_SUCCESS, + .amo_remap_to_pcie_fadd = -1, + }, + + /* Invalid PCIe SUM FI_INT32 due to amo_remap_to_pcie_fadd being + * -1. + */ + { + .datatype = FI_INT32, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = -1, + }, + + /* Invalid PCIe SUM FI_INT32 due to missing FI_FETCH_ATOMIC. + */ + { + .datatype = FI_INT32, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_INT32 since FI_COMPARE_ATOMIC is invalid. + */ + { + .datatype = FI_INT32, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_COMPARE_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Valid PCIe SUM FI_INT32 remapping C_AMO_OP_MIN. */ + { + .datatype = FI_INT32, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = FI_SUCCESS, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Valid PCIe SUM FI_UINT32 remapping C_AMO_OP_MIN. */ + { + .datatype = FI_UINT32, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = FI_SUCCESS, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Valid PCIe SUM FI_INT64 remapping C_AMO_OP_MIN. */ + { + .datatype = FI_INT64, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = FI_SUCCESS, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Valid PCIe SUM FI_UINT64 remapping C_AMO_OP_MIN. */ + { + .datatype = FI_UINT64, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = FI_SUCCESS, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_INT8. */ + { + .datatype = FI_INT8, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_UINT8. */ + { + .datatype = FI_UINT8, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_INT16. */ + { + .datatype = FI_INT16, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_UINT16. */ + { + .datatype = FI_UINT16, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_FLOAT. */ + { + .datatype = FI_FLOAT, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_DOUBLE. */ + { + .datatype = FI_DOUBLE, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_FLOAT_COMPLEX. */ + { + .datatype = FI_FLOAT_COMPLEX, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_DOUBLE_COMPLEX. */ + { + .datatype = FI_DOUBLE_COMPLEX, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_LONG_DOUBLE. */ + { + .datatype = FI_LONG_DOUBLE, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid PCIe SUM FI_LONG_DOUBLE_COMPLEX. */ + { + .datatype = FI_LONG_DOUBLE_COMPLEX, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = FI_CXI_PCIE_AMO | FI_FETCH_ATOMIC, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid FI_MIN operation since it is remapped. */ + { + .datatype = FI_INT8, + .op = FI_MIN, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Invalid FI_MAX operation since it is remapped. */ + { + .datatype = FI_INT8, + .op = FI_MAX, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_MAX, + }, + + /* Invalid FI_SUM operation without PCIe AMO since it is + * remapped. + */ + { + .datatype = FI_INT8, + .op = FI_SUM, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_SUM, + }, + + /* Invalid FI_LOR operation since it is remapped. */ + { + .datatype = FI_INT8, + .op = FI_LOR, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_LOR, + }, + + /* Invalid FI_LAND operation since it is remapped. */ + { + .datatype = FI_INT8, + .op = FI_LAND, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_LAND, + }, + + /* Invalid FI_BOR operation since it is remapped. */ + { + .datatype = FI_INT8, + .op = FI_BOR, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_BOR, + }, + + /* Invalid FI_BAND operation since it is remapped. */ + { + .datatype = FI_INT8, + .op = FI_BAND, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_BAND, + }, + + /* Invalid FI_LXOR operation since it is remapped. */ + { + .datatype = FI_INT8, + .op = FI_LXOR, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_LXOR, + }, + + /* Invalid FI_BXOR operation since it is remapped. */ + { + .datatype = FI_INT8, + .op = FI_BXOR, + .valid_atomic_attr = true, + .flags = 0, + .expected_rc = -FI_EOPNOTSUPP, + .amo_remap_to_pcie_fadd = C_AMO_OP_BXOR, + }, + }; + size_t param_sz = ARRAY_SIZE(params); + + return cr_make_param_array(struct fi_query_atomic_test, params, + param_sz); +} + +ParameterizedTest(struct fi_query_atomic_test *params, pcie_atomic, + query_atomic) +{ + int ret; + struct fi_atomic_attr atomic_attr; + struct fi_atomic_attr *attr = + params->valid_atomic_attr ? &atomic_attr : NULL; + + /* The AMO remap value must be set before libfabric domain is allocated. + * Else, and inconsistent view of the AMO remap value will be read. + */ + set_amo_remap_to_pcie_fadd(params->amo_remap_to_pcie_fadd); + cxit_setup_rma(); + + ret = fi_query_atomic(cxit_domain, params->datatype, params->op, attr, + params->flags); + + cr_assert_eq(ret, params->expected_rc, + "Unexpected fi_query_atomic() rc: expected=%d got=%d\n", + params->expected_rc, ret); + + cxit_teardown_rma(); +} + +struct fi_pcie_fadd_test { + enum fi_datatype dt; + union { + uint64_t u64_src; + int64_t s64_src; + uint32_t u32_src; + int32_t s32_src; + } src; + union { + uint64_t u64_dst; + int64_t s64_dst; + uint32_t u32_dst; + int32_t s32_dst; + } dst; + union { + uint64_t u64_result; + int64_t s64_result; + uint32_t u32_result; + int32_t s32_result; + } result; + int amo_remap_to_pcie_fadd; +}; + +ParameterizedTestParameters(pcie_atomic, fadd) +{ + static struct fi_pcie_fadd_test params[] = { + /* Interger overflow. */ + { + .dt = FI_INT32, + .src.s32_src = 2147483647, + .dst.s32_dst = 1, + .result.s32_result = -2147483648, + .amo_remap_to_pcie_fadd = C_AMO_OP_SWAP, + }, + + /* Unsigned interger overflow. */ + { + .dt = FI_UINT32, + .src.u32_src = 0xFFFFFFFF, + .dst.u32_dst = 1, + .result.u32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_SWAP, + }, + + /* Long overflow. */ + { + .dt = FI_INT64, + .src.s64_src = 9223372036854775807, + .dst.s64_dst = 1, + .result.u64_result = 0x8000000000000000, + .amo_remap_to_pcie_fadd = C_AMO_OP_SWAP, + }, + + /* Unsigned long overflow. */ + { + .dt = FI_UINT64, + .src.u64_src = 0xFFFFFFFFFFFFFFFF, + .dst.u64_dst = 1, + .result.u64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_SWAP, + }, + + /* Valid 32-bit AMO with C_AMO_OP_MIN remapped. */ + { + .dt = FI_INT32, + .src.s32_src = -1, + .dst.s32_dst = 1, + .result.s32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Valid 64-bit AMO with C_AMO_OP_MIN remapped. */ + { + .dt = FI_INT64, + .src.s64_src = -4294967296, + .dst.s64_dst = 4294967296, + .result.s64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_MIN, + }, + + /* Valid 32-bit AMO with C_AMO_OP_MAX remapped. */ + { + .dt = FI_INT32, + .src.s32_src = -1, + .dst.s32_dst = 1, + .result.s32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_MAX, + }, + + /* Valid 64-bit AMO with C_AMO_OP_MAX remapped. */ + { + .dt = FI_INT64, + .src.s64_src = -4294967296, + .dst.s64_dst = 4294967296, + .result.s64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_MAX, + }, + + /* Valid 32-bit AMO with C_AMO_OP_SUM remapped. */ + { + .dt = FI_INT32, + .src.s32_src = -1, + .dst.s32_dst = 1, + .result.s32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_SUM, + }, + + /* Valid 64-bit AMO with C_AMO_OP_SUM remapped. */ + { + .dt = FI_INT64, + .src.s64_src = -4294967296, + .dst.s64_dst = 4294967296, + .result.s64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_SUM, + }, + + /* Valid 32-bit AMO with C_AMO_OP_LOR remapped. */ + { + .dt = FI_INT32, + .src.s32_src = -1, + .dst.s32_dst = 1, + .result.s32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_LOR, + }, + + /* Valid 64-bit AMO with C_AMO_OP_LOR remapped. */ + { + .dt = FI_INT64, + .src.s64_src = -4294967296, + .dst.s64_dst = 4294967296, + .result.s64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_LOR, + }, + + /* Valid 32-bit AMO with C_AMO_OP_LAND remapped. */ + { + .dt = FI_INT32, + .src.s32_src = -1, + .dst.s32_dst = 1, + .result.s32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_LAND, + }, + + /* Valid 64-bit AMO with C_AMO_OP_LAND remapped. */ + { + .dt = FI_INT64, + .src.s64_src = -4294967296, + .dst.s64_dst = 4294967296, + .result.s64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_LAND, + }, + + /* Valid 32-bit AMO with C_AMO_OP_BOR remapped. */ + { + .dt = FI_INT32, + .src.s32_src = -1, + .dst.s32_dst = 1, + .result.s32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_BOR, + }, + + /* Valid 64-bit AMO with C_AMO_OP_BOR remapped. */ + { + .dt = FI_INT64, + .src.s64_src = -4294967296, + .dst.s64_dst = 4294967296, + .result.s64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_BOR, + }, + + /* Valid 32-bit AMO with C_AMO_OP_BAND remapped. */ + { + .dt = FI_INT32, + .src.s32_src = -1, + .dst.s32_dst = 1, + .result.s32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_BAND, + }, + + /* Valid 64-bit AMO with C_AMO_OP_BAND remapped. */ + { + .dt = FI_INT64, + .src.s64_src = -4294967296, + .dst.s64_dst = 4294967296, + .result.s64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_BAND, + }, + + /* Valid 32-bit AMO with C_AMO_OP_LXOR remapped. */ + { + .dt = FI_INT32, + .src.s32_src = -1, + .dst.s32_dst = 1, + .result.s32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_LXOR, + }, + + /* Valid 64-bit AMO with C_AMO_OP_LXOR remapped. */ + { + .dt = FI_INT64, + .src.s64_src = -4294967296, + .dst.s64_dst = 4294967296, + .result.s64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_LXOR, + }, + + /* Valid 32-bit AMO with C_AMO_OP_BXOR remapped. */ + { + .dt = FI_INT32, + .src.s32_src = -1, + .dst.s32_dst = 1, + .result.s32_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_BXOR, + }, + + /* Valid 64-bit AMO with C_AMO_OP_BXOR remapped. */ + { + .dt = FI_INT64, + .src.s64_src = -4294967296, + .dst.s64_dst = 4294967296, + .result.s64_result = 0, + .amo_remap_to_pcie_fadd = C_AMO_OP_BXOR, + }, + }; + size_t param_sz = ARRAY_SIZE(params); + + return cr_make_param_array(struct fi_pcie_fadd_test, params, + param_sz); +} + +ParameterizedTest(struct fi_pcie_fadd_test *params, pcie_atomic, fadd) +{ + int ret; + size_t amo_size; + uint64_t rkey = 0x1; + uint64_t nic_rkey = 0x2; + struct mem_region remote_window; + struct mem_region nic_remote_window; + union { + uint64_t u64_fetch; + int64_t s64_fetch; + uint32_t u32_fetch; + int32_t s32_fetch; + } fetch; + union { + uint64_t u64_fetch; + int64_t s64_fetch; + uint32_t u32_fetch; + int32_t s32_fetch; + } nic_fetch; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc = {}; + struct fi_rma_ioc rma_ioc = {}; + struct fi_ioc fetch_ioc = {}; + struct fi_cq_tagged_entry cqe; + uint64_t cur_cpu_fetch_cntr; + uint64_t new_cpu_fetch_cntr; + struct cxip_ep *cxi_ep; + + if (params->dt == FI_INT32 || params->dt == FI_UINT32) + amo_size = 4; + else + amo_size = 8; + + /* The AMO remap value must be set before libfabric domain is allocated. + * Else, and inconsistent view of the AMO remap value will be read. + */ + set_amo_remap_to_pcie_fadd(params->amo_remap_to_pcie_fadd); + cxit_setup_rma(); + + /* PCIe AMOs not supported on netsim. */ + cxi_ep = container_of(cxit_ep, struct cxip_ep, ep); + if (is_netsim(cxi_ep->ep_obj)) + goto teardown; + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_CPU_FTCH_AMO_REQS, + &cur_cpu_fetch_cntr, NULL, true); + cr_assert(ret == 0); + + /* Create target MR and copy destantion contents into it. */ + ret = mr_create(amo_size, FI_REMOTE_READ | FI_REMOTE_WRITE, 0, &rkey, + &remote_window); + cr_assert(ret == FI_SUCCESS); + memcpy(remote_window.mem, ¶ms->dst, amo_size); + + /* Create another target MR to be used for NIC AMO SUM comparison to the + * PCIe AMO. + */ + ret = mr_create(amo_size, FI_REMOTE_READ | FI_REMOTE_WRITE, 0, + &nic_rkey, &nic_remote_window); + cr_assert(ret == FI_SUCCESS); + memcpy(nic_remote_window.mem, ¶ms->dst, amo_size); + + /* Fill in fetching AMO desciptors. */ + ioc.addr = ¶ms->src; + ioc.count = 1; + + rma_ioc.key = rkey; + rma_ioc.count = 1; + + fetch_ioc.addr = &fetch; + fetch_ioc.count = 1; + + msg.datatype = params->dt; + msg.op = FI_SUM; + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + + /* Issue PCIe fetch add. */ + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc, NULL, 1, + FI_TRANSMIT_COMPLETE | FI_COMPLETION | + FI_CXI_PCIE_AMO); + cr_assert(ret == FI_SUCCESS); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + + /* Issue NIC fetching SUM AMO. */ + if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) { + rma_ioc.key = nic_rkey; + fetch_ioc.addr = &nic_fetch; + + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &fetch_ioc, NULL, 1, + FI_TRANSMIT_COMPLETE | FI_COMPLETION); + cr_assert(ret == FI_SUCCESS); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + } + + if (params->dt == FI_INT32) { + /* Compare PCIe FADD to the expected values. */ + cr_assert_eq(*((int32_t *)remote_window.mem), + params->result.s32_result, + "Unexpected remote AMO result: got=%d expected=%d\n", + *((int32_t *)remote_window.mem), + params->result.s32_result); + cr_assert_eq(fetch.s32_fetch, params->dst.s32_dst, + "Unexpected fetch AMO result: got=%d expected=%d\n", + fetch.s32_fetch, params->dst.s32_dst); + + /* Compare PCIe FADD to the NIC fetch add/sum values. */ + if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) { + cr_assert_eq(*((int32_t *)remote_window.mem), + *((int32_t *)nic_remote_window.mem), + "Unexpected remote AMO result: got=%d expected=%d\n", + *((int32_t *)remote_window.mem), + *((int32_t *)nic_remote_window.mem)); + cr_assert_eq(fetch.s32_fetch, nic_fetch.s32_fetch, + "Unexpected fetch AMO result: got=%d expected=%d\n", + fetch.s32_fetch, nic_fetch.s32_fetch); + } + } else if (params->dt == FI_UINT32) { + /* Compare PCIe FADD to the expected values. */ + cr_assert_eq(*((uint32_t *)remote_window.mem), + params->result.u32_result, + "Unexpected remote AMO result: got=%u expected=%u\n", + *((uint32_t *)remote_window.mem), + params->result.s32_result); + cr_assert_eq(fetch.u32_fetch, params->dst.u32_dst, + "Unexpected fetch AMO result: got=%u expected=%u\n", + fetch.u32_fetch, params->dst.u32_dst); + + /* Compare PCIe FADD to the NIC fetch add/sum values. */ + if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) { + cr_assert_eq(*((uint32_t *)remote_window.mem), + *((uint32_t *)nic_remote_window.mem), + "Unexpected remote AMO result: got=%u expected=%u\n", + *((uint32_t *)remote_window.mem), + *((uint32_t *)nic_remote_window.mem)); + cr_assert_eq(fetch.u32_fetch, nic_fetch.u32_fetch, + "Unexpected fetch AMO result: got=%u expected=%u\n", + fetch.u32_fetch, nic_fetch.u32_fetch); + } + } else if (params->dt == FI_INT64) { + /* Compare PCIe FADD to the expected values. */ + cr_assert_eq(*((int64_t *)remote_window.mem), + params->result.s64_result, + "Unexpected remote AMO result: got=%ld expected=%ld\n", + *((int64_t *)remote_window.mem), + params->result.s64_result); + cr_assert_eq(fetch.s64_fetch, params->dst.s64_dst, + "Unexpected fetch AMO result: got=%ld expected=%ld\n", + fetch.s64_fetch, params->dst.s64_dst); + + /* Compare PCIe FADD to the NIC fetch add/sum values. */ + if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) { + cr_assert_eq(*((int64_t *)remote_window.mem), + *((int64_t *)nic_remote_window.mem), + "Unexpected remote AMO result: got=%ld expected=%ld\n", + *((int64_t *)remote_window.mem), + *((int64_t *)nic_remote_window.mem)); + cr_assert_eq(fetch.s64_fetch, nic_fetch.s64_fetch, + "Unexpected fetch AMO result: got=%ld expected=%ld\n", + fetch.s64_fetch, nic_fetch.s64_fetch); + } + } else { + /* Compare PCIe FADD to the expected values. */ + cr_assert_eq(*((uint64_t *)remote_window.mem), + params->result.u64_result, + "Unexpected remote AMO result: got=%lu expected=%lu\n", + *((uint64_t *)remote_window.mem), + params->result.u64_result); + cr_assert_eq(fetch.u64_fetch, params->dst.u64_dst, + "Unexpected fetch AMO result: got=%lu expected=%lu\n", + fetch.u64_fetch, params->dst.u64_dst); + + /* Compare PCIe FADD to the NIC fetch add/sum values. */ + if (params->amo_remap_to_pcie_fadd != C_AMO_OP_SUM) { + cr_assert_eq(*((uint64_t *)remote_window.mem), + *((uint64_t *)nic_remote_window.mem), + "Unexpected remote AMO result: got=%lu expected=%lu\n", + *((uint64_t *)remote_window.mem), + *((uint64_t *)nic_remote_window.mem)); + cr_assert_eq(fetch.u64_fetch, nic_fetch.u64_fetch, + "Unexpected fetch AMO result: got=%lu expected=%lu\n", + fetch.u64_fetch, nic_fetch.u64_fetch); + } + } + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_CPU_FTCH_AMO_REQS, + &new_cpu_fetch_cntr, NULL, true); + cr_assert(ret == 0); + + cr_assert(cur_cpu_fetch_cntr + 1 == new_cpu_fetch_cntr); + + mr_destroy(&nic_remote_window); + mr_destroy(&remote_window); + +teardown: + cxit_teardown_rma(); +} diff --git a/prov/cxi/test/auth_key.c b/prov/cxi/test/auth_key.c new file mode 100644 index 00000000000..57c30d37a15 --- /dev/null +++ b/prov/cxi/test/auth_key.c @@ -0,0 +1,2940 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP + */ +#include +#include +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +void *memdup(const void *src, size_t n) +{ + void *dest; + + dest = malloc(n); + if (dest == NULL) + return NULL; + + return memcpy(dest, src, n); +} + +TestSuite(auth_key, .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test fi_getinfo() verification of hints argument. */ +Test(auth_key, invalid_auth_key_size_domain_attr_hints) +{ + struct cxi_auth_key auth_key = { + .svc_id = CXI_DEFAULT_SVC_ID, + .vni = 1, + }; + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->auth_key_size = 1; + hints->domain_attr->auth_key = memdup(&auth_key, 1); + cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); +} + +/* Test fi_getinfo() verification of hints argument. */ +Test(auth_key, missing_auth_key_size_domain_attr_hints) +{ + struct cxi_auth_key auth_key = { + .svc_id = CXI_DEFAULT_SVC_ID, + .vni = 1, + }; + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + hints->domain_attr->auth_key = memdup(&auth_key, 1); + cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); +} + +/* Test fi_getinfo() verification of hints argument. */ +Test(auth_key, invalid_auth_key_size_ep_attr_hints) +{ + struct cxi_auth_key auth_key = { + .svc_id = CXI_DEFAULT_SVC_ID, + .vni = 1, + }; + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + hints->ep_attr->auth_key_size = 1; + hints->ep_attr->auth_key = memdup(&auth_key, 1); + cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); +} + +/* Test fi_getinfo() verification of hints argument. */ +Test(auth_key, missing_auth_key_size_ep_attr_hints) +{ + struct cxi_auth_key auth_key = { + .svc_id = CXI_DEFAULT_SVC_ID, + .vni = 1, + }; + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + hints->ep_attr->auth_key = memdup(&auth_key, 1); + cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); +} + +/* Verify fi_getinfo() correctly echos back a valid auth_key hint using the + * default svc_id. + */ +Test(auth_key, valid_default_domain_auth_key_hint) +{ + struct cxi_auth_key auth_key = { + .svc_id = CXI_DEFAULT_SVC_ID, + .vni = 1, + }; + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + hints->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed"); + + hints->domain_attr->auth_key_size = sizeof(auth_key); + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + cr_assert_not_null(info->domain_attr->auth_key, "NULL domain auth_key"); + cr_assert_eq(hints->domain_attr->auth_key_size, + info->domain_attr->auth_key_size, + "fi_getinfo returned auth_key_size does not match hints"); + + ret = memcmp(hints->domain_attr->auth_key, info->domain_attr->auth_key, + hints->domain_attr->auth_key_size); + cr_assert_eq(ret, 0, "fi_getinfo returned auth_key does not match hints"); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +/* Verify fi_getinfo() correctly echos back a valid auth_key hint using the + * default svc_id. + */ +Test(auth_key, valid_default_ep_auth_key_hint) +{ + struct cxi_auth_key auth_key = { + .svc_id = CXI_DEFAULT_SVC_ID, + .vni = 1, + }; + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + hints->ep_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed"); + + hints->ep_attr->auth_key_size = sizeof(auth_key); + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + cr_assert_not_null(info->ep_attr->auth_key, "NULL ep auth_key"); + cr_assert_eq(hints->ep_attr->auth_key_size, + info->ep_attr->auth_key_size, + "fi_getinfo returned auth_key_size does not match hints"); + + ret = memcmp(hints->ep_attr->auth_key, info->ep_attr->auth_key, + hints->ep_attr->auth_key_size); + cr_assert_eq(ret, 0, "fi_getinfo returned auth_key does not match hints"); + + /* Since hints domain auth_key is NULL, CXI provider should echo the + * hints ep auth_key into info domain auth_key. This is the behavior + * some MPICH versions expect. + */ + cr_assert_not_null(info->domain_attr->auth_key, "NULL domain auth_key"); + cr_assert_eq(hints->ep_attr->auth_key_size, + info->domain_attr->auth_key_size, + "fi_getinfo returned auth_key_size does not match hints"); + + ret = memcmp(hints->ep_attr->auth_key, info->domain_attr->auth_key, + hints->ep_attr->auth_key_size); + cr_assert_eq(ret, 0, "fi_getinfo returned auth_key does not match hints"); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +/* Verify fi_getinfo() rejects a svc_id which has not been allocated thus making + * the auth_key invalid. + */ +Test(auth_key, invalid_user_defined_domain_svc_id_hint) +{ + struct cxi_auth_key auth_key = { + .svc_id = 0xffff, + .vni = 1, + }; + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + hints->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed"); + + hints->domain_attr->auth_key_size = sizeof(auth_key); + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); +} + +/* Verify fi_getinfo() rejects a svc_id which has not been allocated thus making + * the auth_key invalid. + */ +Test(auth_key, invalid_user_defined_ep_svc_id_hint) +{ + struct cxi_auth_key auth_key = { + .svc_id = 0xffff, + .vni = 1, + }; + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + hints->ep_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed"); + + hints->ep_attr->auth_key_size = sizeof(auth_key); + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); +} + +/* Verify fi_domain() rejects an invalid auth_key. */ +Test(auth_key, invalid_user_defined_domain_svc_id) +{ + struct cxi_auth_key auth_key = { + .svc_id = 0xffff, + .vni = 1, + }; + int ret; + struct fi_info *info; + struct fid_fabric *fab; + struct fid_domain *dom; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + /* Override auth_key with bad auth_key. */ + if (info->domain_attr->auth_key) + free(info->domain_attr->auth_key); + info->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + info->domain_attr->auth_key_size = sizeof(auth_key); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_domain failed: %d", ret); + + fi_close(&fab->fid); + fi_freeinfo(info); +} + +/* Verify fi_endpoint() rejects an invalid auth_key. */ +Test(auth_key, invalid_user_defined_ep_svc_id) +{ + struct cxi_auth_key auth_key = { + .svc_id = 0xffff, + .vni = 1, + }; + int ret; + struct fi_info *info; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_ep *ep; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + /* Override auth_key with bad auth_key. */ + if (info->domain_attr->auth_key) + free(info->domain_attr->auth_key); + info->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + info->domain_attr->auth_key_size = sizeof(auth_key); + + ret = fi_endpoint(dom, info, &ep, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_endpoint failed: %d", ret); + + fi_close(&dom->fid); + fi_close(&fab->fid); + fi_freeinfo(info); +} + +/* Valid service ID but invalid VNI for the service ID. */ +Test(auth_key, valid_user_defined_svc_id_invalid_vni_hints) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + uint16_t valid_vni = 0x120; + struct fi_info *hints; + struct fi_info *info; + struct cxi_auth_key auth_key = { + .vni = 0x123, + }; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = valid_vni; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + auth_key.svc_id = svc_desc.svc_id; + hints->ep_attr->auth_key_size = sizeof(auth_key); + hints->ep_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + cr_assert_not_null(hints->ep_attr->auth_key, "memdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +/* Valid service ID but invalid VNI for the service ID. */ +Test(auth_key, valid_user_defined_svc_id_invalid_vni_dom_attr) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + uint16_t valid_vni = 0x120; + struct fi_info *info; + struct cxi_auth_key auth_key = { + .vni = 0x123, + }; + struct fid_fabric *fab; + struct fid_domain *dom; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = valid_vni; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + /* Override auth_key with bad auth_key. */ + auth_key.svc_id = svc_desc.svc_id; + + if (info->domain_attr->auth_key) + free(info->domain_attr->auth_key); + info->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + info->domain_attr->auth_key_size = sizeof(auth_key); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_domain failed: %d", ret); + + fi_close(&fab->fid); + fi_freeinfo(info); + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +/* Valid service ID but invalid VNI for the service ID. */ +Test(auth_key, valid_user_defined_svc_id_invalid_vni_ep_attr) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + uint16_t valid_vni = 0x120; + struct fi_info *info; + struct cxi_auth_key auth_key = { + .vni = 0x123, + }; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_ep *ep; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = valid_vni; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + /* Override auth_key with bad auth_key. */ + auth_key.svc_id = svc_desc.svc_id; + + if (info->domain_attr->auth_key) + free(info->domain_attr->auth_key); + info->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + info->domain_attr->auth_key_size = sizeof(auth_key); + + ret = fi_endpoint(dom, info, &ep, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_endpoint failed: %d", ret); + + fi_close(&dom->fid); + fi_close(&fab->fid); + fi_freeinfo(info); + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +static void alloc_endpoint(struct fi_info *info, struct fid_fabric **fab, + struct fid_domain **dom, struct fid_av **av, + struct fid_cq **cq, struct fid_ep **ep) +{ + int ret; + struct fi_cq_attr cq_attr = { + .format = FI_CQ_FORMAT_TAGGED, + }; + struct fi_av_attr av_attr = {}; + + ret = fi_fabric(info->fabric_attr, fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(*fab, info, dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + ret = fi_cq_open(*dom, &cq_attr, cq, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open failed: %d", ret); + + ret = fi_av_open(*dom, &av_attr, av, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret); + + ret = fi_endpoint(*dom, info, ep, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_endpoint failed: %d", ret); + + ret = fi_ep_bind(*ep, &(*av)->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind failed: %d", ret); + + ret = fi_ep_bind(*ep, &(*cq)->fid, FI_TRANSMIT | FI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind failed: %d", ret); + + ret = fi_enable(*ep); + cr_assert_eq(ret, FI_SUCCESS, "fi_enable failed: %d", ret); +} + +Test(auth_key, valid_user_defined_svc_id_valid_vni_verify_vni_enforcement) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + struct fi_info *hints; + struct fi_info *default_info; + struct fi_info *user_info; + struct cxi_auth_key auth_key = {}; + uint16_t valid_vni = 0x1234; + struct fid_fabric *default_fab; + struct fid_domain *default_dom; + struct fid_av *default_av; + struct fid_cq *default_cq; + struct fid_ep *default_ep; + struct fid_fabric *user_fab; + struct fid_domain *user_dom; + struct fid_av *user_av; + struct fid_cq *user_cq; + struct fid_ep *user_ep; + char buf[256]; + fi_addr_t target_default_ep; + struct fi_cq_tagged_entry event; + struct fi_cq_err_entry error; + + /* Allocate infos for RDMA test. Default_info users the provider + * assigned default auth_key where user_info uses the user defined + * auth_key. + */ + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "0", FI_SOURCE, NULL, &default_info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = valid_vni; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + auth_key.svc_id = svc_desc.svc_id; + auth_key.vni = valid_vni; + hints->domain_attr->auth_key_size = sizeof(auth_key); + hints->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed"); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &user_info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + /* Allocate endpoints using different service IDs and VNIs. */ + alloc_endpoint(default_info, &default_fab, &default_dom, &default_av, + &default_cq, &default_ep); + alloc_endpoint(user_info, &user_fab, &user_dom, &user_av, + &user_cq, &user_ep); + + /* Insert the default EP address into the user AVs. */ + ret = fi_av_insert(user_av, default_info->src_addr, 1, + &target_default_ep, 0, NULL); + cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret); + + /* These two endpoints should not be able to talk due to operating in + * different VNIs. This should result in an I/O error at the initiator. + */ + ret = fi_recv(default_ep, buf, sizeof(buf), NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + ret = fi_send(user_ep, buf, sizeof(buf), NULL, target_default_ep, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + do { + ret = fi_cq_read(user_cq, &event, 1); + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_read failed: %d", ret); + + ret = fi_cq_readerr(user_cq, &error, 0); + cr_assert_eq(ret, 1, "fi_cq_readerr failed: %d", ret); + + /* Single these tests are loopback on the same NIC, RC_PTLTE_NOT_FOUND + * is returned instead of RC_VNI_NOT_FOUND since the VNI is valid. + * Non-loopback should returned RC_VNI_NOT_FOUND. + */ + cr_assert_eq(error.prov_errno, C_RC_PTLTE_NOT_FOUND, + "Bad error.prov_errno: got=%d expected=%d", + error.prov_errno, C_RC_PTLTE_NOT_FOUND); + + fi_close(&user_ep->fid); + fi_close(&user_cq->fid); + fi_close(&user_av->fid); + fi_close(&user_dom->fid); + fi_close(&user_fab->fid); + fi_close(&default_ep->fid); + fi_close(&default_cq->fid); + fi_close(&default_av->fid); + fi_close(&default_dom->fid); + fi_close(&default_fab->fid); + fi_freeinfo(user_info); + fi_freeinfo(hints); + fi_freeinfo(default_info); + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +/* Use the Slingshot plugin environment variables to generate an auth_key. Only + * a single entry per environment variable is specified. + */ +Test(auth_key, ss_plugin_env_vars_single_entry) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + struct fi_info *info; + struct cxi_auth_key auth_key = { + .vni = 288, + }; + char svc_id_str[256]; + struct fid_fabric *fab; + struct fid_domain *dom; + struct cxip_nic_attr *nic_attr; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = auth_key.vni; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + auth_key.svc_id = ret; + + ret = setenv("SLINGSHOT_VNIS", "288", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + ret = setenv("SLINGSHOT_DEVICES", "cxi0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + sprintf(svc_id_str, "%d", auth_key.svc_id); + ret = setenv("SLINGSHOT_SVC_IDS", svc_id_str, 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_eq(nic_attr->default_rgroup_id, auth_key.svc_id, + "fi_getinfo returned auth_key does not match Slingshot env vars"); + cr_assert_eq(nic_attr->default_vni, auth_key.vni, + "fi_getinfo returned auth_key does not match Slingshot env vars"); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + fi_close(&dom->fid); + fi_close(&fab->fid); + fi_freeinfo(info); + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +/* Use the Slingshot plugin environment variables to generate an auth_key. + * Multiple values per environment variable are specified. + */ +Test(auth_key, ss_plugin_env_vars_multiple_entries) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + struct fi_info *info; + struct cxi_auth_key auth_key = { + .vni = 288, + }; + char svc_id_str[256]; + struct fid_fabric *fab; + struct fid_domain *dom; + struct cxip_nic_attr *nic_attr; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = auth_key.vni; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + auth_key.svc_id = ret; + + ret = setenv("SLINGSHOT_VNIS", "288,999", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + ret = setenv("SLINGSHOT_DEVICES", "cxi1,cxi15,cxi4,cxi0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + sprintf(svc_id_str, "1024,1025,1026,%d", auth_key.svc_id); + ret = setenv("SLINGSHOT_SVC_IDS", svc_id_str, 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_eq(nic_attr->default_rgroup_id, auth_key.svc_id, + "fi_getinfo returned auth_key does not match Slingshot env vars"); + cr_assert_eq(nic_attr->default_vni, auth_key.vni, + "fi_getinfo returned auth_key does not match Slingshot env vars"); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + fi_close(&dom->fid); + fi_close(&fab->fid); + fi_freeinfo(info); + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +#define DEFAULT_SERVICE_ID 1U + +/* Use the Slingshot plugin environment variables to define auth_keys for a + * cxi device which does not exist. + */ +Test(auth_key, ss_plugin_env_vars_no_nic) +{ + struct fi_info *info; + int ret; + struct cxip_nic_attr *nic_attr; + + ret = setenv("SLINGSHOT_VNIS", "288,999", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + ret = setenv("SLINGSHOT_DEVICES", "cxi1,cxi15,cxi4", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + ret = setenv("SLINGSHOT_SVC_IDS", "1024,1025,1026", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_eq(nic_attr->default_rgroup_id, DEFAULT_SERVICE_ID, + "Unexpected svc_id: %d", nic_attr->default_rgroup_id); + + fi_freeinfo(info); +} + +/* Define valid Slingshot plugin environment variables and verify that user + * provided auth_key is honored before using Slingshot plugin environment + * variables to generate auth_key. + */ +Test(auth_key, ss_plugin_auth_key_priority) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + struct fi_info *info; + struct fi_info *hints; + char svc_id_str[256]; + struct cxi_auth_key auth_key = { + .svc_id = CXI_DEFAULT_SVC_ID, + .vni = 1, + }; + struct fid_fabric *fab; + struct fid_domain *dom; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = auth_key.vni; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + ret = setenv("SLINGSHOT_VNIS", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + ret = setenv("SLINGSHOT_DEVICES", "cxi0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + sprintf(svc_id_str, "%d", auth_key.svc_id); + ret = setenv("SLINGSHOT_SVC_IDS", svc_id_str, 1); + cr_assert_eq(ret, 0, "setenv failed: %d", errno); + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + hints->domain_attr->auth_key = memdup(&auth_key, sizeof(auth_key)); + cr_assert_not_null(hints->domain_attr->auth_key, "memdup failed"); + + hints->domain_attr->auth_key_size = sizeof(auth_key); + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + ret = memcmp(hints->domain_attr->auth_key, info->domain_attr->auth_key, + hints->domain_attr->auth_key_size); + cr_assert_eq(ret, 0, "fi_getinfo returned auth_key does not match hints"); + cr_assert_eq(info->domain_attr->auth_key_size, sizeof(auth_key)); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + fi_close(&dom->fid); + fi_close(&fab->fid); + fi_freeinfo(info); + fi_freeinfo(hints); + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +/* Restrict the auth_key to a specific UID. */ +Test(auth_key, uid_valid_service) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + struct fi_info *info; + struct fid_fabric *fab; + struct fid_domain *dom; + uid_t test_uid = 65530; + uint64_t test_vni = 12345; + struct cxi_auth_key auth_key = {}; + struct cxip_nic_attr *nic_attr; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_members = 1; + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = test_vni; + svc_desc.members[0].type = CXI_SVC_MEMBER_UID; + svc_desc.members[0].svc_member.uid = test_uid; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + auth_key.svc_id = svc_desc.svc_id; + auth_key.vni = test_vni; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + /* Ensure that returned auth_key does not contain allocated service ID + * since this is restricted to specific UID. + * + * Return auth_key hint should be NULL. NIC attr should not contain the + * service ID and VNI. + */ + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_neq(nic_attr->default_rgroup_id, auth_key.svc_id); + cr_assert_neq(nic_attr->default_vni, auth_key.vni); + + fi_freeinfo(info); + + ret = seteuid(test_uid); + cr_assert_eq(ret, 0, "seteuid failed: %d", errno); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + /* Ensure that returned auth_key does not contain allocated service ID + * since this is restricted to specific UID. + * + * Return auth_key hint should be NULL. NIC attr should not contain the + * service ID and VNI. + */ + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_eq(nic_attr->default_rgroup_id, auth_key.svc_id); + cr_assert_eq(nic_attr->default_vni, auth_key.vni); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + fi_close(&dom->fid); + fi_close(&fab->fid); + fi_freeinfo(info); + + /* Make sure non-root user cannot destroy service. */ + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_neq(ret, 0, "cxil_destroy_svc did not fail"); + + ret = seteuid(0); + cr_assert_eq(ret, 0, "seteuid failed: %d", errno); + + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +/* Restrict the auth_key to a specific GID. */ +Test(auth_key, gid_valid_service) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + struct fi_info *info; + struct fid_fabric *fab; + struct fid_domain *dom; + uid_t test_gid = 32766; + uint64_t test_vni = 12345; + struct cxi_auth_key auth_key = {}; + struct cxip_nic_attr *nic_attr; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_members = 1; + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = test_vni; + svc_desc.members[0].type = CXI_SVC_MEMBER_GID; + svc_desc.members[0].svc_member.gid = test_gid; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + auth_key.svc_id = svc_desc.svc_id; + auth_key.vni = test_vni; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + /* Ensure that returned auth_key does not contain allocated service ID + * since this is restricted to specific UID. + * + * Return auth_key hint should be NULL. NIC attr should not contain the + * service ID and VNI. + */ + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_neq(nic_attr->default_rgroup_id, auth_key.svc_id); + cr_assert_neq(nic_attr->default_vni, auth_key.vni); + + fi_freeinfo(info); + + ret = setegid(test_gid); + cr_assert_eq(ret, 0, "setegid failed: %d", errno); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + /* Ensure that returned auth_key does contain allocated service ID + * since this is restricted to specific UID. + * + * Return auth_key hint should be NULL. NIC attr should contain the + * service ID and VNI. + */ + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_eq(nic_attr->default_rgroup_id, auth_key.svc_id); + cr_assert_eq(nic_attr->default_vni, auth_key.vni); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + fi_close(&dom->fid); + fi_close(&fab->fid); + fi_freeinfo(info); + + ret = setegid(0); + cr_assert_eq(ret, 0, "setegid failed: %d", errno); + + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +/* Verify the priority between UID, GID, and unrestricted services get honored. + */ +Test(auth_key, uid_gid_default_service_id_priority) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + struct fi_info *info; + uid_t test_uid = 65530; + uint64_t test_uid_vni = 12345; + uid_t test_gid = 32766; + uint64_t test_gid_vni = 12344; + struct cxi_auth_key uid_auth_key = {}; + struct cxi_auth_key gid_auth_key = {}; + struct cxip_nic_attr *nic_attr; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_members = 1; + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = test_uid_vni; + svc_desc.members[0].type = CXI_SVC_MEMBER_UID; + svc_desc.members[0].svc_member.uid = test_uid; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + + uid_auth_key.svc_id = ret; + uid_auth_key.vni = test_uid_vni; + + svc_desc.vnis[0] = test_gid_vni; + svc_desc.members[0].type = CXI_SVC_MEMBER_GID; + svc_desc.members[0].svc_member.gid = test_gid; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + + gid_auth_key.svc_id = ret; + gid_auth_key.vni = test_gid_vni; + + /* Since UID and GID have not changed, auth_key with default service ID + * should be returned. + */ + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_eq(nic_attr->default_rgroup_id, DEFAULT_SERVICE_ID, + "Default service ID was not returned: svc_id=%d", + nic_attr->default_rgroup_id); + + fi_freeinfo(info); + + /* Changing GID should result in GID auth_key being returned. */ + ret = setegid(test_gid); + cr_assert_eq(ret, 0, "setegid failed: %d", errno); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_eq(nic_attr->default_rgroup_id, gid_auth_key.svc_id); + cr_assert_eq(nic_attr->default_vni, gid_auth_key.vni); + + fi_freeinfo(info); + + /* Changing the UID should result in UID auth_key being returned. */ + ret = seteuid(test_uid); + cr_assert_eq(ret, 0, "seteuid failed: %d", errno); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + cr_assert_eq(info->domain_attr->auth_key, NULL); + cr_assert_eq(info->domain_attr->auth_key_size, 0); + + nic_attr = info->nic->prov_attr; + cr_assert_eq(nic_attr->default_rgroup_id, uid_auth_key.svc_id); + cr_assert_eq(nic_attr->default_vni, uid_auth_key.vni); + + fi_freeinfo(info); + + ret = seteuid(0); + cr_assert_eq(ret, 0, "seteuid failed: %d", errno); + + ret = setegid(0); + cr_assert_eq(ret, 0, "setegid failed: %d", errno); + + ret = cxil_destroy_svc(dev, gid_auth_key.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + + ret = cxil_destroy_svc(dev, uid_auth_key.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + + cxil_close_device(dev); +} + +/* Test disabling the default service ID. */ +Test(auth_key, default_service_id_disabled) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + struct fi_info *info; + struct fid_fabric *fab; + struct fid_domain *dom; + + /* Disable the default service ID. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + ret = cxil_get_svc(dev, DEFAULT_SERVICE_ID, &svc_desc); + cr_assert_eq(ret, 0, "cxil_get_svc failed: %d", ret); + cr_assert_eq(svc_desc.enable, 1, + "Default service ID unexpectedly disabled"); + + svc_desc.enable = 0; + + ret = cxil_update_svc(dev, &svc_desc, &fail_info); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + /* With the default service ID disabled, NULL auth_key should be + * returned. + */ + cr_assert_null(info->domain_attr->auth_key, "Domain auth_key not NULL"); + cr_assert_null(info->ep_attr->auth_key, "EP auth_key not NULL"); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_neq(ret, FI_SUCCESS, "fi_domain did not fail"); + + fi_close(&fab->fid); + fi_freeinfo(info); + + /* Restore default service. */ + svc_desc.enable = 1; + ret = cxil_update_svc(dev, &svc_desc, &fail_info); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + cxil_close_device(dev); +} + +#define DEFAULT_MAX_EP_AUTH_KEY 4 + +Test(auth_key, max_ep_auth_key_null_hints) +{ + int ret; + struct fi_info *info; + struct fi_info *tmp; + int i = 0; + size_t expected_ep_auth_key; + + ret = setenv("FI_CXI_COMPAT", "0", 1); + cr_assert(ret == 0); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + tmp = info; + while (tmp) { + /* The first 2 fi_info's should have max_ep_auth_key == 1*/ + if (i < 2) + expected_ep_auth_key = 1; + else + expected_ep_auth_key = DEFAULT_MAX_EP_AUTH_KEY; + + cr_assert_eq(tmp->domain_attr->max_ep_auth_key, + expected_ep_auth_key, + "Invalid max_ep_auth_key: expected=%ld got=%ld info_count=%d", + expected_ep_auth_key, + tmp->domain_attr->max_ep_auth_key, i); + tmp = tmp->next; + i++; + } + + fi_freeinfo(info); +} + +/* Test fi_getinfo() verification of hints argument. */ +Test(auth_key, zero_max_ep_auth_key_null_hint) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + struct fi_info *tmp; + int i = 0; + size_t expected_ep_auth_key; + + ret = setenv("FI_CXI_COMPAT", "0", 1); + cr_assert(ret == 0); + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->max_ep_auth_key = 0; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + tmp = info; + while (tmp) { + /* The first 2 fi_info's should have max_ep_auth_key == 1*/ + if (i < 2) + expected_ep_auth_key = 1; + else + expected_ep_auth_key = DEFAULT_MAX_EP_AUTH_KEY; + + cr_assert_eq(tmp->domain_attr->max_ep_auth_key, + expected_ep_auth_key, + "Invalid max_ep_auth_key: expected=%ld got=%ld info_count=%d", + expected_ep_auth_key, + tmp->domain_attr->max_ep_auth_key, i); + tmp = tmp->next; + i++; + } + + fi_freeinfo(hints); + fi_freeinfo(info); +} + +/* Test fi_getinfo() verification of hints argument. */ +Test(auth_key, valid_max_ep_auth_key_null_hint) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + struct fi_info *tmp; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->max_ep_auth_key = 1; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + tmp = info; + while (tmp) { + cr_assert_eq(tmp->domain_attr->max_ep_auth_key, + hints->domain_attr->max_ep_auth_key, + "Invalid max_ep_auth_key: expected=%ld got=%ld", + hints->domain_attr->max_ep_auth_key, + tmp->domain_attr->max_ep_auth_key); + tmp = tmp->next; + } + + fi_freeinfo(hints); + fi_freeinfo(info); +} + +/* Test fi_getinfo() verification of hints argument. */ +Test(auth_key, invalid_max_ep_auth_key_null_hint) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->max_ep_auth_key = 12345678; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); +} + +TestSuite(av_auth_key, .timeout = CXIT_DEFAULT_TIMEOUT); + +static void open_av_auth_key(struct fi_info *info, struct fid_fabric **fab, + struct fid_domain **dom, struct fid_av **av) +{ + int ret; + struct fi_av_attr av_attr = {}; + + ret = fi_fabric(info->fabric_attr, fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(*fab, info, dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + ret = fi_av_open(*dom, &av_attr, av, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret); +} + +static void close_av_auth_key(struct fid_fabric *fab, struct fid_domain *dom, + struct fid_av *av) +{ + int ret; + + ret = fi_close(&av->fid); + cr_assert_eq(ret, FI_SUCCESS); + + ret = fi_close(&dom->fid); + cr_assert_eq(ret, FI_SUCCESS); + + ret = fi_close(&fab->fid); + cr_assert_eq(ret, FI_SUCCESS); +} + +Test(av_auth_key, insert_without_av_auth_key_set) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxip_nic_attr *nic_attr; + fi_addr_t addr_key; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = info->nic->prov_attr; + auth_key.vni = nic_attr->default_vni; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, lookup_without_av_auth_key_set) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + size_t size = sizeof(auth_key); + fi_addr_t addr_key = 0; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_lookup_auth_key(av, addr_key, &auth_key, &size); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_lookup_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +/* Insert multiple auth_keys. */ +#define NUM_VNIS 4U +Test(av_auth_key, insert_lookup_valid_auth_key) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxi_auth_key lookup_auth_key = {}; + size_t auth_key_size; + fi_addr_t addr_key; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + int i; + + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = NUM_VNIS; + + for (i = 0; i < NUM_VNIS; i++) + svc_desc.vnis[i] = 123 + i; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = NUM_VNIS; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + open_av_auth_key(info, &fab, &dom, &av); + + for (i = 0; i < NUM_VNIS; i++) { + auth_key.vni = svc_desc.vnis[i]; + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), + &addr_key, 0); + cr_assert_eq(ret, FI_SUCCESS, + "fi_av_insert_auth_key failed: %d", ret); + + auth_key_size = sizeof(lookup_auth_key); + ret = fi_av_lookup_auth_key(av, addr_key, &lookup_auth_key, + &auth_key_size); + cr_assert_eq(ret, FI_SUCCESS, + "fi_av_lookup_auth_key failed: %d", ret); + + cr_assert_eq(auth_key_size, sizeof(lookup_auth_key), + "Invalid auth_key_size returned"); + cr_assert_eq(lookup_auth_key.vni, auth_key.vni, + "Incorrect auth_key.vni returned"); + } + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); + + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +Test(av_auth_key, insert_invalid_null_auth_key) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxip_nic_attr *nic_attr; + fi_addr_t addr_key; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = info->nic->prov_attr; + auth_key.vni = nic_attr->default_vni; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, NULL, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, insert_invalid_null_fi_addr) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxip_nic_attr *nic_attr; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = info->nic->prov_attr; + auth_key.vni = nic_attr->default_vni; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), NULL, 0); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, insert_invalid_flags) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxip_nic_attr *nic_attr; + fi_addr_t addr_key; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = info->nic->prov_attr; + auth_key.vni = nic_attr->default_vni; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0x123); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, insert_invalid_vni) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + fi_addr_t addr_key; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + auth_key.vni = 0x1234; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, insert_max_ep_auth_key_bounds_check) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxip_nic_attr *nic_attr; + fi_addr_t addr_key; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = 1; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = info->nic->prov_attr; + auth_key.vni = nic_attr->default_vni; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, -FI_ENOSPC, "fi_av_insert_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, lookup_null_auth_key) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + fi_addr_t addr_key = 0; + size_t auth_key_size = sizeof(auth_key); + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + auth_key.vni = 0x1234; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_lookup_auth_key(av, addr_key, NULL, &auth_key_size); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_lookup_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, lookup_null_auth_key_size) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + fi_addr_t addr_key = 0; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + auth_key.vni = 0x1234; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_lookup_auth_key(av, addr_key, &auth_key, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_lookup_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, remove) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxip_nic_attr *nic_attr; + fi_addr_t addr_key; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = 1; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = info->nic->prov_attr; + auth_key.vni = nic_attr->default_vni; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret); + + ret = fi_av_remove(av, &addr_key, 1, FI_AUTH_KEY); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_remove failed: %d", ret); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, valid_insert_auth_key_addr) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxip_addr addr = {}; + struct cxip_nic_attr *nic_attr; + fi_addr_t addr_key; + size_t addr_key_size = sizeof(addr); + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = 1; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = info->nic->prov_attr; + auth_key.vni = nic_attr->default_vni; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret); + + ret = fi_av_insert(av, &addr, 1, &addr_key, FI_AUTH_KEY, NULL); + cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret); + + ret = fi_av_lookup(av, addr_key, &addr, &addr_key_size); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_lookup failed: %d", ret); + + cr_assert_eq(addr.vni, auth_key.vni, + "Invalid auth_key vni: expected=%u got=%u", + auth_key.vni, addr.vni); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, miss_auth_key_insert_flag) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxip_addr addr = {}; + struct cxip_nic_attr *nic_attr; + fi_addr_t addr_key; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = 1; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = info->nic->prov_attr; + auth_key.vni = nic_attr->default_vni; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret); + + ret = fi_av_insert(av, &addr, 1, &addr_key, 0, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, invalid_user_id_flag) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxi_auth_key auth_key = {}; + struct cxip_addr addr = {}; + struct cxip_nic_attr *nic_attr; + fi_addr_t addr_key; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = 1; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = info->nic->prov_attr; + auth_key.vni = nic_attr->default_vni; + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert_auth_key(av, &auth_key, sizeof(auth_key), &addr_key, + 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_insert_auth_key failed: %d", ret); + + ret = fi_av_insert(av, &addr, 1, &addr_key, FI_AV_USER_ID, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, invalid_user_id_auth_key_flags) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxip_addr addr = {}; + fi_addr_t addr_key; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->max_ep_auth_key = 1; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert(av, &addr, 1, &addr_key, + (FI_AV_USER_ID | FI_AUTH_KEY), NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, null_auth_key_addr) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct cxip_addr addr = {}; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = 1; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + open_av_auth_key(info, &fab, &dom, &av); + + ret = fi_av_insert(av, &addr, 1, NULL, FI_AUTH_KEY, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_av_insert failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, invalid_multiple_auth_keys_per_ep_with_directed_recv_cap) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct fid_ep *ep; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = 2; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + open_av_auth_key(info, &fab, &dom, &av); + + info->caps |= FI_DIRECTED_RECV; + ret = fi_endpoint(dom, info, &ep, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_endpoint failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(av_auth_key, invalid_multiple_auth_keys_per_ep_with_directed_recv_rx_cap) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_av *av; + struct fid_ep *ep; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = 2; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + "255", FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + open_av_auth_key(info, &fab, &dom, &av); + + info->rx_attr->caps |= FI_DIRECTED_RECV; + ret = fi_endpoint(dom, info, &ep, NULL); + cr_assert_eq(ret, -FI_EINVAL, "fi_endpoint failed: %d", ret); + + close_av_auth_key(fab, dom, av); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +#define NUM_VNIS 4U +#define NUM_TX_EPS NUM_VNIS + +static struct cxil_dev *dev; +static struct cxi_svc_desc svc_desc = { + .restricted_vnis = 1, + .enable = 1, + .num_vld_vnis = NUM_VNIS, + .vnis = {1234, 1235, 1236, 1237}, +}; + +static struct fid_fabric *fab; +static struct fid_domain *dom; +static struct fid_cq *cq; +static struct fid_av *av; +static volatile uint64_t rx_mr_buf; +static struct fid_mr *rx_mr; +static struct fid_ep *rx_ep; +static fi_addr_t auth_keys[NUM_VNIS]; +static fi_addr_t init_addrs[NUM_TX_EPS]; + +static char *rx_ep_pid = "0"; +static char *tx_ep_pids[] = {"128", "129", "130", "131"}; +static unsigned int nic_addr; + +static struct fid_domain *tx_dom; +static struct fid_cq *tx_cq; +static struct fid_av *tx_av; +static struct fid_ep *tx_ep[NUM_TX_EPS]; +static volatile uint64_t tx_mr_buf[NUM_TX_EPS]; +static struct fid_mr *tx_mr[NUM_TX_EPS]; +static fi_addr_t target_addr; + +static void av_auth_key_test_tx_ep_init(unsigned int num_vnis) +{ + struct fi_info *hints; + static struct fi_info *info; + int ret; + struct fi_cq_attr cq_attr = { + .format = FI_CQ_FORMAT_TAGGED, + }; + struct fi_av_attr av_attr = { + .type = FI_AV_TABLE, + }; + int i; + struct cxi_auth_key key = {}; + char node[64]; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->caps |= FI_SOURCE | FI_SOURCE_ERR | FI_MSG | FI_SEND | FI_RECV | + FI_RMA | FI_ATOMIC; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED | + FI_MR_PROV_KEY; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + ret = fi_domain(fab, info, &tx_dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + fi_freeinfo(info); + + ret = fi_cq_open(tx_dom, &cq_attr, &tx_cq, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open failed: %d", ret); + + ret = fi_av_open(tx_dom, &av_attr, &tx_av, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret); + + sprintf(node, "%u", nic_addr); + ret = fi_av_insertsvc(tx_av, node, rx_ep_pid, &target_addr, 0, NULL); + cr_assert_eq(ret, 1, "fi_av_insertsvc failed: %d", ret); + + for (i = 0; i < num_vnis; i++) { + key.vni = svc_desc.vnis[i]; + key.svc_id = svc_desc.svc_id; + + hints->ep_attr->auth_key = (void *)&key; + hints->ep_attr->auth_key_size = sizeof(key); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + "cxi0", tx_ep_pids[i], FI_SOURCE, hints, + &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + ret = fi_endpoint(tx_dom, info, &tx_ep[i], NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_endpoint failed: %d", ret); + + ret = fi_ep_bind(tx_ep[i], &tx_cq->fid, FI_TRANSMIT | FI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind CQ failed: %d", ret); + + ret = fi_ep_bind(tx_ep[i], &tx_av->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind AV failed: %d", ret); + + ret = fi_enable(tx_ep[i]); + cr_assert_eq(ret, FI_SUCCESS, "fi_enable failed: %d", ret); + + ret = fi_mr_reg(tx_dom, (void *)&tx_mr_buf[i], + sizeof(tx_mr_buf[i]), + FI_WRITE | FI_READ | FI_REMOTE_WRITE | FI_REMOTE_READ, + 0, 0, 0, &tx_mr[i], NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + ret = fi_mr_bind(tx_mr[i], &tx_ep[i]->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed: %d", ret); + + ret = fi_mr_enable(tx_mr[i]); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed: %d", ret); + + fi_freeinfo(info); + } + + hints->ep_attr->auth_key = NULL; + fi_freeinfo(hints); +} + +static void av_auth_key_test_rx_ep_init(bool source_err, unsigned int num_vnis, + bool directed_recv, bool av_user_id) +{ + struct fi_info *hints; + static struct fi_info *info; + struct cxi_svc_fail_info fail_info = {}; + int ret; + struct fi_cq_attr cq_attr = { + .format = FI_CQ_FORMAT_TAGGED, + }; + struct fi_av_attr av_attr = { + .type = FI_AV_TABLE, + }; + int i; + struct cxi_auth_key key = {}; + size_t key_size; + char node[64]; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + nic_addr = dev->info.nic_addr; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + if (av_user_id) { + av_attr.flags = FI_AV_USER_ID; + hints->caps |= FI_AV_USER_ID; + } + + hints->caps |= FI_SOURCE | FI_SOURCE_ERR | FI_MSG | FI_SEND | FI_RECV | + FI_RMA | FI_ATOMIC; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED | + FI_MR_PROV_KEY; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->max_ep_auth_key = num_vnis; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints->fabric_attr->prov_name, "strdup failed"); + + if (directed_recv) + hints->caps |= FI_DIRECTED_RECV; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + rx_ep_pid, FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); + + ret = fi_fabric(info->fabric_attr, &fab, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_fabric failed: %d", ret); + + ret = fi_domain(fab, info, &dom, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_domain failed: %d", ret); + + ret = fi_cq_open(dom, &cq_attr, &cq, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open failed: %d", ret); + + ret = fi_av_open(dom, &av_attr, &av, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret); + + ret = fi_endpoint(dom, info, &rx_ep, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_endpoint failed: %d", ret); + + ret = fi_ep_bind(rx_ep, &cq->fid, FI_TRANSMIT | FI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind CQ failed: %d", ret); + + ret = fi_ep_bind(rx_ep, &av->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind AV failed: %d", ret); + + for (i = 0; i < num_vnis; i++) { + key.vni = svc_desc.vnis[i]; + key_size = sizeof(key); + + ret = fi_av_insert_auth_key(av, &key, key_size, &auth_keys[i], + 0); + cr_assert_eq(ret, FI_SUCCESS, + "fi_av_insert_auth_key failed: %d", ret); + + if (source_err) + continue; + + sprintf(node, "%u", nic_addr); + init_addrs[i] = auth_keys[i]; + ret = fi_av_insertsvc(av, node, tx_ep_pids[i], &init_addrs[i], + FI_AUTH_KEY, NULL); + cr_assert_eq(ret, 1, "fi_av_insertsvc failed: %d", ret); + } + + ret = fi_enable(rx_ep); + cr_assert_eq(ret, FI_SUCCESS, "fi_enable failed: %d", ret); + + ret = fi_mr_reg(dom, (void *)&rx_mr_buf, sizeof(rx_mr_buf), + FI_WRITE | FI_READ | FI_REMOTE_WRITE | FI_REMOTE_READ, + 0, 0, 0, &rx_mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + ret = fi_mr_bind(rx_mr, &rx_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed: %d", ret); + + ret = fi_mr_enable(rx_mr); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed: %d", ret); + + fi_freeinfo(info); +} + +static void av_auth_key_tx_ep_fini(unsigned int num_vnis) +{ + int i; + int ret; + + for (i = 0; i < num_vnis; i++) { + ret = fi_close(&tx_mr[i]->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + + ret = fi_close(&tx_ep[i]->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close EP failed: %d", ret); + } + + ret = fi_close(&tx_av->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close AV failed: %d", ret); + + ret = fi_close(&tx_cq->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close CQ failed: %d", ret); + + ret = fi_close(&tx_dom->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close dom failed: %d", ret); +} + +static void av_auth_key_test_rx_ep_fini(void) +{ + int ret; + + ret = fi_close(&rx_mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + + ret = fi_close(&rx_ep->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close EP failed: %d", ret); + + ret = fi_close(&av->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close AV failed: %d", ret); + + ret = fi_close(&cq->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close CQ failed: %d", ret); + + ret = fi_close(&dom->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close dom failed: %d", ret); + + ret = fi_close(&fab->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close fab failed: %d", ret); + + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} + +TestSuite(data_transfer_av_auth_key, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(data_transfer_av_auth_key, successful_inject_transfer_source) +{ + int i; + int ret; + struct fi_cq_tagged_entry event; + fi_addr_t src_addr; + + av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false); + av_auth_key_test_tx_ep_init(NUM_TX_EPS); + + /* Each TX EP has been configured for a different VNI. Send from each + * TX EP to the RX EP. The RX EP is configured with all VNIs. + */ + for (i = 0; i < NUM_TX_EPS; i++) { + ret = fi_inject(tx_ep[i], NULL, 0, target_addr); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + ret = fi_recv(rx_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + do { + ret = fi_cq_readfrom(cq, &event, 1, &src_addr); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret); + cr_assert_eq(src_addr, init_addrs[i], "Bad source addr"); + + ret = fi_inject(rx_ep, NULL, 0, src_addr); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + ret = fi_recv(tx_ep[i], NULL, 0, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + do { + ret = fi_cq_readfrom(tx_cq, &event, 1, &src_addr); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret); + cr_assert_eq(src_addr, target_addr, "Bad source addr"); + } + + av_auth_key_tx_ep_fini(NUM_TX_EPS); + av_auth_key_test_rx_ep_fini(); +} + +Test(data_transfer_av_auth_key, successful_rdzv_transfer_source) +{ + int i; + int ret; + struct fi_cq_tagged_entry event; + fi_addr_t src_addr; + void *buf; + size_t buf_size = 1024 * 1024; + + buf = malloc(buf_size); + cr_assert(buf != NULL); + + av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false); + av_auth_key_test_tx_ep_init(NUM_TX_EPS); + + /* Each TX EP has been configured for a different VNI. Send from each + * TX EP to the RX EP. The RX EP is configured with all VNIs. + */ + for (i = 0; i < NUM_TX_EPS; i++) { + ret = fi_send(tx_ep[i], buf, buf_size, NULL, target_addr, + tx_ep[i]); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + ret = fi_recv(rx_ep, buf, buf_size, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + do { + ret = fi_cq_readfrom(cq, &event, 1, &src_addr); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret); + cr_assert_eq(src_addr, init_addrs[i], "Bad source addr"); + + do { + ret = fi_cq_read(tx_cq, &event, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret); + + ret = fi_send(rx_ep, buf, buf_size, NULL, src_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + ret = fi_recv(tx_ep[i], buf, buf_size, NULL, FI_ADDR_UNSPEC, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + do { + ret = fi_cq_readfrom(tx_cq, &event, 1, &src_addr); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret); + cr_assert_eq(src_addr, target_addr, "Bad source addr"); + + do { + ret = fi_cq_read(cq, &event, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret); + } + + av_auth_key_tx_ep_fini(NUM_TX_EPS); + av_auth_key_test_rx_ep_fini(); + + free(buf); +} + +Test(data_transfer_av_auth_key, successful_transfer_source_err) +{ + int i; + int ret; + struct fi_cq_tagged_entry event; + struct fi_cq_err_entry error = {}; + fi_addr_t src_addr; + + av_auth_key_test_rx_ep_init(true, NUM_VNIS, false, false); + av_auth_key_test_tx_ep_init(NUM_TX_EPS); + + /* Each TX EP has been configured for a different VNI. Send from each + * TX EP to the RX EP. The RX EP is configured with all VNIs. + */ + for (i = 0; i < NUM_TX_EPS; i++) { + ret = fi_send(tx_ep[i], NULL, 0, NULL, target_addr, tx_ep[i]); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + ret = fi_recv(rx_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + do { + ret = fi_cq_readfrom(cq, &event, 1, &src_addr); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_readfrom failed: %d", ret); + + ret = fi_cq_readerr(cq, &error, 0); + cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret); + cr_assert_eq(error.err, FI_EADDRNOTAVAIL, "Bad error.err"); + cr_assert_eq(error.src_addr, auth_keys[i], + "Bad error.src_addr: got=%lx expected=%lx", + error.src_addr, auth_keys[i]); + + do { + ret = fi_cq_read(tx_cq, &event, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret); + } + + av_auth_key_tx_ep_fini(NUM_TX_EPS); + av_auth_key_test_rx_ep_fini(); +} + +Test(data_transfer_av_auth_key, single_auth_key_with_directed_recv) +{ + int ret; + int i; + struct fi_cq_tagged_entry event; + fi_addr_t src_addr; + fi_addr_t from_src_addr; + struct cxip_addr addr; + size_t addr_size = sizeof(struct cxip_addr); + + av_auth_key_test_rx_ep_init(false, 1, true, false); + av_auth_key_test_tx_ep_init(1); + + ret = fi_getname(&rx_ep->fid, &addr, &addr_size); + cr_assert_eq(ret, FI_SUCCESS, "fi_getname failed: %d", ret); + + /* Insert a AV entry for the RX EP. */ + src_addr = auth_keys[0]; + ret = fi_av_insert(av, &addr, 1, &src_addr, FI_AUTH_KEY, NULL); + cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret); + + /* Queue FI_DIRECTED_RECV to match only the RX EP. */ + ret = fi_recv(rx_ep, NULL, 0, NULL, src_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + /* Queue a zero byte message which should not match. */ + ret = fi_send(tx_ep[0], NULL, 0, NULL, target_addr, tx_ep[0]); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + /* Arbitrary amount of loops to ensure no recv events. */ + for (i = 0; i < 100; i++) { + ret = fi_cq_readfrom(cq, &event, 1, &from_src_addr); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read failed: %d", ret); + } + + /* Post matching send. */ + ret = fi_send(rx_ep, NULL, 0, NULL, src_addr, rx_ep); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + /* Two events should occur: a send and a recv. */ + for (i = 0; i < 2; i++) { + do { + ret = fi_cq_readfrom(cq, &event, 1, &from_src_addr); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret); + + if (event.flags & FI_RECV) + cr_assert_eq(src_addr, from_src_addr, + "Bad source addr"); + } + + av_auth_key_tx_ep_fini(1); + av_auth_key_test_rx_ep_fini(); +} + +Test(data_transfer_av_auth_key, av_user_id_source_err_missing_auth_key_user_id) +{ + int i; + int ret; + struct fi_cq_tagged_entry event; + struct fi_cq_err_entry error = {}; + fi_addr_t src_addr; + + av_auth_key_test_rx_ep_init(true, NUM_VNIS, false, true); + av_auth_key_test_tx_ep_init(NUM_TX_EPS); + + /* Each TX EP has been configured for a different VNI. Send from each + * TX EP to the RX EP. The RX EP is configured with all VNIs. + */ + for (i = 0; i < NUM_TX_EPS; i++) { + ret = fi_send(tx_ep[i], NULL, 0, NULL, target_addr, tx_ep[i]); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + ret = fi_recv(rx_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + do { + ret = fi_cq_readfrom(cq, &event, 1, &src_addr); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_readfrom failed: %d", ret); + + ret = fi_cq_readerr(cq, &error, 0); + cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret); + cr_assert_eq(error.err, FI_EADDRNOTAVAIL, "Bad error.err"); + cr_assert_eq(error.src_addr, FI_ADDR_UNSPEC, + "Bad error.src_addr: got=%lx expected=%lx", + error.src_addr, FI_ADDR_UNSPEC); + + do { + ret = fi_cq_read(tx_cq, &event, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret); + } + + av_auth_key_tx_ep_fini(NUM_TX_EPS); + av_auth_key_test_rx_ep_fini(); +} + +Test(data_transfer_av_auth_key, av_user_id_source_err_auth_key_user_id) +{ + int i; + int ret; + struct fi_cq_tagged_entry event; + struct fi_cq_err_entry error = {}; + fi_addr_t src_addr; + fi_addr_t user_id[NUM_VNIS] = {0x1234, 0x1235, 0x1236, 0x1237}; + + av_auth_key_test_rx_ep_init(true, NUM_VNIS, false, true); + av_auth_key_test_tx_ep_init(NUM_TX_EPS); + + for (i = 0; i < NUM_VNIS; i++) { + ret = fi_av_set_user_id(av, auth_keys[i], user_id[i], + FI_AUTH_KEY); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_set_user_id failed: %d", + ret); + } + + /* Each TX EP has been configured for a different VNI. Send from each + * TX EP to the RX EP. The RX EP is configured with all VNIs. + */ + for (i = 0; i < NUM_TX_EPS; i++) { + ret = fi_send(tx_ep[i], NULL, 0, NULL, target_addr, tx_ep[i]); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + ret = fi_recv(rx_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + do { + ret = fi_cq_readfrom(cq, &event, 1, &src_addr); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_readfrom failed: %d", ret); + + ret = fi_cq_readerr(cq, &error, 0); + cr_assert_eq(ret, 1, "fi_cq_readfrom failed: %d", ret); + cr_assert_eq(error.err, FI_EADDRNOTAVAIL, "Bad error.err"); + cr_assert_eq(error.src_addr, user_id[i], + "Bad error.src_addr: got=%lx expected=%lx", + error.src_addr, user_id[i]); + + do { + ret = fi_cq_read(tx_cq, &event, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret); + } + + av_auth_key_tx_ep_fini(NUM_TX_EPS); + av_auth_key_test_rx_ep_fini(); +} + +Test(data_transfer_av_auth_key, rma_write_successful_transfer) +{ + int i; + int ret; + volatile uint64_t rma_value; + + av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false); + av_auth_key_test_tx_ep_init(NUM_TX_EPS); + + /* Each TX EP has been configured for a different VNI. Issue ping-pong + * RMA from each TX MR to RX MR. + */ + for (i = 0; i < NUM_TX_EPS; i++) { + rma_value = i + 1; + + ret = fi_write(tx_ep[i], + (void *) &rma_value, sizeof(rma_value), NULL, + target_addr, 0, fi_mr_key(rx_mr), NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_write failed: %d", ret); + + while (rx_mr_buf != rma_value) {} + + ret = fi_write(rx_ep, + (void *) &rma_value, sizeof(rma_value), NULL, + init_addrs[i], 0, fi_mr_key(tx_mr[i]), NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_write failed: %d", ret); + + while (tx_mr_buf[i] != rma_value) {} + } + + av_auth_key_tx_ep_fini(NUM_TX_EPS); + av_auth_key_test_rx_ep_fini(); +} + +Test(data_transfer_av_auth_key, rma_read_successful_transfer) +{ + int i; + int ret; + volatile uint64_t rma_value; + + av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false); + av_auth_key_test_tx_ep_init(NUM_TX_EPS); + + /* Each TX EP has been configured for a different VNI. Issue ping-pong + * RMA from each TX MR to RX MR. + */ + for (i = 0; i < NUM_TX_EPS; i++) { + rx_mr_buf = i + 1; + + ret = fi_read(tx_ep[i], + (void *) &rma_value, sizeof(rma_value), NULL, + target_addr, 0, fi_mr_key(rx_mr), NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_read failed: %d", ret); + + while (rx_mr_buf != rma_value) {} + + tx_mr_buf[i] = i + 1; + ret = fi_read(rx_ep, + (void *) &rma_value, sizeof(rma_value), NULL, + init_addrs[i], 0, fi_mr_key(tx_mr[i]), NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_read failed: %d", ret); + + while (tx_mr_buf[i] != rma_value) {} + } + + av_auth_key_tx_ep_fini(NUM_TX_EPS); + av_auth_key_test_rx_ep_fini(); +} + +Test(data_transfer_av_auth_key, amo_inject_successful_transfer) +{ + int i; + int ret; + uint64_t amo_value = 1; + + av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false); + av_auth_key_test_tx_ep_init(NUM_TX_EPS); + + /* Each TX EP has been configured for a different VNI. Issue ping-pong + * AMO from each TX MR to RX MR. + */ + for (i = 0; i < NUM_TX_EPS; i++) { + ret = fi_inject_atomic(tx_ep[i], &amo_value, 1, target_addr, 0, + fi_mr_key(rx_mr), FI_UINT64, FI_SUM); + cr_assert_eq(ret, FI_SUCCESS, "fi_atomic failed: %d", ret); + + while (rx_mr_buf != i + 1) {} + + ret = fi_inject_atomic(rx_ep, &amo_value, 1, init_addrs[i], 0, + fi_mr_key(tx_mr[i]), FI_UINT64, FI_SUM); + cr_assert_eq(ret, FI_SUCCESS, "fi_atomic failed: %d", ret); + + while (tx_mr_buf[i] != 1) {} + } + + av_auth_key_tx_ep_fini(NUM_TX_EPS); + av_auth_key_test_rx_ep_fini(); +} + +Test(data_transfer_av_auth_key, amo_successful_transfer_opt_disabled) +{ + int i; + int ret; + uint64_t amo_value = 1; + + ret = setenv("FI_CXI_OPTIMIZED_MRS", "0", 1); + cr_assert(ret == 0); + + av_auth_key_test_rx_ep_init(false, NUM_VNIS, false, false); + av_auth_key_test_tx_ep_init(NUM_TX_EPS); + + /* Each TX EP has been configured for a different VNI. Issue ping-pong + * AMO from each TX MR to RX MR. + */ + for (i = 0; i < NUM_TX_EPS; i++) { + ret = fi_atomic(tx_ep[i], &amo_value, 1, NULL, + target_addr, 0, fi_mr_key(rx_mr), FI_UINT64, + FI_SUM, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_atomic failed: %d", ret); + + while (rx_mr_buf != i + 1) {} + + ret = fi_atomic(rx_ep, &amo_value, 1, NULL, + init_addrs[i], 0, fi_mr_key(tx_mr[i]), + FI_UINT64, FI_SUM, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_atomic failed: %d", ret); + + while (tx_mr_buf[i] != 1) {} + } + + av_auth_key_tx_ep_fini(NUM_TX_EPS); + av_auth_key_test_rx_ep_fini(); +} diff --git a/prov/cxi/test/av.c b/prov/cxi/test/av.c new file mode 100644 index 00000000000..84de0a6fdf1 --- /dev/null +++ b/prov/cxi/test/av.c @@ -0,0 +1,557 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2015-2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +static struct cxip_addr *test_addrs; +fi_addr_t *test_fi_addrs; +#define AV_COUNT 1024 +int naddrs = AV_COUNT * 10; + +static char *nic_to_amac(uint32_t nic) +{ + struct ether_addr mac = {}; + + mac.ether_addr_octet[5] = nic; + mac.ether_addr_octet[4] = nic >> 8; + mac.ether_addr_octet[3] = nic >> 16; + + return ether_ntoa(&mac); +} + +/* This allocates memory for naddrs FSAs (test_addrs), and naddrs tokens + * (test_fi_addrs), and initializes the FSAs to unique addresses. + */ +static void +test_addrs_init(void) +{ + int i; + + test_addrs = malloc(naddrs * sizeof(struct cxip_addr)); + cr_assert(test_addrs != NULL); + + test_fi_addrs = calloc(naddrs, sizeof(fi_addr_t)); + cr_assert(test_fi_addrs != NULL); + + for (i = 0; i < naddrs; i++) { + test_addrs[i].nic = i; + test_addrs[i].pid = i + 1; + } +} + +/* Clean up the FSA and token memory. + */ +static void +test_addrs_fini(void) +{ + free(test_fi_addrs); + free(test_addrs); +} + +/* This creates an AV with 'count' objects, and peeks at internals to ensure + * that the structure is sound. If 'count' is 0, this should default to + * cxip_av_dev_sz. + */ +static void +test_create(size_t count) +{ + cxit_av_attr.count = count; + cxit_create_av(); + + /* Should allocate a structure */ + cr_assert(cxit_av != NULL, + "cxit_av=%p", cxit_av); + + cxit_destroy_av(); +} + +/* This inserts 'count' FSAs, looks up all of them, then removes all of them. It + * repeats this 'iters' times without destroying the AV. + */ +static void +__test_insert(int count, int iters) +{ + int j, i, ret; + struct cxip_addr addr; + size_t addrlen; + + /* Can't test addresses we haven't set up */ + cr_assert(naddrs >= count, "Invalid test case"); + + cxit_create_av(); + test_addrs_init(); + + for (j = 0; j < iters; j++) { + /* Insert addresses */ + for (i = 0; i < count; i++) { + ret = fi_av_insert(cxit_av, &test_addrs[i], 1, + &test_fi_addrs[i], 0, NULL); + /* Should have inserted 1 item */ + cr_assert(ret == 1, + "fi_av_insert() iter=%d, idx=%d, ret=%d\n", + j, i, ret); + /* Returned tokens should match insertion order */ + cr_assert(test_fi_addrs[i] == i, + "fi_av_insert() iter=%d, idx=%d, index=%ld\n", + j, i, test_fi_addrs[i]); + } + + /* Lookup addresses */ + for (i = 0; i < count; i++) { + addrlen = sizeof(struct cxip_addr); + ret = fi_av_lookup(cxit_av, test_fi_addrs[i], &addr, + &addrlen); + /* Should succeed */ + cr_assert(ret == FI_SUCCESS, + "fi_av_lookup() iter=%d, idx=%d, ret=%d", + j, i, ret); + /* Address should match what we expect */ + cr_assert(addr.nic == test_addrs[i].nic, + "fi_av_lookup() iter=%d, count=%d, i=%d, index=%ld, nic=%d, exp=%d", + j, count, i, test_fi_addrs[i], addr.nic, + test_addrs[i].nic); + cr_assert(addr.pid == test_addrs[i].pid, + "fi_av_lookup() iter=%d, idx=%d, pid=%d", + j, i, addr.pid); + } + + /* Spot-check. If we remove an arbitrary entry, and then insert + * a new address, it should always fill the hole left by the + * removal. + */ + + /* Remove an arbitrary item in the middle */ + i = count / 2; + ret = fi_av_remove(cxit_av, &test_fi_addrs[i], 1, 0); + cr_assert(ret == FI_SUCCESS, + "fi_av_remove() mid iter=%d, idx=%d, ret=%d\n", + j, i, ret); + + /* Insert an address */ + ret = fi_av_insert(cxit_av, &test_addrs[i], 1, + &test_fi_addrs[i], 0, NULL); + cr_assert(ret == 1, + "fi_av_insert() mid iter=%d, idx=%d, ret=%d\n", + j, i, ret); + cr_assert(test_fi_addrs[i] == i, + "fi_av_insert() mid iter=%d, idx=%d, index=%ld\n", + j, i, test_fi_addrs[i]); + + addrlen = sizeof(struct cxip_addr); + ret = fi_av_lookup(cxit_av, test_fi_addrs[i], &addr, + &addrlen); + cr_assert(ret == FI_SUCCESS, + "fi_av_lookup() mid iter=%d, idx=%d, ret=%d", + j, i, ret); + cr_assert(addr.nic == test_addrs[i].nic, + "fi_av_lookup() mid iter=%d, count=%d, i=%d, index=%ld, nic=%d, exp=%d", + j, count, i, test_fi_addrs[i], addr.nic, + test_addrs[i].nic); + cr_assert(addr.pid == test_addrs[i].pid, + "fi_av_lookup() mid iter=%d, idx=%d, pid=%d", + j, i, addr.pid); + + /* Remove all of the entries */ + for (i = 0; i < count; i++) { + ret = fi_av_remove(cxit_av, &test_fi_addrs[i], 1, 0); + /* Should succeed */ + cr_assert(ret == 0, + "fi_av_remove() iter=%d, idx=%d, ret=%d", + j, i, ret); + } + } + + test_addrs_fini(); + cxit_destroy_av(); +} + +/* Wrapper for insert test. + * + * The first call in each group only fills half of the initially allocated + * space. + * + * The second call fills the entire initially allocated space. + * + * The third call requires multiple memory reallocations to expand the memory as + * this inserts. + */ +static void +test_insert(void) +{ + int iters = 1; + + __test_insert(AV_COUNT / 2, iters); + __test_insert(AV_COUNT, iters); + __test_insert(naddrs, iters); + + iters = 3; + + __test_insert(AV_COUNT / 2, iters); + __test_insert(AV_COUNT, iters); + __test_insert(naddrs, iters); +} + +TestSuite(av, .init = cxit_setup_av, .fini = cxit_teardown_av, + .timeout = CXIT_DEFAULT_TIMEOUT); + +ReportHook(TEST_CRASH)(struct criterion_test_stats *stats) +{ + printf("signal = %d\n", stats->signal); +} + +/* Test AV creation syntax error */ +Test(av, av_open_invalid) +{ + int ret; + + ret = fi_av_open(cxit_domain, NULL, NULL, NULL); + cr_assert(ret == -FI_EINVAL, "fi_av_open AV all NULL = %d", ret); + + ret = fi_av_open(cxit_domain, &cxit_av_attr, NULL, NULL); + cr_assert(ret == -FI_EINVAL, "fi_av_open AV NULL av = %d", ret); + + ret = fi_av_open(cxit_domain, NULL, &cxit_av, NULL); + cr_assert(ret == -FI_EINVAL, "fi_av_open AV NULL av_attr = %d", ret); + + cxit_av_attr.type = 99; + ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL); + cr_assert(ret == -FI_EINVAL, "fi_av_open AV bad type = %d", ret); + cxit_av_attr.type = 0; + + /* NOTE: FI_READ means read-only */ + cxit_av_attr.flags = FI_READ; + ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL); + cr_assert(ret == -FI_EINVAL, "fi_av_open AV FI_READ with no name = %d", + ret); + cxit_av_attr.flags = 0; + + cxit_av_attr.rx_ctx_bits = CXIP_EP_MAX_CTX_BITS + 1; + ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL); + cr_assert(ret == -FI_EINVAL, "fi_av_open AV too many bits = %d", ret); + cxit_av_attr.rx_ctx_bits = 0; +} + +/* Test AV bind not supported */ +Test(av, av_bind_invalid) +{ + int ret; + + cxit_create_av(); + + ret = fi_av_bind(cxit_av, NULL, 0); + cr_assert(ret == -FI_ENOSYS, "fi_av_bind() = %d", ret); + + cxit_destroy_av(); +} + +/* Test AV control not supported */ +Test(av, av_control_invalid) +{ + int ret; + + cxit_create_av(); + + ret = fi_control(&cxit_av->fid, 0, NULL); + cr_assert(ret == -FI_ENOSYS, "fi_control() = %d", ret); + + cxit_destroy_av(); +} + +/* Test AV open_ops not supported */ +Test(av, av_open_ops_invalid) +{ + int ret; + + cxit_create_av(); + + ret = fi_open_ops(&cxit_av->fid, NULL, 0, NULL, NULL); + cr_assert(ret == -FI_ENOSYS, "fi_open_ops() = %d", ret); + + cxit_destroy_av(); +} + +/* Test basic AV table creation */ +Test(av, table_create) +{ + cxit_av_attr.type = FI_AV_TABLE; + test_create(0); + test_create(1024); +} + +/* Test basic AV map creation */ +Test(av, map_create) +{ + cxit_av_attr.type = FI_AV_MAP; + test_create(0); + test_create(1024); +} + +/* Test basic AV default creation */ +Test(av, unspecified_create) +{ + cxit_av_attr.type = FI_AV_UNSPEC; + test_create(0); + test_create(1024); +} + +/* Test basic AV table insert */ +Test(av, table_insert) +{ + cxit_av_attr.count = AV_COUNT; + cxit_av_attr.type = FI_AV_TABLE; + naddrs = cxit_av_attr.count * 10; + + test_insert(); +} + +/* Test basic AV map insert */ +Test(av, map_insert) +{ + cxit_av_attr.count = AV_COUNT; + cxit_av_attr.type = FI_AV_MAP; + naddrs = cxit_av_attr.count * 10; + + test_insert(); +} + +/* Test address conversion to string */ +Test(av, straddr) +{ + uint32_t addr = 0xabcd1234; + size_t len = 0; + char *buf = NULL; + const char *tmp_buf; + + cxit_create_av(); + + tmp_buf = fi_av_straddr(cxit_av, &addr, buf, &len); + cr_assert_null(tmp_buf, "fi_av_straddr() buffer not null %p", tmp_buf); + + buf = malloc(len); + cr_assert(buf != NULL); + + tmp_buf = fi_av_straddr(cxit_av, &addr, buf, &len); + cr_assert_not_null(tmp_buf, "fi_av_straddr() buffer is null"); + cr_assert_str_eq(tmp_buf, buf, + "fi_av_straddr() buffer failure: '%s' != '%s'", tmp_buf, buf); + + free(buf); + + cxit_destroy_av(); +} + +Test(av, insertsvc) +{ + int i, ret; + struct cxip_addr addr; + size_t addrlen; + char pid_str[256]; + + cxit_create_av(); + test_addrs_init(); + + ret = fi_av_insertsvc(cxit_av, NULL, pid_str, &test_fi_addrs[0], 0, + NULL); + cr_assert(ret == -FI_EINVAL); + + ret = fi_av_insertsvc(cxit_av, nic_to_amac(test_addrs[0].nic), NULL, + &test_fi_addrs[0], 0, NULL); + cr_assert(ret == -FI_EINVAL); + + ret = fi_av_insertsvc(cxit_av, NULL, NULL, &test_fi_addrs[0], 0, NULL); + cr_assert(ret == -FI_EINVAL); + + /* Insert addresses */ + for (i = 0; i < naddrs; i++) { + ret = sprintf(pid_str, "%d", test_addrs[i].pid); + cr_assert(ret > 0); + + ret = fi_av_insertsvc(cxit_av, nic_to_amac(test_addrs[i].nic), + pid_str, &test_fi_addrs[i], 0, NULL); + /* Should have inserted 1 item */ + cr_assert(ret == 1, + "fi_av_insertsvc() idx=%d, ret=%d\n", + i, ret); + /* Returned tokens should match insertion order */ + cr_assert(test_fi_addrs[i] == i, + "fi_av_insertsvc() idx=%d, fi_addr=%ld\n", + i, test_fi_addrs[i]); + } + + /* Lookup addresses */ + for (i = 0; i < naddrs; i++) { + addrlen = sizeof(struct cxip_addr); + ret = fi_av_lookup(cxit_av, test_fi_addrs[i], &addr, + &addrlen); + /* Should succeed */ + cr_assert(ret == FI_SUCCESS, + "fi_av_lookup() idx=%d, ret=%d", + i, ret); + /* Address should match what we expect */ + cr_assert(addr.nic == test_addrs[i].nic, + "fi_av_lookup() naddrs=%d, i=%d, index=%ld, nic=%d, exp=%d", + naddrs, i, test_fi_addrs[i], addr.nic, + test_addrs[i].nic); + cr_assert(addr.pid == test_addrs[i].pid, + "fi_av_lookup() idx=%d, pid=%d", + i, addr.pid); + } + + /* Spot-check. If we remove an arbitrary entry, and then insert + * a new address, it should always fill the hole left by the + * removal. + */ + + /* Remove an arbitrary item in the middle */ + i = naddrs / 2; + ret = fi_av_remove(cxit_av, &test_fi_addrs[i], 1, 0); + cr_assert(ret == FI_SUCCESS, + "fi_av_remove() mid idx=%d, ret=%d\n", + i, ret); + + /* Insert an address */ + ret = fi_av_insert(cxit_av, &test_addrs[i], 1, + &test_fi_addrs[i], 0, NULL); + cr_assert(ret == 1, + "fi_av_insert() mid idx=%d, ret=%d\n", + i, ret); + cr_assert(test_fi_addrs[i] == i, + "fi_av_insert() mid idx=%d, index=%ld\n", + i, test_fi_addrs[i]); + + addrlen = sizeof(struct cxip_addr); + ret = fi_av_lookup(cxit_av, test_fi_addrs[i], &addr, + &addrlen); + cr_assert(ret == FI_SUCCESS, + "fi_av_lookup() mid idx=%d, ret=%d", + i, ret); + cr_assert(addr.nic == test_addrs[i].nic, + "fi_av_lookup() mid naddrs=%d, i=%d, index=%ld, nic=%d, exp=%d", + naddrs, i, test_fi_addrs[i], addr.nic, + test_addrs[i].nic); + cr_assert(addr.pid == test_addrs[i].pid, + "fi_av_lookup() mid idx=%d, pid=%d", + i, addr.pid); + + /* Remove all of the entries */ + for (i = 0; i < naddrs; i++) { + ret = fi_av_remove(cxit_av, &test_fi_addrs[i], 1, 0); + /* Should succeed */ + cr_assert(ret == 0, + "fi_av_remove() idx=%d, ret=%d", + i, ret); + } + + test_addrs_fini(); + cxit_destroy_av(); +} + +static double diff_timespec(const struct timespec *time1, + const struct timespec *time0) { + return (time1->tv_sec - time0->tv_sec) + + (time1->tv_nsec - time0->tv_nsec) / 1000000000.0; +} + +/* Verify that reserve lookup is O(1). */ +Test(av, reverse_lookup) +{ + int i; + int ret; + struct cxip_av *av; + struct cxip_addr addr = {}; + struct timespec start; + struct timespec end; + double timestamp1; + double timestamp2; + fi_addr_t fi_addr; + + cxit_create_av(); + + av = container_of(cxit_av, struct cxip_av, av_fid.fid); + + /* Insert lots of addresses into the AV. */ + for (i = 0; i < 10000; i++) { + addr.nic = i; + + ret = fi_av_insert(cxit_av, &addr, 1, NULL, 0, NULL); + cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret); + } + + /* Verify that reserve lookup is not linear. Verify this by the + * addresses being within 5% of each other. + */ + addr.nic = 0; + clock_gettime(CLOCK_MONOTONIC, &start); + fi_addr = cxip_av_lookup_fi_addr(av, &addr); + clock_gettime(CLOCK_MONOTONIC, &end); + + cr_assert_neq(fi_addr, FI_ADDR_NOTAVAIL, + "cxip_av_lookup_fi_addr failed"); + timestamp1 = diff_timespec(&end, &start); + + addr.nic = i - 1; + clock_gettime(CLOCK_MONOTONIC, &start); + fi_addr = cxip_av_lookup_fi_addr(av, &addr); + clock_gettime(CLOCK_MONOTONIC, &end); + + cr_assert_neq(fi_addr, FI_ADDR_NOTAVAIL, + "cxip_av_lookup_fi_addr failed"); + timestamp2 = diff_timespec(&end, &start); + + cr_assert((timestamp1 * 1.05) > timestamp2, "O(1) verification failed"); + + cxit_destroy_av(); +} + +Test(av, av_user_id_invalid_insert_with_symmetric) +{ + int ret; + struct cxip_addr addr = {};; + fi_addr_t fi_addr = 0; + + cxit_av_attr.flags |= FI_SYMMETRIC; + cxit_create_av(); + + ret = fi_av_insert(cxit_av, &addr, 1, &fi_addr, FI_AV_USER_ID, NULL); + cr_assert_eq(ret, -FI_EINVAL, "Bad fi_av_insert rc: %d", ret); + + cxit_destroy_av(); +} + +Test(av, av_user_id_invalid_null_fi_addr) +{ + int ret; + struct cxip_addr addr = {};; + + cxit_create_av(); + + ret = fi_av_insert(cxit_av, &addr, 1, NULL, FI_AV_USER_ID, NULL); + cr_assert_eq(ret, -FI_EINVAL, "Bad fi_av_insert rc: %d", ret); + + cxit_destroy_av(); +} + +Test(av, invalid_fi_av_user_id_flag) +{ + int ret; + struct cxip_addr addr = {};; + fi_addr_t fi_addr = 0; + + cxit_av_attr.flags = FI_AV_USER_ID; + cxit_create_av(); + + ret = fi_av_insert(cxit_av, &addr, 1, &fi_addr, FI_AV_USER_ID, NULL); + cr_assert_eq(ret, -FI_EINVAL, "Bad fi_av_insert rc: %d", ret); + + cxit_destroy_av(); +} diff --git a/prov/cxi/test/avset.c b/prov/cxi/test/avset.c new file mode 100644 index 00000000000..9dbe691b58c --- /dev/null +++ b/prov/cxi/test/avset.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2020 Intel Corporation. All rights reserved. + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include +#include +#include +#include + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(avset, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT); + +/* + * Simple test to ensure that any attempt to close the AV before closing any AV + * Set will fail with -FI_EBUSY. + */ +Test(avset, av_set_refcnt) +{ + // Make sure open AV sets preclude closing AV + struct fi_av_set_attr attr = {.flags=FI_UNIVERSE}; + struct fid_av_set *set; + int ret; + + ret = fi_av_set(cxit_av, &attr, &set, NULL); + cr_expect_eq(ret, 0, "fi_av_set failed, ret=%d", ret); + + ret = fi_close(&cxit_av->fid); + cr_expect_eq(ret, -FI_EBUSY, "premature AV close failed, ret=%d", ret); + + ret = fi_close(&set->fid); + cr_expect_eq(ret, 0, "fi_close(set) failed, ret=%d", ret); +} + +/* + * Test of AVSet operations + * + * We choose by-two and by-three spans to explore union, intersection, diff + */ +static bool is_div_2(fi_addr_t addr) +{ + return (addr & 1) == 0; +} + +static bool is_div_3(fi_addr_t addr) +{ + return ((addr / 3) * 3) == addr; +} + +static bool is_not2_and_3(fi_addr_t addr) +{ + return !is_div_2(addr) && is_div_3(addr); +} + +static bool is_2_and_3(fi_addr_t addr) +{ + return is_div_2(addr) && is_div_3(addr); +} + +static bool is_2_or_3(fi_addr_t addr) +{ + return is_div_2(addr) || is_div_3(addr); +} + +static bool is_2_and_not14(fi_addr_t addr) +{ + return is_div_2(addr) && addr != 14; +} + +static int _comp_fi_addr(const void *a, const void *b) +{ + // for sorting unsigned + if (*(fi_addr_t *)a < *(fi_addr_t *)b) return -1; + if (*(fi_addr_t *)a > *(fi_addr_t *)b) return 1; + return 0; +} + +static int check_av_set(const char *name, struct fid_av_set *set, int max, + bool (*func)(fi_addr_t), bool is_ordered) +{ + // ensure all elements of set satisfy expectations + struct cxip_av_set *cxi_set; + fi_addr_t *local; + int locidx = 0; + int errors = 0; + int i; + + cxi_set = container_of(set, struct cxip_av_set, av_set_fid); + + // Create the expected result + local = calloc(max, sizeof(fi_addr_t)); + cr_assert_not_null(local, "calloc failure"); + for (i = 0; i < max; i++) { + if ((*func)(i)) + local[locidx++] = i; + } + + // If set is not ordered, sort into order to test + if (! is_ordered) + qsort(cxi_set->fi_addr_ary, cxi_set->fi_addr_cnt, + sizeof(fi_addr_t), _comp_fi_addr); + + // Traverse maximum span, ensuring that allowed addr is the next addr + if (locidx != cxi_set->fi_addr_cnt) { + errors++; + } else { + for (i = 0; i < locidx; i++) { + if (local[i] != cxi_set->fi_addr_ary[i]) { + errors++; + break; + } + } + } + if (errors) { + printf("%s: bad set:\n", name); + printf(" exp act\n"); + for (i = 0; i < locidx && i < cxi_set->fi_addr_cnt; i++) { + printf(" %3ld %3ld\n", local[i], cxi_set->fi_addr_ary[i]); + } + for ( ; i < locidx; i++) { + printf(" %3ld ---\n", local[i]); + } + for ( ; i < cxi_set->fi_addr_cnt; i++) { + printf(" --- %3ld\n", cxi_set->fi_addr_ary[i]); + } + } + free(local); + return errors; +} + +enum { + ordered = true, + unordered = false +}; + +Test(avset, basics) +{ + // Test basic set operations + struct fi_av_set_attr attr2 = { + .count = 20, .start_addr = 0, .end_addr = 19, .stride = 2 + }; + struct fi_av_set_attr attr3 = { + .count = 20, .start_addr = 0, .end_addr = 19, .stride = 3 + }; + struct fid_av_set *set2; + struct fid_av_set *setX; + int errors; + int i, ret; + + errors = 0; + + // Expand the AV, so we have enough addresses to test + for (i = 0; i < 20; i++) { + struct cxip_addr fake_addr = { .nic = i, .pid = 0xff }; + int inserted; + + inserted = fi_av_insert(cxit_av, (void *)&fake_addr, + 1, NULL, 0, NULL); + cr_expect_eq(inserted, 1, + "fi_av_insert[%2d] failed, inserted=%d", + i, inserted); + } + + // Create a stride of every second element + ret = fi_av_set(cxit_av, &attr2, &set2, NULL); + cr_expect_eq(ret, 0, "1 fi_av_set set2 failed, ret=%d", ret); + errors += check_av_set("1 two", set2, 20, is_div_2, ordered); + + // Create a stride of every third element + ret = fi_av_set(cxit_av, &attr3, &setX, NULL); + cr_expect_eq(ret, 0, "1 fi_av_set setX failed, ret=%d", ret); + errors += check_av_set("1 three", setX, 20, is_div_3, ordered); + + ret = fi_close(&setX->fid); + cr_expect_eq(ret, 0, "1 fi_close(setX) failed, ret=%d", ret); + + // 3 union 2 + ret = fi_av_set(cxit_av, &attr3, &setX, NULL); + cr_expect_eq(ret, 0, "2 fi_av_set setX failed, ret=%d", ret); + errors += check_av_set("2 dst", setX, 20, is_div_3, ordered); + + ret = fi_av_set_union(setX, set2); + cr_expect_eq(ret, 0, "2 fi_av_set set_union failed, ret=%d", ret); + errors += check_av_set("2 union", setX, 20, is_2_or_3, unordered); + + ret = fi_close(&setX->fid); + cr_expect_eq(ret, 0, "2 fi_close(setX) failed, ret=%d", ret); + + // 3 diff 2 + ret = fi_av_set(cxit_av, &attr3, &setX, NULL); + cr_expect_eq(ret, 0, "3 fi_av_set setX failed, ret=%d", ret); + errors += check_av_set("3 dst", setX, 20, is_div_3, ordered); + + ret = fi_av_set_diff(setX, set2); + cr_expect_eq(ret, 0, "3 fi_av_set set_diff failed, ret=%d", ret); + errors += check_av_set("3 diff", setX, 20, is_not2_and_3, ordered); + + ret = fi_close(&setX->fid); + cr_expect_eq(ret, 0, "3 fi_close(setX) failed, ret=%d", ret); + + // 3 intersect 2 + ret = fi_av_set(cxit_av, &attr3, &setX, NULL); + cr_expect_eq(ret, 0, "4 fi_av_set setX failed, ret=%d", ret); + errors += check_av_set("4 dst", setX, 20, is_div_3, ordered); + + ret = fi_av_set_intersect(setX, set2); + cr_expect_eq(ret, 0, "4 fi_av_set set_intersect failed, ret=%d", ret); + errors += check_av_set("4 intersect", setX, 20, is_2_and_3, ordered); + + ret = fi_close(&setX->fid); + cr_expect_eq(ret, 0, "4 fi_close(setX) failed, ret=%d", ret); + + // remove address 14 + ret = fi_av_set(cxit_av, &attr2, &setX, NULL); + cr_expect_eq(ret, 0, "5 fi_av_set setX failed, ret=%d", ret); + errors += check_av_set("5 dst", setX, 20, is_div_2, ordered); + + ret = fi_av_set_remove(setX, 14); + cr_expect_eq(ret, 0, "5 fi_av_set fi_av_set_remove failed, ret=%d", ret); + errors += check_av_set("4 remove", setX, 20, is_2_and_not14, ordered); + + ret = fi_close(&setX->fid); + cr_expect_eq(ret, 0, "4 fi_close(setX) failed, ret=%d", ret); + + // clean up + ret = fi_close(&set2->fid); + cr_expect_eq(ret, 0, "fi_close(set2) failed, ret=%d", ret); + + cr_expect_eq(errors, 0, "Errors detected"); +} + + diff --git a/prov/cxi/test/cntr.c b/prov/cxi/test/cntr.c new file mode 100644 index 00000000000..9ab420b9993 --- /dev/null +++ b/prov/cxi/test/cntr.c @@ -0,0 +1,720 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(cntr, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .timeout = 5); + +Test(cntr, mod) +{ + int ret; + int i; + uint64_t val = 0; + uint64_t errval = 0; + struct fid_cntr *tmp_cntr; + struct fi_cntr_attr attr = { + .wait_obj = FI_WAIT_NONE, + }; + + ret = fi_cntr_open(cxit_domain, &attr, &tmp_cntr, NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (send)"); + + ret = fi_cntr_add(tmp_cntr, 1); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_read(tmp_cntr) != 1) + sched_yield(); + + /* fi_cntr_wait() is invalid with FI_WAIT_NONE */ + ret = fi_cntr_wait(tmp_cntr, 1, -1); + cr_assert(ret == -FI_EINVAL); + + fi_close(&tmp_cntr->fid); + + cr_assert(!fi_cntr_read(cxit_write_cntr)); + + /* Test invalid values */ + ret = fi_cntr_add(cxit_write_cntr, FI_CXI_CNTR_SUCCESS_MAX + 1); + cr_assert(ret == -FI_EINVAL); + + ret = fi_cntr_set(cxit_write_cntr, FI_CXI_CNTR_SUCCESS_MAX + 1); + cr_assert(ret == -FI_EINVAL); + + ret = fi_cntr_adderr(cxit_write_cntr, FI_CXI_CNTR_FAILURE_MAX + 1); + cr_assert(ret == -FI_EINVAL); + + ret = fi_cntr_seterr(cxit_write_cntr, FI_CXI_CNTR_FAILURE_MAX + 1); + cr_assert(ret == -FI_EINVAL); + + for (i = 0; i < 10; i++) { + val += 10; + ret = fi_cntr_add(cxit_write_cntr, 10); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_read(cxit_write_cntr) != val) + sched_yield(); + + errval += 30; + ret = fi_cntr_adderr(cxit_write_cntr, 30); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_readerr(cxit_write_cntr) != errval) + sched_yield(); + + val = 5; + ret = fi_cntr_set(cxit_write_cntr, val); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_read(cxit_write_cntr) != val) + sched_yield(); + + errval = 15; + ret = fi_cntr_seterr(cxit_write_cntr, errval); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_readerr(cxit_write_cntr) != errval) + sched_yield(); + } +} + +/* Test RMA with counters */ +Test(cntr, write) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = 0x1f; + struct fi_cq_tagged_entry cqe; + int writes = 10; + int i; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + for (i = 0; i < send_len; i++) + send_buf[i] = 0xab + i; + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + cr_assert(!fi_cntr_read(cxit_write_cntr)); + + for (i = 0; i < writes; i++) { + int off = i * send_len; + + ret = fi_inject_write(cxit_ep, send_buf + off, send_len, + cxit_ep_fi_addr, off, key_val); + cr_assert(ret == FI_SUCCESS); + } + + while (fi_cntr_read(cxit_write_cntr) != writes) + sched_yield(); + + /* Validate sent data */ + for (int i = 0; i < writes * send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + /* Make sure no events were delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + mr_destroy(&mem_window); + free(send_buf); +} + +/* Test all sizes of RMA transactions with counters */ +Test(cntr, write_sizes) +{ + int ret; + uint8_t *send_buf; + int win_len = 16 * 1024; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = 0x1f; + struct fi_cq_tagged_entry cqe; + int writes = 0; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + cr_assert(!fi_cntr_read(cxit_write_cntr)); + + for (send_len = 1; send_len <= win_len; send_len <<= 1) { + ret = fi_write(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "ret=%d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + writes++; + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + } + + while (fi_cntr_read(cxit_write_cntr) != writes) + sched_yield(); + + mr_destroy(&mem_window); + free(send_buf); +} + +/* Test fi_read with counters */ +Test(cntr, read) +{ + int ret; + uint8_t *local; + int remote_len = 0x1000; + int local_len = 8; + uint64_t key_val = 0xa; + struct fi_cq_tagged_entry cqe; + struct mem_region remote; + + local = calloc(1, local_len); + cr_assert_not_null(local, "local alloc failed"); + + mr_create(remote_len, FI_REMOTE_READ, 0xc0, &key_val, &remote); + + cr_assert(!fi_cntr_read(cxit_read_cntr)); + + /* Get 8 bytes from the source buffer to the receive buffer */ + ret = fi_read(cxit_ep, local, local_len, NULL, cxit_ep_fi_addr, 0, + key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret); + + validate_tx_event(&cqe, FI_RMA | FI_READ, NULL); + + /* Validate sent data */ + for (int i = 0; i < local_len; i++) + cr_expect_eq(local[i], remote.mem[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + local[i], remote.mem[i]); + + while (fi_cntr_read(cxit_read_cntr) != 1) + sched_yield(); + + mr_destroy(&remote); + free(local); +} + +/* Test send/recv counters */ +Test(cntr, ping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + cr_assert(!fi_cntr_read(cxit_send_cntr)); + cr_assert(!fi_cntr_read(cxit_recv_cntr)); + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + while (fi_cntr_read(cxit_send_cntr) != 1) + sched_yield(); + + while (fi_cntr_read(cxit_recv_cntr) != 1) + sched_yield(); + + free(send_buf); + free(recv_buf); +} + +int wait_for_cnt(struct fid_cntr *cntr, int cnt, + uint64_t (*cntr_read)(struct fid_cntr *cntr)) +{ + uint64_t cntr_value; + time_t timeout = time(NULL) + 3; + + while ((cntr_value = cntr_read(cntr)) != cnt) { + if (time(NULL) > timeout) { + printf("Timeout waiting for cnt:%d cntr_value:%lx\n", + cnt, cntr_value); + return -1; + } + sched_yield(); + } + + return 0; +} + +int wait_for_value(uint64_t compare_value, uint64_t *wb_buf) +{ + time_t timeout = time(NULL) + 2; + + while (compare_value != *wb_buf) { + if (time(NULL) > timeout) { + printf("Timeout waiting for compare_value:%lx wb:%lx\n", + compare_value, *wb_buf); + return -1; + } + sched_yield(); + } + + return 0; +} + +static void deferred_rma_test(enum fi_op_type op) +{ + int ret; + uint8_t *send_buf; + struct mem_region mem_window; + struct iovec iov = {}; + struct fi_rma_iov rma_iov = {}; + struct fi_op_rma rma = {}; + struct fi_deferred_work work = {}; + struct fid_cntr *trig_cntr = cxit_write_cntr; + + size_t xfer_size = 8; + uint64_t trig_thresh = 1; + uint64_t key = 0xbeef; + + uint64_t cxi_value; + struct fi_cxi_cntr_ops *cntr_ops; + struct cxip_cntr *cxi_cntr; + + ret = fi_open_ops(&trig_cntr->fid, FI_CXI_COUNTER_OPS, 0, + (void **)&cntr_ops, NULL); + cr_assert(ret == FI_SUCCESS); + cxi_cntr = container_of(&trig_cntr->fid, struct cxip_cntr, + cntr_fid.fid); + cr_assert_not_null(cxi_cntr, "cxi_cntr is null"); + + send_buf = calloc(1, xfer_size); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(xfer_size, FI_REMOTE_WRITE | FI_REMOTE_READ, 0xa0, &key, + &mem_window); + + iov.iov_base = send_buf; + iov.iov_len = xfer_size; + + rma_iov.key = key; + + rma.ep = cxit_ep; + rma.msg.msg_iov = &iov; + rma.msg.iov_count = 1; + rma.msg.addr = cxit_ep_fi_addr; + rma.msg.rma_iov = &rma_iov; + rma.msg.rma_iov_count = 1; + rma.flags = FI_CXI_CNTR_WB; + + work.threshold = trig_thresh; + work.triggering_cntr = trig_cntr; + work.completion_cntr = trig_cntr; + work.op_type = op; + work.op.rma = &rma; + + ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work); + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + + ret = fi_cntr_add(trig_cntr, work.threshold); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + + ret = fi_cxi_gen_cntr_success(trig_thresh + 1, &cxi_value); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb); + + mr_destroy(&mem_window); + free(send_buf); +} + +Test(cntr, deferred_wb_rma_write) +{ + deferred_rma_test(FI_OP_WRITE); +} + +Test(cntr, deferred_wb_rma_read) +{ + deferred_rma_test(FI_OP_READ); +} + +Test(cntr, op_cntr_wb1) +{ + int ret; + struct fid_cntr *cntr; + uint64_t trig_thresh = 1; + uint64_t cxi_value; + struct cxip_cntr *cxi_cntr; + + ret = fi_cntr_open(cxit_domain, NULL, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + cxi_cntr = container_of(&cntr->fid, struct cxip_cntr, cntr_fid.fid); + + ret = wait_for_cnt(cntr, 0, fi_cntr_read); + cr_assert(ret == 0); + + ret = fi_cntr_add(cntr, trig_thresh); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + + fi_cntr_read(cntr); + + ret = fi_cxi_gen_cntr_success(trig_thresh, &cxi_value); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS, "fi_close cntr"); +} + +Test(cntr, op_cntr_wb2) +{ + int ret; + void *mmio_addr; + size_t mmio_len; + uint64_t cxi_value; + uint64_t threshold = 1; + struct fid_cntr *cntr; + struct cxip_cntr *cxi_cntr; + struct fi_cxi_cntr_ops *cntr_ops; + struct c_ct_writeback *wb_buf = NULL; + int wb_len = sizeof(*wb_buf); + + ret = fi_cntr_open(cxit_domain, NULL, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0, + (void **)&cntr_ops, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = wait_for_cnt(cntr, 0, fi_cntr_read); + cr_assert(ret == 0); + + cxi_cntr = container_of(&cntr->fid, struct cxip_cntr, cntr_fid.fid); + + ret = fi_cntr_add(cntr, threshold); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + + ret = cntr_ops->get_mmio_addr(&cntr->fid, &mmio_addr, &mmio_len); + cr_assert(ret == FI_SUCCESS); + + ret = fi_cxi_gen_cntr_success(threshold, &cxi_value); + cr_assert(ret == FI_SUCCESS); + fi_cntr_read(cntr); + ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb); + cr_assert(ret == 0); + + cr_assert(fi_cxi_cntr_wb_read(cxi_cntr->wb) == threshold); + + fi_cxi_cntr_set(mmio_addr, 0); + fi_cxi_gen_cntr_success(0, &cxi_value); + ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb); + cr_assert(ret == 0); + + threshold = 10; + ret = fi_cntr_add(cntr, threshold); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + ret = fi_cxi_gen_cntr_success(threshold, &cxi_value); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_cnt(cntr, threshold, fi_cntr_read); + cr_assert(ret == 0); + + fi_cxi_cntr_set(mmio_addr, 0); + fi_cxi_gen_cntr_success(0, &cxi_value); + ret = wait_for_value(cxi_value, (uint64_t *)cxi_cntr->wb); + cr_assert(ret == 0); + + /* Change to a new writeback buffer */ + wb_buf = aligned_alloc(s_page_size, wb_len); + cr_assert_not_null(wb_buf, "wb_buf alloc failed"); + ret = cntr_ops->set_wb_buffer(&cntr->fid, wb_buf, wb_len); + cr_assert(ret == FI_SUCCESS); + + /* Use the new wb buffer */ + threshold = 20; + ret = fi_cntr_add(cntr, threshold); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + ret = fi_cxi_gen_cntr_success(threshold, &cxi_value); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_cnt(cntr, threshold, fi_cntr_read); + cr_assert(ret == 0); + + // Use instead of fi_cxi_cntr_set() + *(uint64_t*)(fi_cxi_get_cntr_reset_addr(mmio_addr)) = 0; + ret = wait_for_cnt(cntr, 0, fi_cntr_read); + cr_assert(ret == 0); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS, "fi_close cntr"); + + free(wb_buf); +} + +Test(cntr, counter_ops) +{ + int ret; + int cnt; + uint64_t *addr; + uint64_t cxi_value; + struct fid_cntr *cntr; + struct fi_cxi_cntr_ops *cntr_ops; + struct cxip_cntr *cxi_cntr; + + struct c_ct_writeback *wb_buf = NULL; + int wb_len = sizeof(*wb_buf); + void *mmio_addr; + size_t mmio_len; + + ret = fi_cntr_open(cxit_domain, NULL, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0, + (void **)&cntr_ops, NULL); + cr_assert(ret == FI_SUCCESS); + + cxi_cntr = container_of(&cntr->fid, struct cxip_cntr, cntr_fid.fid); + + wb_buf = aligned_alloc(s_page_size, wb_len); + cr_assert_not_null(wb_buf, "wb_buf alloc failed"); + + ret = cntr_ops->set_wb_buffer(&cntr->fid, wb_buf, wb_len); + cr_assert(ret == FI_SUCCESS); + + /* enables counter */ + ret = fi_cntr_set(cntr, 0); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_cnt(cntr, 0, fi_cntr_read); + cr_assert(ret == 0); + + ret = cntr_ops->get_mmio_addr(&cntr->fid, &mmio_addr, &mmio_len); + cr_assert(ret == FI_SUCCESS); + + cr_assert(fi_cxi_cntr_wb_read(cxi_cntr->wb) == 0); + + cnt = 10; + ret = fi_cntr_add(cntr, cnt); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_cnt(cntr, cnt, fi_cntr_read); + cr_assert(ret == 0); + cr_assert(fi_cxi_cntr_wb_read(wb_buf) == cnt); + + fi_cxi_cntr_set(mmio_addr, 0); + ret = wait_for_cnt(cntr, 0, fi_cntr_read); + cr_assert(ret == 0); + cr_assert(fi_cntr_read(cntr) == 0, "read:%ld", fi_cntr_read(cntr)); + + ret = fi_cxi_cntr_set(mmio_addr, 15); + cr_assert(ret != FI_SUCCESS, "fi_cxi_cntr_set should fail:%d", ret); + + cnt = 5; + ret = fi_cntr_add(cntr, cnt); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_cnt(cntr, cnt, fi_cntr_read); + cr_assert(ret == 0); + cr_assert(fi_cxi_cntr_wb_read(wb_buf) == cnt); + + fi_cxi_cntr_set(mmio_addr, 0); + ret = wait_for_cnt(cntr, 0, fi_cntr_read); + cr_assert(ret == 0); + cr_assert(fi_cntr_read(cntr) == 0, "read:%ld", fi_cntr_read(cntr)); + + fi_cxi_cntr_seterr(mmio_addr, 0); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_cnt(cntr, 0, fi_cntr_readerr); + cr_assert(ret == 0); + + cnt = 1; + ret = fi_cxi_cntr_adderr(mmio_addr, cnt); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_cnt(cntr, cnt, fi_cntr_readerr); + cr_assert(ret == 0); + cr_assert(fi_cntr_readerr(cntr) == cnt); + cr_assert(fi_cxi_cntr_wb_readerr(wb_buf) == cnt); + + fi_cxi_cntr_set(mmio_addr, 0); + cr_assert(ret == FI_SUCCESS); + + fi_cxi_cntr_seterr(mmio_addr, 0); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_cnt(cntr, 0, fi_cntr_readerr); + cr_assert(ret == 0); + + cnt = 50; + ret = fi_cxi_cntr_add(mmio_addr, cnt); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_cnt(cntr, cnt, fi_cntr_read); + cr_assert(ret == 0); + cr_assert(fi_cntr_read(cntr) == cnt, "cntr:%ld", fi_cntr_read(cntr)); + + fi_cxi_cntr_set(mmio_addr, 0); + cr_assert(ret == FI_SUCCESS); + ret = fi_cxi_gen_cntr_success(0, &cxi_value); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_value(cxi_value, (uint64_t *)wb_buf); + cr_assert(ret == 0); + + // Use instead of fi_cxi_cntr_set() + *(uint64_t*)(fi_cxi_get_cntr_reset_addr(mmio_addr)) = 0; + ret = wait_for_cnt(cntr, 0, fi_cntr_read); + cr_assert(ret == 0); + + cnt = 12; + *(uint64_t*)(fi_cxi_get_cntr_adderr_addr(mmio_addr)) = cnt; + /* Error transition from 0 causes a writeback */ + while(fi_cxi_cntr_wb_readerr(wb_buf) != cnt) + sched_yield(); + + cr_assert(fi_cxi_cntr_wb_readerr(wb_buf) == cnt); + + addr = fi_cxi_get_cntr_reseterr_addr(mmio_addr); + *addr = 0; + ret = fi_cxi_gen_cntr_success(0, &cxi_value); + cr_assert(ret == FI_SUCCESS); + ret = wait_for_value(cxi_value, (uint64_t *)wb_buf); + cr_assert(ret == FI_SUCCESS); + + cr_assert(fi_cntr_readerr(cntr) == 0); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS, "fi_close cntr"); + + free(wb_buf); +} + +Test(cntr, cntr_wait_timeout) +{ + struct fid_cntr *cntr; + struct fi_cntr_attr attr = { + .wait_obj = FI_WAIT_UNSPEC, + }; + int timeout = 2999; + uint64_t thresh = 0x1234; + int ret; + + ret = fi_cntr_open(cxit_domain, &attr, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_cntr_wait(cntr, thresh, timeout); + cr_assert(ret == -FI_ETIMEDOUT); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS); +} + +Test(cntr, cntr_wait) +{ + struct fid_cntr *cntr; + struct fi_cntr_attr attr = { + .wait_obj = FI_WAIT_UNSPEC, + }; + void *mmio_addr; + size_t mmio_len; + struct fi_cxi_cntr_ops *cntr_ops; + int timeout = 2000; + uint64_t thresh = 0x1234; + int ret; + + ret = fi_cntr_open(cxit_domain, &attr, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0, + (void **)&cntr_ops, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = cntr_ops->get_mmio_addr(&cntr->fid, &mmio_addr, &mmio_len); + cr_assert(ret == FI_SUCCESS); + + ret = fi_cntr_wait(cntr, thresh, timeout); + cr_assert(ret == -FI_ETIMEDOUT); + + fi_cxi_cntr_add(mmio_addr, thresh); + + ret = fi_cntr_wait(cntr, thresh, timeout); + cr_assert(ret == FI_SUCCESS); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS); +} + +Test(cntr, cntr_wait_bad_threshold) +{ + struct fid_cntr *cntr; + struct fi_cntr_attr attr = { + .wait_obj = FI_WAIT_UNSPEC, + }; + int timeout = 2000; + uint64_t thresh = (1ULL << 49); + int ret; + + ret = fi_cntr_open(cxit_domain, &attr, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_cntr_wait(cntr, thresh, timeout); + cr_assert(ret == -FI_EINVAL); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS); +} diff --git a/prov/cxi/test/coll.c b/prov/cxi/test/coll.c new file mode 100644 index 00000000000..b6cc6732253 --- /dev/null +++ b/prov/cxi/test/coll.c @@ -0,0 +1,2376 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2017-2019 Intel Corporation. All rights reserved. + * Copyright (c) 2020-2023 Hewlett Packard Enterprise Development LP + */ + +/* + * NOTE: This is a standalone test that uses the COMM_KEY_RANK model, and thus + * consists of a single process driving multiple data objects sequentially to + * simulate network transfers. It can be run under NETSIM, and is part of the + * standard Jenkins validation integration with Git check-in, allowing this to + * serve as and automated regression test. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +/* If not compiled with DEBUG=1, this is a no-op */ +#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) + +#define MIN(a,b) (((a)<(b))?(a):(b)) + +/***************************************/ +/** + * Sanity tests for proper integration with EP, enable/disable checks. + */ + +TestSuite(coll_init, .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test EP close without explicitly enabling collectives. + */ +Test(coll_init, noop) +{ + struct cxip_ep *ep; + + cxit_setup_rma(); + ep = container_of(cxit_ep, struct cxip_ep, ep); + cr_assert(ep->ep_obj->coll.enabled, + "coll not enabled on startup\n"); + + cr_assert(sizeof(struct cxip_coll_accumulator) >= + sizeof(struct cxip_coll_data), + "sizeof(cxip_coll_accumulator=%ld <" + "sizeof(cxip_coll_data=%ld", + sizeof(struct cxip_coll_accumulator), + sizeof(struct cxip_coll_data)); + + cxit_teardown_rma(); +} + +/* Test EP close after explicitly enabling collectives. + */ +Test(coll_init, enable) +{ + struct cxip_ep *ep; + int ret; + + cxit_setup_rma(); + ep = container_of(cxit_ep, struct cxip_ep, ep); + + ret = cxip_coll_enable(ep); + cr_assert(ret == 0, "cxip_coll_enable failed: %d\n", ret); + cr_assert(ep->ep_obj->coll.enabled, + "coll not enabled after enabling\n"); + cxit_teardown_rma(); +} + +/* Test EP close after disabling collectives. + */ +Test(coll_init, disable) +{ + struct cxip_ep *ep; + int ret; + + cxit_setup_rma(); + ep = container_of(cxit_ep, struct cxip_ep, ep); + + ret = cxip_coll_enable(ep); + cr_assert(ret == 0, "cxip_coll_enable failed: %d\n", ret); + ret = cxip_coll_disable(ep->ep_obj); + cr_assert(ret == 0, "cxip_coll_disable failed: %d\n", ret); + cr_assert(!ep->ep_obj->coll.enabled, + "coll enabled after disabling\n"); + cxit_teardown_rma(); +} + +/* Test EP close after disabling/re-enabling collectives. + */ +Test(coll_init, reenable) +{ + struct cxip_ep *ep; + int ret; + + cxit_setup_rma(); + ep = container_of(cxit_ep, struct cxip_ep, ep); + + ret = cxip_coll_enable(ep); + cr_assert(ret == 0, "cxip_coll_enable failed: %d\n", ret); + ret = cxip_coll_disable(ep->ep_obj); + cr_assert(ret == 0, "cxip_coll_disable failed: %d\n", ret); + ret = cxip_coll_enable(ep); + cr_assert(ret == 0, "cxip_coll_enable failed: %d\n", ret); + cr_assert(ep->ep_obj->coll.enabled, + "coll not enabled after enabling\n"); + cxit_teardown_rma(); +} + +/***************************************/ +/** + * JOIN testing. + */ +TestSuite(coll_join, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT); + +struct cxip_addr caddr_base; +void insert_out(struct cxip_addr *addr, struct cxip_addr *addr_out) +{ + *addr = caddr_base; +} + +/* expand AV and create av_sets for collectives */ +static void _create_av_set(int count, int rank, bool rx_discard, + struct fid_av_set **av_set_fid) +{ + struct cxip_ep *ep; + struct cxip_comm_key comm_key = { + .keytype = COMM_KEY_RANK, + .rank.rank = rank, + .rank.hwroot_idx = 0, + .rank.rx_discard = rx_discard + }; + struct fi_av_set_attr attr = { + .count = 0, + .start_addr = FI_ADDR_NOTAVAIL, + .end_addr = FI_ADDR_NOTAVAIL, + .stride = 1, + .comm_key_size = sizeof(comm_key), + .comm_key = (void *)&comm_key, + .flags = 0, + }; + struct cxip_addr caddr; + int i, ret; + + ep = container_of(cxit_ep, struct cxip_ep, ep); + + /* lookup initiator caddr as set in test framework */ + ret = cxip_av_lookup_addr(ep->ep_obj->av, cxit_ep_fi_addr, &caddr); + cr_assert(ret == 0, "bad lookup on address %ld: %d\n", + cxit_ep_fi_addr, ret); + caddr_base = caddr; + + /* create empty av_set */ + ret = fi_av_set(&ep->ep_obj->av->av_fid, &attr, av_set_fid, NULL); + cr_assert(ret == 0, "av_set creation failed: %d\n", ret); + + /* add source address as multiple av entries */ + for (i = count - 1; i >= 0; i--) { + fi_addr_t fi_addr; + + ret = fi_av_insert(&ep->ep_obj->av->av_fid, &caddr, 1, + &fi_addr, 0, NULL); + cr_assert(ret == 1, "%d cxip_av_insert failed: %d\n", i, ret); + ret = fi_av_set_insert(*av_set_fid, fi_addr); + cr_assert(ret == 0, "%d fi_av_set_insert failed: %d\n", i, ret); + caddr.nic++; + } +} + +void _create_netsim_collective(int count, bool discard, int exp) +{ + int i, ret; + + /* replace the insertion/lookup model */ + cxip_av_addr_out = insert_out; + + TRACE("========================\n%s: entry\n", __func__); + TRACE("%s: count=%d\n", __func__, count); + cxit_coll_mc_list.count = count; + cxit_coll_mc_list.av_set_fid = calloc(cxit_coll_mc_list.count, + sizeof(struct fid_av_set *)); + cxit_coll_mc_list.mc_fid = calloc(cxit_coll_mc_list.count, + sizeof(struct fid_mc *)); + + for (i = 0; i < cxit_coll_mc_list.count; i++) { + TRACE("%s: ==== create %d\n", __func__, i); + TRACE("create av_set rank %d\n", i); + _create_av_set(cxit_coll_mc_list.count, i, discard, + &cxit_coll_mc_list.av_set_fid[i]); + TRACE("join collective\n"); + ret = cxip_join_collective(cxit_ep, FI_ADDR_NOTAVAIL, + cxit_coll_mc_list.av_set_fid[i], + 0, &cxit_coll_mc_list.mc_fid[i], + NULL); + TRACE("ret=%d\n", ret); + cr_assert(ret == exp, + "cxip_coll_enable failed: exp %s saw %s\n", + fi_strerror(-exp), fi_strerror(-ret)); + } + TRACE("%s: exit\n========================\n", __func__); +} + +void _destroy_netsim_collective(void) +{ + int i; + + for (i = cxit_coll_mc_list.count - 1; i >= 0; i--) { + TRACE("closing %d\n", i); + if (cxit_coll_mc_list.mc_fid[i]) + fi_close(&cxit_coll_mc_list.mc_fid[i]->fid); + if (cxit_coll_mc_list.av_set_fid[i]) + fi_close(&cxit_coll_mc_list.av_set_fid[i]->fid); + } + TRACE("cleanup\n"); + free(cxit_coll_mc_list.mc_fid); + free(cxit_coll_mc_list.av_set_fid); + cxit_coll_mc_list.mc_fid = NULL; + cxit_coll_mc_list.av_set_fid = NULL; +} + +static void _wait_for_join(int count, int eq_err, int prov_errno) +{ + struct cxip_ep *ep; + struct fid_cq *txcq, *rxcq; + struct fid_eq *eq; + struct fi_cq_err_entry cqd = {}; + struct fi_eq_err_entry eqd = {}; + uint32_t event; + int ret, err, provcnt; + + ep = container_of(cxit_ep, struct cxip_ep, ep); + rxcq = &ep->ep_obj->coll.rx_evtq->cq->util_cq.cq_fid; + txcq = &ep->ep_obj->coll.tx_evtq->cq->util_cq.cq_fid; + eq = &ep->ep_obj->coll.eq->util_eq.eq_fid; + provcnt = 0; + + do { + sched_yield(); + err = -FI_EINVAL; + ret = fi_eq_read(eq, &event, &eqd, sizeof(eqd), 0); + if (ret == -FI_EAVAIL) { + TRACE("=== error available!\n"); + ret = fi_eq_readerr(eq, &eqd, 0); + cr_assert(ret >= 0, + "-FI_EAVAIL but fi_eq_readerr()=%d\n", ret); + TRACE(" event = %d\n", event); + TRACE(" fid = %p\n", eqd.fid); + TRACE(" context = %p\n", eqd.context); + TRACE(" data = %lx\n", eqd.data); + TRACE(" err = %s (%d)\n", + fi_strerror(-eqd.err), eqd.err); + TRACE(" prov_err= %d\n", eqd.prov_errno); + TRACE(" err_data= %p\n", eqd.err_data); + TRACE(" err_size= %ld\n", eqd.err_data_size); + TRACE(" readerr = %d\n", ret); + err = eqd.err; + event = eqd.data; + if (eqd.prov_errno != prov_errno) { + TRACE("prov_err exp=%d saw=%d\n", + prov_errno, eqd.prov_errno); + provcnt++; + } + TRACE("===\n"); + } else if (ret >= 0) { + TRACE("=== EQ SUCCESS!\n"); + err = FI_SUCCESS; + } else { + err = ret; + } + if (err != -FI_EAGAIN) { + TRACE("eq_err = %d, err = %d\n", eq_err, err); + if (eq_err != err) { + cr_assert(eq_err == err, + "FAILED TEST: eq_err = '%s' saw '%s'\n", + fi_strerror(-eq_err), fi_strerror(-err)); + break; + } + if (event == FI_JOIN_COMPLETE) { + TRACE("FI_JOIN_COMPLETE seen\n"); + count--; + } + } + + ret = fi_cq_read(rxcq, &cqd, sizeof(cqd)); + if (ret == -FI_EAVAIL) { + ret = fi_cq_readerr(rxcq, &cqd, sizeof(cqd)); + break; + } + + ret = fi_cq_read(txcq, &cqd, sizeof(cqd)); + if (ret == -FI_EAVAIL) { + ret = fi_cq_readerr(txcq, &cqd, sizeof(cqd)); + break; + } + } while (count > 0); + TRACE("wait done\n"); + cr_assert(provcnt == 0, "Mismatched provider errors\n"); +} + +/* Basic test of single NETSIM join. + */ +Test(coll_join, join1) +{ + TRACE("=========================\n"); + TRACE("join1\n"); + _create_netsim_collective(1, true, FI_SUCCESS); + _wait_for_join(1, FI_SUCCESS, 0); + _destroy_netsim_collective(); +} + +/* Basic test of two NETSIM joins. + */ +Test(coll_join, join2) +{ + TRACE("=========================\n"); + TRACE("join2\n"); + _create_netsim_collective(2, true, FI_SUCCESS); + _wait_for_join(2, FI_SUCCESS, 0); + _destroy_netsim_collective(); +} + +/* Basic test of three NETSIM joins. + */ +Test(coll_join, join3) +{ + TRACE("=========================\n"); + TRACE("join3\n"); + _create_netsim_collective(3, true, FI_SUCCESS); + _wait_for_join(3, FI_SUCCESS, 0); + _destroy_netsim_collective(); +} + +/* Basic test of maximum NETSIM joins. + */ +Test(coll_join, join32) +{ + TRACE("=========================\n"); + TRACE("join32\n"); + _create_netsim_collective(32, true, FI_SUCCESS); + _wait_for_join(32, FI_SUCCESS, 0); + _destroy_netsim_collective(); +} + +#if ENABLE_DEBUG +/* The following tests verify DEBUG-ONLY capabilities */ + +/* Confirm that -FI_EAGAIN is harmless on all zbcoll stages */ +Test(coll_join, retry_getgroup) { + int node; + + TRACE("=========================\n"); + TRACE("join retry getgroup\n"); + for (node = 0; node < 5; node++) { + cxip_trap_set(node, CXIP_TRAP_GETGRP, -FI_EAGAIN); + _create_netsim_collective(5, true, FI_SUCCESS); + _wait_for_join(5, FI_SUCCESS, 0); + _destroy_netsim_collective(); + cxip_trap_close(); + } +} + +Test(coll_join, retry_broadcast) { + int node; + + TRACE("=========================\n"); + TRACE("join retry broadcast\n"); + for (node = 0; node < 5; node++) { + cxip_trap_set(node, CXIP_TRAP_BCAST, -FI_EAGAIN); + _create_netsim_collective(5, true, FI_SUCCESS); + _wait_for_join(5, FI_SUCCESS, 0); + _destroy_netsim_collective(); + cxip_trap_close(); + } +} + +Test(coll_join, retry_reduce) { + int node; + + TRACE("=========================\n"); + TRACE("join retry reduce\n"); + for (node = 0; node < 5; node++) { + cxip_trap_set(node, CXIP_TRAP_REDUCE, -FI_EAGAIN); + _create_netsim_collective(5, true, FI_SUCCESS); + _wait_for_join(5, FI_SUCCESS, 0); + _destroy_netsim_collective(); + cxip_trap_close(); + } +} + +Test(coll_join, fail_ptlte) { + int node; + + TRACE("=========================\n"); + TRACE("join fail mixed errors\n"); + for (node = 0; node < 5; node++) { + cxip_trap_set(node, CXIP_TRAP_INITPTE, -FI_EFAULT); + _create_netsim_collective(5, true, FI_SUCCESS); + _wait_for_join(5, -FI_EAVAIL, CXIP_PROV_ERRNO_PTE); + _destroy_netsim_collective(); + cxip_trap_close(); + } +} +#endif + +/***************************************/ +/** + * Basic send/receive testing. + */ + +TestSuite(coll_put, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT); + +/* 50-byte packet */ +struct fakebuf { + uint64_t count[6]; + uint16_t pad; +} __attribute__((packed)); + +/* Progression is needed because the test runs in a single execution thread with + * NETSIM. This waits for completion of PROGRESS_COUNT messages on the simulated + * (loopback) target. It needs to be called periodically during the test run, or + * the netsim resources run out and this gets blocked. + */ +#define PROGRESS_COUNT 10 +void _progress_put(struct cxip_cq *cq, int sendcnt, uint64_t *dataval) +{ + struct fi_cq_tagged_entry entry[PROGRESS_COUNT]; + struct fi_cq_err_entry err; + int i, ret; + + while (sendcnt > 0) { + do { + int cnt = MIN(PROGRESS_COUNT, sendcnt); + sched_yield(); + ret = fi_cq_read(&cq->util_cq.cq_fid, entry, cnt); + } while (ret == -FI_EAGAIN); + if (ret == -FI_EAVAIL) { + ret = fi_cq_readerr(&cq->util_cq.cq_fid, &err, 0); + memcpy(&entry[0], &err, sizeof(entry[0])); + } + for (i = 0; i < ret; i++) { + struct fakebuf *fb = entry[i].buf; + cr_assert(entry[i].len == sizeof(*fb), + "fb->len exp %ld, saw %ld\n", + sizeof(*fb), entry[i].len); + cr_assert(fb->count[0] == *dataval, + "fb->count[0] exp %ld, saw %ld\n", + fb->count[0], *dataval); + cr_assert(fb->count[5] == *dataval, + "fb->count[5] exp %ld, saw %ld\n", + fb->count[5], *dataval); + cr_assert(fb->pad == (uint16_t)*dataval, + "fb_pad exp %x, saw %x\n", + fb->pad, (uint16_t)*dataval); + (*dataval)++; + } + sendcnt -= ret; + } +} + +/* Put count packets, and verify them. This sends count packets from one + * NETSIM multicast resource to another. + */ +void _put_data(int count, int from_rank, int to_rank) +{ + struct cxip_coll_mc *mc_obj_send, *mc_obj_recv; + struct cxip_coll_reduction *reduction; + struct cxip_ep *ep; + struct fakebuf *buf; + void *buffers; + int sendcnt, cnt; + uint64_t dataval; + int i, j, ret; + + ep = container_of(cxit_ep, struct cxip_ep, ep); + + /* from and to (may be the same mc_obj) */ + mc_obj_send = container_of(cxit_coll_mc_list.mc_fid[from_rank], + struct cxip_coll_mc, mc_fid); + mc_obj_recv = container_of(cxit_coll_mc_list.mc_fid[to_rank], + struct cxip_coll_mc, mc_fid); + + TRACE("%s: mc_obj_send = %p\n", __func__, mc_obj_send); + TRACE("%s: mc_obj_recv = %p\n", __func__, mc_obj_recv); + + /* clear any prior values */ + TRACE("%s: reset mc_ctrs\n", __func__); + cxip_coll_reset_mc_ctrs(&mc_obj_send->mc_fid); + cxip_coll_reset_mc_ctrs(&mc_obj_recv->mc_fid); + + /* from_rank reduction */ + reduction = &mc_obj_send->reduction[0]; + + /* must persist until _progress called, for validation */ + buffers = calloc(PROGRESS_COUNT, sizeof(*buf)); + + buf = buffers; + sendcnt = 0; + dataval = 0; + TRACE("%s: iteration over %p\n", __func__, buf); + for (i = 0; i < count; i++) { + for (j = 0; j < 6; j++) + buf->count[j] = i; + buf->pad = i; + TRACE("call cxip_coll_send()\n"); + ret = cxip_coll_send(reduction, to_rank, buf, sizeof(*buf), + NULL); + cr_assert(ret == 0, "cxip_coll_send failed: %d\n", ret); + + buf++; + sendcnt++; + if (sendcnt >= PROGRESS_COUNT) { + _progress_put(ep->ep_obj->coll.rx_evtq->cq, sendcnt, + &dataval); + buf = buffers; + sendcnt = 0; + } + } + TRACE("call _progress_put\n"); + _progress_put(ep->ep_obj->coll.rx_evtq->cq, sendcnt, &dataval); + + /* check final counts */ + TRACE("check counts\n"); + if (count * sizeof(*buf) > + ep->ep_obj->coll.buffer_size - ep->ep_obj->rxc.min_multi_recv) { + cnt = ofi_atomic_get32(&mc_obj_recv->coll_pte->buf_swap_cnt); + cr_assert(cnt > 0, "Did not recirculate buffers\n"); + } + + TRACE("check atomic counts\n"); + cnt = ofi_atomic_get32(&mc_obj_send->send_cnt); + cr_assert(cnt == count, + "Expected mc_obj[%d] send_cnt == %d, saw %d", + from_rank, count, cnt); + + cnt = ofi_atomic_get32(&mc_obj_recv->coll_pte->recv_cnt); + cr_assert(cnt == count, + "Expected mc_obj raw recv_cnt == %d, saw %d", + count, cnt); + + cnt = ofi_atomic_get32(&mc_obj_recv->recv_cnt); + cr_assert(cnt == 0, + "Expected mc_obj[%d]->[%d] recv_cnt == %d, saw %d", + from_rank, to_rank, count, cnt); + cnt = ofi_atomic_get32(&mc_obj_recv->pkt_cnt); + cr_assert(cnt == 0, + "Expected mc_obj[%d]->[%d] pkt_cnt == %d, saw %d", + from_rank, to_rank, 0, cnt); + + TRACE("free buffers\n"); + free(buffers); +} + +/* Attempt to send from rank 0 to rank 3 (does not exist). + */ +Test(coll_put, put_bad_rank) +{ + struct cxip_coll_mc *mc_obj; + struct cxip_coll_reduction *reduction; + struct fakebuf buf; + int ret; + + _create_netsim_collective(2, false, FI_SUCCESS); + _wait_for_join(2, FI_SUCCESS, 0); + + mc_obj = container_of(cxit_coll_mc_list.mc_fid[0], + struct cxip_coll_mc, mc_fid); + reduction = &mc_obj->reduction[0]; + + ret = cxip_coll_send(reduction, 3, &buf, sizeof(buf), NULL); + cr_assert(ret == -FI_EINVAL, "cxip_coll_set bad error = %d\n", ret); + + _destroy_netsim_collective(); +} + +/* Basic test with one packet from rank 0 to rank 0. + */ +Test(coll_put, put_one) +{ + _create_netsim_collective(1, false, FI_SUCCESS); + _wait_for_join(1, FI_SUCCESS, 0); + _put_data(1, 0, 0); + _destroy_netsim_collective(); +} + +/* Basic test with one packet from each rank to another rank. + * Exercises NETSIM rank-based target addressing. + */ +Test(coll_put, put_ranks) +{ + _create_netsim_collective(2, false, FI_SUCCESS); + _wait_for_join(2, FI_SUCCESS, 0); + TRACE("call _put_data()\n"); + _put_data(1, 0, 0); + _put_data(1, 0, 1); + _put_data(1, 1, 0); + _put_data(1, 1, 1); + _destroy_netsim_collective(); +} + +/* Test a lot of packets to force buffer rollover. + */ +Test(coll_put, put_many) +{ + _create_netsim_collective(1, false, FI_SUCCESS); + _wait_for_join(1, FI_SUCCESS, 0); + _put_data(4000, 0, 0); + _destroy_netsim_collective(); +} + +/* Progress the reduction packet send. + */ +void _progress_red_pkt(struct cxip_cq *cq, int sendcnt, uint64_t *dataval) +{ + struct fi_cq_tagged_entry entry[PROGRESS_COUNT]; + struct fi_cq_err_entry err; + int i, ret; + + while (sendcnt > 0) { + do { + int cnt = MIN(PROGRESS_COUNT, sendcnt); + sched_yield(); + ret = fi_cq_read(&cq->util_cq.cq_fid, entry, cnt); + } while (ret == -FI_EAGAIN); + if (ret == -FI_EAVAIL) { + ret = fi_cq_readerr(&cq->util_cq.cq_fid, &err, 0); + memcpy(&entry[0], &err, sizeof(entry[0])); + } + for (i = 0; i < ret; i++) + (*dataval)++; + sendcnt -= ret; + } +} + +/* Test red_pkt sends. With only one node, root sends to self. + */ +void _put_red_pkt(int count) +{ + struct cxip_coll_mc *mc_obj; + struct cxip_coll_reduction *reduction; + struct cxip_coll_data coll_data = {.red_cnt = 1}; + int sendcnt, cnt; + uint64_t dataval; + int i, ret; + + _create_netsim_collective(1, false, FI_SUCCESS); + _wait_for_join(1, FI_SUCCESS, 0); + + mc_obj = container_of(cxit_coll_mc_list.mc_fid[0], + struct cxip_coll_mc, mc_fid); + + /* clear counters */ + cxip_coll_reset_mc_ctrs(&mc_obj->mc_fid); + + sendcnt = 0; + dataval = 0; + coll_data.intval.ival[0] = dataval; + reduction = &mc_obj->reduction[0]; + reduction->coll_state = CXIP_COLL_STATE_NONE; + for (i = 0; i < count; i++) { + ret = cxip_coll_send_red_pkt(reduction, &coll_data, + false, false); + cr_assert(ret == FI_SUCCESS, + "Packet send from root failed: %d\n", ret); + + sendcnt++; + if (sendcnt >= PROGRESS_COUNT) { + _progress_red_pkt(mc_obj->ep_obj->coll.rx_evtq->cq, + sendcnt, &dataval); + sendcnt = 0; + } + } + _progress_red_pkt(mc_obj->ep_obj->coll.rx_evtq->cq, sendcnt, &dataval); + + cnt = ofi_atomic_get32(&mc_obj->send_cnt); + cr_assert(cnt == count, "Bad send counter on root: %d, exp %d\n", cnt, count); + cnt = ofi_atomic_get32(&mc_obj->recv_cnt); + cr_assert(cnt == count, "Bad recv counter on root: %d, exp %d\n", cnt, count); + cnt = ofi_atomic_get32(&mc_obj->pkt_cnt); + cr_assert(cnt == count, "Bad pkt counter on root: %d, exp %d\n", cnt, count); + + _destroy_netsim_collective(); +} + +/* Test of a single red_pkt from root to root. + */ +Test(coll_put, put_red_pkt_one) +{ + _put_red_pkt(1); +} + +/* Test of a many red_pkts from root to root. + */ +Test(coll_put, put_red_pkt_many) +{ + _put_red_pkt(4000); +} + +/* Test of the reduction packet code distribution under NETSIM. + * Exercises distribution root->leaves, leaves->root, single packet. + */ +Test(coll_put, put_red_pkt_distrib) +{ + struct cxip_coll_mc *mc_obj[5]; + struct cxip_cq *rx_cq; + struct cxip_coll_reduction *reduction; + struct cxip_coll_data coll_data = {.red_cnt = 1}; + struct fi_cq_data_entry entry; + int i, cnt, ret; + + _create_netsim_collective(5, false, FI_SUCCESS); + _wait_for_join(5, FI_SUCCESS, 0); + + for (i = 0; i < 5; i++) { + mc_obj[i] = container_of(cxit_coll_mc_list.mc_fid[i], + struct cxip_coll_mc, mc_fid); + mc_obj[i]->reduction[0].coll_state = CXIP_COLL_STATE_NONE; + cxip_coll_reset_mc_ctrs(&mc_obj[i]->mc_fid); + } + + rx_cq = mc_obj[0]->ep_obj->coll.rx_evtq->cq; + + coll_data.intval.ival[0] = 0; + reduction = &mc_obj[0]->reduction[0]; + ret = cxip_coll_send_red_pkt(reduction, &coll_data, + false, false); + cr_assert(ret == FI_SUCCESS, + "Packet send from root failed: %d\n", ret); + cnt = ofi_atomic_get32(&mc_obj[0]->send_cnt); + cr_assert(cnt == 4, "Bad send counter on root: %d\n", cnt); + for (i = 1; i < 5; i++) { + do { + sched_yield(); + ret = fi_cq_read(&rx_cq->util_cq.cq_fid, &entry, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1, "Bad CQ response[%d]: %d\n", i, ret); + cnt = ofi_atomic_get32(&mc_obj[i]->recv_cnt); + cr_assert(cnt == 1, + "Bad recv counter on leaf[%d]: %d\n", i, cnt); + } + + /* Send data from leaf (!0) to root */ + for (i = 0; i < 5; i++) + cxip_coll_reset_mc_ctrs(&mc_obj[i]->mc_fid); + for (i = 1; i < 5; i++) { + coll_data.intval.ival[0] = i; + reduction = &mc_obj[i]->reduction[0]; + ret = cxip_coll_send_red_pkt(reduction, &coll_data, + false, false); + cr_assert(ret == FI_SUCCESS, + "Packet send from leaf[%d] failed: %d\n", i, ret); + cnt = ofi_atomic_get32(&mc_obj[i]->send_cnt); + cr_assert(cnt == 1, + "Bad send counter on leaf[%d]: %d\n", i, cnt); + do { + sched_yield(); + ret = fi_cq_read(&rx_cq->util_cq.cq_fid, &entry, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1, "Bad CQ response[%d]: %d\n", i, ret); + } + + cnt = ofi_atomic_get32(&mc_obj[0]->recv_cnt); + cr_assert(cnt == 4, + "Bad recv counter on root: %d\n", cnt); + + _destroy_netsim_collective(); +} + +/***************************************/ +/** + * Test reduction concurrency. + */ +TestSuite(coll_reduce, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .disabled = false, .timeout = 2*CXIT_DEFAULT_TIMEOUT); + +/* Simulated user context, specifically to return error codes */ +struct user_context { + struct dlist_entry entry; + int node; // reduction simulated node (MC object) + int seqno; // reduction sequence number + int red_id; // reduction ID + int errcode; // reduction error code + int hw_rc; // reduction hardware failure code + uint64_t expval; // expected reduction value +}; + +static struct dlist_entry done_list; +static int dlist_initialized; +static int max_queue_depth; +static int queue_depth; +static int rx_count; +static int tx_count; + +static ssize_t _allreduce_poll(struct fid_cq *rx_cq_fid, + struct fid_cq *tx_cq_fid, + struct fi_cq_data_entry *entry) +{ + ssize_t ret; + + /* poll once for RX and TX, report only TX event */ + sched_yield(); + ret = fi_cq_read(rx_cq_fid, entry, 1); + if (ret == FI_SUCCESS) + rx_count++; + ret = fi_cq_read(tx_cq_fid, entry, 1); + if (ret == FI_SUCCESS) + tx_count++; + return ret; +} + +static void _allreduce_wait(struct fid_cq *rx_cq_fid, struct fid_cq *tx_cq_fid, + struct user_context *context) +{ + struct dlist_entry *done; + struct fi_cq_data_entry entry; + struct fi_cq_err_entry err_entry; + struct user_context *ctx; + int ret; + + /* initialize the static locals on first use */ + if (! dlist_initialized) { + dlist_init(&done_list); + dlist_initialized = 1; + } + + /* search for prior detection of context (on queue) */ + dlist_foreach(&done_list, done) { + if ((void *)context == (void *)done) { + dlist_remove(done); + return; + } + } + + do { + /* Wait for a tx CQ completion event, rx CQ may get behind */ + do { + ret = _allreduce_poll(rx_cq_fid, tx_cq_fid, &entry); + } while (context && ret == -FI_EAGAIN); + + ctx = NULL; + if (ret == -FI_EAVAIL) { + /* tx CQ posted an error, copy to user context */ + ret = fi_cq_readerr(tx_cq_fid, &err_entry, 1); + cr_assert(ret == 1, "fi_cq_readerr failed: %d\n", ret); + ctx = err_entry.op_context; + ctx->errcode = err_entry.err; + ctx->hw_rc = err_entry.prov_errno; + cr_assert(err_entry.err != 0, + "Failure with good return\n"); + queue_depth--; + } else if (ret == 1) { + /* tx CQ posted a normal completion */ + ctx = entry.op_context; + ctx->errcode = 0; + ctx->hw_rc = 0; + queue_depth--; + } else { + /* We should only see a 'no-event' error */ + cr_assert(ret == -FI_EAGAIN, "Improper return %d\n", ret); + } + + /* context we are looking for, NULL matches no-event */ + if (ctx == context) + return; + + /* if we did see a ctx == context, record it */ + if (ctx) + dlist_insert_tail(&ctx->entry, &done_list); + + } while (context); + +} + +/* extract and verify mcs and cqs across NETSIM collective group */ +void _resolve_group(const char *label, int nodes, + struct cxip_coll_mc **mc_obj, + struct fid_cq **rx_cq_fid, + struct fid_cq **tx_cq_fid) +{ + struct cxip_ep_obj *ep_obj; + int node; + + /* scan mc_fid[], convert to mc_obj[], and extract ep_obj pointer */ + ep_obj = NULL; + for (node = 0; node < nodes; node++) { + mc_obj[node] = container_of(cxit_coll_mc_list.mc_fid[node], + struct cxip_coll_mc, mc_fid); + /* all mc_obj[] must have the same ep_obj */ + if (!ep_obj) + ep_obj = mc_obj[node]->ep_obj; + cr_assert(mc_obj[node]->ep_obj == ep_obj, + "%s Mismatched endpoints\n", label); + } + cr_assert(ep_obj != NULL, + "%s Did not find an endpoint object\n", label); + /* extract rx and tx cq fids */ + *rx_cq_fid = &ep_obj->coll.rx_evtq->cq->util_cq.cq_fid; + *tx_cq_fid = &ep_obj->coll.tx_evtq->cq->util_cq.cq_fid; +} + +/** + * @brief Exercise the collective state machine. + * + * This is a single-threaded test, intended for use with NETSIM. + * + * We initiate the collective in sequence, beginning with 'start_node', and + * wrapping around. If start_node is zero, the root node initiates first, + * otherwise a leaf node initiates first. + * + * We perform 'concur' reductions concurrently. When we hit the maximum of + * concurrent injections, the reduction attempt should return -FI_EAGAIN. When + * this happens, we poll to see if a completion has occurred, then try again. + * Since we don't know the order of completions, we wait for ANY completion, + * which is then saved in a queue. We can then (later) look for a specific + * completion, which searches the queue before waiting for new completions. + * + * We inject an error by specifying a 'bad' node in the range of nodes. If + * bad_node is outside the range (e.g. -1), no errors will be injected. The + * error injection is done by choosing to send the wrong reduction operation + * code for the bad node, which causes the entire reduction to fail. + * + * We perform 'concur' reductions to exercise the round-robin reduction ID + * handling and blocking. This should be tested for values > 8. + * + * We generate different results for each concurrent reduction, to ensure that + * there is no mixing of the packets in each reduction channel. + * + * @param start_node - node (rank) to start the reduction + * @param bad_node - node to inject a bad reduction, or -1 to succeed + * @param concur - number of reductions to start before polling + */ +void _allreduce(int start_node, int bad_node, int concur) +{ + struct cxip_coll_mc **mc_obj; + struct user_context **context; + struct cxip_intval **rslt; + struct cxip_intval *data; + struct fid_cq *rx_cq_fid, *tx_cq_fid; + int nodes, first, last, base; + char label[128]; + uint64_t result; + ssize_t size; + int i, node, ret; + + TRACE("\n===== %s rank=%d bad=%d concur=%d\n", + __func__, start_node, bad_node, concur); + concur = MAX(concur, 1); + nodes = cxit_coll_mc_list.count; + context = calloc(nodes, sizeof(**context)); + mc_obj = calloc(nodes, sizeof(**mc_obj)); + rslt = calloc(nodes, sizeof(**rslt)); + data = calloc(nodes, sizeof(*data)); + start_node %= nodes; + snprintf(label, sizeof(label), "{%2d,%2d,%2d}", + start_node, bad_node, concur); + + _resolve_group(label, nodes, mc_obj, &rx_cq_fid, &tx_cq_fid); + for (node = 0; node < nodes; node++) { + context[node] = calloc(concur, sizeof(struct user_context)); + rslt[node] = calloc(concur, sizeof(struct cxip_intval)); + } + + /* Inject all of the collectives */ + first = 0; + last = 0; + base = 1; + result = 0; + + /* last advances from 0 to concur */ + while (last < concur) { + uint64_t undone = (1 << nodes) - 1; + + /* use different values on each concurrency */ + base <<= 1; + if (base > 16) + base = 1; + + /* FI_EAGAIN results will force reordering */ + result = 0; + while (undone) { + /* Polls once if we have free reduction IDs */ + _allreduce_wait(rx_cq_fid, tx_cq_fid, NULL); + /* Initiates a single BAND reduction across the nodes */ + for (i = 0; i < nodes; i++) { + enum fi_op op; + uint64_t mask; + + node = (start_node + i) % nodes; + mask = 1LL << node; + op = (node == bad_node) ? FI_BAND : FI_BOR; + + /* Don't repeat nodes that succeeded */ + if (! (mask & undone)) + continue; + + /* Each node contributes a bit */ + data[node].ival[0] = (base << node); + result |= data[node].ival[0]; + context[node][last].node = node; + context[node][last].seqno = last; + + cxip_capture_red_id(&context[node][last].red_id); + size = cxip_allreduce(cxit_ep, + &data[node], 1, NULL, + &rslt[node][last], NULL, + (fi_addr_t)mc_obj[node], + FI_UINT64, op, 0, + &context[node][last]); + if (size == -FI_EAGAIN) + continue; + + /* Completed this one */ + undone &= ~mask; + + /* Event queue should be one deeper */ + if (ret != -FI_EAGAIN && + max_queue_depth < ++queue_depth) + max_queue_depth = queue_depth; + } + } + + /* record the final expected result */ + for (node = 0; node < nodes; node++) + context[node][last].expval = result; + + /* Ensure these all used the same reduction ID */ + ret = 0; + for (node = 1; node < nodes; node++) + if (context[0][last].red_id != + context[node][last].red_id) + ret = -1; + if (ret) + cr_assert(true, "%s reduction ID mismatch\n", label); + + last++; + } + + /* Wait for all reductions to complete */ + while (first < last) { + struct user_context *ctx; + int red_id0, fi_err0, rc_err0; + uint64_t expval, actval; + + /* If there was a bad node, all reductions should fail */ + rc_err0 = (bad_node < 0) ? 0 : CXIP_COLL_RC_OP_MISMATCH; + for (node = 0; node < nodes; node++) { + _allreduce_wait(rx_cq_fid, tx_cq_fid, + &context[node][first]); + ctx = &context[node][first]; + + /* Use the root values as definitive */ + if (node == 0) { + red_id0 = ctx->red_id; + fi_err0 = ctx->errcode; + expval = ctx->expval; + } + actval = rslt[node][first].ival[0]; + + /* Test values */ + if (ctx->node != node || + ctx->seqno != first || + ctx->red_id != red_id0 || + ctx->errcode != fi_err0 || + ctx->hw_rc != rc_err0 || + (!fi_err0 && expval != actval)) { + TRACE("%s =====\n", label); + TRACE(" node %3d, exp %3d\n", + ctx->node, node); + TRACE(" seqno %3d, exp %3d\n", + ctx->seqno, first); + TRACE(" red_id %3d, exp %3d\n", + ctx->red_id, red_id0); + TRACE(" errcode %3d, exp %3d\n", + ctx->errcode, fi_err0); + TRACE(" hw_rc %3d, exp %3d\n", + ctx->hw_rc, rc_err0); + TRACE(" value %08lx, exp %08lx\n", + actval, expval); + cr_assert(true, "%s context failure\n", + label); + } + } + first++; + } + cr_assert(!rx_count && !tx_count, + "rx_count=%d tx_count=%d should be 0\n", rx_count, tx_count); + + for (node = 0; node < nodes; node++) { + TRACE("tmout[%d] = %d\n", node, + ofi_atomic_get32(&mc_obj[node]->tmout_cnt)); + } + + /* make sure we got them all */ + cr_assert(dlist_empty(&done_list), "Pending contexts\n"); + cr_assert(queue_depth == 0, "queue_depth = %d\n", queue_depth); + TRACE("completed\n"); + + for (node = 0; node < nodes; node++) { + free(rslt[node]); + free(context[node]); + } + free(context); + free(rslt); + free(data); + free(mc_obj); +} + +void _reduce_test_set(int concur) +{ + _create_netsim_collective(31, true, FI_SUCCESS); + _wait_for_join(31, FI_SUCCESS, 0); + /* success with each of the nodes starting */ + _allreduce(0, -1, concur); + _allreduce(1, -1, concur); + _allreduce(2, -1, concur); + _allreduce(3, -1, concur); + _allreduce(4, -1, concur); + _allreduce(27, -1, concur); + _allreduce(28, -1, concur); + _allreduce(29, -1, concur); + _allreduce(30, -1, concur); + /* failure with root starting */ + _allreduce(0, 0, concur); + _allreduce(0, 1, concur); + /* failure with leaf starting */ + _allreduce(1, 0, concur); + _allreduce(1, 1, concur); + _destroy_netsim_collective(); +} + +Test(coll_reduce, concur1) +{ + _reduce_test_set(1); +} + +Test(coll_reduce, concur2) +{ + _reduce_test_set(2); +} + +Test(coll_reduce, concur3) +{ + _reduce_test_set(3); +} + +Test(coll_reduce, concur8) +{ + _reduce_test_set(8); +} + +Test(coll_reduce, concurN) +{ + _reduce_test_set(29); +} + +/***************************************/ +/* Collective operation testing */ +#define REDUCE_NODES 10 + +void setup_coll(void) +{ + cxit_setup_rma(); + _create_netsim_collective(REDUCE_NODES, true, FI_SUCCESS); + _wait_for_join(REDUCE_NODES, FI_SUCCESS, 0); +} + +void teardown_coll(void) { + _destroy_netsim_collective(); + cxit_teardown_rma(); +} + +TestSuite(coll_reduce_ops, .init = setup_coll, .fini = teardown_coll, + .disabled = false, .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test barrier */ +Test(coll_reduce_ops, barrier) +{ + struct cxip_coll_mc **mc_obj; + struct fid_cq *rx_cq_fid, *tx_cq_fid; + int nodes, node; + ssize_t size; + struct user_context *context; + + nodes = cxit_coll_mc_list.count; + mc_obj = calloc(nodes, sizeof(**mc_obj)); + context = calloc(nodes, sizeof(*context)); + _resolve_group("barrier", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid); + + /* test bad parameters */ + cr_assert(-FI_EINVAL == cxip_barrier(NULL, 0L, NULL)); + cr_assert(-FI_EINVAL == cxip_barrier(cxit_ep, 0L, NULL)); + + /* 'parallel' injection across nodes */ + for (node = 0; node < nodes; node++) { + size = cxip_barrier(cxit_ep, (fi_addr_t)mc_obj[node], + &context[node]); + cr_assert(size == FI_SUCCESS, + "cxip_barrier[%d]=%ld\n", node, size); + } + + /* 'parallel' wait for all to complete */ + for (node = 0; node < nodes; node++) + _allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node]); + + free(context); + free(mc_obj); +} + +/* Test broadcast */ +Test(coll_reduce_ops, broadcast) +{ + struct cxip_coll_mc **mc_obj; + struct fid_cq *rx_cq_fid, *tx_cq_fid; + int nodes, node, root; + fi_addr_t fi_root; + struct cxip_intval *data; + struct user_context *context; + ssize_t size; + int i, err; + + nodes = cxit_coll_mc_list.count; + mc_obj = calloc(nodes, sizeof(**mc_obj)); + context = calloc(nodes, sizeof(*context)); + data = calloc(nodes, sizeof(*data)); + _resolve_group("broadcast", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid); + + /* test bad parameters */ + cr_assert(-FI_EINVAL == cxip_broadcast(NULL, NULL, 0L, NULL, + 0L, -1L, -1L, -1L, NULL)); + cr_assert(-FI_EINVAL == cxip_broadcast(cxit_ep, NULL, 0L, NULL, + 0L, -1L, -1L, -1L, NULL)); + cr_assert(-FI_EINVAL == cxip_broadcast(cxit_ep, data, 0L, NULL, + 0L, -1L, -1L, -1L, NULL)); + cr_assert(-FI_EINVAL == cxip_broadcast(cxit_ep, data, 4L, NULL, + 0L, -1L, -1L, -1L, NULL)); + + /* repeat for each node serving as root */ + for (root = 0; root < nodes; root++) { + /* set root data to be different from other data */ + memset(data, -1, nodes*sizeof(*data)); + for (i = 0; i < 4; i++) + data[root].ival[i] = root; + /* convert root rank to root fi_addr */ + fi_root = (fi_addr_t)root; + /* 'parallel' injection across nodes */ + for (node = 0; node < nodes; node++) { + size = cxip_broadcast(cxit_ep, &data[node], 4, NULL, + (fi_addr_t)mc_obj[node], + fi_root, FI_UINT64, 0L, + &context[node]); + cr_assert(size == FI_SUCCESS, + "cxip_broadcast[%d]=%ld\n", node, size); + } + /* 'parallel' wait for all to complete */ + for (node = 0; node < nodes; node++) + _allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node]); + /* ensure broadcast worked */ + err = 0; + for (node = 0; node < nodes; node++) { + for (i = 0; i < 4; i++) { + if (data[node].ival[i] != root) + err++; + } + } + if (err) { + printf("FAILED on node=%d, ival=%d\n", node, i); + for (node = 0; node < nodes; node++) { + printf("root=%d node=%2d [", root, node); + for (i = 0; i < 4; i++) { + printf("%016lx ", data[node].ival[i]); + } + printf("]\n"); + } + cr_assert(1, "failed\n"); + } + } + + free(data); + free(context); + free(mc_obj); +} + +/* Test reduce */ +Test(coll_reduce_ops, reduce) +{ + struct cxip_coll_mc **mc_obj; + struct fid_cq *rx_cq_fid, *tx_cq_fid; + int nodes, node, root; + fi_addr_t fi_root; + struct cxip_intval *data, rslt; + struct user_context *context; + uint64_t testval; + ssize_t size; + int i; + + /* test bad parameters */ + cr_assert(-FI_EINVAL == cxip_reduce(NULL, NULL, 0L, NULL, NULL, NULL, + 0L, -1L, -1L, -1L, 0L, NULL)); + cr_assert(-FI_EINVAL == cxip_reduce(cxit_ep, NULL, 0L, NULL, NULL, NULL, + 0L, -1L, -1L, -1L, 0L, NULL)); + + nodes = cxit_coll_mc_list.count; + mc_obj = calloc(nodes, sizeof(**mc_obj)); + context = calloc(nodes, sizeof(*context)); + data = calloc(nodes, sizeof(*data)); + _resolve_group("reduce", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid); + + /* repeat for each node serving as root */ + for (root = 0; root < nodes; root++) { + /* set root data to be different from other data */ + memset(data, -1, nodes*sizeof(*data)); + /* convert root rank to root fi_addr */ + fi_root = (fi_addr_t)root; + /* 'parallel' injection across nodes */ + for (node = 0; node < nodes; node++) { + data[node].ival[0] = (1L << node); + data[node].ival[1] = (1L << node) << 1; + data[node].ival[2] = (1L << node) << 2; + data[node].ival[3] = (1L << node) << 3; + size = cxip_reduce(cxit_ep, &data[node], 4, NULL, + (node == root) ? &rslt : NULL, NULL, + (fi_addr_t)mc_obj[node], + fi_root, FI_UINT64, FI_BOR, 0L, + &context[node]); + cr_assert(size == FI_SUCCESS, + "cxip_broadcast[%d]=%ld\n", node, size); + } + /* 'parallel' wait for all to complete */ + for (node = 0; node < nodes; node++) + _allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node]); + /* ensure reduce worked */ + testval = (1L << nodes) - 1; + for (i = 0; i < 4; i++) { + cr_assert(rslt.ival[i] == testval, + "ival[%d] %016lx != %016lx\n", + i, rslt.ival[i], testval); + testval <<= 1; + } + } + + free(data); + free(context); + free(mc_obj); +} + +/* Perform reduction operation with data, wait for result */ +int _allreduceop(enum fi_op opcode, + enum fi_datatype typ, + uint64_t flags, + void *data, + void *rslt, + size_t count, + struct user_context *context) +{ + struct cxip_coll_mc **mc_obj; + struct fid_cq *rx_cq_fid, *tx_cq_fid; + int nodes, node, datawidth, rsltwidth, ret; + ssize_t size; + + datawidth = (flags & FI_CXI_PRE_REDUCED) ? + sizeof(struct cxip_coll_accumulator) : + sizeof(struct cxip_intval); + rsltwidth = (flags & FI_MORE) ? + sizeof(struct cxip_coll_accumulator) : + sizeof(struct cxip_intval); + nodes = cxit_coll_mc_list.count; + mc_obj = calloc(nodes, sizeof(**mc_obj)); + _resolve_group("reduce", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid); + /* 'parallel' injection across nodes */ + ret = 0; + for (node = 0; node < nodes; node++) { + size = cxip_allreduce(cxit_ep, + (char *)data + (node*datawidth), count, NULL, + (char *)rslt + (node*rsltwidth), NULL, + (fi_addr_t)mc_obj[node], + typ, opcode, flags, + &context[node]); + if (size != FI_SUCCESS) { + printf("%s cxip_allreduce()[%d]=%ld\n", + __func__, node, size); + ret = 1; + goto done; + } + } + + /* 'parallel' wait for all to complete */ + if (!(flags & FI_MORE)) { + for (node = 0; node < nodes; node++) + _allreduce_wait(rx_cq_fid, tx_cq_fid, &context[node]); + } + +done: + free(mc_obj); + return ret; +} + +/* Signaling NaN generation, for testing. + * Linux feature requires GNU_SOURCE. + * This generates a specific sNaN value. + */ +static inline double _snan64(void) +{ + return _bits2dbl(0x7ff4000000000000); +} + +/* Returns true if this is a signalling NAN */ +static inline bool _is_snan64(double d) +{ + /* This detection is universal IEEE */ + return isnan(d) && !(_dbl2bits(d) & 0x0008000000000000); +} + +/* Converts a signalling NAN to a non-signalling NAN */ +static void _quiesce_nan(double *d) +{ + if (isnan(*d)) + *d = NAN; +} + +/* random generation for doubles */ +static inline double _frand(double range) +{ + return ((double)rand()/(double)RAND_MAX) * range; +} + +/* float equality measure, accommodates snan */ +static inline bool _feq(double a, double b) +{ + if (_is_snan64(a) && _is_snan64(b)) + return true; + if (_is_snan64(a) || _is_snan64(b)) + return false; + if (isnan(a) && isnan(b)) + return true; + if (isnan(a) || isnan(b)) + return false; + return (a == b); +} + +/* returns true if a is preferred, false if b is preferred. + * preference is determined by prefer_nan and prefer_min. + * if (a==b), a is preferred. + */ +static inline bool _fcmp(double a, double b, bool prefer_min, bool prefer_nan) +{ + if (prefer_nan) { + /* leftmost snan places first */ + if (_is_snan64(a)) + return false; + /* rightmost snan places second */ + if (_is_snan64(b)) + return true; + /* leftmost nan places third */ + if (isnan(a)) + return false; + /* rightmost nan places last */ + if (isnan(b)) + return true; + } + /* right argument is nan, give preference to left (possibly nan) */ + if (isnan(b)) + return false; + /* left argument is nan and right argument is not, use right */ + if (isnan(a)) + return true; + /* neither argument is nan, return left or right by preference */ + return (a > b) ? prefer_min : !prefer_min; +} + +/* Sanity test for the above */ +Test(coll_reduce_ops, fcmp) +{ + cr_assert(!_fcmp(1.0, 2.0, true, true)); + cr_assert( _fcmp(1.0, 2.0, false, true)); + cr_assert(!_fcmp(1.0, 2.0, true, false)); + cr_assert( _fcmp(1.0, 2.0, false, false)); + cr_assert( _fcmp(2.0, NAN, true, true)); + cr_assert( _fcmp(2.0, NAN, false, true)); + cr_assert(!_fcmp(2.0, NAN, true, false)); + cr_assert(!_fcmp(2.0, NAN, false, false)); + cr_assert(!_fcmp(NAN, NAN, true, true)); + cr_assert(!_fcmp(NAN, NAN, false, true)); + cr_assert(!_fcmp(NAN, NAN, true, false)); + cr_assert(!_fcmp(NAN, NAN, false, false)); + cr_assert( _fcmp(2.0, _snan64(), true, true)); + cr_assert( _fcmp(2.0, _snan64(), false, true)); + cr_assert(!_fcmp(2.0, _snan64(), true, false)); + cr_assert(!_fcmp(2.0, _snan64(), false, false)); + cr_assert( _fcmp(NAN, _snan64(), true, true)); + cr_assert( _fcmp(NAN, _snan64(), false, true)); + cr_assert(!_fcmp(NAN, _snan64(), true, false)); + cr_assert(!_fcmp(NAN, _snan64(), false, false)); + cr_assert(!_fcmp(_snan64(), _snan64(), true, true)); + cr_assert(!_fcmp(_snan64(), _snan64(), false, true)); + cr_assert(!_fcmp(_snan64(), _snan64(), true, false)); + cr_assert(!_fcmp(_snan64(), _snan64(), false, false)); +} + +/* finds MIN(a, b) with two NAN models */ +static inline double _fmin(double a, double b, bool prefer_nan) +{ + return (!_fcmp(a, b, true, prefer_nan)) ? a : b; +} + +/* finds MAX(a, b) with two NAN models */ +static inline double _fmax(double a, double b, bool prefer_nan) +{ + return (!_fcmp(a, b, false, prefer_nan)) ? a : b; +} + +/* Prediction of results takes into account the two NAN models and accounts + * for the distinction between NAN and sNAN. After collective processing, the + * sNAN will be quiesced, so after accounting for its effect, we need to + * quiesce it here for comparison. + */ + +/* computes fmin result */ +static void _predict_fmin(int nodes, struct cxip_fltval *data, + struct cxip_fltval *check, bool prefer_nan) +{ + int i, j; + + prefer_nan = false; // NETCASSINI-5959 + memcpy(check, &data[0], sizeof(*check)); + for (i = 1; i < nodes; i++) + for (j = 0; j < 4; j++) + check->fval[j] = + _fmin(data[i].fval[j], check->fval[j], + prefer_nan); + for (i = 0; i < nodes; i++) + for (j = 0; j < 4; j++) + _quiesce_nan(&check->fval[j]); +} + +/* computes fmax result */ +static void _predict_fmax(int nodes, struct cxip_fltval *data, + struct cxip_fltval *check, bool prefer_nan) +{ + int i, j; + + prefer_nan = false; // NETCASSINI-5959 + memcpy(check, &data[0], sizeof(*check)); + for (i = 1; i < nodes; i++) + for (j = 0; j < 4; j++) + check->fval[j] = + _fmax(data[i].fval[j], check->fval[j], + prefer_nan); + for (i = 0; i < nodes; i++) + for (j = 0; j < 4; j++) + _quiesce_nan(&check->fval[j]); +} + +/* computes minmax result */ +static void _predict_fminmax(int nodes, struct cxip_fltminmax *data, + struct cxip_fltminmax *check, bool prefer_nan) +{ + double a, b; + int i; + + prefer_nan = false; // NETCASSINI-5959 + memcpy(check, &data[0], sizeof(*check)); + for (i = 1; i < nodes; i++) { + a = data[i].fminval; + b = check->fminval; + if (_feq(a, b)) { + /* if equal, choose lowest index */ + if (data[i].fminidx < check->fminidx) + check->fminidx = data[i].fminidx; + } else if (!_fcmp(a, b, true, prefer_nan)) { + check->fminval = a; + check->fminidx = i; + } + a = data[i].fmaxval; + b = check->fmaxval; + if (_feq(a, b)) { + /* if equal, choose lowest index */ + if (data[i].fmaxidx < check->fmaxidx) + check->fmaxidx = data[i].fmaxidx; + } else if (!_fcmp(a, b, false, prefer_nan)) { + check->fmaxval = a; + check->fmaxidx = i; + } + } + for (i = 0; i < nodes; i++) { + _quiesce_nan(&check->fminval); + _quiesce_nan(&check->fmaxval); + } +} + +/* Routines to dump error messages on failure */ +static int _dump_ival(int nodes, int i0, int j0, + struct cxip_intval *rslt, + struct cxip_intval *check) +{ + int i, j; + + for (i = 0; i < nodes; i++) + for (j = 0; j < 4; j++) + printf("[%2d][%2d] rslt=%016lx expect=%016lx%s\n", + i, j, rslt[i].ival[j], check->ival[j], + (i==i0 && j==j0) ? "<-failed" : ""); + return 1; +} + +static int _dump_fval(int nodes, int i0, int j0, + struct cxip_fltval *rslt, + struct cxip_fltval *check) +{ + int i, j; + + for (i = 0; i < nodes; i++) + for (j = 0; j < 4; j++) + printf("[%2d][%2d] rslt=%016g expect=%016g%s\n", + i, j, rslt[i].fval[j], check->fval[j], + (i==i0 && j==j0) ? "<-failed" : ""); + return 1; +} + +static int _dump_iminmax(int nodes, int i0, + struct cxip_iminmax *rslt, + struct cxip_iminmax *check) +{ + int i; + + for (i = 0; i < nodes; i++) { + printf("[%2d] iminval=%16lx expect=%16lx%s\n", + i, rslt[i].iminval, check->iminval, + (i==i0) ? "<-failed" : ""); + printf("[%2d] iminidx=%16ld expect=%16ld%s\n", + i, rslt[i].iminidx, check->iminidx, + (i==i0) ? "<-failed" : ""); + printf("[%2d] imaxval=%16lx expect=%16lx%s\n", + i, rslt[i].imaxval, check->imaxval, + (i==i0) ? "<-failed" : ""); + printf("[%2d] imaxidx=%16ld expect=%16ld%s\n", + i, rslt[i].imaxidx, check->imaxidx, + (i==i0) ? "<-failed" : ""); + } + return 1; +} + +static int _dump_fminmax(int nodes, int i0, + struct cxip_fltminmax *rslt, + struct cxip_fltminmax *check) +{ + int i; + + for (i = 0; i < nodes; i++) { + printf("[%2d] fminval=%16g expect=%16g%s\n", + i, rslt[i].fminval, check->fminval, + (i==i0) ? "<-failed" : ""); + printf("[%2d] fminidx=%16ld expect=%16ld%s\n", + i, rslt[i].fminidx, check->fminidx, + (i==i0) ? "<-failed" : ""); + printf("[%2d] fmaxval=%16g expect=%16g%s\n", + i, rslt[i].fmaxval, check->fmaxval, + (i==i0) ? "<-failed" : ""); + printf("[%2d] fmaxidx=%16ld expect=%16ld%s\n", + i, rslt[i].fmaxidx, check->fmaxidx, + (i==i0) ? "<-failed" : ""); + } + return 1; +} + +/* compares collective integer rslt with computed check */ +static int _check_ival(int nodes, struct cxip_intval *rslt, + struct cxip_intval *check) +{ + int i, j, ret; + + ret = 0; + for (i = 0; i < nodes; i++) + for (j = 0; j < 4; j++) + if (rslt[i].ival[j] != check->ival[j]) + ret += _dump_ival(nodes, i, j, rslt, check); + return ret; +} + +/* compares collective double rslt with computed check */ +static int _check_fval(int nodes, struct cxip_fltval *rslt, + struct cxip_fltval *check) +{ + int i, j; + + for (i = 0; i < nodes; i++) + for (j = 0; j < 4; j++) + if (!_feq(rslt[i].fval[j], check->fval[j])) + return _dump_fval(nodes, i, j, rslt, check); + return 0; +} + +/* compares collective integer minmax rslt with computed check */ +static int _check_iminmax(int nodes, struct cxip_iminmax *rslt, + struct cxip_iminmax *check) +{ + int i; + + for (i = 0; i < nodes; i++) { + if (rslt[i].iminval != check->iminval || + rslt[i].iminidx != check->iminidx || + rslt[i].imaxval != check->imaxval || + rslt[i].imaxidx != check->imaxidx) + return _dump_iminmax(nodes, i, rslt, check); + } + return 0; +} + +/* compares collective double minmax rslt with computed check */ +static int _check_fminmax(int nodes, struct cxip_fltminmax *rslt, + struct cxip_fltminmax *check) +{ + int i; + + for (i = 0; i < nodes; i++) + if (!_feq(rslt[i].fminval, check->fminval) || + !_feq(rslt[i].fmaxval, check->fmaxval) || + rslt[i].fminidx != check->fminidx || + rslt[i].fmaxidx != check->fmaxidx) + return _dump_fminmax(nodes, i, rslt, check); + return 0; +} + +/* compares returned RC code with expected value */ +static int _check_rc(int nodes, struct user_context *context, int rc) +{ + int i, ret; + + ret = 0; + for (i = 0; i < nodes; i++) + if (context[i].hw_rc != rc) { + printf("hw_rc[%d]=%d!=%d\n", i, context[i].hw_rc, rc); + ret = 1; + } + return ret; +} + +/* keeps code easier to read */ +#define STDINTSETUP \ + struct user_context *context; \ + struct cxip_intval *data; \ + struct cxip_intval *rslt; \ + struct cxip_intval check; \ + int i, j, ret, nodes; \ + nodes = cxit_coll_mc_list.count; \ + data = calloc(nodes, sizeof(*data)); \ + rslt = calloc(nodes, sizeof(*rslt)); \ + context = calloc(nodes, sizeof(*context)); \ + +#define STDILOCSETUP \ + struct user_context *context; \ + struct cxip_iminmax *data; \ + struct cxip_iminmax *rslt; \ + struct cxip_iminmax check; \ + int i, ret, nodes; \ + nodes = cxit_coll_mc_list.count; \ + data = calloc(nodes, sizeof(*data)); \ + rslt = calloc(nodes, sizeof(*rslt)); \ + context = calloc(nodes, sizeof(*context)); + +#define STDFLTSETUP \ + struct user_context *context; \ + struct cxip_fltval *data; \ + struct cxip_fltval *rslt; \ + struct cxip_fltval check; \ + int i, ret, nodes; \ + nodes = cxit_coll_mc_list.count; \ + data = calloc(nodes, sizeof(*data)); \ + rslt = calloc(nodes, sizeof(*rslt)); \ + context = calloc(nodes, sizeof(*context)); + +#define STDFLOCSETUP \ + struct user_context *context; \ + struct cxip_fltminmax *data; \ + struct cxip_fltminmax *rslt; \ + struct cxip_fltminmax check; \ + int i, ret, nodes; \ + nodes = cxit_coll_mc_list.count; \ + data = calloc(nodes, sizeof(*data)); \ + rslt = calloc(nodes, sizeof(*rslt)); \ + context = calloc(nodes, sizeof(*context)); + +#define STDCLEANUP \ + free(context); \ + free(rslt); \ + free(data); + +/* Test binary OR */ +Test(coll_reduce_ops, bor) +{ + STDINTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].ival[0] = 1 << i; + data[i].ival[1] = i << 2*i; + data[i].ival[2] = i; + data[i].ival[3] = 2*i; + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) + for (j = 0; j < 4; j++) + check.ival[j] |= data[i].ival[j]; + + ret = _allreduceop(FI_BOR, FI_UINT64, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop() failed\n"); + ret = _check_ival(nodes, rslt, &check); + cr_assert(!ret, "compare failed\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed\n"); + STDCLEANUP +} + +/* Test binary AND */ +Test(coll_reduce_ops, band) +{ + STDINTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].ival[0] = ~(1 << i); + data[i].ival[1] = ~(i << 2*i); + data[i].ival[2] = ~i; + data[i].ival[3] = ~(2*i); + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) + for (j = 0; j < 4; j++) + check.ival[j] &= data[i].ival[j]; + + ret = _allreduceop(FI_BAND, FI_UINT64, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop() failed = %d\n", ret); + ret = _check_ival(nodes, rslt, &check); + cr_assert(!ret, "compare failed\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed\n"); + STDCLEANUP +} + +/* Test binary XOR */ +Test(coll_reduce_ops, bxor) +{ + STDINTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].ival[0] = 1 << i; + data[i].ival[1] = ~(i << i); + data[i].ival[2] = i; + data[i].ival[3] = ~i; + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) + for (j = 0; j < 4; j++) + check.ival[j] ^= data[i].ival[j]; + + ret = _allreduceop(FI_BXOR, FI_UINT64, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop() failed\n"); + ret = _check_ival(nodes, rslt, &check); + cr_assert(!ret, "compare failed\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed\n"); + STDCLEANUP +} + +/* Tests int64 minimum */ +Test(coll_reduce_ops, imin) +{ + STDINTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].ival[0] = rand(); + data[i].ival[1] = -rand(); + data[i].ival[2] = rand(); + data[i].ival[3] = -rand(); + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) + for (j = 0; j < 4; j++) + check.ival[j] = MIN(check.ival[j], data[i].ival[j]); + + ret = _allreduceop(FI_MIN, FI_INT64, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop() failed\n"); + ret = _check_ival(nodes, rslt, &check); + cr_assert(!ret, "compare failed\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed\n"); + STDCLEANUP +} + +/* Tests int64 maximum */ +Test(coll_reduce_ops, imax) +{ + STDINTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].ival[0] = rand(); + data[i].ival[1] = -rand(); + data[i].ival[2] = rand(); + data[i].ival[3] = -rand(); + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) + for (j = 0; j < 4; j++) + check.ival[j] = MAX(check.ival[j], data[i].ival[j]); + + ret = _allreduceop(FI_MAX, FI_INT64, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop() failed\n"); + ret = _check_ival(nodes, rslt, &check); + cr_assert(!ret, "compare failed\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed\n"); + STDCLEANUP +} + +/* Tests int64 SUM */ +Test(coll_reduce_ops, isum) +{ + STDINTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].ival[0] = rand(); + data[i].ival[1] = -rand(); + data[i].ival[2] = rand(); + data[i].ival[3] = -rand(); + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) + for (j = 0; j < 4; j++) + check.ival[j] += data[i].ival[j]; + + ret = _allreduceop(FI_SUM, FI_INT64, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop() failed\n"); + ret = _check_ival(nodes, rslt, &check); + cr_assert(!ret, "compare failed\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed\n"); + STDCLEANUP +} + +/* Tests int64 minmaxloc */ +Test(coll_reduce_ops, iminmaxloc) +{ + STDILOCSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].iminval = rand(); + data[i].iminidx = i; + data[i].imaxval = rand(); + data[i].imaxidx = i; + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) { + if (check.iminval > data[i].iminval) { + check.iminval = data[i].iminval; + check.iminidx = data[i].iminidx; + } + if (check.imaxval < data[i].imaxval) { + check.imaxval = data[i].imaxval; + check.imaxidx = data[i].imaxidx; + } + } + + ret = _allreduceop(FI_CXI_MINMAXLOC, FI_INT64, 0L, data, rslt, 1, + context); + cr_assert(!ret, "_allreduceop() failed = %d\n", ret); + ret = _check_iminmax(nodes, rslt, &check); + cr_assert(!ret, "compare failed\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed\n"); + STDCLEANUP +} + +/* Tests double sum */ +Test(coll_reduce_ops, fsum) +{ + STDFLTSETUP + int j; + + /* max nodes == 32 under NETSIM */ + data[0].fval[0] = 1.0e-53; + data[0].fval[1] = 1.0e-53; + data[0].fval[2] = 1.0e-53; + data[0].fval[3] = 1.0e-53; + for (i = 1; i < nodes; i++) { + data[i].fval[0] = _frand(1.0); + data[i].fval[1] = -_frand(1.0); + data[i].fval[2] = _frand(1.0); + data[i].fval[3] = -_frand(1.0); + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) + for (j = 0; j < 4; j++) + check.fval[j] += data[i].fval[j]; + + ret = _allreduceop(FI_SUM, FI_DOUBLE, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop() failed\n"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INEXACT); + cr_assert(!ret, "rc failed\n"); + + /* Note: inexact computation is guaranteed by the small value included + * in the data set. There is a hidden trick when performing the + * comparison that relies on the prediction and the NETSIM allreduce + * operation both occuring in the same order, due to the nature of the + * simulated endpoints. In a real collective, ordering will be random, + * and the results will vary according to the ordering. + */ + STDCLEANUP +} + +/* Test double minimum -- this should be exact */ +Test(coll_reduce_ops, fmin) +{ + STDFLTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].fval[0] = _frand(1.0); + data[i].fval[1] = -_frand(1.0); + data[i].fval[2] = _frand(1.0); + data[i].fval[3] = -_frand(1.0); + } + + /* normal floating point */ + _predict_fmin(nodes, data, &check, true); + ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop failed normal"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed normal\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed normal\n"); + + data[1].fval[1] = NAN; + _predict_fmin(nodes, data, &check, true); + ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop failed NAN"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed NAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); + cr_assert(!ret, "rc failed NAN\n"); + + data[1].fval[1] = _snan64(); + _predict_fmin(nodes, data, &check, true); + ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop failed sNAN"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed sNAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + cr_assert(!ret, "rc failed sNAN\n"); + STDCLEANUP +} + +/* Test double maximum -- this should be exact */ +Test(coll_reduce_ops, fmax) +{ + STDFLTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].fval[0] = _frand(1.0); + data[i].fval[1] = -_frand(1.0); + data[i].fval[2] = _frand(1.0); + data[i].fval[3] = -_frand(1.0); + } + + _predict_fmax(nodes, data, &check, true); + ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop failed normal"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed normal\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed normal\n"); + + data[1].fval[1] = NAN; + _predict_fmax(nodes, data, &check, true); + ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop failed NAN"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed NAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); + cr_assert(!ret, "rc failed NAN\n"); + + data[1].fval[1] = _snan64(); + _predict_fmax(nodes, data, &check, true); + ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4, context); + cr_assert(!ret, "_allreduceop failed sNAN"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed sNAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + cr_assert(!ret, "rc failed sNAN\n"); + STDCLEANUP +} + +/* Test double minmax with index -- should be exact */ +Test(coll_reduce_ops, fminmaxloc) +{ + STDFLOCSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].fminval = _frand(1.0); + data[i].fminidx = i; + data[i].fmaxval = _frand(1.0); + data[i].fmaxidx = i; + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) { + if (check.fminval > data[i].fminval) { + check.fminval = data[i].fminval; + check.fminidx = data[i].fminidx; + } + if (check.fmaxval < data[i].fmaxval) { + check.fmaxval = data[i].fmaxval; + check.fmaxidx = data[i].fmaxidx; + } + } + + _predict_fminmax(nodes, data, &check, true); + ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1, + context); + cr_assert(!ret, "_allreduceop failed normal"); + ret = _check_fminmax(nodes, rslt, &check); + cr_assert(!ret, "compare failed normal\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed normal\n"); + + /* NAN is given preference over number */ + data[1].fminval = NAN; + data[3].fmaxval = NAN; + _predict_fminmax(nodes, data, &check, true); + ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1, + context); + cr_assert(!ret, "_allreduceop failed NAN"); + ret = _check_fminmax(nodes, rslt, &check); + cr_assert(!ret, "compare failed NAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed NAN\n"); + + /* SNAN is given preference over NAN */ + data[1].fminval = NAN; + data[2].fminval = _snan64(); + data[3].fmaxval = NAN; + _predict_fminmax(nodes, data, &check, true); + ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1, + context); + cr_assert(!ret, "_allreduceop failed sNAN"); + ret = _check_fminmax(nodes, rslt, &check); + cr_assert(!ret, "compare failed sNAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + cr_assert(!ret, "rc failed sNAN\n"); + STDCLEANUP +} + +/* Test double minimum ignoring NAN -- should be exact */ +Test(coll_reduce_ops, fminnum) +{ + STDFLTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].fval[0] = _frand(1.0); + data[i].fval[1] = -_frand(1.0); + data[i].fval[2] = _frand(1.0); + data[i].fval[3] = -_frand(1.0); + } + + _predict_fmin(nodes, data, &check, false); + ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4, + context); + cr_assert(!ret, "_allreduceop failed normal"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed normal\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed normal\n"); + + /* number is given preference over NAN */ + data[1].fval[1] = NAN; + _predict_fmin(nodes, data, &check, false); + ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4, + context); + cr_assert(!ret, "_allreduceop failed NAN"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed NAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); + cr_assert(!ret, "rc failed NAN\n"); + + /* number is given preference over NAN */ + data[1].fval[1] = _snan64(); + _predict_fmin(nodes, data, &check, false); + ret = _allreduceop(FI_MIN, FI_DOUBLE, 0L, data, rslt, 4, + context); + cr_assert(!ret, "_allreduceop failed sNAN"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed sNAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + cr_assert(!ret, "rc failed sNAN\n"); + STDCLEANUP +} + +/* Test double maximum ignoring NAN -- should be exact */ +Test(coll_reduce_ops, fmaxnum) +{ + STDFLTSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].fval[0] = _frand(1.0); + data[i].fval[1] = -_frand(1.0); + data[i].fval[2] = _frand(1.0); + data[i].fval[3] = -_frand(1.0); + } + + _predict_fmax(nodes, data, &check, false); + ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4, + context); + cr_assert(!ret, "_allreduceop failed normal"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed normal\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed normal\n"); + + /* number is given preference over NAN */ + data[1].fval[1] = NAN; + _predict_fmax(nodes, data, &check, false); + ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4, + context); + cr_assert(!ret, "_allreduceop failed NAN"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed NAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_OVERFLOW); + cr_assert(!ret, "rc failed NAN\n"); + + /* SNAN is given preference over number */ + data[1].fval[1] = _snan64(); + _predict_fmax(nodes, data, &check, false); + ret = _allreduceop(FI_MAX, FI_DOUBLE, 0L, data, rslt, 4, + context); + cr_assert(!ret, "_allreduceop failed sNAN"); + ret = _check_fval(nodes, rslt, &check); + cr_assert(!ret, "compare failed sNAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + cr_assert(!ret, "rc failed sNAN\n"); + STDCLEANUP +} + +/* Test double minmax with index ignoring NAN -- should be exact */ +Test(coll_reduce_ops, fminmaxnumloc) +{ + STDFLOCSETUP + /* max nodes == 32 under NETSIM */ + for (i = 0; i < nodes; i++) { + data[i].fminval = _frand(1.0); + data[i].fminidx = i; + data[i].fmaxval = _frand(1.0); + data[i].fmaxidx = i; + } + memcpy(&check, &data[0], sizeof(check)); + for (i = 1; i < nodes; i++) { + if (check.fminval > data[i].fminval) { + check.fminval = data[i].fminval; + check.fminidx = data[i].fminidx; + } + if (check.fmaxval < data[i].fmaxval) { + check.fmaxval = data[i].fmaxval; + check.fmaxidx = data[i].fmaxidx; + } + } + + _predict_fminmax(nodes, data, &check, false); + ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1, + context); + cr_assert(!ret, "_allreduceop failed normal"); + ret = _check_fminmax(nodes, rslt, &check); + cr_assert(!ret, "compare failed normal\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed normal\n"); + + /* NAN is given preference over number */ + data[1].fminval = NAN; + data[3].fmaxval = NAN; + _predict_fminmax(nodes, data, &check, false); + ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1, + context); + cr_assert(!ret, "_allreduceop failed NAN"); + ret = _check_fminmax(nodes, rslt, &check); + cr_assert(!ret, "compare failed NAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed NAN\n"); + + /* SNAN is given preference over NAN */ + data[1].fminval = NAN; + data[2].fminval = _snan64(); + data[3].fmaxval = NAN; + _predict_fminmax(nodes, data, &check, false); + ret = _allreduceop(FI_CXI_MINMAXLOC, FI_DOUBLE, 0L, data, rslt, 1, + context); + cr_assert(!ret, "_allreduceop failed sNAN"); + ret = _check_fminmax(nodes, rslt, &check); + cr_assert(!ret, "compare failed sNAN\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_FLT_INVALID); + cr_assert(!ret, "rc failed sNAN\n"); + STDCLEANUP +} + +Test(coll_reduce_ops, prereduce) +{ + STDINTSETUP + struct cxip_coll_mc **mc_obj; + struct fid_cq *rx_cq_fid, *tx_cq_fid; + struct cxip_coll_accumulator *accum1, accum2; + struct cxip_intval rawdata; + + mc_obj = calloc(nodes, sizeof(**mc_obj)); + _resolve_group("prereduce", nodes, mc_obj, &rx_cq_fid, &tx_cq_fid); + + accum1 = calloc(nodes, sizeof(*accum1)); + memset(&check, 0, sizeof(check)); + ret = -1; + for (i = 0; i < nodes; i++) { + /* reset accum2 for next node */ + memset(&accum2, 0, sizeof(accum2)); + /* reduce over 128 threads */ + for (j = 0; j < 128; j++) { + rawdata.ival[0] = rand(); + rawdata.ival[1] = -rand(); + rawdata.ival[2] = rand(); + rawdata.ival[3] = -rand(); + + /* total contributions from all nodes/threads */ + check.ival[0] += rawdata.ival[0]; + check.ival[1] += rawdata.ival[1]; + check.ival[2] += rawdata.ival[2]; + check.ival[3] += rawdata.ival[3]; + + /* FI_MORE interleaved into accum1[], accum2 */ + ret = cxip_allreduce(NULL, &rawdata, 4, NULL, + (j & 1) ? &accum2 : &accum1[i], NULL, (fi_addr_t)mc_obj[i], + FI_INT64, FI_SUM, + FI_MORE, NULL); + + } + /* Fold accum2 into accum1[] */ + ret = cxip_allreduce(NULL, &accum2, 4, NULL, &accum1[i], NULL, + (fi_addr_t)mc_obj[i], FI_INT64, FI_SUM, + FI_MORE | FI_CXI_PRE_REDUCED, NULL); + } + /* after all accumulators loaded, reduce them across nodes */ + for (i = 0; i < nodes; i++) { + ret = cxip_allreduce(cxit_ep, &accum1[i], 4, NULL, &rslt[i], + NULL, (fi_addr_t)mc_obj[i], FI_INT64, + FI_SUM, FI_CXI_PRE_REDUCED, &context[i]); + } + /* wait for all reductions to post completion */ + for (i = 0; i < nodes; i++) + _allreduce_wait(rx_cq_fid, tx_cq_fid, &context[i]); + cr_assert(!ret, "_allreduceop() failed\n"); + + /* validate results */ + ret = _check_ival(nodes, rslt, &check); + cr_assert(!ret, "compare failed\n"); + ret = _check_rc(nodes, context, CXIP_COLL_RC_SUCCESS); + cr_assert(!ret, "rc failed\n"); + + free(accum1); + free(mc_obj); + STDCLEANUP +} diff --git a/prov/cxi/test/cq.c b/prov/cxi/test/cq.c new file mode 100644 index 00000000000..592190c7bf5 --- /dev/null +++ b/prov/cxi/test/cq.c @@ -0,0 +1,615 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(cq, .init = cxit_setup_cq, .fini = cxit_teardown_cq, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test basic CQ creation */ +Test(cq, simple) +{ + cxit_create_cqs(); + cr_assert(cxit_tx_cq != NULL); + cr_assert(cxit_rx_cq != NULL); + + cxit_destroy_cqs(); +} + +static void req_populate(struct cxip_req *req, fi_addr_t *addr) +{ + *addr = 0xabcd0; + req->flags = FI_SEND; + req->context = 0xabcd2; + req->data = 0xabcd4; + req->tag = 0xabcd5; + req->buf = 0xabcd6; + req->data_len = 0xabcd7; + req->discard = false; +} + +Test(cq, read_fmt_context) +{ + int ret; + struct cxip_req req; + struct fi_cq_entry entry; + fi_addr_t req_addr; + struct cxip_cq *cxi_cq; + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_CONTEXT; + cxit_create_cqs(); + + req_populate(&req, &req_addr); + cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + req.cq = cxi_cq; + + cxip_cq_req_complete(&req); + ret = fi_cq_read(cxit_tx_cq, &entry, 1); + cr_assert(ret == 1); + cr_assert((uint64_t)entry.op_context == req.context); + + cxit_destroy_cqs(); +} + +Test(cq, read_fmt_msg) +{ + int ret; + struct cxip_req req; + struct fi_cq_msg_entry entry; + fi_addr_t req_addr; + struct cxip_cq *cxi_cq; + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_MSG; + cxit_create_cqs(); + + req_populate(&req, &req_addr); + cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + req.cq = cxi_cq; + + cxip_cq_req_complete(&req); + ret = fi_cq_read(cxit_tx_cq, &entry, 1); + cr_assert(ret == 1); + + cr_assert((uint64_t)entry.op_context == req.context); + cr_assert(entry.flags == req.flags); + cr_assert(entry.len == req.data_len); + + cxit_destroy_cqs(); +} + +Test(cq, read_fmt_data) +{ + int ret; + struct cxip_req req; + struct fi_cq_data_entry entry; + fi_addr_t req_addr; + struct cxip_cq *cxi_cq; + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_DATA; + cxit_create_cqs(); + + req_populate(&req, &req_addr); + cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + req.cq = cxi_cq; + + cxip_cq_req_complete(&req); + ret = fi_cq_read(cxit_tx_cq, &entry, 1); + cr_assert(ret == 1); + + cr_assert((uint64_t)entry.op_context == req.context); + cr_assert(entry.flags == req.flags); + cr_assert(entry.len == req.data_len); + cr_assert((uint64_t)entry.buf == req.buf); + cr_assert(entry.data == req.data); + + cxit_destroy_cqs(); +} + +Test(cq, read_fmt_tagged) +{ + int ret; + struct cxip_req req; + struct fi_cq_tagged_entry entry; + fi_addr_t req_addr; + struct cxip_cq *cxi_cq; + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_create_cqs(); + + req_populate(&req, &req_addr); + cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + req.cq = cxi_cq; + + cxip_cq_req_complete(&req); + ret = fi_cq_read(cxit_tx_cq, &entry, 1); + cr_assert(ret == 1); + + cr_assert((uint64_t)entry.op_context == req.context); + cr_assert(entry.flags == req.flags); + cr_assert(entry.len == req.data_len); + cr_assert((uint64_t)entry.buf == req.buf); + cr_assert(entry.data == req.data); + cr_assert(entry.tag == req.tag); + + cxit_destroy_cqs(); +} + +Test(cq, readfrom_fmt_context) +{ + int ret; + struct cxip_req req; + struct fi_cq_entry entry; + fi_addr_t addr = 0, req_addr; + struct cxip_cq *cxi_cq; + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_CONTEXT; + cxit_create_cqs(); + + req_populate(&req, &req_addr); + cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + req.cq = cxi_cq; + + cxip_cq_req_complete_addr(&req, req_addr); + ret = fi_cq_readfrom(cxit_tx_cq, &entry, 1, &addr); + cr_assert(ret == 1); + + cr_assert((uint64_t)entry.op_context == req.context); + cr_assert(addr == req_addr); + + cxit_destroy_cqs(); +} + +Test(cq, readfrom_fmt_msg) +{ + int ret; + struct cxip_req req; + struct fi_cq_msg_entry entry; + fi_addr_t addr = 0, req_addr; + struct cxip_cq *cxi_cq; + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_MSG; + cxit_create_cqs(); + + req_populate(&req, &req_addr); + cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + req.cq = cxi_cq; + + cxip_cq_req_complete_addr(&req, req_addr); + ret = fi_cq_readfrom(cxit_tx_cq, &entry, 1, &addr); + cr_assert(ret == 1); + + cr_assert((uint64_t)entry.op_context == req.context); + cr_assert(entry.flags == req.flags); + cr_assert(entry.len == req.data_len); + cr_assert(addr == req_addr); + + cxit_destroy_cqs(); +} + +Test(cq, readfrom_fmt_data) +{ + int ret; + struct cxip_req req; + struct fi_cq_data_entry entry; + fi_addr_t addr = 0, req_addr; + struct cxip_cq *cxi_cq; + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_DATA; + cxit_create_cqs(); + + req_populate(&req, &req_addr); + cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + req.cq = cxi_cq; + + cxip_cq_req_complete_addr(&req, req_addr); + ret = fi_cq_readfrom(cxit_tx_cq, &entry, 1, &addr); + cr_assert(ret == 1); + + cr_assert((uint64_t)entry.op_context == req.context); + cr_assert(entry.flags == req.flags); + cr_assert(entry.len == req.data_len); + cr_assert((uint64_t)entry.buf == req.buf); + cr_assert(entry.data == req.data); + cr_assert(addr == req_addr); + + cxit_destroy_cqs(); +} + +Test(cq, readfrom_fmt_tagged) +{ + int ret; + struct cxip_req req; + struct fi_cq_tagged_entry entry; + fi_addr_t addr = 0, req_addr; + struct cxip_cq *cxi_cq; + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_create_cqs(); + + req_populate(&req, &req_addr); + cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + req.cq = cxi_cq; + + cxip_cq_req_complete_addr(&req, req_addr); + ret = fi_cq_readfrom(cxit_tx_cq, &entry, 1, &addr); + cr_assert(ret == 1); + + cr_assert((uint64_t)entry.op_context == req.context); + cr_assert(entry.flags == req.flags); + cr_assert(entry.len == req.data_len); + cr_assert((uint64_t)entry.buf == req.buf); + cr_assert(entry.data == req.data); + cr_assert(entry.tag == req.tag); + cr_assert(addr == req_addr); + + cxit_destroy_cqs(); +} + +Test(cq, cq_open_null_attr) +{ + int ret; + struct fid_cq *cxi_open_cq = NULL; + struct cxip_cq *cxi_cq = NULL; + + /* Open a CQ with a NULL attribute object pointer */ + ret = fi_cq_open(cxit_domain, NULL, &cxi_open_cq, NULL); + cr_assert(ret == FI_SUCCESS, "fi_cq_open with NULL attr"); + cr_assert_not_null(cxi_open_cq); + + /* Validate that the default attributes were set */ + cxi_cq = container_of(cxi_open_cq, struct cxip_cq, util_cq.cq_fid); + cr_assert_eq(cxi_cq->attr.size, CXIP_CQ_DEF_SZ); + cr_assert_eq(cxi_cq->attr.flags, 0); + cr_assert_eq(cxi_cq->attr.format, FI_CQ_FORMAT_CONTEXT); + cr_assert_eq(cxi_cq->attr.wait_obj, FI_WAIT_NONE); + cr_assert_eq(cxi_cq->attr.signaling_vector, 0); + cr_assert_eq(cxi_cq->attr.wait_cond, FI_CQ_COND_NONE); + cr_assert_null((void *)cxi_cq->attr.wait_set); + + ret = fi_close(&cxi_open_cq->fid); + cr_assert(ret == FI_SUCCESS); + cxi_open_cq = NULL; +} + +struct cq_format_attr_params { + enum fi_cq_format in_format; + enum fi_cq_format out_format; + int status; +}; + +ParameterizedTestParameters(cq, cq_attr_format) +{ + size_t param_sz; + + static struct cq_format_attr_params params[] = { + {.in_format = FI_CQ_FORMAT_CONTEXT, + .out_format = FI_CQ_FORMAT_CONTEXT, + .status = FI_SUCCESS}, + {.in_format = FI_CQ_FORMAT_MSG, + .out_format = FI_CQ_FORMAT_MSG, + .status = FI_SUCCESS}, + {.in_format = FI_CQ_FORMAT_DATA, + .out_format = FI_CQ_FORMAT_DATA, + .status = FI_SUCCESS}, + {.in_format = FI_CQ_FORMAT_TAGGED, + .out_format = FI_CQ_FORMAT_TAGGED, + .status = FI_SUCCESS}, + {.in_format = FI_CQ_FORMAT_UNSPEC, + .out_format = FI_CQ_FORMAT_CONTEXT, + .status = FI_SUCCESS}, + {.in_format = FI_CQ_FORMAT_UNSPEC - 1, + .out_format = -1, /* Unchecked in failure case */ + .status = -FI_ENOSYS} + }; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct cq_format_attr_params, params, + param_sz); +} + +ParameterizedTest(struct cq_format_attr_params *param, cq, cq_attr_format) +{ + int ret; + struct fid_cq *cxi_open_cq = NULL; + struct fi_cq_attr cxit_cq_attr = {0}; + struct cxip_cq *cxi_cq = NULL; + + cxit_cq_attr.format = param->in_format; + cxit_cq_attr.wait_obj = FI_WAIT_NONE; /* default */ + cxit_cq_attr.size = 0; /* default */ + + /* Open a CQ with a NULL attribute object pointer */ + ret = fi_cq_open(cxit_domain, &cxit_cq_attr, &cxi_open_cq, NULL); + cr_assert_eq(ret, param->status, + "fi_cq_open() status mismatch %d != %d with format %d. %s", + ret, param->status, cxit_cq_attr.format, + fi_strerror(-ret)); + + if (ret != FI_SUCCESS) { + /* Test Complete */ + return; + } + + /* Validate that the format attribute */ + cr_assert_not_null(cxi_open_cq, + "fi_cq_open() cxi_open_cq is NULL with format %d", + cxit_cq_attr.format); + cxi_cq = container_of(cxi_open_cq, struct cxip_cq, util_cq.cq_fid); + cr_assert_eq(cxi_cq->attr.format, param->out_format); + + ret = fi_close(&cxi_open_cq->fid); + cr_assert(ret == FI_SUCCESS); +} + +struct cq_wait_attr_params { + enum fi_wait_obj in_wo; + enum fi_wait_obj out_wo; + int status; +}; + +ParameterizedTestParameters(cq, cq_attr_wait) +{ + size_t param_sz; + + static struct cq_wait_attr_params params[] = { + {.in_wo = FI_WAIT_NONE, + .status = FI_SUCCESS}, + {.in_wo = FI_WAIT_FD, + .status = FI_SUCCESS}, + {.in_wo = FI_WAIT_SET, + .status = -FI_ENOSYS}, + {.in_wo = FI_WAIT_MUTEX_COND, + .status = -FI_ENOSYS}, + {.in_wo = FI_WAIT_UNSPEC, + .status = FI_SUCCESS}, + {.in_wo = FI_WAIT_NONE - 1, + .status = -FI_ENOSYS} + }; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct cq_wait_attr_params, params, + param_sz); +} + +ParameterizedTest(struct cq_wait_attr_params *param, cq, cq_attr_wait) +{ + int ret; + struct fid_cq *cxi_open_cq = NULL; + struct fi_cq_attr cxit_cq_attr = {0}; + + cxit_cq_attr.wait_obj = param->in_wo; + cxit_cq_attr.format = FI_CQ_FORMAT_UNSPEC; /* default */ + cxit_cq_attr.size = 0; /* default */ + + /* Open a CQ with a NULL attribute object pointer */ + ret = fi_cq_open(cxit_domain, &cxit_cq_attr, &cxi_open_cq, NULL); + cr_assert_eq(ret, param->status, + "fi_cq_open() status mismatch %d != %d with wait obj %d. %s", + ret, param->status, cxit_cq_attr.wait_obj, + fi_strerror(-ret)); + + if (ret != FI_SUCCESS) { + /* Test Complete */ + return; + } + + ret = fi_close(&cxi_open_cq->fid); + cr_assert(ret == FI_SUCCESS); +} + +struct cq_size_attr_params { + size_t in_sz; + size_t out_sz; +}; + +ParameterizedTestParameters(cq, cq_attr_size) +{ + size_t param_sz; + + static struct cq_size_attr_params params[] = { + {.in_sz = 0, + .out_sz = CXIP_CQ_DEF_SZ}, + {.in_sz = 1 << 9, + .out_sz = 1 << 9}, + {.in_sz = 1 << 6, + .out_sz = 1 << 6} + }; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct cq_size_attr_params, params, + param_sz); +} + +ParameterizedTest(struct cq_size_attr_params *param, cq, cq_attr_size) +{ + int ret; + struct fid_cq *cxi_open_cq = NULL; + struct fi_cq_attr cxit_cq_attr = {0}; + + cxit_cq_attr.format = FI_CQ_FORMAT_UNSPEC; /* default */ + cxit_cq_attr.wait_obj = FI_WAIT_NONE; /* default */ + cxit_cq_attr.size = param->in_sz; + + /* Open a CQ with a NULL attribute object pointer */ + ret = fi_cq_open(cxit_domain, &cxit_cq_attr, &cxi_open_cq, NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_cq_open() status mismatch %d != %d with size %ld. %s", + ret, FI_SUCCESS, cxit_cq_attr.size, + fi_strerror(-ret)); + cr_assert_not_null(cxi_open_cq); + + ret = fi_close(&cxi_open_cq->fid); + cr_assert(ret == FI_SUCCESS); +} + +Test(cq, cq_open_null_domain, .signal = SIGSEGV) +{ + struct fid_cq *cxi_open_cq = NULL; + + /* + * Attempt to open a CQ with a NULL domain pointer + * Expect a SIGSEGV since the fi_cq_open implementation attempts to + * use the domain pointer before checking. + */ + fi_cq_open(NULL, NULL, &cxi_open_cq, NULL); +} + +Test(cq, cq_open_null_cq) +{ + /* Attempt to open a CQ with a NULL cq pointer */ + int ret; + + ret = fi_cq_open(cxit_domain, NULL, NULL, NULL); + cr_assert(ret == -FI_EINVAL, "fi_cq_open with NULL cq"); +} + +Test(cq, cq_readerr_null_cq, .signal = SIGSEGV) +{ + struct fi_cq_err_entry err_entry; + + /* Attempt to read an err with a CQ with a NULL cq pointer */ + fi_cq_readerr(NULL, &err_entry, (uint64_t)0); +} + +Test(cq, cq_readerr_no_errs) +{ + int ret; + struct fid_cq *cxi_open_cq = NULL; + struct fi_cq_err_entry err_entry; + + /* Open a CQ */ + ret = fi_cq_open(cxit_domain, NULL, &cxi_open_cq, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open with NULL attr"); + cr_assert_not_null(cxi_open_cq); + + /* Attempt to read an err with a CQ with a NULL buff pointer */ + ret = fi_cq_readerr(cxi_open_cq, &err_entry, (uint64_t)0); + /* Expect no completions to be available */ + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_readerr returned %d", ret); + + ret = fi_close(&cxi_open_cq->fid); + cr_assert_eq(ret, FI_SUCCESS); +} + +void err_entry_comp(struct fi_cq_err_entry *a, + struct fi_cq_err_entry *b, + size_t size) +{ + uint8_t *data_a, *data_b; + + data_a = (uint8_t *)a; + data_b = (uint8_t *)b; + + for (int i = 0; i < size; i++) + if (data_a[i] != data_b[i]) + cr_expect_fail("Mismatch at offset %d. %02X - %02X", + i, data_a[i], data_b[i]); +} + +Test(cq, cq_readerr_err) +{ + int ret; + struct fid_cq *cxi_open_cq = NULL; + struct fi_cq_err_entry err_entry, fake_entry; + struct cxip_cq *cxi_cq; + uint8_t *data_fake, *data_err; + + /* initialize the entries with data */ + data_fake = (uint8_t *)&fake_entry; + data_err = (uint8_t *)&err_entry; + for (int i = 0; i < sizeof(fake_entry); i++) { + data_fake[i] = (uint8_t)i; + data_err[i] = (uint8_t)0xa5; + } + fake_entry.prov_errno = 18; + fake_entry.err_data = err_entry.err_data = NULL; + fake_entry.err_data_size = err_entry.err_data_size = 0; + + /* Open a CQ */ + ret = fi_cq_open(cxit_domain, NULL, &cxi_open_cq, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open with NULL attr"); + cr_assert_not_null(cxi_open_cq); + + /* Add a fake error to the CQ's error ringbuffer */ + cxi_cq = container_of(cxi_open_cq, struct cxip_cq, util_cq.cq_fid); + ofi_cq_write_error(&cxi_cq->util_cq, &fake_entry); + + /* Attempt to read an err with a CQ with a NULL buff pointer */ + ret = fi_cq_readerr(cxi_open_cq, &err_entry, (uint64_t)0); + /* Expect 1 completion to be available */ + cr_assert_eq(ret, 1, "fi_cq_readerr returned %d", ret); + /* Expect the data to match the fake entry */ + err_entry_comp(&err_entry, &fake_entry, sizeof(fake_entry)); + printf("prov_errno: %s\n", + fi_cq_strerror(cxi_open_cq, err_entry.prov_errno, + NULL, NULL, 0)); + + ret = fi_close(&cxi_open_cq->fid); + cr_assert_eq(ret, FI_SUCCESS); +} + +Test(cq, cq_readerr_reperr) +{ + int ret; + struct fi_cq_err_entry err_entry = {0}; + struct cxip_req req = {0}; + size_t olen, err_data_size; + int err, prov_errno; + void *err_data; + struct cxip_cq *cxi_cq; + uint8_t err_buff[32] = {0}; + + /* initialize the input data */ + req.flags = 0x12340987abcd5676; + req.context = 0xa5a5a5a5a5a5a5a5; + req.data_len = 0xabcdef0123456789; + req.data = 0xbadcfe1032547698; + req.tag = 0xefcdab0192837465; + olen = 0x4545121290907878; + err = -3; + prov_errno = -2; + err_data = (void *)err_buff; + err_data_size = ARRAY_SIZE(err_buff); + + /* Open a CQ */ + cxit_create_cqs(); + cxi_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + req.cq = cxi_cq; + + /* Add an error to the CQ's error ringbuffer */ + ret = cxip_cq_req_error(&req, olen, err, prov_errno, + err_data, err_data_size, FI_ADDR_UNSPEC); + cr_assert_eq(ret, 0, "cxip_cq_report_error() error %d", ret); + + /* Attempt to read an err with a CQ with a NULL buff pointer */ + ret = fi_cq_readerr(cxit_tx_cq, &err_entry, (uint64_t)0); + cr_assert_eq(ret, 1, "fi_cq_readerr returned %d", ret); + + /* Expect the data to match the fake entry */ + cr_assert_eq(err_entry.err, err); + cr_assert_eq(err_entry.olen, olen); + cr_assert_eq(err_entry.len, req.data_len); + cr_assert_eq(err_entry.prov_errno, prov_errno); + cr_assert_eq(err_entry.flags, req.flags); + cr_assert_eq(err_entry.data, req.data); + cr_assert_eq(err_entry.tag, req.tag); + cr_assert_eq(err_entry.op_context, (void *)(uintptr_t)req.context); + cr_assert_eq(memcmp(err_entry.err_data, err_data, err_data_size), 0); + cr_assert_leq(err_entry.err_data_size, err_data_size, + "Size mismatch. %zd, %zd", + err_entry.err_data_size, err_data_size); + + cxit_destroy_cqs(); +} diff --git a/prov/cxi/test/criterion.yaml b/prov/cxi/test/criterion.yaml new file mode 100644 index 00000000000..9abd1514ba7 --- /dev/null +++ b/prov/cxi/test/criterion.yaml @@ -0,0 +1,100 @@ +# example input file for run_criterion_tests script + +# Set paths, prompt, and password for node under test +env: + libfabric_dir_on_node: /path/to/libfabric + pycxi_dir_on_node: /path/to/pycxi # required for cxiutil + node_prompt: '#' + node_password: + + +# These parameters apply to all tests +global_runtime_parameters: + - {DMA_FAULT_RATE: .1, + MALLOC_FAULT_RATE: .1, + FI_LOG_LEVEL: warn, + FI_CXI_FC_RECOVERY: 1} + + +# Test definitions +tests: + #------------------------------------------------------------------------------------------------------- + # EXAMPLE: + # - {description: "Meaningful description of test(s) to be included in tap report", + # filter: "tagged/*", # run all tagged tests (null = run all tests) + # runtime_parameters: { # include these params when running the test (null = no params) + # FI_CXI_PARAM_1: val, + # FI_CXI_PARAM_2: val}, + # csrs: [ # set these CSRs prior to running the test (null = no CSRs) + # [, , ], + # [, , ] + # ]} + #------------------------------------------------------------------------------------------------------- + + - {description: "Run with default settings", + filter: null, + runtime_parameters: null, + csrs: null} + + - {description: "Disable caching of FI_HMEM_SYSTEM", + filter: null, + runtime_parameters: { + FI_MR_CACHE_MONITOR: disable}, + csrs: null} + + - {description: "Run with RPut and SW Gets", + filter: "(tagged|msg)/*", + runtime_parameters: null, + csrs: [ + [get_ctrl, get_en, 0] + ]} + + - {description: "Run with constrained LE count", + filter: "tagged/fc*", + runtime_parameters: null, + csrs: [ + ["le_pools[]", max_alloc, 10] + ]} + + - {description: "Verify tag matching with rendezvous", + filter: "tagged_directed/*", + runtime_parameters: { + FI_CXI_DEVICE_NAME: "cxi1,cxi0", + FI_CXI_RDZV_GET_MIN: 0, + FI_CXI_RDZV_THRESHOLD: 2048}, + csrs: null} + + - {description: "Run with software RX matching mode", + filter: null, + runtime_parameters: { + FI_CXI_RX_MATCH_MODE: '"software"', + FI_CXI_RDZV_GET_MIN: 0, + FI_CXI_RDZV_THRESHOLD: 2048}, + csrs: null} + + - {description: "Run with FI_CXI_MSG_OFFLOAD disabled", + filter: null, + runtime_parameters: { + FI_CXI_MSG_OFFLOAD: 0, + FI_CXI_RDZV_GET_MIN: 0, + FI_CXI_RDZV_THRESHOLD: 2048}, + csrs: null} + + - {description: "Verify fc_no_eq_space_expected_multi_recv", + filter: "tagged/fc_no_eq_space_expected_multi_recv", + runtime_parameters: { + FI_CXI_DEFAULT_CQ_SIZE: 64, + FI_CXI_DISABLE_CQ_HUGETLB: 1, + FI_CXI_RDZV_GET_MIN: 0, + FI_CXI_RDZV_THRESHOLD: 2048}, + csrs: null} + + - {description: "Verify fc_no_eq_space_expected_multi_recv and FI_CXI_CQ_FILL_PERCENT", + filter: "tagged/fc_no_eq_space_expected_multi_recv", + runtime_parameters: { + FI_CXI_CQ_FILL_PERCENT: 20, + FI_CXI_DEFAULT_CQ_SIZE: 64, + FI_CXI_DISABLE_CQ_HUGETLB: 1, + FI_CXI_RDZV_GET_MIN: 0, + FI_CXI_RDZV_THRESHOLD: 2048}, + csrs: null} diff --git a/prov/cxi/test/ctrl.c b/prov/cxi/test/ctrl.c new file mode 100644 index 00000000000..4651a52a94a --- /dev/null +++ b/prov/cxi/test/ctrl.c @@ -0,0 +1,1086 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021-2022 Hewlett Packard Enterprise Development LP + */ +#include +#include + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_CTRL, fmt, ##__VA_ARGS__) + +TestSuite(ctrl, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/** + * @brief Test reversibility of N <-> (r,c), error conditions + * + * For a range of radix values, select a node number (N), convert to + * a (row,column) pair, and then convert back to node number. These + * should match, unless an invalid column (for a row) is specified, + * in which case we see an error. + */ +Test(ctrl, radix_tree_reversible) +{ + int radix, N, M, row, col, siz, rowold, rowwid; + + for (radix = 1; radix < 8; radix++) { + rowold = -1; + rowwid = 1; + for (N = 0; N < 256; N++) { + /* test reversibility */ + cxip_tree_rowcol(radix, N, &row, &col, &siz); + cxip_tree_nodeidx(radix, row, col, &M); + cr_assert(M == N, "M=%d != N=%d\n", M, N); + if (rowold != row) { + rowold = row; + rowwid *= radix; + } + /* test invalid column */ + col = rowwid + 1; + cxip_tree_nodeidx(radix, row, col, &M); + cr_assert(M == -1, + "radix=%d N=%d row=%d col=%d" + " M=%d != -1\n", + radix, N, row, col, M); + } + } +} + +/** + * @brief Test parent/child mapping. + * + * For a range of radix values, generate the relatives in the tree (one + * parent, multiple children), and confirm that these relatives have the + * expected position in the tree, which guarantees that we have no loops + * in the tree, and that every node has a parent (except the root), and + * is a child of its parent. + */ +Test(ctrl, radix_tree_mapping) +{ + int *rels, parent, child; + int radix, nodes, N, M; + int count, i; + + /* Test radix zero case */ + M = cxip_tree_relatives(0, 0, 0, NULL); + cr_assert(M == 0); + + /* Test expected pattern of parent/child indices */ + for (radix = 1; radix < 8; radix++) { + /* only needs radix+1, but for test, provide extra space */ + rels = calloc(radix+2, sizeof(*rels)); + for (nodes = 0; nodes < 256; nodes++) { + count = 0; + parent = -1; + child = 1; + for (N = 0; N < nodes; N++) { + M = cxip_tree_relatives(radix, N, nodes, rels); + cr_assert(M >= 0); + cr_assert(M <= radix+1); + if (M > 0) { + /* test parent node index */ + cr_assert(rels[0] == parent, + "radix=%d nodes=%d index=%d" + " parent=%d != rels[0]=%d\n", + radix, nodes, N, parent, rels[0]); + /* test child node indices */ + for (i = 1; i < M; i++, child++) + cr_assert(rels[i] == child, + "radix=%d nodes=%d" + " index=%d child=%d" + " != rels[%d]=%d\n", + radix, nodes, N, + child, i, rels[i]); + } + count++; + if (N == 0 || count >= radix) { + count = 0; + parent++; + } + } + } + free(rels); + } +} + +/* Utility to show the node relatives */ +__attribute__((unused)) +static void dumpmap(struct cxip_zbcoll_obj *zb) +{ + int i, j; + + printf("MAP=======\n"); + for (i = 0; i < zb->simcount; i++) { + printf("%2d:", i); + for (j = 0; j < zb->state[i].num_relatives; j++) + printf(" %2d", zb->state[i].relatives[j]); + printf("\n"); + } + printf("\n"); +} + +/** + * @brief Test the valid and invalid cxip_zbcoll_obj configurations. + */ +Test(ctrl, zb_config) +{ + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct cxip_zbcoll_obj *zb; + struct cxip_addr *caddrs; + fi_addr_t *fiaddrs; + int i, ret; + + int num_addrs = 5; + + cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + ep_obj = cxip_ep->ep_obj; + + caddrs = calloc(num_addrs, sizeof(*caddrs)); + cr_assert(caddrs); + fiaddrs = calloc(num_addrs, sizeof(*fiaddrs)); + cr_assert(fiaddrs); + + for (i = 0; i < num_addrs; i++) + caddrs[i] = ep_obj->src_addr; + ret = fi_av_insert(&ep_obj->av->av_fid, caddrs, num_addrs, fiaddrs, + 0L, NULL); + cr_assert(ret == num_addrs); + + /* test case, object but no tree */ + TRACE("case: no tree\n"); + ret = cxip_zbcoll_alloc(ep_obj, 0, NULL, ZB_NOSIM, &zb); + cr_assert(ret == 0, + "no tree: ret=%d\n", ret); + cr_assert(zb->simcount == 1, + "no tree: simcnt=%d\n", zb->simcount); + cr_assert(zb->num_caddrs == 1, + "no_tree: num_caddrs=%d\n", zb->num_caddrs); + cr_assert(memcmp(&zb->caddrs[0], &ep_obj->src_addr, sizeof(ep_obj->src_addr)) == 0); + cxip_zbcoll_free(zb); + + /* request simulation */ + TRACE("case: simulated\n"); + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb); + cr_assert(ret == 0, + "sim tree 4: ret=%d\n", ret); + cr_assert(zb->simcount == num_addrs, + "sim tree 4: cnt=%d\n", zb->simcount); + cxip_zbcoll_free(zb); + + /* exercise real setup, send-to-self-only */ + TRACE("case: real send-only\n"); + ret = cxip_zbcoll_alloc(ep_obj, 0, NULL, ZB_NOSIM, &zb); + cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret)); + cr_assert(zb != NULL); + cr_assert(zb->simcount == 1); + cr_assert(zb->state != NULL); + cr_assert(CXIP_ADDR_EQUAL(zb->caddrs[0], ep_obj->src_addr)); + + /* exercise real setup success, all caddrs are real */ + TRACE("case: real addresses root 0\n"); + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, fiaddrs, ZB_NOSIM, &zb); + cr_assert(ret == 0, "real tree0: ret=%s\n", fi_strerror(-ret)); + cr_assert(zb->simcount == 1, "real tree0: simcnt=%d\n", zb->simcount); + cr_assert(zb->state[0].grp_rank == 0, "real tree0: grp_rank=%d\n", + zb->state[0].grp_rank); + cxip_zbcoll_free(zb); + + /* exercise real setup success, first caddr is not me */ + TRACE("case: real addresses root 1\n"); + caddrs[0].nic += 1; + ret = fi_av_insert(&ep_obj->av->av_fid, caddrs, num_addrs, fiaddrs, + 0L, NULL); + cr_assert(ret == num_addrs); + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, fiaddrs, ZB_NOSIM, &zb); + cr_assert(ret == 0, "real tree1: ret=%s\n", fi_strerror(-ret)); + cr_assert(zb->simcount == 1, "real tree1: simcnt=%d\n", zb->simcount); + cr_assert(zb->state[0].grp_rank == 1, "real tree1: grp_rank=%d\n", + zb->state[0].grp_rank); + cxip_zbcoll_free(zb); + + /* exercise real setup failure, no caddr is me */ + TRACE("case: real addresses root N\n"); + for (i = 0; i < num_addrs; i++) + caddrs[i].nic += i + 1; + ret = fi_av_insert(&ep_obj->av->av_fid, caddrs, num_addrs, fiaddrs, + 0L, NULL); + cr_assert(ret == num_addrs); + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, fiaddrs, ZB_NOSIM, &zb); + cr_assert(ret == -FI_ECONNREFUSED, "real treeN: ret=%s\n", fi_strerror(-ret)); + cxip_zbcoll_free(zb); + + free(fiaddrs); +} + +/** + * @brief Send a single packet using a self to self send-only configuration. + */ +Test(ctrl, zb_send0) +{ + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct cxip_zbcoll_obj *zb; + union cxip_match_bits mb = {.raw = 0}; + uint32_t dsc, err, ack, rcv, cnt; + int ret; + + cr_assert(sizeof(union cxip_match_bits) == 8); + + cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + ep_obj = cxip_ep->ep_obj; + + /* Set up the send-only zbcoll */ + ret = cxip_zbcoll_alloc(ep_obj, 0, NULL, ZB_NOSIM, &zb); + + /* Test that if disabled, getgroup is no-op */ + ep_obj->zbcoll.disable = true; + ret = cxip_zbcoll_getgroup(zb); + cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret)); + + /* Legitimate send to self */ + cxip_zbcoll_reset_counters(ep_obj); + cxip_zbcoll_send(zb, 0, 0, mb.raw); + cnt = 0; + do { + usleep(1); + cxip_ep_zbcoll_progress(ep_obj); + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + ret = (dsc || err || (ack && rcv)); + cnt++; + } while (!ret && cnt < 1000); + cr_assert(cnt < 1000, "repeat count = %d >= %d\n", cnt, 1000); + cr_assert(dsc == 0, "dsc = %d, != 0\n", dsc); + cr_assert(err == 0, "err = %d, != 0\n", err); + cr_assert(ack == 1, "ack = %d, != 1\n", ack); + cr_assert(rcv == 1, "rcv = %d, != 1\n", rcv); + + /* Invalid send to out-of-range address index */ + cxip_zbcoll_reset_counters(ep_obj); + cxip_zbcoll_send(zb, 0, 1, mb.raw); + cnt = 0; + do { + usleep(1); + cxip_ep_zbcoll_progress(ep_obj); + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + ret = (err || dsc || (ack && rcv)); + cnt++; + } while (!ret && cnt < 1000); + cr_assert(cnt < 1000, "repeat count = %d < %d\n", cnt, 1000); + cr_assert(dsc == 0, "dsc = %d, != 0\n", dsc); + cr_assert(err == 1, "err = %d, != 1\n", err); + cr_assert(ack == 0, "ack = %d, != 0\n", ack); + cr_assert(rcv == 0, "rcv = %d, != 0\n", rcv); + + cxip_zbcoll_free(zb); +} + +/* utility to send from src to dst */ +static void _send(struct cxip_zbcoll_obj *zb, int srcidx, int dstidx) +{ + struct cxip_ep_obj *ep_obj; + union cxip_match_bits mb = {.zb_data=0}; + int ret, cnt; + uint32_t dsc, err, ack, rcv; + + /* send to dstidx simulated address */ + ep_obj = zb->ep_obj; + cxip_zbcoll_reset_counters(ep_obj); + cxip_zbcoll_send(zb, srcidx, dstidx, mb.raw); + + /* wait for errors, or completion */ + cnt = 0; + do { + usleep(1); + cxip_ep_zbcoll_progress(ep_obj); + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + ret = (err || dsc || (ack && rcv)); + cnt++; + } while (!ret && cnt < 1000); + cr_assert(cnt < 1000, "repeat count = %d\n", cnt); + + cr_assert(dsc == 0, "dsc = %d, != 0\n", dsc); + cr_assert(err == 0, "err = %d, != 0\n", err); + cr_assert(ack == 1, "ack = %d, != 1\n", ack); + cr_assert(rcv == 1, "rcv = %d, != 1\n", rcv); +} + +/** + * @brief Send a single packet from each src to dst in NETSIM simulation. + * + * Scales as O(N^2), so keep number of addresses small. + */ +Test(ctrl, zb_sendN) +{ + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct cxip_zbcoll_obj *zb; + int srcidx, dstidx, ret; + + int num_addrs = 5; + + cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + ep_obj = cxip_ep->ep_obj; + + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb); + cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret)); + cr_assert(zb != NULL); + cr_assert(zb->simcount == num_addrs); + cr_assert(zb->state != NULL); + + /* Test that if disabled, getgroup is no-op */ + ep_obj->zbcoll.disable = true; + ret = cxip_zbcoll_getgroup(zb); + cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret)); + + for (srcidx = 0; srcidx < num_addrs; srcidx++) + for (dstidx = 0; dstidx < num_addrs; dstidx++) + _send(zb, srcidx, dstidx); + cxip_zbcoll_free(zb); +} + +/* Utility to wait until an ALLSIM collective has completed */ +static int _await_complete(struct cxip_zbcoll_obj *zb) +{ + uint32_t rep; + + /* We only wait for 1 sec */ + for (rep = 0; rep < 10000; rep++) { + usleep(100); + cxip_ep_zbcoll_progress(zb->ep_obj); + if (zb->error) + return zb->error; + if (!zb->busy) + break; + } + return (zb->busy) ? -FI_ETIMEDOUT : FI_SUCCESS; +} + +/* Utility to wait until a multi-zb collective has completed */ +static int _await_complete_all(struct cxip_zbcoll_obj **zb, int cnt) +{ + uint32_t i, rep; + + /* We only wait for 1 sec */ + for (rep = 0; rep < 10000; rep++) { + usleep(100); + cxip_ep_zbcoll_progress(zb[0]->ep_obj); + for (i = 0; i < cnt; i++) { + if (zb[i]->error) + return zb[i]->error; + if (zb[i]->busy) + break; + } + if (i == cnt) + break; + } + return (i < cnt) ? -FI_ETIMEDOUT : FI_SUCCESS; +} + +/* shuffle the array */ +void _shuffle_array32(uint32_t *array, size_t size) +{ + uint32_t i, j, t; + + for (i = 0; i < size-1; i++) { + j = i + rand() / (RAND_MAX / (size - i) + 1); + t = array[j]; + array[j] = array[i]; + array[i] = t; + } +} + +/* create a randomized shuffle array */ +void _addr_shuffle(struct cxip_zbcoll_obj *zb, bool shuffle) +{ + struct timespec tv; + int i; + + clock_gettime(CLOCK_MONOTONIC, &tv); + srand((unsigned int)tv.tv_nsec); + free(zb->shuffle); + zb->shuffle = calloc(zb->simcount, sizeof(uint32_t)); + if (!zb->shuffle) + return; + /* create ordered list */ + for (i = 0; i < zb->simcount; i++) + zb->shuffle[i] = i; + /* if requested, randomize */ + if (shuffle) + _shuffle_array32(zb->shuffle, zb->simcount); +} + +/*****************************************************************/ +/** + * @brief Test simulated getgroup. + * + * This exercises the basic getgroup operation, the user callback, and the + * non-concurrency lockout. It tests grpid wrap-around at the limit. + * + * This does not test error returns, which are not robustly simulated. + */ + +struct getgroup_data { + int count; +}; +static void getgroup_func(struct cxip_zbcoll_obj *zb, void *usrptr) +{ + struct getgroup_data *data = (struct getgroup_data *)usrptr; + data->count++; +} + +/* Test getgroup single-zb simulation */ +Test(ctrl, zb_getgroup) +{ + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct cxip_zbcoll_obj **zb; + struct getgroup_data zbd = {}; + int i, j, ret; + uint32_t dsc, err, ack, rcv; + int max_zb = cxip_zbcoll_max_grps(true); + int num_zb = 2*max_zb; + int num_addrs = 9; + int cnt = 0; + + cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + ep_obj = cxip_ep->ep_obj; + + zb = calloc(num_zb, sizeof(struct cxip_zbcoll_obj *)); + cr_assert(zb, "zb out of memory\n"); + + TRACE("%s entry\n", __func__); + for (i = 0; i < num_zb; i++) { + /* Verify multiple allocations */ + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, + ZB_ALLSIM, &zb[i]); + cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", + fi_strerror(-ret)); + cr_assert(zb[i]->simcount == num_addrs, + "zb->simcount = %d, != %d\n", + zb[i]->simcount, num_addrs); + /* Add callback function */ + cxip_zbcoll_set_user_cb(zb[i], getgroup_func, &zbd); + /* Initialize the address shuffling */ + _addr_shuffle(zb[i], true); + TRACE("created zb[%d]\n", i); + } + for (i = j = 0; i < num_zb; i++) { + /* Free space if necessary */ + while ((i - j) >= max_zb) + cxip_zbcoll_free(zb[j++]); + _addr_shuffle(zb[i], true); + /* Test getgroup operation */ + TRACE("initiate getgroup %d\n", i); + ret = cxip_zbcoll_getgroup(zb[i]); + cr_assert(ret == FI_SUCCESS, "%d getgroup = %s\n", + i, fi_strerror(-ret)); + /* Test getgroup non-concurrency */ + TRACE("second initiate getgroup %d\n", i); + ret = cxip_zbcoll_getgroup(zb[i]); + cr_assert(ret == -FI_EAGAIN, "%d getgroup = %s\n", + i, fi_strerror(-ret)); + /* Poll until complete */ + TRACE("await completion %d\n", i); + ret = _await_complete(zb[i]); + cr_assert(ret == FI_SUCCESS, "%d getgroup = %s\n", + i, fi_strerror(-ret)); + /* Check user callback completion count result */ + cr_assert(zbd.count == i+1, "%d zbdcount = %d\n", + i, zbd.count); + /* Confirm expected grpid */ + cr_assert(zb[i]->grpid == (i % max_zb), + "%d grpid = %d, exp %d\n", + i, zb[i]->grpid, i % max_zb); + TRACE("second getgroup after completion\n"); + /* Attempt another getgroup on same zb */ + ret = cxip_zbcoll_getgroup(zb[i]); + cr_assert(ret == -FI_EINVAL, "%d getgroup = %s\n", + i, fi_strerror(-ret)); + /* Compute expected transfer count */ + cnt += 2 * (num_addrs - 1); + } + + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + cr_assert(dsc == 0 && err == 0, + "FAILED dsc=%d err=%d ack=%d rcv=%d cnt=%d\n", + dsc, err, ack, rcv, cnt); + /* cleanup */ + while (j < num_zb) + cxip_zbcoll_free(zb[j++]); + free(zb); +} + +/*****************************************************************/ +/** + * @brief Test simulated getgroup with multi-zb model. + */ + +void _getgroup_multi(int num_addrs, struct cxip_zbcoll_obj **zb, + int expect_grpid) +{ + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct getgroup_data zbd = {}; + int i, ret; + + cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + ep_obj = cxip_ep->ep_obj; + + /* allocate multiple zb objects, simrank = i */ + for (i = 0; i < num_addrs; i++) { + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, i, &zb[i]); + cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", + fi_strerror(-ret)); + cr_assert(zb[i]->simcount == num_addrs, + "zb->simcount = %d, != %d\n", + zb[i]->simcount, num_addrs); + ret = cxip_zbcoll_simlink(zb[0], zb[i]); + cr_assert(!ret, "link zb[%d] failed\n", i); + } + + for (i = 0; i < num_addrs; i++) + cxip_zbcoll_set_user_cb(zb[i], getgroup_func, &zbd); + + /* initiate getgroup across all of the zb objects */ + for (i = 0; i < num_addrs; i++) { + ret = cxip_zbcoll_getgroup(zb[i]); + cr_assert(ret == FI_SUCCESS, "getgroup[%d]=%s, exp success\n", + i, fi_strerror(-ret)); + } + + /* make a second attempt */ + for (i = 0; i < num_addrs; i++) { + ret = cxip_zbcoll_getgroup(zb[i]); + cr_assert(ret == -FI_EAGAIN, "getgroup[%d]=%s exp FI_EAGAIN\n", + i, fi_strerror(-ret)); + } + + /* Poll until all are complete */ + ret = _await_complete_all(zb, num_addrs); + cr_assert(ret == FI_SUCCESS, "getgroup = %s\n", + fi_strerror(-ret)); + + /* Ensure all objects have the same group ids */ + ret = 0; + for (i = 0; i < num_addrs; i++) { + if (zb[i]->grpid != expect_grpid) { + TRACE("zb[%d]->grpid = %d, exp %d\n", + i, zb[i]->grpid, expect_grpid); + ret++; + } + } + cr_assert(!ret, "Some zb objects have the wrong group id\n"); + + /* Make sure we can't take a second group id */ + for (i = 0; i < num_addrs; i++) { + ret = cxip_zbcoll_getgroup(zb[i]); + cr_assert(ret == -FI_EINVAL, "getgroup[%d]=%s exp FI_EINVAL\n", + i, fi_strerror(-ret)); + } + +} + +void _free_getgroup_multi(int num_addrs, struct cxip_zbcoll_obj **zb) +{ + int i; + + for (i = 0; i < num_addrs; i++) + cxip_zbcoll_free(zb[i]); + free(zb); +} + +/* Test getgroup multi-zb simulation */ +Test(ctrl, zb_getgroup2) +{ + struct cxip_zbcoll_obj **zb1, **zb2; + int num_addrs = 9; // arbitrary + + zb1 = calloc(num_addrs, sizeof(struct cxip_zbcoll_obj *)); + cr_assert(zb1, "zb out of memory\n"); + zb2 = calloc(num_addrs, sizeof(struct cxip_zbcoll_obj *)); + cr_assert(zb2, "zb out of memory\n"); + + _getgroup_multi(num_addrs, zb1, 0); + _getgroup_multi(num_addrs, zb2, 1); + + _free_getgroup_multi(num_addrs, zb2); + _free_getgroup_multi(num_addrs, zb1); +} + +/*****************************************************************/ +/** + * @brief Test simulated barrier. + * + * This exercises the basic barrier operation, the user callback, and the + * non-concurrency lockout. + * + * This is done in a single thread, so it tests only a single barrier across + * multiple addrs. It randomizes the nid processing order, and performs multiple + * barriers to uncover any ordering issues. + */ +struct barrier_data { + int count; +}; +static void barrier_func(struct cxip_zbcoll_obj *zb, void *usrptr) +{ + struct barrier_data *data = (struct barrier_data *)usrptr; + + /* increment the user completion count */ + data->count++; +} + +/* Test barrier single-zb simulation */ +Test(ctrl, zb_barrier) +{ + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct cxip_zbcoll_obj *zb; + struct barrier_data zbd; + int rep, ret; + + int num_addrs = 9; + + cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + ep_obj = cxip_ep->ep_obj; + + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb); + cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret)); + cr_assert(zb->simcount == num_addrs, + "zb->simcount = %d, != %d\n", zb->simcount, num_addrs); + /* Initialize the addresses */ + _addr_shuffle(zb, true); + + /* Acquire a group id */ + ret = cxip_zbcoll_getgroup(zb); + cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret)); + ret = _await_complete(zb); + cr_assert(ret == 0, "getgroup done = %s\n", fi_strerror(-ret)); + + cxip_zbcoll_set_user_cb(zb, barrier_func, &zbd); + + memset(&zbd, 0, sizeof(zbd)); + for (rep = 0; rep < 20; rep++) { + /* Shuffle the addresses */ + _addr_shuffle(zb, true); + /* Perform a barrier */ + ret = cxip_zbcoll_barrier(zb); + cr_assert(ret == 0, "%d barrier = %s\n", + rep, fi_strerror(-ret)); + /* Try again immediately, should show BUSY */ + ret = cxip_zbcoll_barrier(zb); + cr_assert(ret == -FI_EAGAIN, "%d barrier = %s\n", + rep, fi_strerror(-ret)); + /* Poll until complete */ + ret = _await_complete(zb); + cr_assert(ret == FI_SUCCESS, "%d barrier = %s\n", + rep, fi_strerror(-ret)); + } + /* Confirm completion count */ + cr_assert(zbd.count == rep, "expected zbd.count=%d == rep=%d\n", + zbd.count, rep); + + uint32_t dsc, err, ack, rcv; + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + cr_assert(dsc == 0 && err == 0, + "FAILED dsc=%d err=%d ack=%d rcv=%d\n", + dsc, err, ack, rcv); + + cxip_zbcoll_free(zb); +} + +/* Test barrier multi-zb simulation */ +Test(ctrl, zb_barrier2) +{ + struct cxip_zbcoll_obj **zb1, **zb2; + struct barrier_data zbd1 = {}; + struct barrier_data zbd2 = {}; + int num_addrs = 17; // arbitrary + int i, ret; + + zb1 = calloc(num_addrs, sizeof(*zb1)); + cr_assert(zb1); + zb2 = calloc(num_addrs, sizeof(*zb2)); + cr_assert(zb2); + + _getgroup_multi(num_addrs, zb1, 0); + _getgroup_multi(num_addrs, zb2, 1); + + for (i = 0; i < num_addrs; i++) { + cxip_zbcoll_set_user_cb(zb1[i], barrier_func, &zbd1); + cxip_zbcoll_set_user_cb(zb2[i], barrier_func, &zbd2); + } + + for (i = 0; i < num_addrs; i++) { + ret = cxip_zbcoll_barrier(zb1[i]); + cr_assert(!ret, "zb1 barrier[%d]=%s\n", i, fi_strerror(-ret)); + + ret = cxip_zbcoll_barrier(zb2[i]); + cr_assert(!ret, "zb2 barrier[%d]=%s\n", i, fi_strerror(-ret)); + } + + /* Poll until all are complete */ + ret = _await_complete_all(zb1, num_addrs); + cr_assert(ret == FI_SUCCESS, "zb1 barrier = %s\n", + fi_strerror(-ret)); + ret = _await_complete_all(zb2, num_addrs); + cr_assert(ret == FI_SUCCESS, "zb2 barrier = %s\n", + fi_strerror(-ret)); + + /* Validate data */ + cr_assert(zbd1.count == num_addrs, "zb1 count=%d != %d\n", + zbd1.count, num_addrs); + cr_assert(zbd2.count == num_addrs, "zb2 count=%d != %d\n", + zbd2.count, num_addrs); + + _free_getgroup_multi(num_addrs, zb2); + _free_getgroup_multi(num_addrs, zb1); +} + +/*****************************************************************/ +/** + * @brief Perform a simulated broadcast. + * + * This exercises the basic broadcast operation, the user callback, and the + * non-concurrency lockout. The user callback captures all of the results and + * ensures they all match the broadcast value. + * + * This is done in a single thread, so it tests only a single broadcast across + * multiple addrs. It randomizes the nid processing order, and performs multiple + * broadcasts to uncover any ordering issues. + */ +struct bcast_data { + uint64_t *data; + int count; +}; + +static void bcast_func(struct cxip_zbcoll_obj *zb, void *usrptr) +{ + struct bcast_data *data = (struct bcast_data *)usrptr; + int i; + + if (zb->simrank >= 0) { + data->data[zb->simrank] = *zb->state[zb->simrank].dataptr; + } else { + for (i = 0; i < zb->simcount; i++) + data->data[i] = *zb->state[i].dataptr; + } + data->count++; +} + +/* Test broadcast single-zb simulation */ +Test(ctrl, zb_broadcast) +{ + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct cxip_zbcoll_obj *zb; + struct bcast_data zbd = {}; + int i, n, rep, ret; + uint64_t *data; + + int num_addrs = 25; + + cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + ep_obj = cxip_ep->ep_obj; + + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb); + cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret)); + cr_assert(zb->simcount == num_addrs, + "zb->simcount = %d, != %d\n", zb->simcount, num_addrs); + _addr_shuffle(zb, true); + + data = calloc(num_addrs, sizeof(uint64_t)); + + /* Acquire a group id */ + ret = cxip_zbcoll_getgroup(zb); + cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret)); + ret = _await_complete(zb); + cr_assert(ret == 0, "getgroup done = %s\n", fi_strerror(-ret)); + + cxip_zbcoll_set_user_cb(zb, bcast_func, &zbd); + + memset(&zbd, 0, sizeof(zbd)); + zbd.data = calloc(num_addrs, sizeof(uint64_t)); + for (rep = 0; rep < 20; rep++) { + _addr_shuffle(zb, true); + n = zb->shuffle[0]; + memset(zbd.data, -1, num_addrs*sizeof(uint64_t)); + /* Perform a broadcast */ + for (i = 0; i < num_addrs; i++) + data[i] = (rand() & ((1 << 29) - 1)) | (1 << 28); + ret = cxip_zbcoll_broadcast(zb, data); + cr_assert(ret == 0, "%d bcast = %s\n", + rep, fi_strerror(-ret)); + /* Try again immediately, should fail */ + ret = cxip_zbcoll_broadcast(zb, data); + cr_assert(ret == -FI_EAGAIN, "%d bcast = %s\n", + rep, fi_strerror(-ret)); + /* Poll until complete */ + ret = _await_complete(zb); + cr_assert(ret == FI_SUCCESS, "%d bcast = %s\n", + rep, fi_strerror(-ret)); + /* Validate the data */ + for (i = 0; i < num_addrs; i++) + cr_assert(zbd.data[i] == data[n], "[%d] %ld != %ld\n", + i, zbd.data[i], data[n]); + } + cr_assert(zbd.count == rep, "zbd.count=%d rep=%d\n", + zbd.count, rep); + + uint32_t dsc, err, ack, rcv; + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + cr_assert(dsc == 0 && err == 0, + "FAILED dsc=%d err=%d ack=%d rcv=%d\n", + dsc, err, ack, rcv); + + free(zbd.data); + free(data); + cxip_zbcoll_free(zb); +} + +/* Test broadcast multi-zb simulation */ +Test(ctrl, zb_broadcast2) +{ + struct cxip_zbcoll_obj **zb1, **zb2; + uint64_t data1, data2; + struct bcast_data zbd1 = {}; + struct bcast_data zbd2 = {}; + int i, ret; + + int num_addrs = 11; // arbitrary + + zb1 = calloc(num_addrs, sizeof(*zb1)); + cr_assert(zb1); + zb2 = calloc(num_addrs, sizeof(*zb2)); + cr_assert(zb2); + zbd1.data = calloc(num_addrs, sizeof(*zbd1.data)); + cr_assert(zbd1.data); + zbd2.data = calloc(num_addrs, sizeof(*zbd2.data)); + cr_assert(zbd2.data); + + /* Acquire group ids */ + _getgroup_multi(num_addrs, zb1, 0); + _getgroup_multi(num_addrs, zb2, 1); + + data1 = (rand() & ((1 << 29) - 1)) | (1 << 28); + data2 = (rand() & ((1 << 29) - 1)) | (1 << 28); + + for (i = 0; i < num_addrs; i++) { + cxip_zbcoll_set_user_cb(zb1[i], bcast_func, &zbd1); + cxip_zbcoll_set_user_cb(zb2[i], bcast_func, &zbd2); + } + for (i = 0; i < num_addrs; i++) { + ret = cxip_zbcoll_broadcast(zb1[i], &data1); + cr_assert(!ret, "zb1 broadcast[%d]=%s\n", i, fi_strerror(-ret)); + + ret = cxip_zbcoll_broadcast(zb2[i], &data2); + cr_assert(!ret, "zb2 broadcast[%d]=%s\n", i, fi_strerror(-ret)); + } + + /* Poll until all are complete */ + ret = _await_complete_all(zb1, num_addrs); + cr_assert(ret == FI_SUCCESS, "zb1 broadcast = %s\n", + fi_strerror(-ret)); + ret = _await_complete_all(zb2, num_addrs); + cr_assert(ret == FI_SUCCESS, "zb2 broadcast = %s\n", + fi_strerror(-ret)); + + /* Validate data */ + cr_assert(zbd1.count == num_addrs, "count=%d != %d\n", + zbd1.count, num_addrs); + for (i = 0; i < num_addrs; i++) { + cr_assert(data1 == zbd1.data[i], + "data1=%ld != zbd1[%d]=%ld\n", + data1, i, zbd1.data[i]); + } + cr_assert(zbd2.count == num_addrs, "count=%d != %d\n", + zbd2.count, num_addrs); + for (i = 0; i < zbd2.count; i++) { + cr_assert(data2 == zbd2.data[i], + "data2=%ld != zbd2[%d]=%ld\n", + data2, i, zbd2.data[i]); + } + + _free_getgroup_multi(num_addrs, zb2); + _free_getgroup_multi(num_addrs, zb1); +} + +/*****************************************************************/ +/** + * @brief Perform a simulated reduce. + * + * This exercises the basic reduce operation, the user callback, and the + * non-concurrency lockout. The user callback captures all of the results and + * ensures they all match the reduce value. + * + * This is done in a single thread, so it tests only a single reduce across + * multiple addrs. It randomizes the nid processing order, and performs multiple + * reductions to uncover any ordering issues. + */ +struct reduce_data { + uint64_t *data; + int count; +}; + +static void reduce_func(struct cxip_zbcoll_obj *zb, void *usrptr) +{ + struct reduce_data *data = (struct reduce_data *)usrptr; + int i; + + if (zb->simrank >= 0) { + data->data[zb->simrank] = *zb->state[zb->simrank].dataptr; + } else { + for (i = 0; i < zb->simcount; i++) + data->data[i] = *zb->state[i].dataptr; + } + data->count++; +} + +/* Test reduce single-zb simulation */ +Test(ctrl, zb_reduce) +{ + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct cxip_zbcoll_obj *zb; + struct reduce_data zbd = {}; + int i, rep, ret; + uint64_t *data, rslt; + + int num_addrs = 25; + + cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + ep_obj = cxip_ep->ep_obj; + + ret = cxip_zbcoll_alloc(ep_obj, num_addrs, NULL, ZB_ALLSIM, &zb); + cr_assert(ret == 0, "cxip_zbcoll_alloc() = %s\n", fi_strerror(-ret)); + cr_assert(zb->simcount == num_addrs, + "zb->simcount = %d, != %d\n", zb->simcount, num_addrs); + _addr_shuffle(zb, true); + + data = calloc(num_addrs, sizeof(uint64_t)); + + /* Acquire a group id */ + ret = cxip_zbcoll_getgroup(zb); + cr_assert(ret == 0, "getgroup = %s\n", fi_strerror(-ret)); + ret = _await_complete(zb); + cr_assert(ret == 0, "getgroup done = %s\n", fi_strerror(-ret)); + + cxip_zbcoll_set_user_cb(zb, reduce_func, &zbd); + + memset(&zbd, 0, sizeof(zbd)); + zbd.data = calloc(num_addrs, sizeof(uint64_t)); + + for (rep = 0; rep < 20; rep++) { + _addr_shuffle(zb, true); + memset(zbd.data, -1, num_addrs*sizeof(uint64_t)); + /* Perform a reduce */ + for (i = 0; i < num_addrs; i++) { + data[i] = (rand() & ((1 << 29) - 1)) | (1 << 28); + data[i] |= 3; + } + rslt = -1L; + for (i = 1; i < num_addrs; i++) { + rslt &= data[i]; + } + ret = cxip_zbcoll_reduce(zb, data); + cr_assert(ret == 0, "%d reduce = %s\n", + rep, fi_strerror(-ret)); + /* Try again immediately, should fail */ + ret = cxip_zbcoll_reduce(zb, data); + cr_assert(ret == -FI_EAGAIN, "%d reduce = %s\n", + rep, fi_strerror(-ret)); + /* Poll until complete */ + ret = _await_complete(zb); + cr_assert(ret == FI_SUCCESS, "%d reduce = %s\n", + rep, fi_strerror(-ret)); + /* Validate the data */ + for (i = 0; i < num_addrs; i++) + cr_assert(zbd.data[i] == rslt, "[%d] %lx != %lx\n", + i, zbd.data[i], rslt); + } + cr_assert(zbd.count == rep, "zbd.count=%d rep=%d\n", + zbd.count, rep); + + uint32_t dsc, err, ack, rcv; + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + cr_assert(dsc == 0 && err == 0, + "FAILED dsc=%d err=%d ack=%d rcv=%d\n", + dsc, err, ack, rcv); + + free(zbd.data); + free(data); + cxip_zbcoll_free(zb); +} + +/* Test reduce multi-zb simulation */ +Test(ctrl, zb_reduce2) +{ + struct cxip_zbcoll_obj **zb1, **zb2; + int num_addrs = 11; // arbitrary + uint64_t data1, data2; + struct reduce_data zbd1 = {}; + struct reduce_data zbd2 = {}; + int i, ret; + + zb1 = calloc(num_addrs, sizeof(*zb1)); + cr_assert(zb1); + zb2 = calloc(num_addrs, sizeof(*zb2)); + cr_assert(zb2); + zbd1.data = calloc(num_addrs, sizeof(*zbd1.data)); + cr_assert(zbd1.data); + zbd2.data = calloc(num_addrs, sizeof(*zbd2.data)); + cr_assert(zbd2.data); + + _getgroup_multi(num_addrs, zb1, 0); + _getgroup_multi(num_addrs, zb2, 1); + + data1 = (rand() & ((1 << 29) - 1)) | (1 << 28); + data2 = (rand() & ((1 << 29) - 1)) | (1 << 28); + + for (i = 0; i < num_addrs; i++) { + cxip_zbcoll_set_user_cb(zb1[i], reduce_func, &zbd1); + cxip_zbcoll_set_user_cb(zb2[i], reduce_func, &zbd2); + } + for (i = 0; i < num_addrs; i++) { + ret = cxip_zbcoll_reduce(zb1[i], &data1); + cr_assert(!ret, "zb1 reduce[%d]=%s\n", i, fi_strerror(-ret)); + + ret = cxip_zbcoll_reduce(zb2[i], &data2); + cr_assert(!ret, "zb2 reduce[%d]=%s\n", i, fi_strerror(-ret)); + } + + /* Poll until all are complete */ + ret = _await_complete_all(zb1, num_addrs); + cr_assert(ret == FI_SUCCESS, "zb1 reduce = %s\n", + fi_strerror(-ret)); + ret = _await_complete_all(zb2, num_addrs); + cr_assert(ret == FI_SUCCESS, "zb2 reduce = %s\n", + fi_strerror(-ret)); + + /* Validate data */ + cr_assert(zbd1.count == num_addrs, "count=%d != %d\n", + zbd1.count, num_addrs); + for (i = 0; i < num_addrs; i++) { + cr_assert(data1 == zbd1.data[i], + "data1=%ld != zbd1[%d]=%ld\n", + data1, i, zbd1.data[i]); + } + cr_assert(zbd2.count == num_addrs, "count=%d != %d\n", + zbd2.count, num_addrs); + for (i = 0; i < zbd2.count; i++) { + cr_assert(data2 == zbd2.data[i], + "data2=%ld != zbd2[%d]=%ld\n", + data2, i, zbd2.data[i]); + } + + _free_getgroup_multi(num_addrs, zb2); + _free_getgroup_multi(num_addrs, zb1); +} diff --git a/prov/cxi/test/cuda.c b/prov/cxi/test/cuda.c new file mode 100644 index 00000000000..5398dcd98f3 --- /dev/null +++ b/prov/cxi/test/cuda.c @@ -0,0 +1,425 @@ +/* + * (C) Copyright 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "libcxi/libcxi.h" +#include "cxip.h" +#include "cxip_test_common.h" + +#define MAX_MSG_SIZE 1048576U +#define MAX_BUF_OFFSET 65536U + +unsigned int seed; + +static void cuda_init(void) +{ + enable_cxi_hmem_ops = 0; + seed = time(NULL); + srand(seed); +} + +TestSuite(cuda, .timeout = CXIT_DEFAULT_TIMEOUT, .init = cuda_init); + +static void cuda_message_runner(void *cuda_send_buf, void *cuda_recv_buf, + size_t buf_size, bool device_only_mem, + bool unexpected) +{ + int ret; + char *send_buf; + char *recv_buf; + struct fi_cq_tagged_entry cqe; + int i; + cudaError_t cuda_ret; + int j; + + cxit_setup_msg(); + + /* For device only memcpy, send and recv buffer as used for data + validation. + */ + if (device_only_mem) { + send_buf = malloc(buf_size); + cr_assert_neq(send_buf, NULL, "Failed to allocate memory"); + + recv_buf = calloc(1, buf_size); + cr_assert_neq(send_buf, NULL, "Failed to allocate memory"); + } else { + send_buf = cuda_send_buf; + recv_buf = cuda_recv_buf; + } + + for (j = 0; j < 2; j++) { + + ret = open("/dev/urandom", O_RDONLY); + cr_assert_neq(ret, -1, "open failed: %d", -errno); + read(ret, send_buf, buf_size); + close(ret); + + if (device_only_mem) { + cuda_ret = cudaMemcpy(cuda_send_buf, send_buf, buf_size, + cudaMemcpyHostToDevice); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMemcpy failed: %d", + cuda_ret); + } + + + if (unexpected) { + ret = fi_send(cxit_ep, cuda_send_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + ret = fi_recv(cxit_ep, cuda_recv_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + } else { + ret = fi_recv(cxit_ep, cuda_recv_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + ret = fi_send(cxit_ep, cuda_send_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + } + + do { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + if (device_only_mem) { + cuda_ret = cudaMemcpy(recv_buf, cuda_recv_buf, buf_size, + cudaMemcpyDeviceToHost); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMemcpy failed: %d", + cuda_ret); + } + + for (i = 0; i < buf_size; i++) + cr_assert_eq(send_buf[i], recv_buf[i], + "Data corruption at byte %d seed %u iter %d", i, seed, j); + } + + if (device_only_mem) { + free(recv_buf); + free(send_buf); + } + + cxit_teardown_msg(); +} + +static void cuda_dev_memory_test(size_t buf_size, size_t buf_offset, + bool unexpected, bool hmem_dev_reg) +{ + cudaError_t cuda_ret; + void *cuda_send_buf; + void *cuda_recv_buf; + int ret; + + if (hmem_dev_reg) + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1); + else + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + /* cuda buffers will be used for RDMA. */ + cuda_ret = cudaMalloc(&cuda_send_buf, buf_size + buf_offset); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret); + + cuda_ret = cudaMalloc(&cuda_recv_buf, buf_size + buf_offset); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret); + + int attr_value = 1; + cuPointerSetAttribute(&attr_value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)cuda_send_buf); + + cuda_message_runner((void *)((char *)cuda_send_buf + buf_offset), + (void *)((char *)cuda_recv_buf + buf_offset), + buf_size, true, unexpected); + + cuda_ret = cudaFree(cuda_recv_buf); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret); + + cuda_ret = cudaFree(cuda_send_buf); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret); + +} + +/* Test messaging using rendezvous, device memory, and HMEM device memory + * registration for load/store access. + */ +Test(cuda, messaging_devMemory_rdvz_hmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, false, true); +} + +/* Test messaging using eager, device memory, and HMEM device memory + * registration for load/store access. + */ +Test(cuda, messaging_devMemory_eager_hmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, false, true); +} + +/* Test messaging using IDC, device memory, and HMEM device memory + * registration for load/store access. + */ +Test(cuda, messaging_devMemory_idc_hmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, false, true); +} + +/* Test messaging using rendezvous, device memory, unexpected messaging, and + * HMEM device memory registration for load/store access. + */ +Test(cuda, messaging_devMemory_rdvz_unexpected_hmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, true, true); +} + +/* Test messaging using eager, device memory, unexpected messaging, and + * HMEM device memory registration for load/store access. + */ +Test(cuda, messaging_devMemory_eager_unexpected_hmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, true, true); +} + +/* Test messaging using IDC, device memory, unexpected messaging, and + * HMEM device memory registration for load/store access. + */ +Test(cuda, messaging_devMemory_idc_unexpected_hmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, true, true); +} + +/* Test messaging using rendezvous, device memory, and without HMEM device memory + * registration for load/store access. + */ +Test(cuda, messaging_devMemory_rdvz_noHmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, false, false); +} + +/* Test messaging using eager, device memory, and without HMEM device memory + * registration for load/store access. + */ +Test(cuda, messaging_devMemory_eager_noHmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, false, false); +} + +/* Test messaging using IDC, device memory, and without HMEM device memory + * registration for load/store access. + */ +Test(cuda, messaging_devMemory_idc_noHmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, false, false); +} + +/* Test messaging using rendezvous, device memory, unexpected messaging, and + * without HMEM device memory registration for load/store access. + */ +Test(cuda, messaging_devMemory_rdvz_unexpected_noHmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, true, false); +} + +/* Test messaging using eager, device memory, unexpected messaging, and + * without HMEM device memory registration for load/store access. + */ +Test(cuda, messaging_devMemory_eager_unexpected_noHmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, true, false); +} + +/* Test messaging using IDC, device memory, unexpected messaging, and + * without HMEM device memory registration for load/store access. + */ +Test(cuda, messaging_devMemory_idc_unexpected_noHmemDevReg) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + cuda_dev_memory_test(buf_size, buf_offset, true, false); +} + +static void verify_dev_reg_handle(bool hmem_dev_reg) +{ + int ret; + void *buf; + cudaError_t cuda_ret; + struct fid_mr *fid_mr; + size_t buf_size = 1024; + struct cxip_mr *mr; + + cxit_setup_msg(); + + cuda_ret = cudaMalloc(&buf, buf_size); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaMalloc failed: %d", cuda_ret); + + ret = fi_mr_reg(cxit_domain, buf, buf_size, FI_READ, 0, 0x123, 0, + &fid_mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + mr = container_of(fid_mr, struct cxip_mr, mr_fid); + + cr_assert_eq(mr->md->handle_valid, hmem_dev_reg, + "Bad cxip_md handle_valid"); + cr_assert_eq(mr->md->info.iface, FI_HMEM_CUDA, + "Invalid CXIP MD iface: %d", mr->md->info.iface); + + ret = fi_close(&fid_mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + + cuda_ret = cudaFree(buf); + cr_assert_eq(cuda_ret, cudaSuccess, "cudaFree failed: %d", cuda_ret); + + cxit_teardown_msg(); +} + +/* Verify MD handle is false. */ +Test(cuda, verify_noHmemDevReg) +{ + int ret; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + verify_dev_reg_handle(false); +} + +/* Verify MD handle is true. */ +Test(cuda, verify_hmemDevReg) +{ + int ret; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + verify_dev_reg_handle(true); +} diff --git a/prov/cxi/test/curl.c b/prov/cxi/test/curl.c new file mode 100644 index 00000000000..143a8b1fe92 --- /dev/null +++ b/prov/cxi/test/curl.c @@ -0,0 +1,546 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cxip.h" + +/* Parsed arguments */ +static bool autotest = false; +static bool verbose = false; +static char *cmd = "get"; +static char *data = NULL; +static int parallel = 100; +static char *server = "http://127.0.0.1:5000"; +static char *endpoint = "/test"; +static char serverpath[1024]; + +/* Measure timings */ +static inline void tmark(struct timespec *t0) +{ + clock_gettime(CLOCK_MONOTONIC, t0); +} + +static inline void tmeas(struct timespec *t0) +{ + struct timespec t1; + + clock_gettime(CLOCK_MONOTONIC, &t1); + if (t1.tv_nsec < t0->tv_nsec) { + t1.tv_nsec += 1000000000; + t1.tv_sec -= 1; + } + t0->tv_nsec = t1.tv_nsec - t0->tv_nsec; + t0->tv_sec = t1.tv_sec - t0->tv_sec; +} + +#define failtest(action, fmt, ...) \ + do { fprintf(stderr, fmt, ##__VA_ARGS__); action; } while (0) + +/** + * @brief Exercise the json value parser. + * + * @return int : error code + */ +int auto_test_cxip_json(void) +{ + /* Two test objects to parse */ + char json1[] = + "{" + "'string': 'string'," + "'double': 0.1234," + "'int64': 9000000000," + "'int': 2000000000," + "'bool': true," + "'object': {" + "'one': 1," + "'two': 2," + "}," + "'array': [0, 1, 2, 3]," + "'nestedarr': [" + "[0, 1, 2, 3]," + "[4, 5, 6, 7]" + "]," + "'nestedobj': [" + "{" + "'one': 1," + "'two': 2" + "}," + "{" + "'three': 3," + "'four': 4" + "}" + "]" + "}"; + char json2[] = "[0, 1, 2, 3]"; + + json_object *json_obj; + const char *key; + const char *string_val; + double double_val; + int64_t int64_val; + int int_val; + bool bool_val; + int i; + + /* Change embedded single quotes to double quotes */ + single_to_double_quote(json1); + single_to_double_quote(json2); + + /* Test parsing of json1 */ + if (!(json_obj = json_tokener_parse(json1))) + failtest(return 1, "json1 could not be parsed\n"); + + key = "string"; + if (cxip_json_string(key, json_obj, &string_val)) + failtest(return 1, "'%s' key not found\n", key); + if (strcmp(string_val, "string")) + failtest(return 1, "'%s' returned '%s'\n", key, string_val); + + key = "double"; + if (cxip_json_double(key, json_obj, &double_val)) + failtest(return 1, "'%s' key not found\n", key); + if (double_val != 0.1234) + failtest(return 1, "'%s' returned %f\n", key, double_val); + + key = "int64"; + if (cxip_json_int64(key, json_obj, &int64_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int64_val != 9000000000) + failtest(return 1, "'%s' returned 0x%lx\n", key, int64_val); + + key = "int"; + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != 2000000000) + failtest(return 1, "'%s' returned 0x%x\n", key, int_val); + + key = "bool"; + if (cxip_json_bool(key, json_obj, &bool_val)) + failtest(return 1, "'%s' key not found\n", key); + if (bool_val != true) + failtest(return 1, "'%s' returned %d\n", key, bool_val); + + key = "object.one"; + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != 1) + failtest(return 1, "'%s' returned %d\n", key, int_val); + + key = "object.two"; + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != 2) + failtest(return 1, "'%s' returned %d\n", key, int_val); + + key = "nestedobj[0].one"; + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != 1) + failtest(return 1, "'%s' returned %d\n", key, int_val); + + key = "nestedobj[0].two"; + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != 2) + failtest(return 1, "'%s' returned %d\n", key, int_val); + + key = "nestedobj[1].three"; + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != 3) + failtest(return 1, "'%s' returned %d\n", key, int_val); + + key = "nestedobj[1].four"; + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != 4) + failtest(return 1, "'%s' returned %d\n", key, int_val); + + for (i = 0; i < 4; i++) { + char key[256]; + snprintf(key, sizeof(key), "array[%d]", i); + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != i) + failtest(return 1, "'%s' returned %d\n", key, int_val); + } + + for (i = 0; i < 8; i++) { + char key[256]; + snprintf(key, sizeof(key), "nestedarr[%d][%d]", i/4, i%4); + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != i) + failtest(return 1, "'%s' returned %d\n", key, int_val); + } + json_object_put(json_obj); + + /* Test parsing of json2 */ + if (!(json_obj = json_tokener_parse(json2))) + failtest(return 1, "json2 could not be parsed\n"); + for (i = 0; i < 4; i++) { + char key[256]; + snprintf(key, sizeof(key), "[%d]", i); + if (cxip_json_int(key, json_obj, &int_val)) + failtest(return 1, "'%s' key not found\n", key); + if (int_val != i) + failtest(return 1, "'%s' returned %d\n", key, int_val); + } + json_object_put(json_obj); + + if (verbose) + printf("PASSED JSON tests\n"); + return 0; +} + +/** + * @brief Simple completion callback. + * + * This expects an (int) usrptr to be registered with the CURL initiation, + * and simply increments it every time a CURL operation completes. + * + * @param handle : CURL operation handle + */ +static void complete(struct cxip_curl_handle *handle) +{ + int *counter = (int *)handle->usrptr; + + (*counter)++; +} + +/** + * @brief Exercise the CURL code. + * + * The flask_testsrv.py code must be running to perform this test. It will + * pass with a warning message if the server is not found. + * + * @return int : error code + */ +int auto_test_cxip_curl(void) +{ + struct cxip_curl_handle *handle; + struct timespec t0, t1; + json_object *json_obj; + int op, ret; + char tag[256]; + int counter; + + /* confirm that the server is running : status is 0 if no server */ + ret = cxip_curl_perform(serverpath, NULL, NULL, 0, CURL_GET, false, + complete, &counter); + do { + ret = cxip_curl_progress(&handle); + } while (ret == -FI_EAGAIN); + if (ret) { + fprintf(stderr, "cxip_curl_perform() returned %s\n", + fi_strerror(-ret)); + return ret; + } + if (!handle) { + fprintf(stderr, "cxip_curl_perform() returned no handle\n"); + return -1; + } + if (handle->status == 0) { + fprintf(stderr, "SERVER at %s is not running\n", serverpath); + cxip_curl_free(handle); + return 0; + } + cxip_curl_free(handle); + + /* Walk through all of the test-supported operations */ + for (op = CURL_GET; op < CURL_MAX; op++) { + const char *opname = cxip_curl_opname(op); + bool reordered = false; + int nextseqid = 0; + int seqid; + int i, err = 0; + + /* reset the callback counter to zero on every opcode */ + counter = 0; + + if (verbose) + printf("\nOperation %s\n", cxip_curl_opname(op)); + + /* Run 'parallel' operations concurrently */ + tmark(&t0); + for (i = 0; i < parallel; i++) { + sprintf(tag, "{\"seqid\": %d}", i); + ret = cxip_curl_perform(serverpath, tag, NULL, 0, + op, false, complete, &counter); + if (ret != 0) + fprintf(stderr, "cxip_curl_perform(%d) = %s\n", + i, fi_strerror(ret)); + } + tmeas(&t0); + + /* Wait for all initiated operations to finish */ + tmark(&t1); + while (i-- > 0) { + do { + sched_yield(); + ret = cxip_curl_progress(&handle); + } while (ret == -FI_EAGAIN); + if (ret) { + /* should not happen, as we are counting */ + fprintf(stderr, "cxip_curl_progress() %s\n", + fi_strerror(-ret)); + err++; + continue; + } + if (!handle) { + /* should NEVER happen with good return */ + fprintf(stderr, + "cxip_curl_progress() no handle\n"); + err++; + continue; + } + if (handle->status != 200) { + /* our test server should generate 200 */ + fprintf(stderr, "status=%ld\n", handle->status); + err++; + goto free_handle; + } + if (!handle->response) { + /* CURL should not return a NULL response */ + fprintf(stderr, "NULL response\n"); + err++; + goto free_handle; + } + + /* Test server should return: + * { + * "operation": , + * "data": {"seqid": } + * } + */ + const char *str; + json_obj = json_tokener_parse(handle->response); + if (! json_obj) { + fprintf(stderr, "%s: JSON unparseable\n", + opname); + err++; + goto free_handle; + } + + if (cxip_json_string("operation", json_obj, &str)) { + fprintf(stderr, "no 'operation' field\n"); + err++; + goto free_json; + } + + if (strcmp(str, opname)) { + fprintf(stderr, "op=%s exp %s\n", str, opname); + err++; + goto free_json; + } + + /* For GET, seqid is is meaningless */ + if (op == CURL_GET) + goto free_json; + + if (cxip_json_int("data.seqid", json_obj, &seqid)) { + fprintf(stderr, "op=%s no seqid\n", opname); + err++; + goto free_json; + } + + /* This confirms that CURL does not order responses */ + if (seqid != nextseqid) + reordered = true; +free_json: + json_object_put(json_obj); +free_handle: + cxip_curl_free(handle); + nextseqid++; + } + tmeas(&t1); + + /* Should be no strays */ + ret = cxip_curl_progress(&handle); + if (ret != -FI_ENODATA) { + fprintf(stderr, "op=%s stray handles\n", opname); + err++; + } + + /* Callback counter should match number of calls */ + if (counter != parallel) { + fprintf(stderr, "op=%s count=%d, exp %d\n", + opname, counter, parallel); + err++; + } + + if (verbose) { + printf(" iterations(%d)\n", parallel); + printf(" counter (%d)\n", counter); + printf(" reordered (%s)\n", reordered ? "true" : "false"); + printf(" errors (%d)\n", err); + printf(" issue (%ld.%09lds)\n", t0.tv_sec, t0.tv_nsec); + printf(" response (%ld.%09lds)\n", t1.tv_sec, t1.tv_nsec); + } + + if (err) + failtest(return 1, "FAILED CURL tests\n"); + } + if (verbose) + printf("\n"); + + printf("PASSED CURL tests\n"); + return 0; +} + +/** + * @brief Perform a manual (command-line arguments) test + * + * @return int : error code + */ +int do_test(void) +{ + struct cxip_curl_handle *handle; + struct timespec t0; + enum curl_ops op; + int ret; + + if (!strcasecmp(cmd, "get")) + op = CURL_GET; + else if (!strcasecmp(cmd, "put")) + op = CURL_PUT; + else if (!strcasecmp(cmd, "post")) + op = CURL_POST; + else if (!strcasecmp(cmd, "patch")) + op = CURL_PATCH; + else if (!strcasecmp(cmd, "delete")) + op = CURL_DELETE; + else { + fprintf(stderr, "Bad HTTP operation \"%s\"", cmd); + return 1; + } + + tmark(&t0); + ret = cxip_curl_perform(serverpath, data, NULL, 0, op, verbose, 0, 0); + if (ret) { + fprintf(stderr, "cxip_curl_perform() returned %d\n", ret); + return ret; + } + + do { + sched_yield(); + ret = cxip_curl_progress(&handle); + } while (ret == -FI_EAGAIN); + tmeas(&t0); + + if (ret) + failtest(return 1, "cxip_curl_progress() ret %d\n", ret); + if (!handle) + failtest(return 1, "cxip_curl_progress() ret no handle\n"); + if (!handle->status) { + fprintf(stderr, "SERVER at %s is not running\n", serverpath); + return 0; + } + + printf("\n"); + printf("endpoint = %s\n", handle->endpoint); + printf("time = %ld.%09lds\n", t0.tv_sec, t0.tv_nsec); + printf("status = %ld\n", handle->status); + printf("request------------\n%s\n", handle->request); + printf("response-----------\n%s\n", handle->response); + + return 0; +} + +int main(int argc, char **argv) { + static char *opts = "c:d:e:p:r:s:hv"; + static struct option lopts[] = { + {"help", no_argument, NULL, 'h'}, + {"auto", no_argument, NULL, 'a'}, + {"verbose", no_argument, NULL, 'v'}, + {"command", required_argument, NULL, 'c'}, + {"data", required_argument, NULL, 'd'}, + {"parallel", required_argument, NULL, 'p'}, + {"server", required_argument, NULL, 's'}, + {"endpoint", required_argument, NULL, 'e'}, + {0, 0, 0, 0} + }; + static const char *help = + "\nExercise cxip_curl module:\n" + " --auto Perform automated test suite\n" + " -c, --command Define HTTP command\n" + " -d, --data Define HTTP payload (json)\n" + " -p, --parallel Define level of auto-test parallism\n" + " -s, --server REST server address" + " (default \"%s\")\n" + " -e, --endpoint REST server endpoint\n" + " -v, --verbose Verbose operation\n" + " -h, --help Display help\n"; + + int ret = 1; + + while (1) { + int idx, c; + + c = getopt_long(argc, argv, opts, lopts, &idx); + if (c == -1) + break; + switch (c) { + case 0: // long option, all map to single characters + break; + case 'a': + autotest = 1; + break; + case 'c': + cmd = strdup(optarg); + break; + case 'd': + data = strdup(optarg); + break; + case 'p': + parallel = atoi(optarg); + break; + case 's': + server = strdup(optarg); + break; + case 'e': + endpoint = strdup(optarg); + break; + case 'v': + verbose = true; + break; + case 'h': + ret = 0; + // fall through + default: + printf(help, server); + return ret; + } + } + + snprintf(serverpath, sizeof(serverpath), "%s%s", server, endpoint); + + if (cxip_curl_init()) { + fprintf(stderr, "CURL could not be initialized\n"); + return ret; + } + + if (autotest) { + ret = auto_test_cxip_json() | + auto_test_cxip_curl(); + } else { + ret = do_test(); + } + + cxip_curl_fini(); + return ret; +} diff --git a/prov/cxi/test/cxip_test_common.c b/prov/cxi/test/cxip_test_common.c new file mode 100644 index 00000000000..999a12627df --- /dev/null +++ b/prov/cxi/test/cxip_test_common.c @@ -0,0 +1,1105 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018,2020-2022 Hewlett Packard Enterprise Development LP + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cxip_test_common.h" + +struct fi_info *cxit_fi_hints; +struct fi_info *cxit_fi; +struct fid_fabric *cxit_fabric; +struct fid_domain *cxit_domain; +struct fi_cxi_dom_ops *dom_ops; +struct fid_ep *cxit_ep; +struct fid_ep *cxit_tx_alias_ep; +struct cxip_addr cxit_ep_addr; +fi_addr_t cxit_ep_fi_addr; +struct fi_eq_attr cxit_eq_attr = {}; +struct fid_eq *cxit_eq; +struct fi_cq_attr cxit_tx_cq_attr = { + .format = FI_CQ_FORMAT_TAGGED, + .size = 16384 +}; +struct fi_cq_attr cxit_rx_cq_attr = { .format = FI_CQ_FORMAT_TAGGED }; +uint64_t cxit_eq_bind_flags = 0; +uint64_t cxit_tx_cq_bind_flags = FI_TRANSMIT; +uint64_t cxit_rx_cq_bind_flags = FI_RECV; +struct fid_cq *cxit_tx_cq, *cxit_rx_cq; +struct fi_cntr_attr cxit_cntr_attr = {}; +struct fid_cntr *cxit_send_cntr, *cxit_recv_cntr; +struct fid_cntr *cxit_read_cntr, *cxit_write_cntr; +struct fid_cntr *cxit_rem_cntr; +struct fi_av_attr cxit_av_attr; +struct fid_av *cxit_av; +struct cxit_coll_mc_list cxit_coll_mc_list = { .count = 5 }; +char *cxit_node, *cxit_service; +uint64_t cxit_flags; +int cxit_n_ifs; +struct fid_av_set *cxit_av_set; +struct fid_mc *cxit_mc; +bool cxit_prov_key; +int s_page_size; +bool enable_cxi_hmem_ops = 1; + +/* Get _SC_PAGESIZE */ +static void cxit_set_page_size(void) +{ + if (!s_page_size) + s_page_size = sysconf(_SC_PAGESIZE); +} + +int cxit_dom_read_cntr(unsigned int cntr, uint64_t *value, + struct timespec *ts, bool sync) +{ + int ret; + struct timespec start; + struct timespec delta; + + /* Map counters if not already mapped */ + ret = dom_ops->cntr_read(&cxit_domain->fid, cntr, value, &start); + if (ret || !sync) + goto done; + + /* Wait for an update to occur to read latest counts */ + do { + usleep(100); + ret = dom_ops->cntr_read(&cxit_domain->fid, cntr, value, + &delta); + } while (!ret && delta.tv_sec == start.tv_sec && + delta.tv_nsec == start.tv_nsec); + +done: + if (ts && !ret) + *ts = sync ? delta : start; + + return ret; +} + +static ssize_t copy_from_hmem_iov(void *dest, size_t size, + enum fi_hmem_iface iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, + uint64_t hmem_iov_offset) +{ + size_t cpy_size = MIN(size, hmem_iov->iov_len); + + assert(iface == FI_HMEM_SYSTEM); + assert(hmem_iov_count == 1); + assert(hmem_iov_offset == 0); + + memcpy(dest, hmem_iov->iov_base, cpy_size); + + return cpy_size; +} + +static ssize_t copy_to_hmem_iov(enum fi_hmem_iface iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, + uint64_t hmem_iov_offset, const void *src, + size_t size) +{ + size_t cpy_size = MIN(size, hmem_iov->iov_len); + + assert(iface == FI_HMEM_SYSTEM); + assert(hmem_iov_count == 1); + assert(hmem_iov_offset == 0); + + memcpy(hmem_iov->iov_base, src, cpy_size); + + return cpy_size; +} + +struct fi_hmem_override_ops cxi_hmem_ops = { + .copy_from_hmem_iov = copy_from_hmem_iov, + .copy_to_hmem_iov = copy_to_hmem_iov, +}; + +void cxit_create_fabric_info(void) +{ + int ret; + + if (cxit_fi) + return; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &cxit_fi); + cr_assert(ret == FI_SUCCESS, "fi_getinfo"); + cxit_fi->ep_attr->tx_ctx_cnt = cxit_fi->domain_attr->tx_ctx_cnt; + cxit_fi->ep_attr->rx_ctx_cnt = cxit_fi->domain_attr->rx_ctx_cnt; + + /* Add in FI_SOURCE and FI_SOURCE_ERR to include all capabilities */ + cxit_fi->caps |= FI_SOURCE | FI_SOURCE_ERR; + cxit_fi->rx_attr->caps |= FI_SOURCE | FI_SOURCE_ERR; +} + +void cxit_destroy_fabric_info(void) +{ + fi_freeinfo(cxit_fi); + cxit_fi = NULL; +} + +void cxit_create_fabric(void) +{ + int ret; + + if (cxit_fabric) + return; + + ret = fi_fabric(cxit_fi->fabric_attr, &cxit_fabric, NULL); + cr_assert(ret == FI_SUCCESS, "fi_fabric"); +} + +void cxit_destroy_fabric(void) +{ + int ret; + + ret = fi_close(&cxit_fabric->fid); + cr_assert(ret == FI_SUCCESS, "fi_close fabric"); + cxit_fabric = NULL; +} + +void cxit_create_domain(void) +{ + int ret; + + if (cxit_domain) + return; + + ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL); + cr_assert(ret == FI_SUCCESS, "fi_domain"); + + /* Should be able to open either v1 - v6 */ + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_1, 0, + (void **)&dom_ops, NULL); + cr_assert(ret == FI_SUCCESS, "fi_open_ops v1"); + cr_assert(dom_ops->cntr_read != NULL, "v1 function returned"); + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_2, 0, + (void **)&dom_ops, NULL); + cr_assert(ret == FI_SUCCESS, "fi_open_ops v2"); + cr_assert(dom_ops->cntr_read != NULL && + dom_ops->topology != NULL, "V2 functions returned"); + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0, + (void **)&dom_ops, NULL); + cr_assert(ret == FI_SUCCESS, "fi_open_ops v3"); + cr_assert(dom_ops->cntr_read != NULL && + dom_ops->topology != NULL && + dom_ops->enable_hybrid_mr_desc != NULL, + "V3 functions returned"); + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_6, 0, + (void **)&dom_ops, NULL); + cr_assert(ret == FI_SUCCESS, "fi_open_ops v6"); + cr_assert(dom_ops->cntr_read != NULL && + dom_ops->topology != NULL && + dom_ops->enable_hybrid_mr_desc != NULL && + dom_ops->ep_get_unexp_msgs != NULL && + dom_ops->get_dwq_depth != NULL && + dom_ops->enable_mr_match_events != NULL, + "V3 functions returned"); + + if (enable_cxi_hmem_ops) { + ret = fi_set_ops(&cxit_domain->fid, FI_SET_OPS_HMEM_OVERRIDE, 0, + &cxi_hmem_ops, NULL); + cr_assert(ret == FI_SUCCESS, "fi_set_ops"); + } +} + +void cxit_destroy_domain(void) +{ + int ret; + + ret = fi_close(&cxit_domain->fid); + cr_assert(ret == FI_SUCCESS, "fi_close domain. %d", ret); + cxit_domain = NULL; +} + +void cxit_create_ep(void) +{ + int ret; + + ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL); + cr_assert(ret == FI_SUCCESS, "fi_endpoint"); + cr_assert_not_null(cxit_ep); +} + +void cxit_destroy_ep(void) +{ + int ret; + + if (cxit_ep != NULL) { + ret = fi_close(&cxit_ep->fid); + cr_assert(ret == FI_SUCCESS, "fi_close endpoint = %d", ret); + cxit_ep = NULL; + } +} + +void cxit_create_eq(void) +{ + struct fi_eq_attr attr = { + .size = 32, + .flags = FI_WRITE, + .wait_obj = FI_WAIT_NONE + }; + int ret; + + ret = fi_eq_open(cxit_fabric, &attr, &cxit_eq, NULL); + cr_assert(ret == FI_SUCCESS, "fi_eq_open failed %d", ret); + cr_assert_not_null(cxit_eq, "fi_eq_open returned NULL eq"); +} + +void cxit_destroy_eq(void) +{ + int ret; + + ret = fi_close(&cxit_eq->fid); + cr_assert(ret == FI_SUCCESS, "fi_close EQ failed %d", ret); + cxit_eq = NULL; +} + +void cxit_bind_eq(void) +{ + int ret; + + /* NOTE: ofi implementation does not allow any flags */ + ret = fi_ep_bind(cxit_ep, &cxit_eq->fid, cxit_eq_bind_flags); + cr_assert(!ret, "fi_ep_bind EQ"); +} + +void cxit_create_cqs(void) +{ + int ret; + + ret = fi_cq_open(cxit_domain, &cxit_tx_cq_attr, &cxit_tx_cq, NULL); + cr_assert(ret == FI_SUCCESS, "fi_cq_open (TX)"); + + ret = fi_cq_open(cxit_domain, &cxit_rx_cq_attr, &cxit_rx_cq, NULL); + cr_assert(ret == FI_SUCCESS, "fi_cq_open (RX)"); +} + +void cxit_destroy_cqs(void) +{ + int ret; + + ret = fi_close(&cxit_rx_cq->fid); + cr_assert(ret == FI_SUCCESS, "fi_close RX CQ"); + cxit_rx_cq = NULL; + + ret = fi_close(&cxit_tx_cq->fid); + cr_assert(ret == FI_SUCCESS, "fi_close TX CQ"); + cxit_tx_cq = NULL; +} + +void cxit_bind_cqs(void) +{ + int ret; + + ret = fi_ep_bind(cxit_ep, &cxit_tx_cq->fid, cxit_tx_cq_bind_flags); + cr_assert(!ret, "fi_ep_bind TX CQ"); + + ret = fi_ep_bind(cxit_ep, &cxit_rx_cq->fid, cxit_rx_cq_bind_flags); + cr_assert(!ret, "fi_ep_bind RX CQ"); +} + +void cxit_create_rem_cntrs(void) +{ + int ret; + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_rem_cntr, NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (rem)"); +} + +void cxit_create_local_cntrs(void) +{ + int ret; + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_send_cntr, + NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (send)"); + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_recv_cntr, + NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (recv)"); + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_read_cntr, + NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (read)"); + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_write_cntr, + NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (write)"); +} + +void cxit_create_cntrs(void) +{ + cxit_create_local_cntrs(); + cxit_create_rem_cntrs(); +} + +void cxit_destroy_cntrs(void) +{ + int ret; + + if (cxit_send_cntr) { + ret = fi_close(&cxit_send_cntr->fid); + cr_assert(ret == FI_SUCCESS, "fi_close send_cntr"); + cxit_send_cntr = NULL; + } + + if (cxit_recv_cntr) { + ret = fi_close(&cxit_recv_cntr->fid); + cr_assert(ret == FI_SUCCESS, "fi_close recv_cntr"); + cxit_recv_cntr = NULL; + } + + if (cxit_read_cntr) { + ret = fi_close(&cxit_read_cntr->fid); + cr_assert(ret == FI_SUCCESS, "fi_close read_cntr"); + cxit_read_cntr = NULL; + } + + if (cxit_write_cntr) { + ret = fi_close(&cxit_write_cntr->fid); + cr_assert(ret == FI_SUCCESS, "fi_close write_cntr"); + cxit_write_cntr = NULL; + } + + if (cxit_rem_cntr) { + ret = fi_close(&cxit_rem_cntr->fid); + cr_assert(ret == FI_SUCCESS, "fi_close rem_cntr"); + cxit_rem_cntr = NULL; + } +} + +void cxit_bind_cntrs(void) +{ + int ret; + + if (cxit_send_cntr) { + ret = fi_ep_bind(cxit_ep, &cxit_send_cntr->fid, FI_SEND); + cr_assert(!ret, "fi_ep_bind send_cntr"); + } + + if (cxit_recv_cntr) { + ret = fi_ep_bind(cxit_ep, &cxit_recv_cntr->fid, FI_RECV); + cr_assert(!ret, "fi_ep_bind recv_cntr"); + } + + if (cxit_read_cntr) { + ret = fi_ep_bind(cxit_ep, &cxit_read_cntr->fid, FI_READ); + cr_assert(!ret, "fi_ep_bind read_cntr"); + } + + if (cxit_write_cntr) { + ret = fi_ep_bind(cxit_ep, &cxit_write_cntr->fid, FI_WRITE); + cr_assert(!ret, "fi_ep_bind write_cntr"); + } +} + +void cxit_create_av(void) +{ + int ret; + + ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL); + cr_assert(ret == FI_SUCCESS, "fi_av_open"); +} + +void cxit_destroy_av(void) +{ + int ret; + + ret = fi_close(&cxit_av->fid); + cr_assert(ret == FI_SUCCESS, "fi_close AV. %d", ret); + cxit_av = NULL; +} + +void cxit_bind_av(void) +{ + int ret; + + ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0); + cr_assert(!ret, "fi_ep_bind AV"); +} + +void cxit_init(void) +{ + struct slist_entry *entry, *prev __attribute__((unused)); + int ret; + struct fi_info *hints = cxit_allocinfo(); + struct fi_info *info; + + setlinebuf(stdout); + cxit_set_page_size(); + + /* Force provider init */ + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, hints, + &info); + cr_assert(ret == FI_SUCCESS); + + slist_foreach(&cxip_if_list, entry, prev) { + cxit_n_ifs++; + } + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +struct fi_info *cxit_allocinfo(void) +{ + struct fi_info *info; + char *odp_env; + char *prov_key_env; + + info = fi_allocinfo(); + cr_assert(info, "fi_allocinfo"); + + /* Always select CXI */ + info->fabric_attr->prov_name = strdup(cxip_prov_name); + + info->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + + /* Test with provider generated keys instead of client */ + prov_key_env = getenv("CXIP_TEST_PROV_KEY"); + if (prov_key_env && strtol(prov_key_env, NULL, 10)) { + cxit_prov_key = 1; + info->domain_attr->mr_mode |= FI_MR_PROV_KEY; + } else { + cxit_prov_key = 0; + } + + /* If remote ODP is enabled then test with ODP */ + odp_env = getenv("CXIP_TEST_ODP"); + if (odp_env && strtol(odp_env, NULL, 10)) + info->domain_attr->mr_mode &= ~FI_MR_ALLOCATED; + + return info; +} + +void cxit_setup_getinfo(void) +{ + cxit_init(); + + if (!cxit_fi_hints) + cxit_fi_hints = cxit_allocinfo(); +} + +void cxit_teardown_getinfo(void) +{ + fi_freeinfo(cxit_fi_hints); + cxit_fi_hints = NULL; +} + +void cxit_setup_fabric(void) +{ + cxit_setup_getinfo(); + cxit_create_fabric_info(); +} + +void cxit_teardown_fabric(void) +{ + cxit_destroy_fabric_info(); + cxit_teardown_getinfo(); +} + +void cxit_setup_domain(void) +{ + cxit_setup_fabric(); + cxit_create_fabric(); +} + +void cxit_teardown_domain(void) +{ + cxit_destroy_fabric(); + cxit_teardown_fabric(); +} + +void cxit_setup_ep(void) +{ + cxit_setup_domain(); + cxit_create_domain(); +} + +void cxit_teardown_ep(void) +{ + cxit_destroy_domain(); + cxit_teardown_domain(); +} + +void cxit_setup_enabled_ep_disable_fi_rma_event(void) +{ + int ret; + size_t addrlen = sizeof(cxit_ep_addr); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + cxit_setup_ep(); + + cxit_fi->caps &= ~FI_RMA_EVENT; + cxit_fi->domain_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->tx_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->rx_attr->caps &= ~FI_RMA_EVENT; + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_bind_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + + /* No FI_RMA_EVENT, don't create/bind remote counters */ + cxit_create_local_cntrs(); + cxit_bind_cntrs(); + + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Find assigned Endpoint address. Address is assigned during enable. */ + ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret); + cr_assert(addrlen == sizeof(cxit_ep_addr)); +} + +void cxit_setup_enabled_ep_mr_events(void) +{ + int ret; + size_t addrlen = sizeof(cxit_ep_addr); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + cxit_setup_ep(); + + /* Enable FI_CXI_MR_MATCH_EVENTS via domain */ + ret = dom_ops->enable_mr_match_events(&cxit_domain->fid, + true); + cr_assert_eq(ret, FI_SUCCESS); + + /* Disable RMA events to make sure MATCH_EVENTS on its own is + * sufficient to disallow atomic with FI_DELIVERY_COMPLETE. + */ + cxit_fi->caps &= ~FI_RMA_EVENT; + cxit_fi->domain_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->tx_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->rx_attr->caps &= ~FI_RMA_EVENT; + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_bind_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + + /* No FI_RMA_EVENT, so only create local counters */ + cxit_create_local_cntrs(); + cxit_bind_cntrs(); + + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Find assigned Endpoint address. Address is assigned during enable. */ + ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret); + cr_assert(addrlen == sizeof(cxit_ep_addr)); +} + +void cxit_setup_enabled_ep(void) +{ + int ret; + size_t addrlen = sizeof(cxit_ep_addr); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + cxit_fi_hints->tx_attr->size = 512; + + cxit_setup_ep(); + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_bind_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Find assigned Endpoint address. Address is assigned during enable. */ + ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret); + cr_assert(addrlen == sizeof(cxit_ep_addr)); +} + +void cxit_setup_enabled_ep_fd(void) +{ + int ret; + size_t addrlen = sizeof(cxit_ep_addr); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_rx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_tx_cq_attr.wait_obj = FI_WAIT_FD; + cxit_rx_cq_attr.wait_obj = FI_WAIT_FD; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + cxit_setup_ep(); + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_bind_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Find assigned Endpoint address. Address is assigned during enable. */ + ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret); + cr_assert(addrlen == sizeof(cxit_ep_addr)); +} + +void cxit_setup_rma_disable_fi_rma_event(void) +{ + int ret; + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + + cxit_setup_enabled_ep_disable_fi_rma_event(); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + 0, NULL); + cr_assert(ret == 1); +} + +void cxit_setup_rma_mr_events(void) +{ + int ret; + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + bool disable = false; + + cxit_setup_enabled_ep_mr_events(); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + 0, NULL); + cr_assert(ret == 1); + + /* Ensure if FI_MR_PROV_KEY cache will not be used */ + fi_control(&cxit_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE, &disable); +} + +void cxit_bind_cqs_hybrid_mr_desc(void) +{ + int ret; + + ret = fi_ep_bind(cxit_ep, &cxit_tx_cq->fid, + cxit_tx_cq_bind_flags | FI_SELECTIVE_COMPLETION); + cr_assert(!ret, "fi_ep_bind TX CQ"); + + ret = fi_ep_bind(cxit_ep, &cxit_rx_cq->fid, + cxit_rx_cq_bind_flags | FI_SELECTIVE_COMPLETION); + cr_assert(!ret, "fi_ep_bind RX CQ"); +} + +void cxit_create_domain_hybrid_mr_desc(void) +{ + int ret; + + if (cxit_domain) + return; + + ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL); + cr_assert(ret == FI_SUCCESS, "fi_domain"); + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0, + (void **)&dom_ops, NULL); + cr_assert(ret == FI_SUCCESS, "fi_open_ops v2"); + cr_assert(dom_ops->cntr_read != NULL && + dom_ops->topology != NULL && + dom_ops->enable_hybrid_mr_desc != NULL, + "V3 functions returned"); + + if (enable_cxi_hmem_ops) { + ret = fi_set_ops(&cxit_domain->fid, FI_SET_OPS_HMEM_OVERRIDE, 0, + &cxi_hmem_ops, NULL); + cr_assert(ret == FI_SUCCESS, "fi_set_ops"); + } + + ret = dom_ops->enable_hybrid_mr_desc(&cxit_domain->fid, true); + cr_assert(ret == FI_SUCCESS, "enable_hybrid_mr_desc failed"); +} + +void cxit_setup_ep_hybrid_mr_desc(void) +{ + cxit_setup_domain(); + cxit_create_domain_hybrid_mr_desc(); +} + +void cxit_setup_enabled_ep_hybrid_mr_desc(void) +{ + int ret; + size_t addrlen = sizeof(cxit_ep_addr); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + cxit_setup_ep_hybrid_mr_desc(); + + cxit_fi->caps &= ~FI_RMA_EVENT; + cxit_fi->domain_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->tx_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->rx_attr->caps &= ~FI_RMA_EVENT; + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_bind_eq(); + cxit_create_cqs(); + cxit_bind_cqs_hybrid_mr_desc(); + + /* No FI_RMA_EVENT, don't create/bind remote counters */ + cxit_create_local_cntrs(); + cxit_bind_cntrs(); + + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Find assigned Endpoint address. Address is assigned during enable. */ + ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret); + cr_assert(addrlen == sizeof(cxit_ep_addr)); +} + +void cxit_setup_rma_hybrid_mr_desc(void) +{ + int ret; + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + + cxit_setup_enabled_ep_hybrid_mr_desc(); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + 0, NULL); + cr_assert(ret == 1); +} + +void cxit_setup_rma(void) +{ + int ret; + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + + cxip_trace_append = true; + cxip_trace_enable(true); + cxit_setup_enabled_ep(); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + 0, NULL); + cr_assert(ret == 1); +} + +void cxit_teardown_rma(void) +{ + /* Tear down RMA objects */ + cxit_destroy_ep(); /* EP must be destroyed before bound objects */ + + cxit_destroy_av(); + cxit_destroy_cntrs(); + cxit_destroy_cqs(); + cxit_destroy_eq(); + cxit_teardown_ep(); +} + +/* Use FI_WAIT_FD CQ wait object */ +void cxit_setup_rma_fd(void) +{ + int ret; + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + + cxit_setup_enabled_ep_fd(); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + 0, NULL); + cr_assert(ret == 1); +} + +#define CXI0_AMO_REMAP \ + "/sys/class/cxi/cxi0/device/properties/amo_remap_to_pcie_fadd" + +void set_amo_remap_to_pcie_fadd(int amo_remap_to_pcie_fadd) +{ + FILE *fd; + int ret; + + /* Assume open a single CXI device is present. */ + fd = fopen(CXI0_AMO_REMAP, "w"); + cr_assert(fd != NULL, "Failed to open %s: %d\n", CXI0_AMO_REMAP, + -errno); + + ret = fprintf(fd, "%d", amo_remap_to_pcie_fadd); + cr_assert(ret >= 0, + "Failed to write AMO remap value: errno=%d\n", -errno); + + fclose(fd); +} + +void reset_amo_remap_to_pcie_fadd(void) +{ + set_amo_remap_to_pcie_fadd(-1); +} + +static void cxit_setup_tx_alias_rma_impl(bool delivery_complete) +{ + int ret; + struct cxip_ep *cxi_ep; + struct cxip_ep *cxi_alias_ep = NULL; + uint64_t op_flags; + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + + cxit_setup_enabled_ep(); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + 0, NULL); + cr_assert(ret == 1); + + /* Create TX alias EP */ + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + cr_assert(!(cxi_ep->tx_attr.op_flags & FI_RECV), "Bad op flags"); + + op_flags = cxi_ep->tx_attr.op_flags | FI_TRANSMIT; + if (delivery_complete) + op_flags |= FI_DELIVERY_COMPLETE; + ret = fi_ep_alias(cxit_ep, &cxit_tx_alias_ep, op_flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_alias"); + + cxi_alias_ep = container_of(&cxit_tx_alias_ep->fid, + struct cxip_ep, ep.fid); + cr_assert_not_null(cxi_alias_ep->ep_obj); +} + +void cxit_setup_tx_alias_rma(void) +{ + cxit_setup_tx_alias_rma_impl(false); +} + +void cxit_setup_tx_alias_rma_dc(void) +{ + cxit_setup_tx_alias_rma_impl(true); +} + +void cxit_teardown_tx_alias_rma(void) +{ + struct cxip_ep *cxi_ep; + int ret; + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + ret = fi_close(&cxit_tx_alias_ep->fid); + cr_assert(ret == FI_SUCCESS, "fi_close alias endpoint"); + cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 0, + "EP reference count"); + + /* Tear down RMA objects */ + cxit_destroy_ep(); /* EP must be destroyed before bound objects */ + + cxit_destroy_av(); + cxit_destroy_cntrs(); + cxit_destroy_cqs(); + cxit_destroy_eq(); + cxit_teardown_ep(); +} + +/* Everyone needs to wait sometime */ +int cxit_await_completion(struct fid_cq *cq, struct fi_cq_tagged_entry *cqe) +{ + int ret; + + do { + ret = fi_cq_read(cq, cqe, 1); + } while (ret == -FI_EAGAIN); + + return ret; +} + +void validate_tx_event(struct fi_cq_tagged_entry *cqe, uint64_t flags, + void *context) +{ + cr_assert(cqe->op_context == context, "TX CQE Context mismatch"); + cr_assert(cqe->flags == flags, "TX CQE flags mismatch"); + cr_assert(cqe->len == 0, "Invalid TX CQE length"); + cr_assert(cqe->buf == 0, "Invalid TX CQE address"); + cr_assert(cqe->data == 0, "Invalid TX CQE data"); + cr_assert(cqe->tag == 0, "Invalid TX CQE tag"); +} + +void validate_rx_event(struct fi_cq_tagged_entry *cqe, void *context, + size_t len, uint64_t flags, void *buf, uint64_t data, + uint64_t tag) +{ + cr_assert(cqe->op_context == context, "CQE Context mismatch"); + cr_assert(cqe->len == len, "Invalid CQE length"); + cr_assert(cqe->flags == flags, "CQE flags mismatch"); + cr_assert(cqe->buf == buf, "Invalid CQE address (%p %p)", + cqe->buf, buf); + cr_assert(cqe->data == data, "Invalid CQE data"); + cr_assert(cqe->tag == tag, "Invalid CQE tag"); +} + +void validate_rx_event_mask(struct fi_cq_tagged_entry *cqe, void *context, + size_t len, uint64_t flags, void *buf, + uint64_t data, uint64_t tag, uint64_t ignore) +{ + cr_assert(cqe->op_context == context, "CQE Context mismatch"); + cr_assert(cqe->len == len, "Invalid CQE length: (%lu %lu)", + cqe->len, len); + cr_assert(cqe->flags == flags, "CQE flags mismatch"); + cr_assert(cqe->buf == buf, "Invalid CQE address (%p %p)", + cqe->buf, buf); + cr_assert(cqe->data == data, "Invalid CQE data"); + cr_assert(((cqe->tag & ~ignore) == (tag & ~ignore)), "Invalid CQE tag"); +} + +void validate_multi_recv_rx_event(struct fi_cq_tagged_entry *cqe, void + *context, size_t len, uint64_t flags, + uint64_t data, uint64_t tag) +{ + cr_assert(cqe->op_context == context, "CQE Context mismatch"); + cr_assert(cqe->len == len, "Invalid CQE length"); + cr_assert((cqe->flags & ~FI_MULTI_RECV) == flags, + "CQE flags mismatch (%#llx %#lx)", + (cqe->flags & ~FI_MULTI_RECV), flags); + cr_assert(cqe->data == data, "Invalid CQE data"); + cr_assert(cqe->tag == tag, "Invalid CQE tag %#lx %#lx", cqe->tag, tag); +} + +int mr_create_ext(size_t len, uint64_t access, uint8_t seed, uint64_t *key, + struct fid_cntr *cntr, struct mem_region *mr) +{ + int ret; + + cr_assert_not_null(mr); + + if (len) { + mr->mem = calloc(1, len); + cr_assert_not_null(mr->mem, "Error allocating memory window"); + } else { + mr->mem = 0; + } + + for (size_t i = 0; i < len; i++) + mr->mem[i] = i + seed; + + ret = fi_mr_reg(cxit_domain, mr->mem, len, access, 0, *key, 0, &mr->mr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed %d", ret); + ret = fi_mr_bind(mr->mr, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind(ep) failed %d", ret); + + if (cxit_fi->caps & FI_RMA_EVENT && cntr) { + ret = fi_mr_bind(mr->mr, &cntr->fid, FI_REMOTE_WRITE); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind(cntr) failed %d", + ret); + } + + ret = fi_mr_enable(mr->mr); + if (!ret) + *key = fi_mr_key(mr->mr); + + return ret; +} + +int mr_create(size_t len, uint64_t access, uint8_t seed, uint64_t *key, + struct mem_region *mr) +{ + return mr_create_ext(len, access, seed, key, cxit_rem_cntr, mr); +} + +void mr_destroy(struct mem_region *mr) +{ + fi_close(&mr->mr->fid); + free(mr->mem); +} diff --git a/prov/cxi/test/cxip_test_common.h b/prov/cxi/test/cxip_test_common.h new file mode 100644 index 00000000000..d04e8fdf56d --- /dev/null +++ b/prov/cxi/test/cxip_test_common.h @@ -0,0 +1,141 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018,2020 Hewlett Packard Enterprise Development LP + */ + +#ifndef _CXIP_TEST_COMMON_H_ +#define _CXIP_TEST_COMMON_H_ + +#include "cxip.h" + +#define CXIT_DEFAULT_TIMEOUT 10 + +extern struct fi_info *cxit_fi_hints; +extern struct fi_info *cxit_fi; +extern struct fid_fabric *cxit_fabric; +extern struct fid_domain *cxit_domain; +extern struct fi_cxi_dom_ops *dom_ops; +extern struct fid_ep *cxit_ep; +extern struct fid_ep *cxit_tx_alias_ep; +extern struct cxip_addr cxit_ep_addr; +extern fi_addr_t cxit_ep_fi_addr; +extern struct fid_eq *cxit_eq; +extern struct fi_cq_attr cxit_tx_cq_attr, cxit_rx_cq_attr; +extern uint64_t cxit_tx_cq_bind_flags; +extern uint64_t cxit_rx_cq_bind_flags; +extern struct fid_cq *cxit_tx_cq, *cxit_rx_cq; +extern struct fi_cntr_attr cxit_cntr_attr; +extern struct fid_cntr *cxit_send_cntr, *cxit_recv_cntr; +extern struct fid_cntr *cxit_read_cntr, *cxit_write_cntr; +extern struct fid_cntr *cxit_rem_cntr; +extern struct fi_av_attr cxit_av_attr; +extern struct fid_av *cxit_av; +extern char *cxit_node, *cxit_service; +extern uint64_t cxit_flags; +extern int cxit_n_ifs; +extern struct fid_av_set *cxit_av_set; +extern struct fid_mc *cxit_mc; +extern FILE *cxit_mc_fifo; +extern bool cxit_prov_key; +extern int s_page_size; +extern bool enable_cxi_hmem_ops; + +extern bool cxip_trace_enable(bool enable); +extern void cxip_trace_flush(void); + +void cxit_init(void); +void cxit_create_fabric_info(void); +void cxit_destroy_fabric_info(void); +void cxit_create_fabric(void); +void cxit_destroy_fabric(void); +void cxit_create_domain(void); +void cxit_destroy_domain(void); +void cxit_create_ep(void); +void cxit_destroy_ep(void); +void cxit_create_eq(void); +void cxit_destroy_eq(void); +void cxit_create_cqs(void); +void cxit_destroy_cqs(void); +void cxit_bind_cqs(void); +void cxit_create_local_cntrs(void); +void cxit_create_rem_cntrs(void); +void cxit_create_cntrs(void); +void cxit_destroy_cntrs(void); +void cxit_bind_cntrs(void); +void cxit_create_av(void); +void cxit_destroy_av(void); +void cxit_bind_av(void); + +void cxit_setup_rma_disable_fi_rma_event(void); +struct fi_info *cxit_allocinfo(void); +void cxit_setup_getinfo(void); +void cxit_teardown_getinfo(void); +void cxit_setup_fabric(void); +void cxit_teardown_fabric(void); +void cxit_setup_domain(void); +void cxit_teardown_domain(void); +void cxit_setup_ep(void); +void cxit_teardown_ep(void); +#define cxit_setup_eq cxit_setup_ep +#define cxit_teardown_eq cxit_teardown_ep +#define cxit_setup_cq cxit_setup_ep +#define cxit_teardown_cq cxit_teardown_ep +#define cxit_setup_av cxit_setup_ep +#define cxit_teardown_av cxit_teardown_ep +void cxit_setup_enabled_ep(void); +void cxit_setup_enabled_ep_fd(void); +void cxit_setup_rma(void); +void cxit_setup_rma_fd(void); +void cxit_setup_rma_hybrid_mr_desc(void); +void cxit_setup_rma_mr_events(void); +#define cxit_setup_tagged cxit_setup_rma +#define cxit_setup_msg cxit_setup_rma +void cxit_teardown_rma(void); +#define cxit_teardown_tagged cxit_teardown_rma +#define cxit_teardown_msg cxit_teardown_rma +#define cxit_teardown_enabled_ep cxit_teardown_rma +#define cxit_teardown_rma_fd cxit_teardown_rma +void cxit_setup_tx_alias_rma(void); +void cxit_setup_tx_alias_rma_dc(void); +#define cxit_setup_tx_alias_tagged cxit_setup_tx_alias_rma +void cxit_teardown_tx_alias_rma(void); +#define cxit_teardown_tx_alias_tagged cxit_teardown_tx_alias_rma +int cxit_await_completion(struct fid_cq *cq, struct fi_cq_tagged_entry *cqe); +void validate_tx_event(struct fi_cq_tagged_entry *cqe, uint64_t flags, + void *context); +void validate_rx_event(struct fi_cq_tagged_entry *cqe, void *context, + size_t len, uint64_t flags, void *buf, uint64_t data, + uint64_t tag); +void validate_rx_event_mask(struct fi_cq_tagged_entry *cqe, void *context, + size_t len, uint64_t flags, void *buf, + uint64_t data, uint64_t tag, uint64_t ignore); +void validate_multi_recv_rx_event(struct fi_cq_tagged_entry *cqe, + void *context, size_t len, uint64_t flags, + uint64_t data, uint64_t tag); + +struct mem_region { + uint8_t *mem; + struct fid_mr *mr; +}; + +int mr_create_ext(size_t len, uint64_t access, uint8_t seed, uint64_t *key, + struct fid_cntr *cntr, struct mem_region *mr); +int mr_create(size_t len, uint64_t access, uint8_t seed, uint64_t *key, + struct mem_region *mr); +void mr_destroy(struct mem_region *mr); + +struct cxit_coll_mc_list { + int count; + struct fid_av_set **av_set_fid; + struct fid_mc **mc_fid; +}; +extern struct cxit_coll_mc_list cxit_coll_mc_list; + +void set_amo_remap_to_pcie_fadd(int amo_remap_to_pcie_fadd); +void reset_amo_remap_to_pcie_fadd(void); + +int cxit_dom_read_cntr(unsigned int cntr, uint64_t *value, + struct timespec *ts, bool sync); + +#endif diff --git a/prov/cxi/test/deferred_work.c b/prov/cxi/test/deferred_work.c new file mode 100644 index 00000000000..369e276ffad --- /dev/null +++ b/prov/cxi/test/deferred_work.c @@ -0,0 +1,1328 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2020 Hewlett Packard Enterprise Development LP + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +static void poll_counter_assert(struct fid_cntr *cntr, uint64_t expected_value, + unsigned int timeout) +{ + int ret; + struct timespec cur = {}; + struct timespec end; + uint64_t value; + + ret = clock_gettime(CLOCK_MONOTONIC, &end); + cr_assert_eq(ret, 0); + + end.tv_sec += timeout; + + while (true) { + ret = clock_gettime(CLOCK_MONOTONIC, &cur); + cr_assert_eq(ret, 0); + + value = fi_cntr_read(cntr); + if (value == expected_value) + break; + + if (cur.tv_sec > end.tv_sec) { + // cr_fail doesn't work so fake it + cr_assert_eq(value, expected_value, + "Counter failed to reach expected value: expected=%lu, got=%lu\n", + expected_value, value); + break; + } + + /* Progress TX side for rendezvous tests */ + fi_cq_read(cxit_tx_cq, NULL, 0); + } +} + +void deferred_msg_op_test(bool comp_event, size_t xfer_size, + uint64_t trig_thresh, bool is_tagged, uint64_t tag) +{ + int i; + int ret; + uint8_t *recv_buf; + uint8_t *send_buf; + struct fi_cq_tagged_entry tx_cqe; + struct fi_cq_tagged_entry rx_cqe; + int err = 0; + fi_addr_t from; + struct iovec iov = {}; + struct fi_op_msg msg = {}; + struct fi_op_tagged tagged = {}; + struct fi_deferred_work work = {}; + uint64_t expected_rx_flags = + is_tagged ? FI_TAGGED | FI_RECV : FI_MSG | FI_RECV; + uint64_t expected_rx_tag = is_tagged ? tag : 0; + uint64_t expected_tx_flags = + is_tagged ? FI_TAGGED | FI_SEND : FI_MSG | FI_SEND; + + recv_buf = calloc(1, xfer_size); + cr_assert(recv_buf); + + send_buf = calloc(1, xfer_size); + cr_assert(send_buf); + + for (i = 0; i < xfer_size; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + if (is_tagged) + ret = fi_trecv(cxit_ep, recv_buf, xfer_size, NULL, + FI_ADDR_UNSPEC, tag, 0, NULL); + else + ret = fi_recv(cxit_ep, recv_buf, xfer_size, NULL, + FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send deferred op to self */ + iov.iov_base = send_buf; + iov.iov_len = xfer_size; + + work.threshold = trig_thresh; + work.triggering_cntr = cxit_send_cntr; + work.completion_cntr = cxit_send_cntr; + + if (is_tagged) { + tagged.ep = cxit_ep; + tagged.msg.msg_iov = &iov; + tagged.msg.iov_count = 1; + tagged.msg.addr = cxit_ep_fi_addr; + tagged.msg.tag = tag; + tagged.flags = comp_event ? FI_COMPLETION : 0; + + work.op_type = FI_OP_TSEND; + work.op.tagged = &tagged; + } else { + msg.ep = cxit_ep; + msg.msg.msg_iov = &iov; + msg.msg.iov_count = 1; + msg.msg.addr = cxit_ep_fi_addr; + msg.flags = comp_event ? FI_COMPLETION : 0; + + work.op_type = FI_OP_SEND; + work.op.msg = &msg; + } + + ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work); + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + + /* Verify no target event has occurred. */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret); + + ret = fi_cntr_add(cxit_send_cntr, work.threshold); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, xfer_size, expected_rx_flags, NULL, 0, + expected_rx_tag); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + if (comp_event) { + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, expected_tx_flags, NULL); + } else { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", + ret); + } + + /* Validate sent data */ + for (i = 0; i < xfer_size; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + poll_counter_assert(cxit_send_cntr, work.threshold + 1, 5); + + free(send_buf); + free(recv_buf); +} + + +TestSuite(deferred_work, .init = cxit_setup_msg, .fini = cxit_teardown_msg, + .timeout = CXIT_DEFAULT_TIMEOUT); + + +Test(deferred_work, eager_message_comp_event) +{ + deferred_msg_op_test(true, 1024, 123546, false, 0); +} + +Test(deferred_work, rendezvous_message_comp_event) +{ + deferred_msg_op_test(true, 1024 * 1024, 123546, false, 0); +} + +Test(deferred_work, eager_message_no_comp_event) +{ + deferred_msg_op_test(false, 1024, 123546, false, 0); +} + +Test(deferred_work, rendezvous_message_no_comp_event, .timeout=60) +{ + deferred_msg_op_test(false, 1024 * 1024, 123546, false, 0); +} + +Test(deferred_work, tagged_eager_message_comp_event) +{ + deferred_msg_op_test(true, 1024, 123546, true, 987654321); +} + +Test(deferred_work, tagged_rendezvous_message_comp_event) +{ + deferred_msg_op_test(true, 1024 * 1024, 123546, true, 987654321); +} + +Test(deferred_work, tagged_eager_message_no_comp_event) +{ + deferred_msg_op_test(false, 1024, 123546, true, 987654321); +} + +Test(deferred_work, tagged_rendezvous_message_no_comp_event, .timeout=60) +{ + deferred_msg_op_test(false, 1024 * 1024, 123546, true, 987654321); +} + +Test(deferred_work, flush_work) +{ + int i; + int ret; + uint8_t *recv_buf; + uint8_t *send_buf; + struct fi_cq_tagged_entry tx_cqe; + struct fi_cq_tagged_entry rx_cqe; + struct iovec iov = {}; + struct fi_op_msg msg = {}; + struct fi_deferred_work msg_work = {}; + unsigned int trig_thresh; + size_t xfer_size = 1; + uint64_t key = 0xbeef; + struct mem_region mem_window; + struct fi_rma_iov rma_iov = {}; + struct fi_op_rma rma = {}; + struct fi_deferred_work rma_work = {}; + struct fi_ioc ioc = {}; + struct fi_rma_ioc rma_ioc = {}; + struct fi_op_atomic amo = {}; + struct fi_deferred_work amo_work = {}; + struct fi_op_cntr op_cntr = {}; + struct fi_deferred_work cntr_work = {}; + + recv_buf = calloc(1, xfer_size); + cr_assert(recv_buf); + + send_buf = calloc(1, xfer_size); + cr_assert(send_buf); + + ret = mr_create(xfer_size, FI_REMOTE_WRITE | FI_REMOTE_READ, 0xa0, &key, + &mem_window); + cr_assert_eq(ret, FI_SUCCESS, "mr_create failed %d", ret); + + for (i = 0; i < xfer_size; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_recv(cxit_ep, recv_buf, xfer_size, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send deferred 64 bytes to self */ + msg.ep = cxit_ep; + iov.iov_base = send_buf; + iov.iov_len = xfer_size; + msg.msg.msg_iov = &iov; + msg.msg.iov_count = 1; + msg.msg.addr = cxit_ep_fi_addr; + msg.flags = FI_COMPLETION; + + msg_work.triggering_cntr = cxit_send_cntr; + msg_work.completion_cntr = cxit_send_cntr; + msg_work.op_type = FI_OP_SEND; + msg_work.op.msg = &msg; + + /* Deferred RMA op to be cancelled. */ + rma_iov.key = key; + + rma.ep = cxit_ep; + rma.msg.msg_iov = &iov; + rma.msg.iov_count = 1; + rma.msg.addr = cxit_ep_fi_addr; + rma.msg.rma_iov = &rma_iov; + rma.msg.rma_iov_count = 1; + rma.flags = FI_COMPLETION; + + rma_work.triggering_cntr = cxit_send_cntr; + rma_work.completion_cntr = cxit_send_cntr; + rma_work.op_type = FI_OP_READ; + rma_work.op.rma = &rma; + + /* Deferred AMO op to be cancelled. */ + ioc.addr = &send_buf; + ioc.count = 1; + + rma_ioc.key = key; + rma_ioc.count = 1; + + amo.ep = cxit_ep; + + amo.msg.msg_iov = &ioc; + amo.msg.iov_count = 1; + amo.msg.addr = cxit_ep_fi_addr; + amo.msg.rma_iov = &rma_ioc; + amo.msg.rma_iov_count = 1; + amo.msg.datatype = FI_UINT8; + amo.msg.op = FI_SUM; + + amo_work.triggering_cntr = cxit_send_cntr; + amo_work.completion_cntr = cxit_send_cntr; + amo_work.op_type = FI_OP_ATOMIC; + amo_work.op.atomic = &amo; + + /* Deferred counter op. */ + op_cntr.cntr = cxit_send_cntr; + op_cntr.value = 13546; + + cntr_work.op_type = FI_OP_CNTR_SET; + cntr_work.triggering_cntr = cxit_send_cntr; + cntr_work.op.cntr = &op_cntr; + + /* Queue up multiple trigger requests to be cancelled. */ + for (i = 0, trig_thresh = 12345; i < 12; i++, trig_thresh++) { + struct fi_deferred_work *work; + + if (i < 3) + work = &msg_work; + else if (i < 6) + work = &rma_work; + else if (i < 9) + work = &cntr_work; + else + work = &amo_work; + + work->threshold = trig_thresh; + + ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, work); + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + } + + /* Verify no source or target event has occurred. */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret); + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret); + + /* Flush all work requests. */ + ret = fi_control(&cxit_domain->fid, FI_FLUSH_WORK, NULL); + cr_assert_eq(ret, FI_SUCCESS, "FI_FLUSH_WORK failed %d", ret); + + ret = fi_cntr_add(cxit_send_cntr, trig_thresh); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + + /* Verify no source or target event has occurred. */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret); + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret); + + poll_counter_assert(cxit_send_cntr, trig_thresh, 5); + + free(send_buf); + free(recv_buf); + mr_destroy(&mem_window); +} + +static void deferred_rma_test(enum fi_op_type op, size_t xfer_size, + uint64_t trig_thresh, uint64_t key, + bool comp_event) +{ + int ret; + struct mem_region mem_window; + struct fi_cq_tagged_entry cqe; + struct iovec iov = {}; + struct fi_rma_iov rma_iov = {}; + struct fi_op_rma rma = {}; + struct fi_deferred_work work = {}; + struct fid_cntr *trig_cntr = cxit_write_cntr; + struct fid_cntr *comp_cntr = cxit_read_cntr; + uint8_t *send_buf; + uint64_t expected_flags = + op == FI_OP_WRITE ? FI_RMA | FI_WRITE : FI_RMA | FI_READ; + + send_buf = calloc(1, xfer_size); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(xfer_size, FI_REMOTE_WRITE | FI_REMOTE_READ, 0xa0, &key, + &mem_window); + + iov.iov_base = send_buf; + iov.iov_len = xfer_size; + + rma_iov.key = key; + + rma.ep = cxit_ep; + rma.msg.msg_iov = &iov; + rma.msg.iov_count = 1; + rma.msg.addr = cxit_ep_fi_addr; + rma.msg.rma_iov = &rma_iov; + rma.msg.rma_iov_count = 1; + rma.flags = comp_event ? FI_COMPLETION : 0; + + work.threshold = trig_thresh; + work.triggering_cntr = trig_cntr; + work.completion_cntr = comp_cntr; + work.op_type = op; + work.op.rma = &rma; + + ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work); + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + + /* Verify no target event has occurred. */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret); + + ret = fi_cntr_add(trig_cntr, work.threshold); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + + if (comp_event) { + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&cqe, expected_flags, NULL); + } else { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", + ret); + } + + poll_counter_assert(trig_cntr, work.threshold, 5); + poll_counter_assert(comp_cntr, 1, 5); + + /* Validate RMA data */ + for (size_t i = 0; i < xfer_size; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%ld) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + mr_destroy(&mem_window); + free(send_buf); +} + +Test(deferred_work, rma_write) +{ + deferred_rma_test(FI_OP_WRITE, 12345, 54321, 0xbeef, true); +} + +Test(deferred_work, rma_write_no_event) +{ + deferred_rma_test(FI_OP_WRITE, 12345, 54321, 0xbeef, false); +} + +Test(deferred_work, rma_read) +{ + deferred_rma_test(FI_OP_READ, 12345, 54321, 0xbeef, true); +} + +Test(deferred_work, rma_read_no_event) +{ + deferred_rma_test(FI_OP_READ, 12345, 54321, 0xbeef, false); +} + +static void deferred_amo_test(bool comp_event, bool fetch, bool comp) +{ + int ret; + struct mem_region mem_window; + struct fi_cq_tagged_entry cqe; + struct fi_ioc iov = {}; + struct fi_ioc fetch_iov = {}; + struct fi_ioc comp_iov = {}; + struct fi_rma_ioc rma_iov = {}; + struct fi_op_atomic amo = {}; + struct fi_op_fetch_atomic fetch_amo = {}; + struct fi_op_compare_atomic comp_amo = {}; + struct fi_msg_atomic *amo_msg; + struct fi_deferred_work work = {}; + struct fid_cntr *trig_cntr = cxit_write_cntr; + struct fid_cntr *comp_cntr = cxit_read_cntr; + uint64_t expected_flags; + uint64_t source_buf = 1; + uint64_t *target_buf; + uint64_t result; + uint64_t key = 0xbbb; + uint64_t trig_thresh = 12345; + uint64_t init_target_value = 0x7FFFFFFFFFFFFFFF; + uint64_t fetch_result = 0; + uint64_t compare_value = init_target_value; + + ret = mr_create(sizeof(*target_buf), FI_REMOTE_WRITE | FI_REMOTE_READ, + 0, &key, &mem_window); + assert(ret == FI_SUCCESS); + + target_buf = (uint64_t *)mem_window.mem; + *target_buf = init_target_value; + + result = source_buf + *target_buf; + + iov.addr = &source_buf; + iov.count = 1; + + rma_iov.key = key; + rma_iov.count = 1; + + if (fetch) { + amo_msg = &fetch_amo.msg; + fetch_amo.ep = cxit_ep; + fetch_amo.flags = comp_event ? FI_COMPLETION : 0; + work.op_type = FI_OP_FETCH_ATOMIC; + work.op.fetch_atomic = &fetch_amo; + expected_flags = FI_ATOMIC | FI_READ; + + fetch_iov.addr = &fetch_result; + fetch_iov.count = 1; + + fetch_amo.fetch.msg_iov = &fetch_iov; + fetch_amo.fetch.iov_count = 1; + } else if (comp) { + amo_msg = &comp_amo.msg; + comp_amo.ep = cxit_ep; + comp_amo.flags = comp_event ? FI_COMPLETION : 0; + work.op_type = FI_OP_COMPARE_ATOMIC; + work.op.compare_atomic = &comp_amo; + expected_flags = FI_ATOMIC | FI_READ; + + fetch_iov.addr = &fetch_result; + fetch_iov.count = 1; + + comp_iov.addr = &compare_value; + comp_iov.count = 1; + + comp_amo.fetch.msg_iov = &fetch_iov; + comp_amo.fetch.iov_count = 1; + comp_amo.compare.msg_iov = &comp_iov; + comp_amo.compare.iov_count = 1; + } else { + amo_msg = &amo.msg; + amo.ep = cxit_ep; + amo.flags = comp_event ? FI_COMPLETION : 0; + work.op_type = FI_OP_ATOMIC; + work.op.atomic = &amo; + expected_flags = FI_ATOMIC | FI_WRITE; + } + + amo_msg->msg_iov = &iov; + amo_msg->iov_count = 1; + amo_msg->addr = cxit_ep_fi_addr; + amo_msg->rma_iov = &rma_iov; + amo_msg->rma_iov_count = 1; + amo_msg->datatype = FI_UINT64; + amo_msg->op = comp ? FI_CSWAP : FI_SUM; + + work.threshold = trig_thresh; + work.triggering_cntr = trig_cntr; + work.completion_cntr = comp_cntr; + + ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work); + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + + /* Verify no target event has occurred. */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret); + + ret = fi_cntr_add(trig_cntr, work.threshold); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + + if (comp_event) { + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&cqe, expected_flags, NULL); + } else { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", + ret); + } + + poll_counter_assert(trig_cntr, work.threshold, 5); + poll_counter_assert(comp_cntr, 1, 5); + + /* Validate AMO data */ + if (comp) + cr_assert_eq(*target_buf, source_buf, "Invalid target result"); + else + cr_assert_eq(*target_buf, result, "Invalid target result"); + + if (fetch || comp) + cr_assert_eq(fetch_result, init_target_value, + "Invalid fetch result expected=%lu got=%lu", + init_target_value, fetch_result); + + mr_destroy(&mem_window); +} + +Test(deferred_work, amo_no_event) +{ + deferred_amo_test(false, false, false); +} + +Test(deferred_work, amo_event) +{ + deferred_amo_test(true, false, false); +} + +Test(deferred_work, fetch_amo_no_event) +{ + deferred_amo_test(false, true, false); +} + +Test(deferred_work, fetch_amo_event) +{ + deferred_amo_test(true, true, false); +} + +Test(deferred_work, compare_amo_no_event) +{ + deferred_amo_test(false, false, true); +} + +Test(deferred_work, compare_amo_event) +{ + deferred_amo_test(true, false, true); +} + +static void deferred_cntr(bool is_inc) +{ + struct fi_cntr_attr attr = {}; + struct fid_cntr *cntr; + struct fid_cntr *trig_cntr = cxit_write_cntr; + int ret; + uint64_t value = 123456; + uint64_t thresh = 1234; + struct fi_op_cntr op_cntr = {}; + struct fi_deferred_work work = {}; + + ret = fi_cntr_open(cxit_domain, &attr, &cntr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_open failed %d", ret); + + /* Ensure success value is non-zero to ensure success and increment + * work. + */ + ret = fi_cntr_add(cntr, 1); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + + op_cntr.cntr = cntr; + op_cntr.value = value; + + work.op_type = is_inc ? FI_OP_CNTR_ADD : FI_OP_CNTR_SET; + work.triggering_cntr = trig_cntr; + work.threshold = thresh; + work.op.cntr = &op_cntr; + + ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work); + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + + /* Trigger the operation. */ + ret = fi_cntr_add(trig_cntr, work.threshold); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_add failed %d", ret); + + poll_counter_assert(trig_cntr, work.threshold, 5); + poll_counter_assert(cntr, is_inc ? 1 + value : value, 5); + + ret = fi_close(&cntr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close failed %d", ret); +} + +Test(deferred_work, cntr_add) +{ + deferred_cntr(true); +} + +Test(deferred_work, cntr_set) +{ + deferred_cntr(false); +} + +static void deferred_recv_op_test(bool comp_event, size_t xfer_size, + uint64_t trig_thresh, bool is_tagged, + uint64_t tag) +{ + int i; + int ret; + uint8_t *recv_buf; + uint8_t *send_buf; + struct fi_cq_tagged_entry tx_cqe; + struct fi_cq_tagged_entry rx_cqe; + int err = 0; + fi_addr_t from; + struct iovec iov = {}; + struct fi_op_msg msg = {}; + struct fi_op_tagged tagged = {}; + struct fi_deferred_work work = {}; + uint64_t expected_rx_flags = + is_tagged ? FI_TAGGED | FI_RECV : FI_MSG | FI_RECV; + uint64_t expected_rx_tag = is_tagged ? tag : 0; + uint64_t expected_tx_flags = + is_tagged ? FI_TAGGED | FI_SEND : FI_MSG | FI_SEND; + struct fi_cntr_attr attr = {}; + struct fid_cntr *recv_cntr; + + ret = fi_cntr_open(cxit_domain, &attr, &recv_cntr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_open failed %d", ret); + + recv_buf = calloc(1, xfer_size); + cr_assert(recv_buf); + + send_buf = calloc(1, xfer_size); + cr_assert(send_buf); + + for (i = 0; i < xfer_size; i++) + send_buf[i] = i + 0xa0; + + /* Recv deferred op */ + iov.iov_base = recv_buf; + iov.iov_len = xfer_size; + + work.threshold = trig_thresh; + work.triggering_cntr = recv_cntr; + work.completion_cntr = recv_cntr; + + if (is_tagged) { + tagged.ep = cxit_ep; + tagged.msg.msg_iov = &iov; + tagged.msg.iov_count = 1; + tagged.msg.tag = tag; + tagged.msg.addr = cxit_ep_fi_addr; + tagged.flags = comp_event ? FI_COMPLETION : 0; + + work.op_type = FI_OP_TRECV; + work.op.tagged = &tagged; + } else { + msg.ep = cxit_ep; + msg.msg.msg_iov = &iov; + msg.msg.iov_count = 1; + msg.msg.addr = cxit_ep_fi_addr; + msg.flags = comp_event ? FI_COMPLETION : 0; + + work.op_type = FI_OP_RECV; + work.op.msg = &msg; + } + + ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work); + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + + if (is_tagged) + ret = fi_tsend(cxit_ep, send_buf, xfer_size, NULL, + cxit_ep_fi_addr, tag, NULL); + else + ret = fi_send(cxit_ep, send_buf, xfer_size, NULL, + cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Wait for async send event. In software endpoint mode, RX CQ needs to + * be progress to progress TX CQ. + */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + } while (ret == -FI_EAGAIN); + + validate_tx_event(&tx_cqe, expected_tx_flags, NULL); + + /* Verify optional receive event. */ + if (comp_event) { + /* Wait for async event indicating data has been sent */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, xfer_size, expected_rx_flags, + NULL, 0, expected_rx_tag); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + } else { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", + ret); + } + + /* Validate sent data */ + for (i = 0; i < xfer_size; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Need to progress recv the transaction to increment the counter. */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, "fi_cq_read unexpected value %d", ret); + + poll_counter_assert(recv_cntr, 1, 5); + + free(send_buf); + free(recv_buf); + fi_close(&recv_cntr->fid); +} + +Test(deferred_work, recv_eager_message_comp_event) +{ + deferred_recv_op_test(true, 1024, 0, false, 0); +} + +Test(deferred_work, recv_rendezvous_message_comp_event) +{ + deferred_recv_op_test(true, 1024 * 1024, 0, false, 0); +} + +Test(deferred_work, recv_eager_message_no_comp_event) +{ + deferred_recv_op_test(false, 1024, 0, false, 0); +} + +Test(deferred_work, recv_rendezvous_message_no_comp_event, .timeout=60) +{ + deferred_recv_op_test(false, 1024 * 1024, 0, false, 0); +} + +Test(deferred_work, recv_tagged_eager_message_comp_event) +{ + deferred_recv_op_test(true, 1024, 0, true, 987654321); +} + +Test(deferred_work, recv_tagged_rendezvous_message_comp_event) +{ + deferred_recv_op_test(true, 1024 * 1024, 0, true, 987654321); +} + +Test(deferred_work, recv_tagged_eager_message_no_comp_event) +{ + deferred_recv_op_test(false, 1024, 0, true, 987654321); +} + +Test(deferred_work, recv_tagged_rendezvous_message_no_comp_event, .timeout=60) +{ + deferred_recv_op_test(false, 1024 * 1024, 0, true, 987654321); +} + +static void deferred_recv_non_zero_thresh(bool is_tagged) +{ + int ret; + uint8_t *recv_buf; + struct iovec iov = {}; + struct fi_op_msg msg = {}; + struct fi_op_tagged tagged = {}; + struct fi_deferred_work work = {}; + struct fi_cntr_attr attr = {}; + struct fid_cntr *recv_cntr; + + ret = fi_cntr_open(cxit_domain, &attr, &recv_cntr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cntr_open failed %d", ret); + + recv_buf = calloc(1, 5); + cr_assert(recv_buf); + + /* Recv deferred op to self */ + iov.iov_base = recv_buf; + iov.iov_len = 5; + + work.threshold = 5; + work.triggering_cntr = recv_cntr; + work.completion_cntr = recv_cntr; + + if (is_tagged) { + tagged.ep = cxit_ep; + tagged.msg.msg_iov = &iov; + tagged.msg.iov_count = 1; + tagged.msg.tag = 456; + tagged.msg.addr = cxit_ep_fi_addr; + tagged.flags = FI_COMPLETION; + + work.op_type = FI_OP_TRECV; + work.op.tagged = &tagged; + } else { + msg.ep = cxit_ep; + msg.msg.msg_iov = &iov; + msg.msg.iov_count = 1; + msg.msg.addr = cxit_ep_fi_addr; + msg.flags = FI_COMPLETION; + + work.op_type = FI_OP_RECV; + work.op.msg = &msg; + } + + ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work); + cr_assert_neq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + + free(recv_buf); + fi_close(&recv_cntr->fid); +} + +Test(deferred_work, recv_non_zero_thresh) +{ + deferred_recv_non_zero_thresh(false); +} + +Test(deferred_work, recv_tagged_non_zero_thresh) +{ + deferred_recv_non_zero_thresh(true); +} + +/* FI_INJECT with deferred work queue processing is not supported. */ +void deferred_msg_inject_test(bool is_tagged) +{ + int ret; + uint8_t *send_buf; + struct iovec iov = {}; + struct fi_op_msg msg = {}; + struct fi_op_tagged tagged = {}; + struct fi_deferred_work work = {}; + + send_buf = calloc(1, 20); + cr_assert(send_buf); + + /* Send deferred op to self */ + iov.iov_base = send_buf; + iov.iov_len = 20; + + work.threshold = 5; + work.triggering_cntr = cxit_send_cntr; + work.completion_cntr = cxit_send_cntr; + + if (is_tagged) { + tagged.ep = cxit_ep; + tagged.msg.msg_iov = &iov; + tagged.msg.iov_count = 1; + tagged.msg.addr = cxit_ep_fi_addr; + tagged.msg.tag = 0x0123; + tagged.flags = FI_INJECT | FI_COMPLETION; + + work.op_type = FI_OP_TSEND; + work.op.tagged = &tagged; + } else { + msg.ep = cxit_ep; + msg.msg.msg_iov = &iov; + msg.msg.iov_count = 1; + msg.msg.addr = cxit_ep_fi_addr; + msg.flags = FI_INJECT | FI_COMPLETION; + + work.op_type = FI_OP_SEND; + work.op.msg = &msg; + } + + ret = fi_control(&cxit_domain->fid, FI_QUEUE_WORK, &work); + cr_assert_eq(ret, -FI_EINVAL, "FI_INJECT did not fail %d", ret); + + free(send_buf); +} + +Test(deferred_work, tsend_inject) +{ + deferred_msg_inject_test(true); +} + +Test(deferred_work, send_inject) +{ + deferred_msg_inject_test(false); +} + +#define TLE_RESERVED 8U + +static int alloc_service(struct cxil_dev *dev, unsigned int tle_count) +{ + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = { + .enable = 1, + .limits = { + .type[CXI_RSRC_TYPE_PTE] = { + .max = 100, + .res = 100, + }, + .type[CXI_RSRC_TYPE_TXQ] = { + .max = 100, + .res = 100, + }, + .type[CXI_RSRC_TYPE_TGQ] = { + .max = 100, + .res = 100, + }, + .type[CXI_RSRC_TYPE_EQ] = { + .max = 100, + .res = 100, + }, + .type[CXI_RSRC_TYPE_CT] = { + .max = 100, + .res = 100, + }, + .type[CXI_RSRC_TYPE_LE] = { + .max = 100, + .res = 100, + }, + .type[CXI_RSRC_TYPE_TLE] = { + .max = tle_count + TLE_RESERVED, + .res = tle_count + TLE_RESERVED, + }, + .type[CXI_RSRC_TYPE_AC] = { + .max = 8, + .res = 8, + }, + }, + }; + int ret; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, + "cxil_alloc_svc(): Failed. Expected Success! rc:%d", ret); + + return ret; +} + +struct deferred_work_resources { + struct fi_info *hints; + struct fi_info *info; + struct fid_fabric *fab; + struct fid_domain *dom; + struct fid_cq *cq; + struct fid_cntr *cntr; + struct fid_av *av; + struct fid_ep *ep; + fi_addr_t loopback; + struct cxil_dev *dev; + int service_id; +}; + +#define test_assert(test, fmt, ...) \ + do { \ + if (!(test)) { \ + fprintf(stderr, "%s:%d: " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__); \ + abort(); \ + } \ + } while (0) + +static void +deferred_work_resources_teardown(struct deferred_work_resources *res) +{ + test_assert((fi_close(&res->ep->fid) == FI_SUCCESS), "fi_close failed"); + test_assert((fi_close(&res->cntr->fid) == FI_SUCCESS), "fi_close failed"); + test_assert((fi_close(&res->cq->fid) == FI_SUCCESS), "fi_close failed"); + test_assert((fi_close(&res->av->fid) == FI_SUCCESS), "fi_close failed"); + test_assert((fi_close(&res->dom->fid) == FI_SUCCESS), "fi_close failed"); + test_assert((fi_close(&res->fab->fid) == FI_SUCCESS), "fi_close failed"); + fi_freeinfo(res->info); + fi_freeinfo(res->hints); +} + +static bool triggered_ops_limited() +{ + static bool first = true; + static bool limited = false; + + if (!first) + return limited; + + char *s = getenv("FI_CXI_ENABLE_TRIG_OP_LIMIT"); + + if (!s) /* variable not set/found */ + goto not_limited; + + char *endptr; + int i = strtol(s, &endptr, 10); + + if (endptr == s) /* no parsable integers */ + goto not_limited; + if (!i) /* set to 0 */ + goto not_limited; + + /* Some non-zero integer was parsed. + * It still could be 10zebras, but we will count it. + */ + + limited = true; + + not_limited: + + first = false; + + return limited; +} + +static void deferred_work_resources_init(struct deferred_work_resources *res, + int service_id) +{ + int ret; + struct cxi_auth_key auth_key = { + .vni = 1, + }; + struct fi_av_attr av_attr = {}; + + auth_key.svc_id = service_id; + + res->hints = fi_allocinfo(); + test_assert(res->hints, "fi_allocinfo failed"); + + res->hints->fabric_attr->prov_name = strdup("cxi"); + test_assert(res->hints->fabric_attr->prov_name, "strdup failed"); + + res->hints->domain_attr->mr_mode = + FI_MR_ENDPOINT | FI_MR_ALLOCATED | FI_MR_PROV_KEY; + res->hints->tx_attr->op_flags = FI_TRANSMIT_COMPLETE; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + "cxi0", NULL, FI_SOURCE, res->hints, + &res->info); + test_assert(ret == FI_SUCCESS, "fi_getinfo failed: %d\n", ret); + + ret = fi_fabric(res->info->fabric_attr, &res->fab, NULL); + test_assert(ret == FI_SUCCESS, "fi_fabric failed: %d\n", ret); + + res->info->domain_attr->auth_key = (void *)&auth_key; + res->info->domain_attr->auth_key_size = sizeof(auth_key); + + ret = fi_domain(res->fab, res->info, &res->dom, NULL); + test_assert(ret == FI_SUCCESS, "fi_domain failed: %d\n", ret); + + res->info->domain_attr->auth_key = NULL; + res->info->domain_attr->auth_key_size = 0; + + ret = fi_av_open(res->dom, &av_attr, &res->av, NULL); + test_assert(ret == FI_SUCCESS, "fi_av_open failed: %d\n", ret); + + ret = fi_cq_open(res->dom, NULL, &res->cq, NULL); + test_assert(ret == FI_SUCCESS, "fi_cq_open failed: %d\n", ret); + + ret = fi_cntr_open(res->dom, NULL, &res->cntr, NULL); + test_assert(ret == FI_SUCCESS, "fi_cntr_open failed: %d\n", ret); + + ret = fi_endpoint(res->dom, res->info, &res->ep, NULL); + test_assert(ret == FI_SUCCESS, "fi_endpoint failed: %d\n", ret); + + ret = fi_ep_bind(res->ep, &res->cq->fid, + FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION); + test_assert(ret == FI_SUCCESS, "fi_ep_bind failed: %d\n", ret); + + ret = fi_ep_bind(res->ep, &res->cntr->fid, + FI_SEND | FI_RECV | FI_READ | FI_WRITE); + test_assert(ret == FI_SUCCESS, "fi_ep_bind failed: %d\n", ret); + + ret = fi_ep_bind(res->ep, &res->av->fid, 0); + test_assert(ret == FI_SUCCESS, "fi_ep_bind failed: %d\n", ret); + + ret = fi_enable(res->ep); + test_assert(ret == FI_SUCCESS, "fi_enable failed: %d\n", ret); + + ret = fi_av_insert(res->av, res->info->src_addr, 1, &res->loopback, 0, + NULL); + test_assert(ret == 1, "fi_av_insert failed: %d\n", ret); +} + +TestSuite(deferred_work_trig_op_limit, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(deferred_work_trig_op_limit, enforce_limit_single_thread) +{ + struct deferred_work_resources res = {}; + unsigned int trig_op_count = 64; + unsigned int threshold = 1000; + char send_buf[256]; + char recv_buf[256]; + int ret; + int i; + struct fi_deferred_work work = {}; + struct iovec iov = {}; + struct fi_op_msg msg = {}; + bool limited = triggered_ops_limited(); + + ret = cxil_open_device(0, &res.dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d\n", ret); + + res.service_id = alloc_service(res.dev, trig_op_count); + cr_assert_gt(res.service_id, 0, "alloc_service() failed: %d\n", + res.service_id); + + deferred_work_resources_init(&res, res.service_id); + + for (i = 0; i < trig_op_count; i++) { + ret = fi_recv(res.ep, recv_buf, sizeof(recv_buf), NULL, + res.loopback, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d\n", ret); + } + + iov.iov_base = send_buf; + iov.iov_len = sizeof(send_buf); + + work.threshold = threshold; + work.triggering_cntr = res.cntr; + work.completion_cntr = res.cntr; + + msg.ep = res.ep; + msg.msg.msg_iov = &iov; + msg.msg.iov_count = 1; + msg.msg.addr = res.loopback; + msg.flags = FI_TRANSMIT_COMPLETE; + + work.op_type = FI_OP_SEND; + work.op.msg = &msg; + + for (i = 0; i < trig_op_count; i++) { + ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work); + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK iter %d failed %d", i, ret); + } + + ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work); + if (limited) + cr_assert_eq(ret, -FI_ENOSPC, "FI_QUEUE_WORK failed %d", ret); + else + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK failed %d", ret); + + cr_assert((fi_control(&res.dom->fid, FI_FLUSH_WORK, NULL) == FI_SUCCESS)); + + for (i = 0; i < trig_op_count; i++) { + ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work); + cr_assert_eq(ret, FI_SUCCESS, "FI_QUEUE_WORK iter %d failed %d", i, ret); + } + + cr_assert((fi_control(&res.dom->fid, FI_FLUSH_WORK, NULL) == FI_SUCCESS)); + + deferred_work_resources_teardown(&res); + + cr_assert((cxil_destroy_svc(res.dev, res.service_id) == 0)); + cxil_close_device(res.dev); +} + +static void run_multi_process_dwq_test(int service_id) +{ + struct deferred_work_resources res = {}; + int count = 4; + unsigned int threshold = 1000; + char send_buf[256]; + int ret; + int i; + struct fi_deferred_work work = {}; + struct iovec iov = {}; + struct fi_op_msg msg = {}; + bool limited = triggered_ops_limited(); + + deferred_work_resources_init(&res, service_id); + + iov.iov_base = send_buf; + iov.iov_len = sizeof(send_buf); + + work.threshold = threshold; + work.triggering_cntr = res.cntr; + work.completion_cntr = res.cntr; + + msg.ep = res.ep; + msg.msg.msg_iov = &iov; + msg.msg.iov_count = 1; + msg.msg.addr = res.loopback; + msg.flags = FI_TRANSMIT_COMPLETE; + + work.op_type = FI_OP_SEND; + work.op.msg = &msg; + + /* Continue trying to queue multiple TLEs and free them. */ + for (i = 0; i < count; i++) { + while (true) { + ret = fi_control(&res.dom->fid, FI_QUEUE_WORK, &work); + test_assert(((ret == FI_SUCCESS) && limited) || (ret == -FI_ENOSPC), + "FI_QUEUE_WORK failed %d", ret); + + if (ret == -FI_ENOSPC) + break; + } + + test_assert((fi_control(&res.dom->fid, FI_FLUSH_WORK, NULL) == FI_SUCCESS), + "FI_FLUSH_WORK failed"); + } + + deferred_work_resources_teardown(&res); + + exit(EXIT_SUCCESS); +} + +#define TLE_POOLS 4U + +Test(deferred_work_trig_op_limit, enforce_limit_multi_process) +{ + struct deferred_work_resources res = {}; + int trig_op_count = 100; + int ret; + union c_cq_sts_max_tle_in_use max_in_use = {}; + pid_t pid = -1; + int status; + int i; + bool found_max_in_use = false; + int num_forks = 5; + bool limited = triggered_ops_limited(); + + ret = cxil_open_device(0, &res.dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d\n", ret); + + ret = cxil_map_csr(res.dev); + cr_assert_eq(ret, 0, "cxil_map_csr failed: %d\n", ret); + + res.service_id = alloc_service(res.dev, trig_op_count); + cr_assert_gt(res.service_id, 0, "alloc_service() failed: %d\n", + res.service_id); + + for (i = 0; i < TLE_POOLS; i++) { + ret = cxil_write_csr(res.dev, C_CQ_STS_MAX_TLE_IN_USE(i), + &max_in_use, sizeof(max_in_use)); + cr_assert_eq(ret, 0, "cxil_write_csr failed: %d\n", ret); + } + + for (i = 0; i < num_forks; i++) { + pid = fork(); + if (pid == 0) + run_multi_process_dwq_test(res.service_id); + } + + wait(&status); + + for (i = 0; i < TLE_POOLS; i++) { + ret = cxil_read_csr(res.dev, C_CQ_STS_MAX_TLE_IN_USE(i), + &max_in_use, sizeof(max_in_use)); + cr_assert_eq(ret, 0, "cxil_read_csr failed: %d\n", ret); + + fprintf(stderr, "%d max_in_use.max = %d\n", i, max_in_use.max); + + if (max_in_use.max >= trig_op_count && max_in_use.max < (trig_op_count + 8)) { + found_max_in_use = true; + break; + } + } + if (limited) + cr_assert_eq(found_max_in_use, true, "Triggered op limit exceeded\n"); + + while ((ret = cxil_destroy_svc(res.dev, res.service_id)) == -EBUSY) {} + cr_assert(ret == 0, "cxil_destroy_svc failed: %d\n", ret); + + cxil_close_device(res.dev); +} diff --git a/prov/cxi/test/domain.c b/prov/cxi/test/domain.c new file mode 100644 index 00000000000..f0dee9f0b8d --- /dev/null +++ b/prov/cxi/test/domain.c @@ -0,0 +1,421 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(domain, .init = cxit_setup_domain, .fini = cxit_teardown_domain, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test basic domain creation */ +Test(domain, simple) +{ + cxit_create_domain(); + cr_assert(cxit_domain != NULL); + + cxit_destroy_domain(); +} + +/* Test use of topology ops */ +Test(domain, topology) +{ + unsigned int group_num, switch_num, port_num; + int ret; + + cxit_create_domain(); + cr_assert(cxit_domain != NULL); + ret = dom_ops->topology(&cxit_domain->fid, &group_num, &switch_num, + &port_num); + cr_assert_eq(ret, FI_SUCCESS, "topology failed: %d\n", ret); + + ret = dom_ops->topology(&cxit_domain->fid, NULL, &switch_num, + &port_num); + cr_assert_eq(ret, FI_SUCCESS, "null group topology failed: %d\n", ret); + + ret = dom_ops->topology(&cxit_domain->fid, &group_num, NULL, + &port_num); + cr_assert_eq(ret, FI_SUCCESS, "null switch topology failed: %d\n", ret); + + ret = dom_ops->topology(&cxit_domain->fid, &group_num, &switch_num, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "null port topology failed: %d\n", ret); + + cxit_destroy_domain(); +} + +Test(domain, enable_hybrid_mr_desc) +{ + int ret; + + cxit_create_domain(); + cr_assert(cxit_domain != NULL); + + ret = dom_ops->enable_hybrid_mr_desc(&cxit_domain->fid, true); + cr_assert_eq(ret, FI_SUCCESS, "enable_hybrid_mr_desc failed: %d\n", + ret); + + cxit_destroy_domain(); +} + +Test(domain, ep_get_unexp_msgs) +{ + size_t num_ux_ret; + size_t num_ux; + size_t addrlen = sizeof(cxit_ep_addr); + int ret; + + cxit_create_domain(); + cr_assert(cxit_domain != NULL); + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Find assigned Endpoint address. Address is assigned during enable. */ + ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret); + cr_assert(addrlen == sizeof(cxit_ep_addr)); + + num_ux_ret = dom_ops->ep_get_unexp_msgs(cxit_ep, NULL, 0, + NULL, &num_ux); + cr_assert_eq(num_ux_ret, 0, "ep_get_unexp_msgs bad return\n"); + cr_assert_eq(num_ux, 0, "ep_get_unexp_msgs ux_count not 0\n"); + + /* Tear down RMA objects */ + cxit_destroy_ep(); + cxit_destroy_av(); + cxit_destroy_cntrs(); + cxit_destroy_cqs(); + cxit_destroy_domain(); +} + +Test(domain, get_dwq_depth) +{ + int ret; + size_t depth; + + cxit_create_domain(); + cr_assert(cxit_domain != NULL); + + ret = dom_ops->get_dwq_depth(&cxit_domain->fid, &depth); + cr_assert_eq(ret, FI_SUCCESS, "get_dwq_depth failed: %d\n", + ret); + + cr_assert(depth > 0); + + cxit_destroy_domain(); +} + +Test(domain, enable_mr_match_events) +{ + int ret; + struct cxip_domain *cxip_dom; + struct cxip_mr *cxip_mr; + uint64_t key = 50; + struct mem_region region; + bool enable; + + cxit_create_domain(); + cr_assert(cxit_domain != NULL); + + cxip_dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + cr_assert_eq(cxip_env.mr_match_events, + cxip_dom->mr_match_events, "Global setting failed"); + + if (!cxip_env.mr_match_events) { + enable = true; + ret = fi_control(&cxit_domain->fid, + FI_OPT_CXI_SET_MR_MATCH_EVENTS, &enable); + cr_assert_eq(ret, FI_SUCCESS, + "enable_mr_match_events failed: %d", ret); + + cr_assert_eq(cxip_dom->mr_match_events, true, + "domain mr_match_events not set"); + } + + /* MR type established, setup RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert_eq(ret, FI_SUCCESS, "EP enable failed %d", ret); + + ret = mr_create(8, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key, ®ion); + cr_assert_eq(ret, FI_SUCCESS, "MR create failed %d", ret); + + cxip_mr = container_of(region.mr, struct cxip_mr, mr_fid); + cr_assert_eq(cxip_mr->count_events, true, + "MR match events not set"); + + mr_destroy(®ion); + + /* Tear down RMA objects */ + cxit_destroy_ep(); + cxit_destroy_av(); + cxit_destroy_cntrs(); + cxit_destroy_cqs(); + cxit_destroy_domain(); +} + +Test(domain, enable_optimized_mrs) +{ + int ret; + struct cxip_domain *cxip_dom; + bool optimized; + + cxit_create_domain(); + cr_assert(cxit_domain != NULL); + + cxip_dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + cr_assert_eq(cxip_env.optimized_mrs, + cxip_dom->optimized_mrs, "Global setting failed"); + + /* Disable optimized MRs for the domain */ + ret = dom_ops->enable_optimized_mrs(&cxit_domain->fid, false); + optimized = false; + ret = fi_control(&cxit_domain->fid, + FI_OPT_CXI_SET_OPTIMIZED_MRS, &optimized); + if (cxip_dom->is_prov_key) { + cr_assert_eq(ret, FI_SUCCESS, "Unexpected call failure"); + cr_assert_eq(cxip_dom->optimized_mrs, false, "Disable failed"); + } else { + cr_assert_eq(ret, -FI_EINVAL, "Client key check failed"); + cr_assert_eq(cxip_dom->optimized_mrs, cxip_env.optimized_mrs, + "Client key altered domain specific setting"); + } + + /* Enable optimized MRs for the domain */ + optimized = true; + ret = fi_control(&cxit_domain->fid, + FI_OPT_CXI_SET_OPTIMIZED_MRS, &optimized); + if (cxip_dom->is_prov_key) { + cr_assert_eq(ret, FI_SUCCESS, "Unexpected call failure"); + cr_assert_eq(cxip_dom->optimized_mrs, true, "Enable failed"); + } else { + cr_assert_eq(ret, -FI_EINVAL, "Client key check failed"); + cr_assert_eq(cxip_dom->optimized_mrs, cxip_env.optimized_mrs, + "Client key altered domain specific setting"); + } + + cxit_destroy_domain(); +} + +Test(domain, disable_prov_key_cache) +{ + int ret; + struct cxip_domain *cxip_dom; + bool enable = false; + + cxit_create_domain(); + cr_assert(cxit_domain != NULL); + + cxip_dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + cr_assert_eq(cxip_env.prov_key_cache, + cxip_dom->prov_key_cache, "Global setting failed"); + + ret = fi_control(&cxit_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE, + &enable); + + if (cxip_dom->is_prov_key) { + cr_assert_eq(ret, FI_SUCCESS, "Unexpected failure %d", ret); + cr_assert_eq(cxip_dom->prov_key_cache, false, "Update failed"); + } else { + cr_assert_eq(ret, -FI_EINVAL, "Unexpected success"); + cr_assert_eq(cxip_env.prov_key_cache, + cxip_dom->prov_key_cache, "Unexpected update"); + } + + cxit_destroy_domain(); +} + +static const char *_fi_coll_to_text(enum fi_collective_op coll) +{ + switch (coll) { + case FI_BARRIER: return "FI_BARRIER"; + case FI_BROADCAST: return "FI_BROADCAST"; + case FI_ALLTOALL: return "FI_ALLTOALL"; + case FI_ALLREDUCE: return "FI_ALLREDUCE"; + case FI_ALLGATHER: return "FI_ALLGATHER"; + case FI_REDUCE_SCATTER: return "FI_REDUCE_SCATTER"; + case FI_REDUCE: return "FI_REDUCE"; + case FI_SCATTER: return "FI_SCATTER"; + case FI_GATHER: return "FI_GATHER"; + default: return "NOCOLL"; + } +} + +static const char *_fi_op_to_text(enum fi_op op) +{ + switch ((int)op) { + case FI_MIN: return "FI_MIN"; + case FI_MAX: return "FI_MAX"; + case FI_SUM: return "FI_SUM"; + case FI_PROD: return "FI_PROD"; + case FI_LOR: return "FI_LOR"; + case FI_LAND: return "FI_LAND"; + case FI_BOR: return "FI_BOR"; + case FI_BAND: return "FI_BAND"; + case FI_LXOR: return "FI_LXOR"; + case FI_BXOR: return "FI_BXOR"; + case FI_ATOMIC_READ: return "FI_ATOMIC_READ"; + case FI_ATOMIC_WRITE: return "FI_ATOMIC_WRITE"; + case FI_CSWAP: return "FI_CSWAP"; + case FI_CSWAP_NE: return "FI_CSWAP_NE"; + case FI_CSWAP_LE: return "FI_CSWAP_LE"; + case FI_CSWAP_LT: return "FI_CSWAP_LT"; + case FI_CSWAP_GE: return "FI_CSWAP_GE"; + case FI_CSWAP_GT: return "FI_CSWAP_GT"; + case FI_MSWAP: return "FI_MSWAP"; + case FI_NOOP: return "FI_NOOP"; + default: return "NOOP"; + } +} + +static const char *_fi_datatype_to_text(enum fi_datatype datatype) +{ + switch ((int)datatype) { + case FI_INT8: return "FI_INT8"; + case FI_UINT8: return "FI_UINT8"; + case FI_INT16: return "FI_INT16"; + case FI_UINT16: return "FI_UINT16"; + case FI_INT32: return "FI_INT32"; + case FI_UINT32: return "FI_UINT32"; + case FI_INT64: return "FI_INT64"; + case FI_UINT64: return "FI_UINT64"; + case FI_FLOAT: return "FI_FLOAT"; + case FI_DOUBLE: return "FI_DOUBLE"; + case FI_FLOAT_COMPLEX: return "FI_FLOAT_COMPLEX"; + case FI_DOUBLE_COMPLEX: return "FI_DOUBLE_COMPLEX"; + case FI_LONG_DOUBLE: return "FI_LONG_DOUBLE"; + case FI_LONG_DOUBLE_COMPLEX: return "FI_LONG_DOUBLE_COMPLEX"; + case FI_VOID: return "FI_VOID"; + default: return "NOTYPE"; + } +} + +static void _test_coll_info(enum fi_collective_op coll, + enum fi_op op, + enum fi_datatype dtyp, + size_t count, size_t size, int exp) +{ + struct fi_collective_attr attr, *attrp; + const char *collname = _fi_coll_to_text(coll); + const char *opname = _fi_op_to_text(op); + const char *dtypname = _fi_datatype_to_text(dtyp); + int ret; + + memset(&attr, 0, sizeof(attr)); + attr.op = op; + attr.datatype = dtyp; + attrp = (op == -1) ? NULL : &attr; + ret = fi_query_collective(cxit_domain, coll, attrp, 0L); + cr_assert_eq(ret, exp, + "query(%s attr.op=%s %s)=%s expect=%s\n", + collname, opname, dtypname, + fi_strerror(ret), fi_strerror(exp)); + if (!attrp || ret) + return; + + cr_assert_eq(attr.datatype_attr.count, count, + "query(%s attr.op=%s %s)count=%ld expect=%ld\n", + collname, opname, dtypname, + attr.datatype_attr.count, count); + cr_assert_eq(attr.datatype_attr.size, size, + "query(%s attr.op=%s %s)size=%ld expect=%ld\n", + collname, opname, dtypname, + attr.datatype_attr.size, size); +} + +Test(domain, coll_info) +{ + cxit_create_domain(); + cr_assert(cxit_domain != NULL); + + _test_coll_info(FI_BARRIER, -1, -1, 0, 0, FI_SUCCESS); + _test_coll_info(FI_BARRIER, FI_NOOP, FI_VOID, 0, 0, FI_SUCCESS); + + _test_coll_info(FI_BROADCAST, -1, FI_VOID, 0, 0, -FI_EINVAL); + _test_coll_info(FI_BROADCAST, FI_SUM, FI_VOID, 0, 0, -FI_EOPNOTSUPP); + _test_coll_info(FI_BROADCAST, FI_ATOMIC_WRITE, FI_UINT8, 32, 1, + FI_SUCCESS); + + _test_coll_info(FI_REDUCE, FI_ATOMIC_WRITE, -1, 0, 0, -FI_EOPNOTSUPP); + _test_coll_info(FI_REDUCE, FI_BOR, FI_INT8, 32, 1, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BOR, FI_INT16, 16, 2, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BOR, FI_INT32, 8, 4, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BOR, FI_INT64, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BOR, FI_UINT8, 32, 1, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BOR, FI_UINT16, 16, 2, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BOR, FI_UINT32, 8, 4, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BOR, FI_UINT64, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BAND, FI_INT8, 32, 1, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BAND, FI_INT16, 16, 2, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BAND, FI_INT32, 8, 4, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BAND, FI_INT64, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BAND, FI_UINT8, 32, 1, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BAND, FI_UINT16, 16, 2, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BAND, FI_UINT32, 8, 4, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BAND, FI_UINT64, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BXOR, FI_INT8, 32, 1, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BXOR, FI_INT16, 16, 2, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BXOR, FI_INT32, 8, 4, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BXOR, FI_INT64, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BXOR, FI_UINT8, 32, 1, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BXOR, FI_UINT16, 16, 2, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BXOR, FI_UINT32, 8, 4, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_BXOR, FI_UINT64, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_MIN, FI_UINT64, 0, 0, -FI_EOPNOTSUPP); + _test_coll_info(FI_REDUCE, FI_MIN, FI_INT64, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_MIN, FI_DOUBLE, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_MAX, FI_UINT64, 0, 0, -FI_EOPNOTSUPP); + _test_coll_info(FI_REDUCE, FI_MAX, FI_INT64, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_MAX, FI_DOUBLE, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_SUM, FI_UINT64, 0, 0, -FI_EOPNOTSUPP); + _test_coll_info(FI_REDUCE, FI_SUM, FI_INT64, 4, 8, FI_SUCCESS); + _test_coll_info(FI_REDUCE, FI_SUM, FI_DOUBLE, 4, 8, FI_SUCCESS); + + cxit_destroy_domain(); +} + +TestSuite(domain_cntrs, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test basic counter read */ +Test(domain_cntrs, cntr_read) +{ + int ret; + uint64_t value; + struct timespec ts; + + ret = dom_ops->cntr_read(&cxit_domain->fid, C_CNTR_LPE_SUCCESS_CNTR, + &value, &ts); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + printf("LPE_SUCCESS_CNTR: %lu\n", value); +} diff --git a/prov/cxi/test/ep.c b/prov/cxi/test/ep.c new file mode 100644 index 00000000000..dab6ed9a37b --- /dev/null +++ b/prov/cxi/test/ep.c @@ -0,0 +1,1914 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include +#include +#include + +#include +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(ep, .init = cxit_setup_ep, .fini = cxit_teardown_ep, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test basic EP creation */ +Test(ep, simple) +{ + cxit_create_ep(); + + cxit_destroy_ep(); +} + +/* Test NULL parameter passed with EP creation */ +Test(ep, ep_null_info) +{ + int ret; + + ret = fi_endpoint(cxit_domain, NULL, &cxit_ep, NULL); + cr_assert_eq(ret, -FI_EINVAL, "Failure with NULL info. %d", ret); +} + +/* Test NULL parameter passed with EP creation */ +Test(ep, ep_null_ep) +{ + int ret; + + ret = fi_endpoint(cxit_domain, cxit_fi, NULL, NULL); + cr_assert_eq(ret, -FI_EINVAL, "Failure with NULL ep. %d", ret); +} + +struct ep_test_params { + void *context; + enum fi_ep_type type; + int retval; +}; + +static struct ep_test_params ep_ep_params[] = { + {.type = FI_EP_RDM, + .retval = FI_SUCCESS}, + {.type = FI_EP_UNSPEC, + .retval = FI_SUCCESS}, + {.type = FI_EP_MSG, + .retval = -FI_EINVAL}, + {.type = FI_EP_DGRAM, + .retval = -FI_EINVAL}, + {.type = FI_EP_SOCK_STREAM, + .retval = -FI_EINVAL}, + {.type = FI_EP_SOCK_DGRAM, + .retval = -FI_EINVAL}, + {.type = FI_EP_RDM, + .context = (void *)0xabcdef, + .retval = FI_SUCCESS}, +}; + +ParameterizedTestParameters(ep, fi_ep_types) +{ + size_t param_sz; + + param_sz = ARRAY_SIZE(ep_ep_params); + return cr_make_param_array(struct ep_test_params, ep_ep_params, + param_sz); +} + +ParameterizedTest(struct ep_test_params *param, ep, fi_ep_types) +{ + int ret; + struct cxip_ep *cep; + + cxit_fi->ep_attr->type = param->type; + cxit_ep = NULL; + ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, param->context); + cr_assert_eq(ret, param->retval, + "fi_endpoint() error for type %d. %d != %d", + param->type, ret, param->retval); + + if (ret != FI_SUCCESS) + return; + + cr_assert_not_null(cxit_ep); + cr_expect_eq(cxit_ep->fid.fclass, FI_CLASS_EP); + cr_expect_eq(cxit_ep->fid.context, param->context); + cep = container_of(cxit_ep, struct cxip_ep, ep); + cr_expect_not_null(cep->ep_obj); + + cxit_destroy_ep(); +} + +/* Test Passive EP creation is not supported */ +Test(ep, passive_ep) +{ + int ret; + struct fid_pep *pep = NULL; + + ret = fi_passive_ep(cxit_fabric, cxit_fi, &pep, NULL); + cr_assert_eq(ret, -FI_ENOSYS, "Failure with fi_passive_ep. %d", ret); + cr_assert_null(pep); +} + +Test(ep, ep_bind_null_bind_obj) +{ + int ret; + + cxit_create_ep(); + + ret = fi_ep_bind(cxit_ep, NULL, 0); + cr_assert_eq(ret, -FI_EINVAL); + + cxit_destroy_ep(); +} + +Test(ep, ep_bind_invalid_fclass) +{ + int ret; + + cxit_create_ep(); + cxit_create_av(); + + /* try to bind an unsupported class type */ + cxit_ep->fid.fclass = FI_CLASS_PEP; + ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0); + cr_assert_eq(ret, -FI_EINVAL); + cxit_ep->fid.fclass = FI_CLASS_EP; + + cxit_destroy_av(); + cxit_destroy_ep(); +} + +Test(ep, ep_bind_av) +{ + struct cxip_ep *ep; + struct cxip_av *av; + + cxit_create_ep(); + cxit_create_av(); + + cxit_bind_av(); + + av = container_of(cxit_av, struct cxip_av, av_fid.fid); + ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + + cr_assert_not_null(ep->ep_obj); + cr_assert_eq(ep->ep_obj->av, av); + + cxit_destroy_ep(); + cxit_destroy_av(); +} + +Test(ep, ep_bind_eq) +{ + int ret; + + /* order is not important */ + cxit_create_eq(); + cxit_create_ep(); + + ret = fi_ep_bind(cxit_ep, &cxit_eq->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_eq_bind EQ failed %d", ret); + + /* order is important */ + cxit_destroy_ep(); + cxit_destroy_eq(); +} + +Test(ep, ep_bind_mr) +{ + int ret; + + /* + * At the time of implementing this test MRs were not supported by the + * CXI provider. Fake attempting to register a MR with a EP using an AV + */ + cxit_create_ep(); + cxit_create_av(); + + cxit_av->fid.fclass = FI_CLASS_MR; + ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0); + cr_assert_eq(ret, -FI_EINVAL, "Bind (fake) MR to EP. %d", ret); + cxit_av->fid.fclass = FI_CLASS_AV; + + cxit_destroy_ep(); + cxit_destroy_av(); +} + +Test(ep, ep_bind_cq) +{ + struct cxip_ep *ep; + struct cxip_cq *rx_cq, *tx_cq; + + cxit_create_ep(); + cxit_create_cqs(); + cr_assert_not_null(cxit_tx_cq); + cr_assert_not_null(cxit_rx_cq); + + cxit_bind_cqs(); + + rx_cq = container_of(cxit_rx_cq, struct cxip_cq, util_cq.cq_fid.fid); + tx_cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid.fid); + ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + + cr_assert_not_null(ep->ep_obj); + cr_assert_eq(ep->ep.fid.fclass, FI_CLASS_EP); + cr_assert_eq(ep->ep_obj->txc.send_cq, tx_cq); + cr_assert_eq(ep->ep_obj->rxc.recv_cq, rx_cq); + + cxit_destroy_ep(); + cxit_destroy_cqs(); +} + +Test(ep, ep_bind_cq_eps) +{ + struct fid_ep *fid_ep2; + struct cxip_ep *ep; + struct cxip_ep *ep2; + int ret; + + cxit_create_ep(); + cxit_create_cqs(); + cr_assert_not_null(cxit_tx_cq); + cr_assert_not_null(cxit_rx_cq); + + cxit_bind_cqs(); + + /* Create second EP */ + ret = fi_endpoint(cxit_domain, cxit_fi, &fid_ep2, NULL); + cr_assert(ret == FI_SUCCESS, "fi_endpoint"); + cr_assert_not_null(fid_ep2); + + /* Bind same CQs to second EP */ + ret = fi_ep_bind(fid_ep2, &cxit_tx_cq->fid, cxit_tx_cq_bind_flags); + cr_assert(!ret, "fi_ep_bind TX CQ to 2nd EP"); + + ret = fi_ep_bind(fid_ep2, &cxit_rx_cq->fid, cxit_rx_cq_bind_flags); + cr_assert(!ret, "fi_ep_bind RX CQ to 2nd EP"); + + ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + cr_assert_not_null(ep->ep_obj); + ep2 = container_of(fid_ep2, struct cxip_ep, ep.fid); + cr_assert_not_null(ep2->ep_obj); + + cr_assert_eq(ep->ep_obj->txc.send_cq, ep2->ep_obj->txc.send_cq, + "Send CQ mismatch"); + cr_assert_eq(ep->ep_obj->rxc.recv_cq, ep2->ep_obj->rxc.recv_cq, + "Receive CQ mismatch"); + + ret = fi_close(&fid_ep2->fid); + cr_assert(ret == FI_SUCCESS, "fi_close endpoint"); + + cxit_destroy_ep(); + cxit_destroy_cqs(); +} + +Test(ep, ep_bind_cntr) +{ + int ret; + + cxit_create_ep(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_av(); + cxit_bind_av(); + + cxit_create_cntrs(); + cxit_bind_cntrs(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS); + + cxit_destroy_ep(); + cxit_destroy_cntrs(); + cxit_destroy_av(); + cxit_destroy_cqs(); +} + +Test(ep, ep_bind_stx_ctx) +{ + int ret; + struct fi_tx_attr *attr = NULL; + void *context = NULL; + + ret = fi_stx_context(cxit_domain, attr, NULL, context); + cr_assert_eq(ret, -FI_ENOSYS, + "TODO Add test for STX CTXs binding to the endpoint when implemented"); +} + +Test(ep, ep_bind_srx_ctx) +{ + int ret; + struct fi_rx_attr *attr = NULL; + void *context = NULL; + + ret = fi_srx_context(cxit_domain, attr, NULL, context); + cr_assert_eq(ret, -FI_ENOSYS, + "TODO Add test for SRX CTXs binding to the endpoint when implemented"); +} + +Test(ep, ep_bind_unhandled) +{ + int ret; + + cxit_create_ep(); + cxit_create_av(); + + /* Emulate a different type of object type */ + cxit_av->fid.fclass = -1; + ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0); + cr_assert_eq(ret, -FI_EINVAL, "fi_ep_bind unhandled object. %d", ret); + cxit_av->fid.fclass = FI_CLASS_AV; + + cxit_destroy_ep(); + cxit_destroy_av(); +} + +Test(ep, cancel_ep) +{ + int ret; + + cxit_create_ep(); + + ret = fi_cancel(&cxit_ep->fid, NULL); + cr_assert_eq(ret, -FI_EOPBADSTATE); + + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS); + + ret = fi_cancel(&cxit_ep->fid, NULL); + cr_assert_eq(ret, -FI_ENOENT); + + ret = fi_cancel(&cxit_ep->fid, (void *)1); + cr_assert_eq(ret, -FI_ENOENT); + + cxit_destroy_ep(); + cxit_destroy_av(); + cxit_destroy_cqs(); +} + +Test(ep, cancel_unhandled) +{ + int ret; + + cxit_create_ep(); + + /* Emulate a different type of object type */ + cxit_ep->fid.fclass = FI_CLASS_PEP; + ret = fi_cancel(&cxit_ep->fid, NULL); + cr_assert_eq(ret, -FI_EINVAL); + cxit_ep->fid.fclass = FI_CLASS_EP; + + cxit_destroy_ep(); +} + +Test(ep, control_unhandled_obj) +{ + int ret; + + cxit_create_ep(); + + /* Emulate a different type of object type */ + cxit_ep->fid.fclass = FI_CLASS_PEP; + ret = fi_control(&cxit_ep->fid, -1, NULL); + cr_assert_eq(ret, -FI_EINVAL); + cxit_ep->fid.fclass = FI_CLASS_EP; + + cxit_destroy_ep(); +} + +Test(ep, control_unhandled_cmd) +{ + int ret; + + cxit_create_ep(); + + ret = fi_control(&cxit_ep->fid, -1, NULL); + cr_assert_eq(ret, -FI_EINVAL); + + cxit_destroy_ep(); +} + +Test(ep, control_null_fid_alias) +{ + int ret; + struct fi_alias alias = {0}; + + cxit_create_ep(); + + /* A null alias.fid causes -FI_EINVAL */ + ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias); + cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_ALIAS. %d", ret); + + cxit_destroy_ep(); +} + +Test(ep, control_empty_alias) +{ + int ret; + struct fi_alias alias = {0}; + struct fid *alias_fid; + + cxit_create_ep(); + + /* Empty alias.flags causes -FI_EINVAL */ + alias.fid = &alias_fid; + ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias); + cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_ALIAS. %d", ret); + + cxit_destroy_ep(); +} + +Test(ep, control_bad_flags_alias) +{ + int ret; + struct fi_alias alias = {0}; + + cxit_create_ep(); + + /* Both Tx and Rx flags causes -FI_EINVAL */ + alias.flags = FI_TRANSMIT | FI_RECV; + ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias); + cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_ALIAS. %d", ret); + + cxit_destroy_ep(); +} + +Test(ep, control_tx_flags_alias) +{ + int ret; + struct fi_alias alias = {0}; + struct fid *alias_fid = NULL; + struct cxip_ep *cxi_ep, *alias_ep; + + cxit_create_ep(); + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + alias.fid = &alias_fid; + alias.flags = FI_TRANSMIT; + ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_ALIAS. %d", ret); + cr_assert_not_null(alias_fid); + + /* verify alias vs cxit_ep */ + alias_ep = container_of(alias_fid, struct cxip_ep, ep.fid); + cr_assert_eq(alias_ep->ep_obj, cxi_ep->ep_obj, "EP Attr"); + cr_assert_eq(alias_ep->is_alias, 1, "EP is_alias"); + cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 1, "EP refs 1"); + + /* close alias */ + ret = fi_close(alias_fid); + cr_assert(ret == FI_SUCCESS, "fi_close endpoint"); + alias_fid = NULL; + cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 0, "EP refs 0"); + + cxit_destroy_ep(); +} + +Test(ep, control_rx_flags_alias) +{ + int ret; + struct fi_alias alias = {0}; + struct fid *alias_fid = NULL; + struct cxip_ep *cxi_ep, *alias_ep; + + cxit_create_ep(); + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + alias.fid = &alias_fid; + alias.flags = FI_RECV; + ret = fi_control(&cxit_ep->fid, FI_ALIAS, &alias); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_ALIAS. %d", ret); + cr_assert_not_null(alias_fid); + + alias_ep = container_of(alias_fid, struct cxip_ep, ep.fid); + cr_assert_eq(alias_ep->ep_obj, cxi_ep->ep_obj, "EP Attr"); + cr_assert_eq(alias_ep->is_alias, 1, "EP is_alias"); + cr_assert_not_null(cxi_ep->ep_obj, "EP attr NULL"); + cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 1, "EP refs 1"); + + /* close alias */ + ret = fi_close(alias_fid); + cr_assert(ret == FI_SUCCESS, "fi_close endpoint"); + alias_fid = NULL; + cr_assert_eq(ofi_atomic_get32(&cxi_ep->ep_obj->ref), 0, "EP refs 0"); + + cxit_destroy_ep(); +} + +Test(ep, control_getopsflag_both_tx_rx) +{ + int ret; + uint64_t flags = FI_TRANSMIT | FI_RECV; + + cxit_create_ep(); + + ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_GETOPSFLAG TX/RX. %d", + ret); + + cxit_destroy_ep(); +} + +Test(ep, control_getopsflag_no_flags) +{ + int ret; + uint64_t flags = FI_TRANSMIT | FI_RECV; + + cxit_create_ep(); + + ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_GETOPSFLAG 0. %d", ret); + + cxit_destroy_ep(); +} + +Test(ep, control_getopsflag_tx) +{ + int ret; + uint64_t flags = FI_TRANSMIT; + struct cxip_ep *cxi_ep; + + cxit_create_ep(); + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_GETOPSFLAG TX. %d", ret); + cr_assert_eq(cxi_ep->tx_attr.op_flags, flags, + "fi_control FI_GETOPSFLAG Flag mismatch. %" PRIx64 " != %" + PRIx64 " ", cxi_ep->tx_attr.op_flags, flags); + + cxit_destroy_ep(); +} + +Test(ep, control_getopsflag_rx) +{ + int ret; + uint64_t flags = FI_RECV; + struct cxip_ep *cxi_ep; + + cxit_create_ep(); + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_GETOPSFLAG RX. %d", ret); + cr_assert_eq(cxi_ep->rx_attr.op_flags, flags, + "fi_control FI_GETOPSFLAG Flag mismatch. %" PRIx64 " != %" + PRIx64 " ", cxi_ep->rx_attr.op_flags, flags); + + cxit_destroy_ep(); +} + +Test(ep, control_setopsflag_both_tx_rx) +{ + int ret; + uint64_t flags = FI_TRANSMIT | FI_RECV; + + cxit_create_ep(); + + ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_SETOPSFLAG TX/RX. %d", + ret); + + cxit_destroy_ep(); +} + +Test(ep, control_setopsflag_no_flags) +{ + int ret; + uint64_t flags = FI_TRANSMIT | FI_RECV; + + cxit_create_ep(); + + ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, -FI_EINVAL, "fi_control FI_SETOPSFLAG 0. %d", ret); + + cxit_destroy_ep(); +} + +Test(ep, control_setopsflag_tx) +{ + int ret; + uint64_t flags = (FI_TRANSMIT | FI_MSG | FI_TRIGGER | + FI_DELIVERY_COMPLETE); + uint64_t tx_flags; + struct cxip_ep *cxi_ep; + + cxit_create_ep(); + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_SETOPSFLAG TX. %d", ret); + flags &= ~FI_TRANSMIT; + tx_flags = cxi_ep->tx_attr.op_flags; + cr_assert_eq(tx_flags, flags, + "fi_control FI_SETOPSFLAG TX Flag mismatch. %" PRIx64 + " != %" PRIx64, tx_flags, flags); + + cxit_destroy_ep(); +} + +Test(ep, control_setopsflag_tx_complete) +{ + int ret; + uint64_t flags = FI_TRANSMIT | FI_MSG | FI_TRIGGER | FI_AFFINITY; + uint64_t tx_flags; + struct cxip_ep *cxi_ep; + + cxit_create_ep(); + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_SETOPSFLAG TX. %d", ret); + flags &= ~FI_TRANSMIT; + flags |= FI_TRANSMIT_COMPLETE; + tx_flags = cxi_ep->tx_attr.op_flags; + cr_assert_eq(tx_flags, flags, + "fi_control FI_SETOPSFLAG TXcomp Flag mismatch. %" PRIx64 + " != %" PRIx64, tx_flags, flags); + + cxit_destroy_ep(); +} + +Test(ep, control_setopsflag_rx) +{ + int ret; + uint64_t flags = FI_RECV | FI_TAGGED | FI_NUMERICHOST | FI_EVENT; + uint64_t rx_flags; + struct cxip_ep *cxi_ep; + + cxit_create_ep(); + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_SETOPSFLAG RX. %d", ret); + flags &= ~FI_RECV; + rx_flags = cxi_ep->rx_attr.op_flags; + cr_assert_eq(rx_flags, flags, + "fi_control FI_SETOPSFLAG RX Flag mismatch. %" PRIx64 + " != %" PRIx64, rx_flags, flags); + + cxit_destroy_ep(); +} + +Test(ep, control_enable_nocq) +{ + int ret; + + cxit_create_ep(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert_eq(ret, -FI_ENOCQ, "fi_enable. %d", ret); + + cxit_destroy_ep(); + cxit_destroy_av(); +} + +Test(ep, control_enable_noav) +{ + int ret; + + cxit_create_ep(); + cxit_create_cqs(); + cxit_bind_cqs(); + + ret = fi_enable(cxit_ep); + cr_assert_eq(ret, -FI_ENOAV, "fi_enable. %d", ret); + + cxit_destroy_ep(); + cxit_destroy_cqs(); +} + +Test(ep, control_enable) +{ + int ret; + + cxit_create_ep(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert_eq(ret, FI_SUCCESS, "fi_enable. %d", ret); + + cxit_destroy_ep(); + cxit_destroy_av(); + cxit_destroy_cqs(); +} + +struct ep_ctrl_null_params { + int command; + int retval; +}; + +ParameterizedTestParameters(ep, ctrl_null_arg) +{ + size_t param_sz; + + static struct ep_ctrl_null_params ep_null_params[] = { + {.command = -1, + .retval = -FI_EINVAL}, + {.command = FI_SETOPSFLAG, + .retval = -FI_EINVAL}, + {.command = FI_ENABLE, + .retval = -FI_ENOAV}, + }; + + param_sz = ARRAY_SIZE(ep_null_params); + return cr_make_param_array(struct ep_ctrl_null_params, ep_null_params, + param_sz); +} + +ParameterizedTest(struct ep_ctrl_null_params *param, ep, ctrl_null_arg) +{ + int ret; + + cxit_create_ep(); + + ret = fi_control(&cxit_ep->fid, param->command, NULL); + cr_assert_eq(ret, param->retval, "fi_control type %d. %d != %d", + param->command, ret, param->retval); + + cxit_destroy_ep(); +} + +struct ep_getopt_args { + int level; + int optname; + size_t *optval; + size_t *optlen; + int retval; +}; + +static size_t optvalue; +static size_t optlength = sizeof(size_t); +static struct ep_getopt_args ep_null_params[] = { + {.level = -1, + .retval = -FI_ENOPROTOOPT}, + {.level = FI_OPT_ENDPOINT, + .optname = FI_OPT_CM_DATA_SIZE, + .retval = -FI_ENOPROTOOPT}, + {.level = FI_OPT_ENDPOINT, + .optname = -1, + .retval = -FI_ENOPROTOOPT}, + {.level = FI_OPT_ENDPOINT, + .optname = FI_OPT_MIN_MULTI_RECV, + .optval = NULL, + .optlen = NULL, + .retval = -FI_EINVAL}, + {.level = FI_OPT_ENDPOINT, + .optname = FI_OPT_MIN_MULTI_RECV, + .optval = &optvalue, + .optlen = NULL, + .retval = -FI_EINVAL}, + {.level = FI_OPT_ENDPOINT, + .optname = FI_OPT_MIN_MULTI_RECV, + .optval = &optvalue, + .optlen = &optlength, + .retval = FI_SUCCESS}, +}; + +ParameterizedTestParameters(ep, getopt_args) +{ + size_t param_sz; + + param_sz = ARRAY_SIZE(ep_null_params); + return cr_make_param_array(struct ep_getopt_args, ep_null_params, + param_sz); +} + +ParameterizedTest(struct ep_getopt_args *param, ep, getopt_args) +{ + int ret; + struct cxip_ep *cxi_ep; + + cxit_create_ep(); + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + ret = fi_getopt(&cxit_ep->fid, param->level, param->optname, + (void *)param->optval, param->optlen); + cr_assert_eq(ret, param->retval, + "fi_getopt lvl %d name %d val %p len %p. %d != %d", + param->level, param->optname, param->optval, + param->optlen, ret, param->retval); + + if (ret == FI_SUCCESS) { + cr_assert_not_null(cxi_ep->ep_obj); + cr_assert_eq(*param->optval, cxi_ep->ep_obj->rxc.min_multi_recv, + "fi_getopt val mismatch. %zd != %zd", + *param->optval, + cxi_ep->ep_obj->rxc.min_multi_recv); + cr_assert_eq(*param->optlen, sizeof(size_t), + "fi_getopt len mismatch. %zd != %zd", + *param->optlen, sizeof(size_t)); + } + + cxit_destroy_ep(); +} + +struct ep_setopt_args { + int level; + int optname; + size_t optval; + size_t optlen; + int retval; +}; + +ParameterizedTestParameters(ep, setopt_args) +{ + size_t param_sz; + + static struct ep_setopt_args ep_null_params[] = { + {.level = -1, + .retval = -FI_ENOPROTOOPT}, + {.level = FI_OPT_ENDPOINT, + .optname = FI_OPT_CM_DATA_SIZE, + .retval = -FI_ENOPROTOOPT}, + {.level = FI_OPT_ENDPOINT, + .optname = -1, + .retval = -FI_ENOPROTOOPT}, + {.level = FI_OPT_ENDPOINT, + .optname = FI_OPT_MIN_MULTI_RECV, + .optval = 0, + .retval = -FI_EINVAL}, + {.level = FI_OPT_ENDPOINT, + .optname = FI_OPT_MIN_MULTI_RECV, + .optval = 26, + .retval = FI_SUCCESS}, + {.level = FI_OPT_ENDPOINT, + .optname = FI_OPT_MIN_MULTI_RECV, + .optval = 90001, + .retval = FI_SUCCESS}, + {.level = FI_OPT_ENDPOINT, + .optname = FI_OPT_MIN_MULTI_RECV, + .optval = 1<<24, + .retval = -FI_EINVAL}, + }; + + param_sz = ARRAY_SIZE(ep_null_params); + return cr_make_param_array(struct ep_setopt_args, ep_null_params, + param_sz); +} + +ParameterizedTest(struct ep_setopt_args *param, ep, setopt_args) +{ + int ret; + struct cxip_ep *cxi_ep; + void *val = NULL; + + if (param->optval != 0) + val = ¶m->optval; + + cxit_create_ep(); + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + ret = fi_setopt(&cxit_ep->fid, param->level, param->optname, + val, param->optlen); + cr_assert_eq(ret, param->retval, + "fi_setopt lvl %d name %d val %zd. %d != %d", + param->level, param->optname, param->optval, + ret, param->retval); + + if (ret == FI_SUCCESS) { + cr_assert_not_null(cxi_ep->ep_obj); + cr_assert_eq(param->optval, cxi_ep->ep_obj->rxc.min_multi_recv, + "fi_setopt val mismatch. %zd != %zd", + param->optval, cxi_ep->ep_obj->rxc.min_multi_recv); + } + + cxit_destroy_ep(); +} + +Test(ep, rx_ctx_ep) +{ + int ret; + + cxit_create_ep(); + + /* RX context doesn't work with anything but scalable eps */ + ret = fi_rx_context(cxit_ep, 0, NULL, NULL, NULL); + cr_assert_eq(ret, -FI_ENOSYS, "fi_rx_context bad ep. %d", ret); + + cxit_destroy_ep(); +} + +Test(ep, tx_ctx_ep) +{ + int ret; + + cxit_create_ep(); + + /* RX context doesn't work with anything but scalable eps */ + ret = fi_tx_context(cxit_ep, 0, NULL, NULL, NULL); + cr_assert_eq(ret, -FI_ENOSYS, "fi_tx_context bad ep. %d", ret); + + cxit_destroy_ep(); +} + +Test(ep, stx_ctx_null_stx) +{ + int ret; + struct fi_tx_attr *attr = NULL; + void *context = NULL; + + ret = fi_stx_context(cxit_domain, attr, NULL, context); + /* TODO Fix when fi_stx_context is implemented, should be -FI_EINVAL */ + cr_assert_eq(ret, -FI_ENOSYS, "fi_stx_context null stx. %d", ret); +} + +Test(ep, stx_ctx) +{ + int ret; + struct fi_tx_attr *attr = NULL; + struct fid_stx *stx; + struct cxip_ep *ep; + void *context = &ret; + struct cxip_domain *dom; + struct cxip_txc *txc; + int refs; + + dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + refs = ofi_atomic_get32(&dom->ref); + + ret = fi_stx_context(cxit_domain, attr, &stx, context); + + /* TODO Fix when fi_stx_context is implemented, should be FI_SUCCESS */ + cr_assert_eq(ret, -FI_ENOSYS, "fi_stx_context failed. %d", ret); + if (ret == -FI_ENOSYS) + return; + + ep = container_of(stx, struct cxip_ep, ep); + txc = &ep->ep_obj->txc; + + /* Validate stx */ + cr_assert_eq(txc->domain, dom); + cr_assert_eq(ofi_atomic_inc32(&dom->ref), refs + 1); + cr_assert_eq(ep->ep.fid.fclass, FI_CLASS_TX_CTX); + cr_assert_eq(ep->ep.fid.context, context); + + ret = fi_close(&stx->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close stx_ep. %d", ret); +} + +Test(ep, srx_ctx_null_srx) +{ + int ret; + struct fi_rx_attr *attr = NULL; + void *context = NULL; + + ret = fi_srx_context(cxit_domain, attr, NULL, context); + /* TODO Fix when fi_srx_context is implemented, should be -FI_EINVAL */ + cr_assert_eq(ret, -FI_ENOSYS, "fi_srx_context null srx. %d", ret); +} + +Test(ep, srx_ctx) +{ + int ret; + struct fi_rx_attr *attr = NULL; + struct fid_ep *srx; + struct cxip_ep *srx_ep; + void *context = &ret; + struct cxip_domain *dom; + struct cxip_rxc *rxc; + int refs; + + dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + refs = ofi_atomic_get32(&dom->ref); + + ret = fi_srx_context(cxit_domain, attr, &srx, context); + /* TODO Fix when fi_srx_context is implemented, should be FI_SUCCESS */ + cr_assert_eq(ret, -FI_ENOSYS, "fi_stx_context failed. %d", ret); + if (ret == -FI_ENOSYS) + return; + + srx_ep = container_of(srx, struct cxip_ep, ep); + rxc = &srx_ep->ep_obj->rxc; + + /* Validate stx */ + cr_assert_eq(rxc->domain, dom); + cr_assert_eq(ofi_atomic_inc32(&dom->ref), refs + 1); + cr_assert_eq(srx_ep->ep.fid.fclass, FI_CLASS_RX_CTX); + cr_assert_eq(srx_ep->ep.fid.context, context); + cr_assert_eq(rxc->state, RXC_ENABLED); + cr_assert_eq(rxc->min_multi_recv, CXIP_EP_MIN_MULTI_RECV); + + ret = fi_close(&srx->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close srx_ep. %d", ret); +} + +TestSuite(ep_init, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(ep_init, auth_key) +{ + int ret; + struct cxi_auth_key auth_key = { + .svc_id = CXI_DEFAULT_SVC_ID, + .vni = 1, + }; + + /* Create fabric */ + cxit_setup_domain(); + + /* Try invalid auth key */ + cxit_fi->domain_attr->auth_key_size = 12345; + ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL); + cr_assert_eq(ret, -FI_EINVAL); + + /* Set custom auth key in Domain */ + cxit_fi->domain_attr->auth_key = mem_dup(&auth_key, sizeof(auth_key)); + cxit_fi->domain_attr->auth_key_size = sizeof(auth_key); + + /* Create enabled Domain/EP */ + cxit_setup_rma(); + + cxit_teardown_rma(); + + /*---*/ + + cxit_setup_domain(); + cxit_create_domain(); + + /* Try invalid auth key */ + cxit_fi->ep_attr->auth_key_size = 12345; + + ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL); + cr_assert_eq(ret, -FI_EINVAL); /* inconsistent error */ + + /* Set custom auth key in EP */ + auth_key.vni = 200; + + free(cxit_fi->ep_attr->auth_key); + cxit_fi->ep_attr->auth_key = mem_dup(&auth_key, sizeof(auth_key)); + cxit_fi->ep_attr->auth_key_size = sizeof(auth_key); + + ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL); + cr_assert_eq(ret, -FI_EINVAL); + + /* Try mis-matched svc_id */ + auth_key.svc_id = 10; + auth_key.vni = 301; + + free(cxit_fi->ep_attr->auth_key); + cxit_fi->ep_attr->auth_key = mem_dup(&auth_key, sizeof(auth_key)); + cxit_fi->ep_attr->auth_key_size = sizeof(auth_key); + + ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL); + cr_assert_eq(ret, -FI_EINVAL); + + cxit_destroy_domain(); + cxit_teardown_domain(); +} + +Test(ep_init, tclass) +{ + int ret; + + /* Create fabric */ + cxit_setup_domain(); + + /* Try invalid auth key */ + cxit_fi->domain_attr->tclass = FI_TC_DSCP; + + ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL); + cr_assert_eq(ret, -FI_EINVAL, "ret is: %d\n", ret); + + /* Set custom TC in Domain */ + cxit_fi->domain_attr->tclass = FI_TC_LOW_LATENCY; + + /* Create enabled Domain/EP */ + cxit_setup_rma(); + + cxit_teardown_rma(); + + /*---*/ + + cxit_setup_domain(); + cxit_create_domain(); + + /* Try invalid auth key */ + cxit_fi->tx_attr->tclass = FI_TC_DSCP; + + ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL); + cr_assert_eq(ret, -FI_EINVAL, "ret is: %d\n", ret); + + /* Set custom TC in EP */ + cxit_fi->tx_attr->tclass = FI_TC_DEDICATED_ACCESS; + + /* Create enabled Domain/EP */ + cxit_setup_rma(); + + cxit_teardown_rma(); +} + +Test(ep, invalid_tx_attr_size) +{ + struct fid_ep *tmp_ep; + int ret; + + /* Invalid TX attr size. */ + cxit_fi->tx_attr->size = 1234567; + + ret = fi_endpoint(cxit_domain, cxit_fi, &tmp_ep, NULL); + cr_assert(ret != FI_SUCCESS, "fi_endpoint"); +} + +Test(ep, valid_tx_attr_size) +{ + struct fid_ep *tmp_ep; + int ret; + + /* Invalid TX attr size. */ + cxit_fi->tx_attr->size = 16384; + + ret = fi_endpoint(cxit_domain, cxit_fi, &tmp_ep, NULL); + cr_assert(ret == FI_SUCCESS, "fi_endpoint"); + + ret = fi_close(&tmp_ep->fid); + cr_assert(ret == FI_SUCCESS, "fi_close EP"); +} + +Test(ep, valid_tx_attr_size_hints) +{ + struct fi_info *hints; + struct fi_info *info; + int ret; + unsigned int tx_size = 1024; + + hints = fi_allocinfo(); + cr_assert(hints != NULL, "fi_allocinfo"); + + hints->tx_attr->size = tx_size; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, hints, + &info); + cr_assert(ret == FI_SUCCESS); + + assert(info->tx_attr->size == tx_size); + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +TestSuite(ep_tclass, .init = cxit_setup_tx_alias_rma, + .fini = cxit_teardown_tx_alias_rma, .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Add control test for setting of EP tclass. + * + * Test same for alias EP. + * + * Parameterized for all TCLASS values and bad values. + */ +struct ep_tclass_params { + int tclass; + int retval; +}; + +static struct ep_tclass_params tclass_params[] = { + {.tclass = 0, + .retval = FI_SUCCESS}, + {.tclass = FI_TC_UNSPEC, + .retval = FI_SUCCESS}, + {.tclass = FI_TC_DSCP, + .retval = -FI_EINVAL}, + {.tclass = FI_TC_LABEL, + .retval = FI_SUCCESS}, + {.tclass = FI_TC_BEST_EFFORT, + .retval = FI_SUCCESS}, + {.tclass = FI_TC_LOW_LATENCY, + .retval = FI_SUCCESS}, + {.tclass = FI_TC_DEDICATED_ACCESS, + .retval = FI_SUCCESS}, + {.tclass = FI_TC_BULK_DATA, + .retval = FI_SUCCESS}, + {.tclass = FI_TC_SCAVENGER, + .retval = FI_SUCCESS}, + {.tclass = FI_TC_NETWORK_CTRL, /* Not supported */ + .retval = -FI_EINVAL}, + {.tclass = FI_TC_NETWORK_CTRL + 1, /* Illegal */ + .retval = -FI_EINVAL}, +}; + +int set_ep_tclass(struct cxip_ep *ep, uint32_t tclass) +{ + int ret; + + ret = fi_set_val(&ep->ep.fid, FI_OPT_CXI_SET_TCLASS, + (void *)&tclass); + if (ret == FI_SUCCESS) { + if (tclass != FI_TC_UNSPEC) + cr_assert_eq(tclass, ep->tx_attr.tclass, + "update tclass mismatch. %d != %d", + tclass, ep->tx_attr.tclass); + else + cr_assert_neq(tclass, ep->tx_attr.tclass, + "FI_TC_UNSPEC tclass not updated"); + } + + return ret; +} + +ParameterizedTestParameters(ep_tclass, alias_set_tclass) +{ + size_t param_sz; + + param_sz = ARRAY_SIZE(tclass_params); + return cr_make_param_array(struct ep_tclass_params, + tclass_params, param_sz); +} + +/* Modify EP alias traffic class */ +ParameterizedTest(struct ep_tclass_params *param, ep_tclass, + alias_set_tclass) +{ + int ret; + struct cxip_ep *cxi_ep; + struct cxip_ep *alias_ep = NULL; + uint32_t orig_ep_tclass; + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + orig_ep_tclass = cxi_ep->tx_attr.tclass; + + alias_ep = container_of(&cxit_tx_alias_ep->fid, struct cxip_ep, ep.fid); + cr_assert_not_null(alias_ep->ep_obj); + + ret = set_ep_tclass(alias_ep, param->tclass); + cr_assert_eq(ret, param->retval, + "fi_set_val for TCLASS %d", param->tclass); + + /* make sure only the alias EP tclass changed */ + cr_assert_eq(orig_ep_tclass, cxi_ep->tx_attr.tclass, + "Original EP tclass changed"); +} + +ParameterizedTestParameters(ep_tclass, set_tclass) +{ + size_t param_sz; + + param_sz = ARRAY_SIZE(tclass_params); + return cr_make_param_array(struct ep_tclass_params, + tclass_params, param_sz); +} + +/* Modify standard EP traffic class parameters */ +ParameterizedTest(struct ep_tclass_params *param, ep_tclass, set_tclass) +{ + int ret; + struct cxip_ep *cxi_ep; + + cxi_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + ret = set_ep_tclass(cxi_ep, param->tclass); + cr_assert_eq(ret, param->retval, + "fi_set_val for TCLASS %d", param->tclass); +} + +TestSuite(ep_caps, .timeout = CXIT_DEFAULT_TIMEOUT); + +void verify_ep_msg_cap(uint64_t flags) +{ + struct cxip_ep *ep; + int ret; + + cxit_setup_ep(); + + /* Set info TX/RX attribute appropriately */ + if (!(flags & FI_SEND)) + cxit_fi->tx_attr->caps &= ~(FI_SEND | FI_SEND); + if (!(flags & FI_RECV)) + cxit_fi->rx_attr->caps &= ~(FI_MSG | FI_RECV); + cxit_create_ep(); + cxit_create_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "EP enable"); + + ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + /* Requires knowledge of implementation */ + if (flags & FI_SEND) { + cr_assert(ep->ep_obj->txc.enabled, "TX Enabled"); + cr_assert(ep->ep_obj->txc.send_cq != NULL, "Send CQ"); + } + + if (flags & FI_RECV) { + cr_assert(ep->ep_obj->rxc.state == RXC_ENABLED || + ep->ep_obj->rxc.state == RXC_ENABLED_SOFTWARE, + "RX Enabled"); + cr_assert(ep->ep_obj->rxc.recv_cq != NULL, "Receive CQ"); + cr_assert(ep->ep_obj->rxc.rx_evtq.eq != NULL, "RX H/W EQ"); + cr_assert(ep->ep_obj->rxc.rx_cmdq != NULL, "RX TGT CMDQ"); + cr_assert(ep->ep_obj->rxc.tx_cmdq != NULL, "RX TX CMDQ"); + } else { + cr_assert(ep->ep_obj->rxc.state == RXC_ENABLED, "R/X enabled"); + cr_assert(ep->ep_obj->rxc.rx_evtq.eq == NULL, "RX H/W EQ"); + cr_assert(ep->ep_obj->rxc.rx_cmdq == NULL, "RX TGT CMDQ"); + cr_assert(ep->ep_obj->rxc.tx_cmdq == NULL, "RX TX CMDQ"); + } + + cxit_teardown_rma(); +} + +static void verify_ep_msg_ops(uint64_t flags) +{ + bool recv; + bool send; + uint8_t *recv_buf; + uint8_t *send_buf; + int recv_len = 512; + int send_len = 512; + struct iovec riovec; + struct iovec siovec; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + int ret; + + recv = !!(flags & FI_RECV); + send = !!(flags & FI_SEND); + + cxit_setup_ep(); + + /* Set info TX/RX attribute appropriately */ + if (!send) + cxit_fi->tx_attr->caps &= ~(FI_MSG | FI_SEND); + if (!recv) + cxit_fi->rx_attr->caps &= ~(FI_MSG | FI_RECV); + cxit_create_ep(); + cxit_create_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert_not_null(recv_buf); + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert_not_null(send_buf); + + /* Verify can not call API functions */ + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_recv"); + + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + ret = fi_recvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_recvv"); + + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + ret = fi_recvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_recvmsg"); + + ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_send"); + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + ret = fi_sendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_sendv"); + + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = NULL; + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_sendmsg"); + + ret = fi_inject(cxit_ep, send_buf, 8, cxit_ep_fi_addr); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_inject"); + + ret = fi_senddata(cxit_ep, send_buf, send_len, NULL, 0xa5a5, + cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_senddata"); + + ret = fi_injectdata(cxit_ep, send_buf, 8, 0xa5a5, cxit_ep_fi_addr); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_injectdata"); + + /* Enable EP */ + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "EP enable"); + + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, recv ? FI_SUCCESS : -FI_ENOSYS, + "EP enabled fi_recv"); + + ret = fi_recvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, recv ? FI_SUCCESS : -FI_ENOSYS, + "EP enabled fi_recvv"); + + ret = fi_recvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, recv ? FI_SUCCESS : -FI_ENOSYS, + "EP enabled fi_recvmsg"); + + ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, send ? FI_SUCCESS : -FI_ENOSYS, + "EP enabled fi_send"); + + ret = fi_sendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, send ? FI_SUCCESS : -FI_ENOSYS, + "EP enabled fi_sendv"); + cr_assert_eq(ret, -FI_ENOSYS, "EP not enabled fi_sendv"); + + cxit_teardown_rma(); +} + +Test(ep_caps, msg_tx_rx) +{ + struct fi_info *info; + int ret; + + /* No hints */ + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, NULL, &info); + cr_assert(ret == FI_SUCCESS); + cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned"); + cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned"); + cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned"); + cr_assert_eq(info->tx_attr->caps & FI_MSG, FI_MSG, + "FI_MSG TX returned"); + cr_assert_eq(info->tx_attr->caps & FI_SEND, FI_SEND, + "FI_SEND TX returned"); + cr_assert_eq(info->rx_attr->caps & FI_MSG, FI_MSG, + "FI_MSG RX returned"); + cr_assert_eq(info->rx_attr->caps & FI_RECV, FI_RECV, + "FI_RECV RX returned"); + verify_ep_msg_cap(FI_SEND | FI_RECV); + fi_freeinfo(info); + + /* hints->caps set to 0 */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = 0; + cxit_fi_hints->tx_attr->caps = 0; + cxit_fi_hints->rx_attr->caps = 0; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned"); + cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned"); + cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned"); + cr_assert_eq(info->tx_attr->caps & FI_MSG, FI_MSG, + "FI_MSG TX returned"); + cr_assert_eq(info->tx_attr->caps & FI_SEND, FI_SEND, + "FI_SEND TX returned"); + cr_assert_eq(info->rx_attr->caps & FI_MSG, FI_MSG, + "FI_MSG RX returned"); + cr_assert_eq(info->rx_attr->caps & FI_RECV, FI_RECV, + "FI_RECV RX returned"); + verify_ep_msg_cap(FI_SEND | FI_RECV); + fi_freeinfo(info); + cxit_teardown_getinfo(); + + /* hints->caps set to FI_MSG | FI_SEND | FI_RECV */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_MSG | FI_SEND | FI_RECV; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned"); + cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned"); + cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned"); + verify_ep_msg_cap(FI_SEND | FI_RECV); + fi_freeinfo(info); + cxit_teardown_getinfo(); + + /* hints->caps set to FI_MSG implies FI_SEND and FI_RECV */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_MSG; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned"); + cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned"); + cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned"); + verify_ep_msg_cap(FI_SEND | FI_RECV); + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(ep_caps, msg_tx) +{ + struct fi_info *info; + int ret; + + /* hints->caps set to FI_MSG | FI_SEND is TX message only EP */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_MSG | FI_SEND; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned"); + cr_assert_eq(info->caps & FI_SEND, FI_SEND, "FI_SEND returned"); + cr_assert_eq(info->caps & FI_RECV, 0, "FI_RECV not returned"); + verify_ep_msg_cap(FI_SEND); + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(ep_caps, msg_rx) +{ + struct fi_info *info; + int ret; + + /* hints->caps set to FI_MSG | FI_RECV is RX message only EP */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_MSG | FI_RECV; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + cr_assert_eq(info->caps & FI_MSG, FI_MSG, "FI_MSG returned"); + cr_assert_eq(info->caps & FI_SEND, 0, "FI_SEND not returned"); + cr_assert_eq(info->caps & FI_RECV, FI_RECV, "FI_RECV returned"); + verify_ep_msg_cap(FI_RECV); + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(ep_caps, msg_rx_only_ops) +{ + struct fi_info *info; + int ret; + + /* hints->caps set to FI_MSG | FI_RECV is RX message only EP */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_MSG | FI_RECV; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + + verify_ep_msg_ops(FI_RECV); + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +/* Verify FI_RMA API ops set */ +extern struct fi_ops_rma cxip_ep_rma_ops; +extern struct fi_ops_rma cxip_ep_rma_no_ops; + +static void verify_ep_rma_ops(uint64_t caps) +{ + int ret; + + cxit_setup_ep(); + + cxit_fi->caps = caps; + cxit_fi->tx_attr->caps = caps; + + cxit_create_ep(); + cxit_create_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + /* Enable EP */ + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "EP enable"); + + /* Verify correct function table is set */ + if (caps & FI_RMA && ofi_rma_initiate_allowed(caps)) + cr_assert_eq(cxit_ep->rma, &cxip_ep_rma_ops, + "FI_RMA ops not set"); + else + cr_assert_eq(cxit_ep->rma, &cxip_ep_rma_no_ops, + "FI_RMA ops set"); + + cxit_teardown_rma(); +} + +/* Verify FI_ATOMIC API ops enable/disable */ +extern struct fi_ops_atomic cxip_ep_atomic_ops; +extern struct fi_ops_atomic cxip_ep_atomic_no_ops; + +static void verify_ep_amo_ops(uint64_t caps) +{ + int ret; + + cxit_setup_ep(); + + cxit_fi->caps = caps; + cxit_fi->tx_attr->caps = caps; + + cxit_create_ep(); + cxit_create_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + cxit_create_cntrs(); + cxit_bind_cntrs(); + cxit_create_av(); + cxit_bind_av(); + + /* Enable EP */ + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "EP enable"); + + /* Verify correct function table is set */ + if (caps & FI_ATOMIC && ofi_rma_initiate_allowed(caps)) + cr_assert_eq(cxit_ep->atomic, &cxip_ep_atomic_ops, + "FI_ATOMIC ops not set"); + else + cr_assert_eq(cxit_ep->atomic, &cxip_ep_atomic_no_ops, + "FI_ATOMIC ops set"); + + cxit_teardown_rma(); +} + +/* test_cap is the caps that should be set */ +static void verify_caps_only(struct fi_info *info, + uint64_t test_cap) +{ + if (!(test_cap & FI_TAGGED)) + cr_assert_eq(info->caps & FI_TAGGED, 0, "FI_TAGGED set"); + if (!(test_cap & FI_ATOMIC)) + cr_assert_eq(info->caps & FI_ATOMIC, 0, "FI_ATOMIC set"); + if (!(test_cap & FI_RMA)) + cr_assert_eq(info->caps & FI_RMA, 0, "FI_RMA set"); + if (!(test_cap & FI_COLLECTIVE)) + cr_assert_eq(info->caps & FI_COLLECTIVE, 0, + "FI_COLLECTIVE set"); +} + +Test(ep_caps, msg_only) +{ + struct fi_info *info; + int ret; + + /* hints->caps set to for only FI_MSG, don't enable other API */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_MSG; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + verify_caps_only(info, FI_MSG); + + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(ep_caps, tagged_only) +{ + struct fi_info *info; + int ret; + + /* hints->caps set to for only FI_TAGGED, don't enable other API */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_TAGGED; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + verify_caps_only(info, FI_TAGGED); + + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(ep_caps, rma_only) +{ + struct fi_info *info; + int ret; + + /* hints->caps set to for only FI_RMA, don't enable other API */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_RMA; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + verify_caps_only(info, FI_RMA); + + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(ep_caps, atomic_only) +{ + struct fi_info *info; + int ret; + + /* hints->caps set to for only FI_ATOMIC, don't enable other API */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_ATOMIC; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + verify_caps_only(info, FI_ATOMIC); + + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(ep_caps, coll_only) +{ + struct fi_info *info; + int ret; + + /* hints->caps set to for only FI_COLLECTIVE enables only FI_MSG */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_COLLECTIVE; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + verify_caps_only(info, FI_COLLECTIVE | FI_MSG); + + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(ep_caps, rma_initiator) +{ + verify_ep_rma_ops(FI_RMA | FI_READ | FI_WRITE); +} + +Test(ep_caps, rma_target_only) +{ + verify_ep_rma_ops(FI_RMA | FI_REMOTE_READ | FI_REMOTE_WRITE); +} + +Test(ep_caps, rma_amo_only) +{ + verify_ep_rma_ops(FI_ATOMIC | FI_READ | FI_WRITE); +} + +Test(ep_caps, rma_none) +{ + verify_ep_rma_ops(FI_MSG | FI_TAGGED); +} + +Test(ep_caps, amo_initiator) +{ + verify_ep_amo_ops(FI_ATOMIC | FI_READ | FI_WRITE); +} + +Test(ep_caps, amo_target_only) +{ + verify_ep_amo_ops(FI_ATOMIC | FI_REMOTE_READ | FI_REMOTE_WRITE); +} + +Test(ep_caps, amo_rma_only) +{ + verify_ep_amo_ops(FI_RMA | FI_READ | FI_WRITE); +} + +Test(ep_caps, amo_none) +{ + verify_ep_amo_ops(FI_MSG | FI_TAGGED); +} + +TestSuite(ep_locking, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(ep_locking, domain) +{ + struct cxip_domain *dom; + struct cxip_ep *ep; + struct cxip_cq *cq; + + cxit_setup_getinfo(); + + cxit_fi_hints->domain_attr->threading = FI_THREAD_DOMAIN; + cxit_setup_rma(); + + cr_assert_eq(cxit_fi->domain_attr->threading, FI_THREAD_DOMAIN, + "Threading"); + + dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + cr_assert_eq(dom->trig_cmdq_lock.lock_type, OFI_LOCK_NONE, + "Domain trigger command lock"); + + ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + cr_assert_eq(ep->ep_obj->lock.lock_type, OFI_LOCK_NONE, + "EP object lock"); + + cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_NONE, + "TX CQ EP list lock"); + cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_NOOP, + "TX CQ entry lock"); + + cq = container_of(cxit_rx_cq, struct cxip_cq, util_cq.cq_fid); + cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_NONE, + "RX CQ EP list lock"); + cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_NOOP, + "RX CQ entry lock"); + + cxit_teardown_rma(); +} + +Test(ep_locking, completion) +{ + struct cxip_domain *dom; + struct cxip_ep *ep; + struct cxip_cq *cq; + + cxit_setup_getinfo(); + + cxit_fi_hints->domain_attr->threading = FI_THREAD_COMPLETION; + cxit_setup_rma(); + + cr_assert_eq(cxit_fi->domain_attr->threading, FI_THREAD_COMPLETION, + "Threading"); + + dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + cr_assert_eq(dom->trig_cmdq_lock.lock_type, OFI_LOCK_SPINLOCK, + "Domain trigger command lock"); + + ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + cr_assert_eq(ep->ep_obj->lock.lock_type, OFI_LOCK_NONE, + "EP object lock"); + + cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_NONE, + "TX CQ EP list lock"); + cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_NOOP, + "TX CQ entry lock"); + + cq = container_of(cxit_rx_cq, struct cxip_cq, util_cq.cq_fid); + cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_NONE, + "RX CQ EP list lock"); + cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_NOOP, + "RX CQ entry lock"); + + cxit_teardown_rma(); +} + +Test(ep_locking, safe) +{ + struct cxip_domain *dom; + struct cxip_ep *ep; + struct cxip_cq *cq; + + cxit_setup_getinfo(); + + cxit_fi_hints->domain_attr->threading = FI_THREAD_SAFE; + cxit_setup_rma(); + + cr_assert_eq(cxit_fi->domain_attr->threading, FI_THREAD_SAFE, + "Threading"); + + dom = container_of(cxit_domain, struct cxip_domain, + util_domain.domain_fid); + cr_assert_eq(dom->trig_cmdq_lock.lock_type, OFI_LOCK_SPINLOCK, + "Domain trigger command lock"); + + ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + cr_assert_eq(ep->ep_obj->lock.lock_type, OFI_LOCK_SPINLOCK, + "EP object lock"); + + cq = container_of(cxit_tx_cq, struct cxip_cq, util_cq.cq_fid); + cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_SPINLOCK, + "TX CQ EP list lock"); + cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_SPINLOCK, + "TX CQ entry lock"); + + cq = container_of(cxit_rx_cq, struct cxip_cq, util_cq.cq_fid); + cr_assert_eq(cq->ep_list_lock.lock_type, OFI_LOCK_SPINLOCK, + "RX CQ EP list lock"); + cr_assert_eq(cq->util_cq.cq_lock.lock_type, OFI_LOCK_SPINLOCK, + "RX CQ entry lock"); + + cxit_teardown_rma(); +} diff --git a/prov/cxi/test/eq.c b/prov/cxi/test/eq.c new file mode 100644 index 00000000000..00730982b22 --- /dev/null +++ b/prov/cxi/test/eq.c @@ -0,0 +1,37 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2020 Hewlett Packard Enterprise Development LP + */ + +/* Notes: + * + * This test is perfunctory at present. A fuller set of tests is available: + * + * virtualize.sh fabtests/unit/fi_eq_test + * + * TODO: current implementation does not support wait states. + */ + +#include +#include + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(eq, .init = cxit_setup_eq, .fini = cxit_teardown_eq, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test basic CQ creation */ +Test(eq, simple) +{ + cxit_create_eq(); + cr_assert(cxit_eq != NULL); + cxit_destroy_eq(); +} + diff --git a/prov/cxi/test/fabric.c b/prov/cxi/test/fabric.c new file mode 100644 index 00000000000..4eb04e4cf0b --- /dev/null +++ b/prov/cxi/test/fabric.c @@ -0,0 +1,647 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2015-2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +static const char cxip_dom_fmt[] = "cxi%d"; + +static char *get_dom_name(int if_idx) +{ + char *dom; + int ret; + + ret = asprintf(&dom, cxip_dom_fmt, if_idx); + cr_assert(ret > 0); + + return dom; +} + +TestSuite(getinfo_env_vars, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(getinfo_env_vars, default_tx_size) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + struct fi_info *iter; + + ret = setenv("FI_CXI_DEFAULT_TX_SIZE", "17", 1); + cr_assert(ret == 0); + + hints = fi_allocinfo(); + cr_assert(hints); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert(hints->fabric_attr->prov_name); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + NULL, NULL, cxit_flags, hints, &info); + cr_assert(ret == FI_SUCCESS); + + iter = info; + while (iter) { + cr_assert(info->tx_attr->size == 17); + iter = iter->next; + } + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +Test(getinfo_env_vars, default_rx_size) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + struct fi_info *iter; + + ret = setenv("FI_CXI_DEFAULT_RX_SIZE", "17", 1); + cr_assert(ret == 0); + + hints = fi_allocinfo(); + cr_assert(hints); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert(hints->fabric_attr->prov_name); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + NULL, NULL, cxit_flags, hints, &info); + cr_assert(ret == FI_SUCCESS); + + iter = info; + while (iter) { + cr_assert(info->rx_attr->size == 17); + iter = iter->next; + } + + fi_freeinfo(info); + fi_freeinfo(hints); +} + +TestSuite(getinfo, .init = cxit_setup_getinfo, + .fini = cxit_teardown_getinfo, .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test fabric selection with provider name */ +Test(getinfo, prov_name) +{ + int infos = 0; + + cxit_fi_hints->fabric_attr->prov_name = strdup(cxip_prov_name); + + cxit_create_fabric_info(); + cr_assert(cxit_fi != NULL); + + /* Make sure we have at least 1 FI for each IF */ + do { + cr_assert(!strcmp(cxit_fi->fabric_attr->prov_name, + cxip_prov_name)); + infos++; + } while ((cxit_fi = cxit_fi->next)); + cr_assert(infos >= cxit_n_ifs); +} + +/* Test fabric selection with domain name */ +Test(getinfo, dom_name) +{ + int infos = 0; + struct cxip_if *if_entry; + struct slist_entry *entry, *prev __attribute__ ((unused)); + + slist_foreach(&cxip_if_list, entry, prev) { + if_entry = container_of(entry, struct cxip_if, if_entry); + infos = 0; + + cxit_node = get_dom_name(if_entry->info->dev_id); + cxit_flags = FI_SOURCE; + printf("searching %s\n", cxit_node); + + cxit_create_fabric_info(); + cr_assert(cxit_fi != NULL); + + /* Make sure we have at least 1 FI for each IF */ + do { + cr_expect(!strcmp(cxit_fi->domain_attr->name, + cxit_node), + "%s != %s\n", + cxit_fi->domain_attr->name, + cxit_fi_hints->domain_attr->name); + + cr_assert(!strcmp(cxit_fi->fabric_attr->prov_name, + cxip_prov_name)); + + cr_assert(!strcmp(cxit_fi->fabric_attr->name, + cxip_prov_name)); + + infos++; + } while ((cxit_fi = cxit_fi->next)); + cr_assert(infos >= 1); + + cxit_destroy_fabric_info(); + } + cr_assert(infos >= 1); +} + +/* Test fabric selection with fabric name */ +Test(getinfo, fab_name) +{ + int infos = 0; + struct slist_entry *entry, *prev __attribute__ ((unused)); + struct fi_info *fi; + + slist_foreach(&cxip_if_list, entry, prev) { + infos = 0; + + cxit_fi_hints->fabric_attr->name = strdup(cxip_prov_name); + + cxit_create_fabric_info(); + cr_assert(cxit_fi != NULL); + + fi = cxit_fi; + do { + /* Not all providers can be trusted to filter by fabric + * name */ + if (strcmp(fi->fabric_attr->prov_name, + cxip_prov_name)) + continue; + + cr_assert(!strcmp(fi->fabric_attr->name, + fi->fabric_attr->name)); + + infos++; + } while ((fi = fi->next)); + + cxit_destroy_fabric_info(); + } + cr_assert(infos); +} + +Test(getinfo, prov_version) +{ + cxit_fi_hints->fabric_attr->prov_name = strdup(cxip_prov_name); + + cxit_create_fabric_info(); + cr_assert(cxit_fi != NULL); + cr_assert(cxit_fi->fabric_attr != NULL); + + cr_assert(FI_MAJOR(cxit_fi->fabric_attr->prov_version) == + CXIP_MAJOR_VERSION, + "Major version wwrong, expected %d, version returned %d", + CXIP_MAJOR_VERSION, + FI_MAJOR(cxit_fi->fabric_attr->prov_version)); + cr_assert(FI_MINOR(cxit_fi->fabric_attr->prov_version) == + CXIP_MINOR_VERSION, + "Minor version wwrong, expected %d, version returned %d", + CXIP_MINOR_VERSION, + FI_MINOR(cxit_fi->fabric_attr->prov_version)); +} + +Test(getinfo, valid_av_auth_key) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->caps = FI_MSG | FI_TAGGED | FI_REMOTE_COMM; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); + fi_freeinfo(info); +} + +Test(getinfo, invalid_av_auth_key_not_null_domain_auth_key) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->caps = FI_MSG | FI_TAGGED | FI_REMOTE_COMM; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->domain_attr->auth_key = (void *)hints; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + hints->domain_attr->auth_key = NULL; + + fi_freeinfo(hints); + fi_freeinfo(info); +} + +Test(getinfo, invalid_av_auth_key_not_null_ep_auth_key) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->caps = FI_MSG | FI_TAGGED | FI_REMOTE_COMM; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->ep_attr->auth_key = (void *)hints; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + hints->ep_attr->auth_key = NULL; + + fi_freeinfo(hints); + fi_freeinfo(info); +} + +Test(getinfo, invalid_av_auth_key_not_zero_ep_auth_key_size) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->caps = FI_MSG | FI_TAGGED | FI_REMOTE_COMM; + hints->domain_attr->auth_key_size = FI_AV_AUTH_KEY; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->ep_attr->auth_key_size = 1; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); + fi_freeinfo(info); +} + +Test(getinfo, valid_multiple_auth_keys_per_ep) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->domain_attr->max_ep_auth_key = 2; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->caps = FI_MSG; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + cr_assert_eq(info->domain_attr->max_ep_auth_key, + hints->domain_attr->max_ep_auth_key); + + fi_freeinfo(hints); + fi_freeinfo(info); +} + +Test(getinfo, invalid_multiple_auth_keys_per_ep) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->domain_attr->max_ep_auth_key = (1 << 16); + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->caps = FI_MSG; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); + fi_freeinfo(info); +} + +Test(getinfo, invalid_fi_directed_recv_with_multiple_auth_keys_per_ep) +{ + int ret; + struct fi_info *hints; + struct fi_info *info; + + hints = fi_allocinfo(); + cr_assert_not_null(hints, "fi_allocinfo failed"); + + hints->domain_attr->max_ep_auth_key = 2; + hints->domain_attr->mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + hints->caps = FI_MSG | FI_DIRECTED_RECV; + hints->fabric_attr->prov_name = strdup("cxi"); + cr_assert_not_null(hints, "strdup failed"); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, hints, &info); + cr_assert_eq(ret, -FI_ENODATA, "fi_getinfo failed: %d", ret); + + fi_freeinfo(hints); + fi_freeinfo(info); +} + +TestSuite(getinfo_infos, .timeout = CXIT_DEFAULT_TIMEOUT); + +#define MAX_INFOS 16 +#define FI_ADDR_CXI_COMPAT FI_ADDR_OPX + +struct info_check { + int mr_mode; + uint32_t format; + size_t max_ep_auth_key; +}; + +Test(getinfo_infos, nohints) +{ + int num_info; + int i; + int info_per_if = 0; + struct fi_info *fi_ptr; + char *dom_name; + char *odp; + char *compat; + struct info_check infos[MAX_INFOS]; + size_t max_ep_auth_key; + + cxit_init(); + cr_assert(!cxit_fi_hints, "hints not NULL"); + + cxit_create_fabric_info(); + cr_assert(cxit_fi != NULL); + + for (i = 0; i < MAX_INFOS; i++) { + infos[i].format = 0; + infos[i].mr_mode = -1; + } + + /* By default when no hints are specified, each interface + * should have 4 fi_info. + */ + for (i = 0; i < 2; i++) { + if (i < 1) + max_ep_auth_key = 1; + else + max_ep_auth_key = 4; + + infos[info_per_if].mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED | + FI_MR_PROV_KEY; + infos[info_per_if].format = FI_ADDR_CXI; + infos[info_per_if].max_ep_auth_key = max_ep_auth_key; + info_per_if++; + + infos[info_per_if].mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + infos[info_per_if].format = FI_ADDR_CXI; + infos[info_per_if].max_ep_auth_key = max_ep_auth_key; + info_per_if++; + + /* Add ODP versions if enabled */ + odp = getenv("FI_CXI_ODP"); + if (odp && strtol(odp, NULL, 10)) { + infos[info_per_if].format = FI_ADDR_CXI; + infos[info_per_if].mr_mode = FI_MR_ENDPOINT | FI_MR_PROV_KEY; + infos[info_per_if].max_ep_auth_key = max_ep_auth_key; + info_per_if++; + + infos[info_per_if].format = FI_ADDR_CXI; + infos[info_per_if].mr_mode = FI_MR_ENDPOINT; + infos[info_per_if].max_ep_auth_key = max_ep_auth_key; + info_per_if++; + } + } + + /* If we are supporting compatibility with old constants, + * then fi_info are repeated with compatibility constants. + */ + compat = getenv("FI_CXI_COMPAT"); + if (!compat || strtol(compat, NULL, 10) == 1) { + for (i = 0; i < info_per_if; i++) { + infos[info_per_if + i].mr_mode = + infos[i].mr_mode; + infos[info_per_if + i].format = + FI_ADDR_CXI_COMPAT; + infos[info_per_if + i].max_ep_auth_key = + infos[i].max_ep_auth_key; + } + info_per_if += i; + } + cr_assert(info_per_if <= MAX_INFOS, "Too many infos"); + + fi_ptr = cxit_fi; + + while (fi_ptr) { + /* Only concerned with CXI */ + if (strcmp(fi_ptr->fabric_attr->prov_name, cxip_prov_name)) { + fi_ptr = fi_ptr->next; + continue; + } + + dom_name = fi_ptr->domain_attr->name; + num_info = 0; + + /* Each info for the same NIC as the same domain name */ + while (fi_ptr) { + /* Different interface detected */ + if (strcmp(dom_name, fi_ptr->domain_attr->name)) + break; + + num_info++; + cr_assert(num_info <= MAX_INFOS, + "too many fi_info %d", num_info); + + cr_assert(infos[num_info - 1].mr_mode == + fi_ptr->domain_attr->mr_mode, + "expected MR mode %x got %x", + infos[num_info - 1].mr_mode, + fi_ptr->domain_attr->mr_mode); + + cr_assert(infos[num_info - 1].format == + fi_ptr->addr_format, + "expected addr_fomrat %u got %u", + infos[num_info - 1].format, + fi_ptr->addr_format); + + fi_ptr = fi_ptr->next; + } + + cr_assert(num_info == info_per_if, + "Wrong number of fi_info %d got %d", + num_info, info_per_if); + } + cxit_destroy_fabric_info(); +} + +Test(getinfo_infos, hints) +{ + int num_info; + int i; + int info_per_if = 0; + struct fi_info *fi_ptr; + char *dom_name; + char *compat; + struct info_check infos[2]; + + cxit_setup_fabric(); + cr_assert(cxit_fi != NULL); + cr_assert(cxit_fi_hints != NULL); + + for (i = 0; i < 2; i++) { + infos[i].format = 0; + infos[i].mr_mode = -1; + } + + infos[0].format = FI_ADDR_CXI; + infos[0].mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + if (cxit_prov_key) + infos[0].mr_mode |= FI_MR_PROV_KEY; + info_per_if++; + + compat = getenv("FI_CXI_COMPAT"); + if (!compat || strtol(compat, NULL, 10) == 1) { + infos[1].format = FI_ADDR_CXI_COMPAT; + infos[1].mr_mode = infos[0].mr_mode; + info_per_if++; + } + + fi_ptr = cxit_fi; + + while (fi_ptr) { + /* Should only be CXI provider */ + cr_assert(!strcmp(fi_ptr->fabric_attr->prov_name, + cxip_prov_name), "non-cxi provider"); + + dom_name = fi_ptr->domain_attr->name; + num_info = 0; + + /* Each info for the same NIC as the same domain name */ + while (fi_ptr) { + /* Different interface detected */ + if (strcmp(dom_name, fi_ptr->domain_attr->name)) + break; + + num_info++; + cr_assert(num_info <= 2, "too many fi_info %d", + num_info); + + cr_assert(infos[num_info - 1].mr_mode == + fi_ptr->domain_attr->mr_mode, + "expected MR mode %x got %x", + infos[num_info - 1].mr_mode, + fi_ptr->domain_attr->mr_mode); + + cr_assert(infos[num_info - 1].format == + fi_ptr->addr_format, + "expected addr_fomrat %u got %u", + infos[num_info - 1].format, + fi_ptr->addr_format); + + fi_ptr = fi_ptr->next; + } + + cr_assert(num_info == info_per_if, + "Wrong number of fi_info %d got %d", + num_info, info_per_if); + } + cxit_teardown_fabric(); +} + +Test(getinfo_infos, hints_no_rma) +{ + int ret; + + cxit_setup_getinfo(); + cr_assert(cxit_fi == NULL); + cr_assert(cxit_fi_hints != NULL); + + /* Request info with hints capabilities that do not + * include RMA and make sure fi_info is returned + * even if FI_MR_ENDPOINT is not specified. + */ + cxit_fi_hints->domain_attr->mr_mode = 0; + cxit_fi_hints->caps = FI_MSG | FI_TAGGED | FI_SEND | FI_RECV; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &cxit_fi); + cr_assert(ret == FI_SUCCESS, "fi_getinfo()"); + cr_assert(cxit_fi != NULL, "no fi_info"); + + cr_assert(cxit_fi->domain_attr->mr_mode == 0, "MR mode not 0"); + cr_assert(cxit_fi->caps & (FI_MSG | FI_TAGGED | FI_SEND | FI_RECV), + "caps cleared"); + + fi_freeinfo(cxit_fi); + cxit_fi = NULL; + + /* Request info with hints capabilities that do not + * include RMA and but do include mr_mode bits. Make + * sure the mr_mode bits are cleared. + * TODO: When common code is patched to remove FI_MR_ENDPOINT, + * when RMA/ATOMIC is not required, add that mode to the hints. + */ + cxit_fi_hints->domain_attr->mr_mode = FI_MR_ALLOCATED | FI_MR_PROV_KEY; + cxit_fi_hints->caps = FI_MSG | FI_TAGGED | FI_SEND | FI_RECV; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &cxit_fi); + cr_assert(ret == FI_SUCCESS, "fi_getinfo()"); + cr_assert(cxit_fi != NULL, "no fi_info"); + + cr_assert(cxit_fi->domain_attr->mr_mode == 0, "MR mode not cleared"); + cr_assert(cxit_fi->caps & (FI_MSG | FI_TAGGED | FI_SEND | FI_RECV), + "caps cleared"); + + fi_freeinfo(cxit_fi); + cxit_fi = NULL; + + cxit_teardown_getinfo(); +} + +TestSuite(fabric, .init = cxit_setup_fabric, .fini = cxit_teardown_fabric, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test basic fabric creation */ +Test(fabric, simple) +{ + cxit_create_fabric(); + cr_assert(cxit_fabric != NULL); + + cxit_destroy_fabric(); +} diff --git a/prov/cxi/test/fi_info_test.sh b/prov/cxi/test/fi_info_test.sh new file mode 100644 index 00000000000..b0fba97d698 --- /dev/null +++ b/prov/cxi/test/fi_info_test.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# set -x + +SCRIPT=$(basename ${BASH_SOURCE[0]:-$0}) +FI_INFO="../../../util/fi_info" +ENODATA=61 +DEVICE="cxi1" + +usage() { +cat < $tapfile +TAP version 13 +1..1 +$ok - fi_info::test for interface not found +EOF2 +} + +# ################################################################ + +tapfile="" + +while getopts t:-: OPT; do # allow -t and -- with arg + # support long options: https://stackoverflow.com/a/28466267/519360 + if [ "$OPT" = "-" ]; then # long option: reformulate OPT and OPTARG + OPT="${OPTARG%%=*}" # extract long option name + OPTARG="${OPTARG#$OPT}" # extract long option argument (may be empty) + OPTARG="${OPTARG#=}" # if long option argument, remove assigning `=` + fi + case "$OPT" in + t | tap) + tapfile="$OPTARG" + ;; + h) + usage + ;; + \?) + exit 1 # bad short option (error reported by getopts) + ;; + *) + echo "Illegal option --$OPT" # bad long option + exit 1 + ;; + esac +done + +if [ -z "$tapfile" ]; then + usage +fi + +test="FI_CXI_DEVICE_NAME=\"${DEVICE}\" ${FI_INFO} -p cxi" + +echo "Running test: $test" +eval "$test" +ret=$? + +status=1 # bashism: 0 means it passed +if [ $ret -eq $ENODATA ] || [ $ret -eq -$ENODATA ]; then + status=0 +fi + +report $status $tapfile + +exit $status diff --git a/prov/cxi/test/flask_testsrv.py b/prov/cxi/test/flask_testsrv.py new file mode 100644 index 00000000000..f2071efa453 --- /dev/null +++ b/prov/cxi/test/flask_testsrv.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +# Copyright (c) 2021 Hewlett Packard Enterprise Development LP +help = f''' +Standalone REST server for local testing + +TARGET /test + Provides basic targets for GET, PUT, POST, PATCH, and DELETE. + "Content-Type: application/json" header should be specified. + Result is JSON data identifying the operation, and the supplied data. + If the supplied data contains a JSON tag named 'return_code', + the corresponding value will be used as the return code of the + response. + Exercise using ./curltest --auto + +If --host is omitted, host is http://127.0.0.1 local address (if available) +If --host is 0.0.0.0, host is the current IP address of the node +''' + +import argparse +import textwrap +import sys +import json + +from argparse import ArgumentParser, HelpFormatter +from flask import Flask, request +from flask_restful import Api, Resource + +class RawFormatter(HelpFormatter): + def _fill_text(self, text, width, indent): + return "\n".join([textwrap.fill(line, width) for line in textwrap.indent(textwrap.dedent(text), indent).splitlines()]) + +# Test code for CURL regression test +class selftestResource(Resource): + def return_code(self, json): + if json is not None and "return_code" in json: + return json["return_code"] + return 200 + + def get(self): + info = { + 'operation': 'GET', + 'data': '' + } + return info, self.return_code(None) + + def put(self): + info = { + 'operation': 'PUT', + 'data': request.json + } + return info, self.return_code(request.json) + + def post(self): + info = { + 'operation': 'POST', + 'data': request.json + } + return info, self.return_code(request.json) + + def patch(self): + info = { + 'operation': 'PATCH', + 'data': request.json + } + return info, self.return_code(request.json) + + def delete(self): + info = { + 'operation': 'DELETE', + 'data': request.json + } + return info, self.return_code(request.json) + +def main(argv): + parser = argparse.ArgumentParser( + description=help, formatter_class=RawFormatter) + parser.add_argument('--host', default=None) + parser.add_argument('--port', default=None) + args = parser.parse_args() + + app = Flask(__name__) + api = Api(app); + api.add_resource(selftestResource, '/test') + app.run(debug=True, host=args.host, port=args.port) + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/prov/cxi/test/fork.c b/prov/cxi/test/fork.c new file mode 100644 index 00000000000..dc106889b15 --- /dev/null +++ b/prov/cxi/test/fork.c @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "libcxi/libcxi.h" +#include "cxip.h" +#include "cxip_test_common.h" + +#define SECRET 0xFFU +#define XFER_SIZE 257U +#define INIT_BUF_VALUE 0xAAU +#define INIT_BUF_OFFSET 127U +#define TGT_BUF_VALUE 0xFFU +#define TGT_BUF_OFFSET 3215U +#define RKEY 0x1U +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) + +/* Needs to be marked volatile to prevent hangs due to compiler optimization. */ +static volatile bool child_process_block = true; + +static void signal_handler(int sig) +{ + child_process_block = false; +} + +static void fork_test_runner(bool odp, bool huge_page, bool fork_safe) +{ + long page_size; + uint8_t *buf; + uint8_t *init_buf; + uint8_t *tgt_buf; + int ret; + struct fid_mr *mr; + int status; + struct fi_cq_tagged_entry cqe; + pid_t pid; + int i = 0; + int flags = MAP_PRIVATE | MAP_ANONYMOUS; + uint64_t rkey; + bool again; + + if (odp) { + ret = setenv("FI_CXI_FORCE_ODP", "1", 1); + cr_assert_eq(ret, 0, "Failed to set FI_CXI_FORCE_ODP %d", + -errno); + } + + if (fork_safe) { + ret = setenv("CXI_FORK_SAFE", "1", 1); + cr_assert_eq(ret, 0, "Failed to set CXI_FORK_SAFE %d", -errno); + + if (huge_page) { + ret = setenv("CXI_FORK_SAFE_HP", "1", 1); + cr_assert_eq(ret, 0, "Failed to set CXI_FORK_SAFE %d", + -errno); + } + } + + cxit_setup_msg(); + + signal(SIGUSR1, signal_handler); + + /* Single map is used for page aliasing with child process and RDMA. */ + if (huge_page) { + page_size = 2 * 1024 * 1024; + flags |= MAP_HUGETLB | MAP_HUGE_2MB; + } else { + page_size = sysconf(_SC_PAGESIZE); + } + + buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE, flags, -1, 0); + cr_assert(buf != MAP_FAILED, "mmap failed"); + + memset(buf, 0, page_size); + + /* This secret is passed to the child process. Child process will verify + * it receives this secret. + */ + buf[0] = SECRET; + init_buf = buf + INIT_BUF_OFFSET; + tgt_buf = buf + TGT_BUF_OFFSET; + + /* Register the buffer. The behavior of the child buffer depends upon + * the following + * - If CXI_FORK_SAFE is set and copy-on-fork kernel support does not + * exist, madvise(MADV_DONTFORK) will be issued against the page. + * This will cause the child to segfault. + * - If CXI_FORK_SAFE is set and copy-on-fork kernel support exists, + * madvise(MADV_DONTFORK) will NOT be issued against the page. The + * child process will get its data and the parent process will + * not have data corruption. + * - If ODP is not used and kernel copy-on-fork is not supported, the + * child process will get its data, and the parent process will have + * data corruption. + * - If ODP is not used and the kernel supports copy-on-fork, the child + * process will get its data, and the parent process will not have + * data corruption. + * - If ODP is used, the child process will get its data, and the parent + * process will not have data corruption. + */ + ret = fi_mr_reg(cxit_domain, tgt_buf, XFER_SIZE, FI_REMOTE_WRITE, 0, + RKEY, 0, &mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed %d", ret); + + ret = fi_mr_bind(mr, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed %d", ret); + + ret = fi_mr_enable(mr); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed %d", ret); + + rkey = fi_mr_key(mr); + + again = true; + do { + pid = fork(); + if (pid >= 0) { + again = false; + break; + } + + cr_assert_eq(errno, EAGAIN, "fork() failed: %d", errno); + } while (again); + + if (pid == 0) { + while (child_process_block) + sched_yield(); + + /* If CXI_FORK_SAFE is set (i.e. fork_safe is true) and + * kernel copy-on-fork does not exist, this will segfault. + */ + if (buf[0] == SECRET) + _exit(EXIT_SUCCESS); + + /* This should never happen. */ + _exit(EXIT_FAILURE); + } + + /* Writing these buffers will trigger COW if copy-on-fork + * kernel support does not exist. If that is the case then unless + * madvise(MADV_DONTFORK) was called, parent process will get a new + * page. + */ + memset(init_buf, INIT_BUF_VALUE, XFER_SIZE); + memset(tgt_buf, TGT_BUF_VALUE, XFER_SIZE); + + ofi_sfence(); + + /* Unblock the child process. */ + kill(pid, SIGUSR1); + + ret = fi_write(cxit_ep, init_buf, XFER_SIZE, NULL, cxit_ep_fi_addr, 0, + rkey, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_write failed %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + if (cxil_is_copy_on_fork() || odp || fork_safe) { + for (i = 0; i < XFER_SIZE; i++) + cr_assert_eq(init_buf[i], tgt_buf[i], "data corruption with fork"); + } else { + for (i = 0; i < XFER_SIZE; i++) + cr_assert_neq(init_buf[i], tgt_buf[i], "Missing data corruption with fork"); + } + + waitpid(pid, &status, 0); + + if (!cxil_is_copy_on_fork() && fork_safe) { + cr_assert_eq(WIFSIGNALED(status), true, "Child was not terminated by signal: is_exit=%d exit=%d is_sig=%d sig=%d", + WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + cr_assert_eq(WTERMSIG(status), SIGSEGV, "Child signal was not SIGSEGV"); + } else { + cr_assert_eq(WIFEXITED(status), true, "Child was not terminated by exit: is_exit=%d exit=%d is_sig=%d sig=%d", + WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + cr_assert_eq(WEXITSTATUS(status), EXIT_SUCCESS, "Child process had data corruption"); + } + + fi_close(&mr->fid); + munmap(buf, page_size); + + cxit_teardown_msg(); +} + +TestSuite(fork, .timeout = CXIT_DEFAULT_TIMEOUT); + +/* No ODP, no fork safe variables, and system page size. On kernels before 5.12, + * parent process should have data corruption. Child process should not have + * data corruption and should not segfault. + */ +Test(fork, page_aliasing_no_odp_no_fork_safe_system_page_size) +{ + fork_test_runner(false, false, false); +} + +/* ODP, no fork safe variables, and system page size. Parent process should not + * have data corruption regardless of kernel version. Child process should not + * have data corruption and should not segfault. + */ +Test(fork, page_aliasing_odp_no_fork_safe_system_page_size) +{ + fork_test_runner(true, false, false); +} + +/* No ODP, no fork safe variables, and system page size. Parent process should + * not have data corruption regardless of kernel version. Child process should + * segfault if copy-on-fork kernel support does not exist (The parent would + * have called madvise MADV_DONTFORK if that is the case). + */ +Test(fork, page_aliasing_no_odp_fork_safe_system_page_size) +{ + fork_test_runner(false, false, true); +} + +/* No ODP, no fork safe variables, and 2MiB page size. On kernels before 5.12, + * parent process should have data corruption. Child process should not have + * data corruption and should not segfault. + */ +Test(fork, page_aliasing_no_odp_no_fork_safe_huge_page) +{ + fork_test_runner(false, true, false); +} + +/* ODP, no fork safe variables, and 2MiB page size. Parent process should not + * have data corruption regardless of kernel version. Child process should not + * have data corruption and should not segfault. + */ +Test(fork, page_aliasing_odp_no_fork_safe_huge_page) +{ + fork_test_runner(true, true, false); +} + +/* No ODP, with fork safe variables, and 2MiB page size. Parent process should + * not have data corruption regardless of kernel version. Child process should + * segfault if the kernel does not support copy-on-fork (since the parent + * would have called MADV_DONTFORK on virtual address range). + */ +Test(fork, page_aliasing_no_odp_fork_safe_huge_page) +{ + fork_test_runner(false, true, true); +} + +static volatile bool block_threads = true; + +static void *child_memory_free_thread_runner(void *context) +{ + bool huge_page = (bool)context; + long page_size; + uint8_t *buf; + int ret; + struct fid_mr *mr; + int status; + pid_t pid; + int flags = MAP_PRIVATE | MAP_ANONYMOUS; + bool again; + + while (block_threads) + sched_yield(); + + /* Single map is used for page aliasing with child process and RDMA. */ + if (huge_page) { + page_size = 2 * 1024 * 1024; + flags |= MAP_HUGETLB | MAP_HUGE_2MB; + } else { + page_size = sysconf(_SC_PAGESIZE); + } + + buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE, flags, -1, 0); + cr_assert(buf != MAP_FAILED, "mmap failed"); + + memset(buf, 0, page_size); + + ret = fi_mr_reg(cxit_domain, buf, XFER_SIZE, FI_REMOTE_WRITE, 0, + gettid(), 0, &mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed %d", ret); + + /* MR reg will result in cxil_map() being called. On kernels < 5.12, + * libcxi will call MADV_DONTFORK on the range. For the purposes of this + * test, we want the child to munmap this buffer to see if it deadlocks + * in the MR cache. Thus, we need to undo the MADV_DONTFORK. + */ + if (!cxil_is_copy_on_fork()) { + ret = madvise(buf, page_size, MADV_DOFORK); + cr_assert_eq(ret, 0, "madvise failed %d", ret); + } + + ret = fi_mr_bind(mr, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed %d", ret); + + ret = fi_mr_enable(mr); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed %d", ret); + + again = true; + do { + pid = fork(); + if (pid >= 0) { + again = false; + break; + } + + cr_assert_eq(errno, EAGAIN, "fork() failed: %d", errno); + } while (again); + + if (pid == 0) { + munmap(buf, page_size); + _exit(EXIT_SUCCESS); + } + + waitpid(pid, &status, 0); + + cr_assert_eq(WIFEXITED(status), true, "Child was not terminated by exit: is_exit=%d exit=%d is_sig=%d sig=%d", + WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + cr_assert_eq(WEXITSTATUS(status), EXIT_SUCCESS, "Child process had data corruption"); + + fi_close(&mr->fid); + munmap(buf, page_size); + + return NULL; +} + +#define THREAD_MAX 256U + +static void child_memory_free_runner(bool huge_page, int thread_count) +{ + pthread_t threads[THREAD_MAX]; + int i; + int ret; + + cr_assert(thread_count <= THREAD_MAX); + + /* For kernels < 5.12, CXI_FORK_SAFE needs to be set. If not set, the + * control event queue buffers would be subjected to copy-on-write. This + * may result in the parent threads deadlocking. + */ + ret = setenv("CXI_FORK_SAFE", "1", 1); + cr_assert_eq(ret, 0, "Failed to set CXI_FORK_SAFE %d", -errno); + + if (huge_page) { + ret = setenv("CXI_FORK_SAFE_HP", "1", 1); + cr_assert_eq(ret, 0, "Failed to set CXI_FORK_SAFE %d", -errno); + } + + cxit_setup_msg(); + + for (i = 0; i < thread_count; i++) { + ret = pthread_create(&threads[i], NULL, + child_memory_free_thread_runner, + (void *)huge_page); + cr_assert(ret == 0); + } + + block_threads = false; + + for (i = 0; i < thread_count; i++) + pthread_join(threads[i], NULL); + + cxit_teardown_msg(); +} + +/* The objective of this test is to see if child processes can deadlock on the + * MR cache lock if threads are forking while other threads are doing memory + * registration. + */ +Test(fork, child_memory_free_system_page_size) +{ + child_memory_free_runner(false, 16); +} + +Test(fork, child_memory_free_huge_page_size) +{ + child_memory_free_runner(true, 16); +} diff --git a/prov/cxi/test/hip/hip_cntr_test.cpp b/prov/cxi/test/hip/hip_cntr_test.cpp new file mode 100644 index 00000000000..486af232e77 --- /dev/null +++ b/prov/cxi/test/hip/hip_cntr_test.cpp @@ -0,0 +1,95 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + */ + +#include "hip/hip_runtime.h" +#include +#include +#include +#include +#include +#include + +/* Example compile instructions. */ +// hipcc --amdgpu-target=gfx908 -I/libfabric/install/include -L/opt/rocm/lib64/ -L/opt/rocm/lib -L/libfabric/install/lib -lfabric -g -c hip_cntr_test.cpp +// hipcc --amdgpu-target=gfx908 -I/libfabric/install/include -L/opt/rocm/lib64/ -L/opt/rocm/lib -L/libfabric/install/lib -lfabric -g hip_cntr_test.o -o hip_cntr_test + +#define GPU_WB_SIZE 8U + +static struct fi_info *hints; +static struct fi_info *info; +static struct fid_fabric *fabric; +static struct fid_domain *domain; +static struct fid_cntr *cntr; +static struct fi_cxi_cntr_ops *cntr_ops; +static void *gpu_wb; + +void resource_init(void) +{ + int ret; + + ret = hipMalloc(&gpu_wb, GPU_WB_SIZE); + assert(ret == hipSuccess); + + hints = fi_allocinfo(); + assert(hints != NULL); + + /* Always select CXI provider */ + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + hints->fabric_attr->prov_name = strdup("cxi"); + assert(hints->fabric_attr->prov_name != NULL); + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), NULL, + NULL, 0, hints, &info); + assert(ret == FI_SUCCESS); + + ret = fi_fabric(info->fabric_attr, &fabric, NULL); + assert(ret == FI_SUCCESS); + + ret = fi_domain(fabric, info, &domain, NULL); + assert(ret == FI_SUCCESS); + + ret = fi_cntr_open(domain, NULL, &cntr, NULL); + assert(ret == FI_SUCCESS); + + ret = fi_open_ops(&cntr->fid, FI_CXI_COUNTER_OPS, 0, (void **)&cntr_ops, + NULL); + assert(ret == FI_SUCCESS); + + ret = cntr_ops->set_wb_buffer(&cntr->fid, gpu_wb, GPU_WB_SIZE); + assert(ret == FI_SUCCESS); +} + +void resource_free(void) +{ + fi_close(&cntr->fid); + fi_close(&domain->fid); + fi_close(&fabric->fid); + fi_freeinfo(info); + fi_freeinfo(hints); + hipFree(gpu_wb); +} + +int main(int argc, char *argv[]) +{ + int ret; + + resource_init(); + + ret = fi_cntr_adderr(cntr, 5); + assert(ret == FI_SUCCESS); + + while (fi_cntr_readerr(cntr) != 5); + + ret = fi_cntr_add(cntr, 123); + assert(ret == FI_SUCCESS); + + while (fi_cntr_read(cntr) != 123); + while (fi_cntr_readerr(cntr) != 5); + + resource_free(); + + return 0; +} diff --git a/prov/cxi/test/lat.c b/prov/cxi/test/lat.c new file mode 100644 index 00000000000..23d324f72d1 --- /dev/null +++ b/prov/cxi/test/lat.c @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2018-2021 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include +#include + +#include +#include +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +void *buf; + +void do_tsend(size_t len) +{ + int ret; + + ret = fi_tsend(cxit_ep, buf, len, NULL, cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "ret is %d\n", ret); +} +void do_tsend_0() { do_tsend(0); } +void do_tsend_8() { do_tsend(8); } +void do_tsend_256() { do_tsend(256); } + +void do_trecv(size_t len) +{ + int ret; + + ret = fi_trecv(cxit_ep, buf, len, NULL, FI_ADDR_UNSPEC, 0, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "ret is %d\n", ret); +} +void do_trecv_0() { do_trecv(0); } +void do_trecv_8() { do_trecv(8); } +void do_trecv_256() { do_trecv(256); } + +void do_tsend_more(size_t len) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = len, + }; + struct fi_msg_tagged msg = { + .msg_iov = &iov, + .iov_count = 1, + .addr = cxit_ep_fi_addr, + }; + int ret; + + ret = fi_tsendmsg(cxit_ep, &msg, FI_MORE); + cr_assert_eq(ret, FI_SUCCESS, "ret is %d\n", ret); +} +void do_tsend_more_8() { do_tsend_more(8); } +void do_tsend_more_256() { do_tsend_more(256); } + +void do_trecv_more(size_t len) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = len, + }; + struct fi_msg_tagged msg = { + .msg_iov = &iov, + .iov_count = 1, + .addr = FI_ADDR_UNSPEC, + }; + int ret; + + ret = fi_trecvmsg(cxit_ep, &msg, FI_MORE); + cr_assert_eq(ret, FI_SUCCESS, "ret is %d\n", ret); +} +void do_trecv_more_8() { do_trecv_more(8); } +void do_trecv_more_256() { do_trecv_more(256); } + +TestSuite(latency, .init = cxit_setup_tagged, .fini = cxit_teardown_tagged, + .timeout = CXIT_DEFAULT_TIMEOUT); + +struct latency_params { + char *api; + void (*func)(); + bool flush_send; +}; + +ParameterizedTestParameters(latency, basic) +{ + size_t param_sz; + + static struct latency_params params[] = { + { + .api = "tsend (0-byte)", + .func = do_tsend_0, + .flush_send = false, + }, + { + .api = "trecv (0-byte)", + .func = do_trecv_0, + .flush_send = false, + }, + { + .api = "tsend (8-byte)", + .func = do_tsend_8, + .flush_send = false, + }, + { + .api = "trecv (8-byte)", + .func = do_trecv_8, + .flush_send = false, + }, + { + .api = "tsend (256-byte)", + .func = do_tsend_256, + .flush_send = false, + }, + { + .api = "trecv (256-byte)", + .func = do_trecv_256, + .flush_send = false, + }, + { + .api = "tsend_more (8b, no doorbell)", + .func = do_tsend_more_8, + .flush_send = true, + }, + { + .api = "trecv_more (8b, no doorbell)", + .func = do_trecv_more_8, + .flush_send = false, + }, + { + .api = "tsend_more (256b, no doorbell)", + .func = do_tsend_more_256, + .flush_send = true, + }, + { + .api = "trecv_more (256b, no doorbell)", + .func = do_trecv_more_256, + .flush_send = false, + }, + }; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct latency_params, params, param_sz); +} + +/* Test API latency */ +ParameterizedTest(struct latency_params *params, latency, basic) +{ + int warmup = 10; + uint64_t loops = 200; + int i; + uint64_t start; + uint64_t end; + + buf = malloc(0x1000); + cr_assert(buf); + + for (i = 0; i < warmup; i++) + params->func(); + + start = ofi_gettime_ns(); + + for (i = 0; i < loops; i++) + params->func(); + + end = ofi_gettime_ns(); + + printf("%s latency: %lu ns\n", params->api, (end - start) / loops); + + /* Cleanup all outstanding more sends. */ + if (params->flush_send) { + do_tsend_0(); + sleep(1); + fi_cq_read(cxit_tx_cq, NULL, 0); + } + + free(buf); +} diff --git a/prov/cxi/test/mem_reg.c b/prov/cxi/test/mem_reg.c new file mode 100644 index 00000000000..8f85d3651d4 --- /dev/null +++ b/prov/cxi/test/mem_reg.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "libcxi/libcxi.h" +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(memReg, .timeout = CXIT_DEFAULT_TIMEOUT); + +static void hmem_dev_reg_test_runner(bool dev_reg, bool cache_enable) +{ + int ret; + void *buf; + size_t buf_size = 1234; + struct fid_mr *mr; + struct cxip_mr *cxi_mr; + + if (dev_reg) + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1); + else + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, + "Failed to set FI_CXI_DISABLE_HMEM_DEV_REGISTER %d", + -errno); + + if (cache_enable) + ret = setenv("FI_MR_CACHE_MONITOR", "memhooks", 1); + else + ret = setenv("FI_MR_CACHE_MONITOR", "disabled", 1); + cr_assert_eq(ret, 0, + "Failed to set FI_MR_CACHE_MONITOR %d", + -errno); + + buf = malloc(buf_size); + cr_assert_neq(buf, NULL, "Failed to alloc mem"); + + cxit_setup_msg(); + + ret = fi_mr_reg(cxit_domain, buf, buf_size, FI_READ | FI_WRITE, 0, 0, 0, + &mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + ret = fi_mr_bind(mr, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind failed: %d", ret); + + ret = fi_mr_enable(mr); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable failed: %d", ret); + + /* Have to examine the struct to determine if correct behavior is + * happening. + */ + cxi_mr = container_of(mr, struct cxip_mr, mr_fid); + if (dev_reg) + cr_assert_eq(cxi_mr->md->handle_valid, true, + "Bad cxip_md handle_valid"); + else + cr_assert_eq(cxi_mr->md->handle_valid, false, + "Bad cxip_md host_addr"); + cr_assert_eq(cxi_mr->md->cached, cache_enable, "Bad cxip_md cached"); + + ret = fi_close(&mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close failed: %d", ret); + + cxit_teardown_msg(); + free(buf); +} + +Test(memReg, disableHmemDevRegisterEnabled_mrCacheEnabled) +{ + hmem_dev_reg_test_runner(true, true); +} + +Test(memReg, disableHmemDevRegisterEnabled_mrCacheDisabled) +{ + hmem_dev_reg_test_runner(true, false); +} + +Test(memReg, disableHmemDevRegisterDisabled_mrCacheEnabled) +{ + hmem_dev_reg_test_runner(false, true); +} + +Test(memReg, disableHmemDevRegisterDisabled_mrCacheDisabled) +{ + hmem_dev_reg_test_runner(false, false); +} + +static void system_mem_dev_reg_test_runner(bool system_mem_cache_enabled, + bool hmem_dev_reg_enabled) +{ + char *send_buf; + char *recv_buf; + size_t buf_size = 1234; + int ret; + struct fi_cq_tagged_entry cqe; + int i; + + if (system_mem_cache_enabled) + ret = setenv("FI_MR_CACHE_MONITOR", "memhooks", 1); + else + ret = setenv("FI_MR_CACHE_MONITOR", "disabled", 1); + cr_assert_eq(ret, 0, + "Failed to set FI_MR_CACHE_MONITOR %d", + -errno); + + if (hmem_dev_reg_enabled) + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1); + else + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, + "Failed to set FI_CXI_DISABLE_HMEM_DEV_REGISTER %d", + -errno); + + send_buf = calloc(1, buf_size); + cr_assert_neq(send_buf, NULL, "Failed to alloc mem"); + + recv_buf = calloc(1, buf_size); + cr_assert_neq(recv_buf, NULL, "Failed to alloc mem"); + + ret = open("/dev/urandom", O_RDONLY); + cr_assert_neq(ret, -1, "open failed: %d", -errno); + read(ret, send_buf + 1, buf_size - 1); + close(ret); + + cxit_setup_msg(); + + ret = fi_recv(cxit_ep, recv_buf + 1, buf_size - 1, NULL, + cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + ret = fi_send(cxit_ep, send_buf + 1, buf_size - 1, NULL, + cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + do { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + for (i = 0; i < buf_size; i++) + cr_assert_eq(send_buf[i], recv_buf[i], + "Data corruption at byte %d", i); + + cxit_teardown_msg(); + + free(send_buf); + free(recv_buf); +} + +Test(memReg, systemMemNoCache_enableHmemDevRegister) +{ + system_mem_dev_reg_test_runner(false, true); +} + +Test(memReg, systemMemCache_enableHmemDevRegister) +{ + system_mem_dev_reg_test_runner(true, true); +} + +Test(memReg, systemMemNoCache_disableHmemDevRegister) +{ + system_mem_dev_reg_test_runner(false, false); +} + +Test(memReg, systemMemCache_disableHmemDevRegister) +{ + system_mem_dev_reg_test_runner(true, false); +} diff --git a/prov/cxi/test/mr.c b/prov/cxi/test/mr.c new file mode 100644 index 00000000000..fab3cbab7d7 --- /dev/null +++ b/prov/cxi/test/mr.c @@ -0,0 +1,974 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2020 Hewlett Packard Enterprise Development LP + */ + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(mr, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(mr, opt_mrs, .timeout = 60) +{ + int opt_mr_cnt = 200; + struct mem_region opt_mrs[opt_mr_cnt]; + int i; + uint64_t key; + + for (i = 0; i < opt_mr_cnt; i++) { + key = i; + mr_create(0x1000, FI_REMOTE_WRITE, 0, &key, &opt_mrs[i]); + } + + + for (i = 0; i < opt_mr_cnt; i++) + mr_destroy(&opt_mrs[i]); +} + +Test(mr, invalid_fi_directed_recv_flag) +{ + int ret; + struct fi_mr_attr attr = {}; + struct iovec iov = {}; + struct fid_mr *mr; + + iov.iov_len = sizeof(ret); + iov.iov_base = (void *)&ret; + + attr.mr_iov = &iov; + attr.iov_count = 1; + attr.access = FI_REMOTE_READ | FI_REMOTE_WRITE; + attr.requested_key = 0x123; + + ret = fi_mr_regattr(cxit_domain, &attr, FI_DIRECTED_RECV, &mr); + cr_assert_eq(ret, -FI_EINVAL, "fi_mr_regattr failed: %d", ret); +} + +Test(mr, std_mrs, .timeout = 600, .disabled = true) +{ + int std_mr_cnt = 16*1024; + int mrs = 0; + struct mem_region std_mrs[std_mr_cnt]; + int i; + int ret; + uint64_t key; + + for (i = 0; i < std_mr_cnt; i++) { + mrs++; + key = i + 200; + ret = mr_create(8, FI_REMOTE_WRITE, 0, &key, &std_mrs[i]); + if (ret) { + printf("Standard MR limit: %d\n", mrs); + break; + } + } + + /* It's difficult to predict available resources. An idle system + * currently supports at least 13955 total standard MRs. This is + * roughly: + * 16k total LEs - + * 1000 (reserved for services) - + * 1400 (reserved for other pools) = + * 13984 + * + * An EP requires a few other LEs to implement messaging and other + * APIs. + */ + cr_assert(mrs >= 13955); + + /* Note: MR close is very slow in emulation due to + * cxil_invalidate_pte_le(). + */ + for (i = 0; i < mrs; i++) + mr_destroy(&std_mrs[i]); +} + +Test(mr, opt_mr_recycle, .timeout = 600, .disabled = false) +{ + int mr_cnt = 2*1024+1; // more than the total number of PTEs + struct mem_region mr; + int i; + int ret; + uint64_t key; + + for (i = 0; i < mr_cnt; i++) { + key = 0; + ret = mr_create(8, FI_REMOTE_WRITE, 0, &key, &mr); + cr_assert_eq(ret, FI_SUCCESS, "Failed to allocate MR %d\n", i); + + mr_destroy(&mr); + } +} + +/* Perform zero-byte Puts to zero-byte standard and optimized MRs. Validate + * remote counting events. + */ +Test(mr, mr_zero_len) +{ + struct mem_region mr; + struct fi_cq_tagged_entry cqe; + int ret; + uint64_t key; + + /* Optimized MR */ + key = 0; + + ret = mr_create(0, FI_REMOTE_WRITE, 0, &key, &mr); + cr_assert(ret == FI_SUCCESS); + + ret = fi_write(cxit_ep, NULL, 0, NULL, + cxit_ep_fi_addr, 0, key, NULL); + cr_assert(ret == FI_SUCCESS, "write failure %d", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + while (fi_cntr_read(cxit_rem_cntr) != 1) + ; + + mr_destroy(&mr); + + /* Standard MR */ + /* TODO: For FI_MR_PROV_KEY we will need to fully + * allocate optimized + */ + key = 200; + ret = mr_create(0, FI_REMOTE_WRITE, 0, &key, &mr); + cr_assert(ret == FI_SUCCESS); + + ret = fi_write(cxit_ep, NULL, 0, NULL, + cxit_ep_fi_addr, 0, key, NULL); + cr_assert(ret == FI_SUCCESS, "ret: %d\n", ret); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + while (fi_cntr_read(cxit_rem_cntr) != 2) + ; + + mr_destroy(&mr); +} + +/* Validate that unique keys are enforced. */ +Test(mr, mr_unique_key) +{ + char buf[256]; + struct fid_mr *mr1; + struct fid_mr *mr2; + int ret; + + /* MR keys are enforced by the domain. */ + if (cxit_prov_key) { + assert(1); + return; + } + + ret = fi_mr_reg(cxit_domain, buf, 256, FI_REMOTE_WRITE, 0, 0, 0, &mr1, + NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_reg(cxit_domain, buf, 256, FI_REMOTE_WRITE, 0, 0, 0, &mr2, + NULL); + cr_assert(ret == -FI_ENOKEY); + + ret = fi_close(&mr1->fid); + cr_assert(ret == FI_SUCCESS); +} + +/* Validate not recycling non-cached FI_MR_PROV_KEY */ +Test(mr, mr_recycle) +{ + char buf[256]; + struct fid_mr *mr1; + struct fid_mr *mr2; + struct fid_mr *mr3; + uint64_t rkey1 = 0; + uint64_t rkey2 = 0; + uint64_t rkey3 = 0; + int ret; + + /* Must be non-cached FI_MR_PROV_KEY; we rely on the fact + * rma EP are setup with a remote counter and bind it + * to the EP which forces non-cached for the MR. + */ + if (!cxit_prov_key) { + assert(1); + return; + } + + ret = fi_mr_reg(cxit_domain, buf, 256, + FI_REMOTE_READ | FI_REMOTE_WRITE, 0, rkey1, 0, + &mr1, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_bind(mr1, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind MR1 failed %d", ret); + + ret = fi_mr_bind(mr1, &cxit_rem_cntr->fid, FI_REMOTE_WRITE); + cr_assert_eq(ret, FI_SUCCESS, + "fi_mr_bind MR1 counter failed %d", ret); + + ret = fi_mr_enable(mr1); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable MR1 failed %d", ret); + + rkey1 = fi_mr_key(mr1); + cr_assert_neq(rkey1, FI_KEY_NOTAVAIL, "MR1 KEY invalid %lx", rkey1); + + ret = fi_mr_reg(cxit_domain, buf, 256, + FI_REMOTE_READ | FI_REMOTE_WRITE, 0, rkey2, 0, + &mr2, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_bind(mr2, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind MR2 failed %d", ret); + + ret = fi_mr_bind(mr2, &cxit_rem_cntr->fid, FI_REMOTE_WRITE); + cr_assert_eq(ret, FI_SUCCESS, + "fi_mr_bind MR2 counter failed %d", ret); + + ret = fi_mr_enable(mr2); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable MR2 failed %d", ret); + + rkey2 = fi_mr_key(mr2); + cr_assert_neq(rkey2, FI_KEY_NOTAVAIL, "MR2 KEY invalid %lx", rkey2); + cr_assert_neq(rkey2, rkey1, "MR Keys not unique"); + + ret = fi_close(&mr2->fid); + cr_assert_eq(ret, FI_SUCCESS, "close of MR2 %d", ret); + + ret = fi_mr_reg(cxit_domain, buf, 256, + FI_REMOTE_READ | FI_REMOTE_WRITE, 0, rkey3, 0, + &mr3, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_bind(mr3, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind MR3 failed %d", ret); + + ret = fi_mr_bind(mr3, &cxit_rem_cntr->fid, FI_REMOTE_WRITE); + cr_assert_eq(ret, FI_SUCCESS, + "fi_mr_bind MR3 counter failed %d", ret); + + ret = fi_mr_enable(mr3); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable MR3 failed %d", ret); + + rkey3 = fi_mr_key(mr3); + cr_assert_neq(rkey3, FI_KEY_NOTAVAIL, "MR3 KEY invalid %lx", rkey3); + + cr_assert_neq(rkey3, rkey1, "MR3 Key not unique"); + cr_assert_neq(rkey3, rkey2, "MR2 Key recycled"); + + ret = fi_close(&mr1->fid); + cr_assert_eq(ret, FI_SUCCESS, "close of MR1 %d", ret); + ret = fi_close(&mr3->fid); + cr_assert_eq(ret, FI_SUCCESS, "close of MR3 %d", ret); +} + +/* Validate that RKEY are not required for local MR */ +Test(mr, mr_no_local_rkey) +{ + char buf[256]; + struct fid_mr *mr1; + struct fid_mr *mr2; + uint64_t rkey = 0; + uint64_t no_rkey; + int ret; + + ret = fi_mr_reg(cxit_domain, buf, 256, FI_READ | FI_WRITE, 0, rkey, 0, + &mr1, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_bind(mr1, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind mr1 failed %d", ret); + + ret = fi_mr_enable(mr1); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable mr1 failed %d", ret); + + no_rkey = fi_mr_key(mr1); + cr_assert_eq(no_rkey, FI_KEY_NOTAVAIL, "No RKEY check %ld", no_rkey); + + /* Verify second local MR with same client key value passed works */ + ret = fi_mr_reg(cxit_domain, buf, 256, FI_READ | FI_WRITE, 0, rkey, 0, + &mr2, NULL); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_bind(mr2, &cxit_ep->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_bind mr2 failed %d", ret); + + ret = fi_mr_enable(mr2); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_enable mr2 failed %d", ret); + + no_rkey = fi_mr_key(mr2); + cr_assert_eq(no_rkey, FI_KEY_NOTAVAIL, "No RKEY check %ld", no_rkey); + + ret = fi_close(&mr2->fid); + cr_assert(ret == FI_SUCCESS); + + ret = fi_close(&mr1->fid); + cr_assert(ret == FI_SUCCESS); +} + + +/* Test creating and destroying an MR that is never bound to an EP. */ +Test(mr, no_bind) +{ + int ret; + size_t buf_len = 0x1000; + void *buf; + struct fid_mr *mr; + + buf = malloc(buf_len); + cr_assert(buf); + + /* Optimized MR */ + + ret = fi_mr_reg(cxit_domain, buf, buf_len, FI_REMOTE_WRITE, + 0, 0, 0, &mr, NULL); + cr_assert_eq(ret, FI_SUCCESS); + + fi_close(&mr->fid); + + /* Standard MR */ + + ret = fi_mr_reg(cxit_domain, buf, buf_len, FI_REMOTE_WRITE, + 0, 200, 0, &mr, NULL); + cr_assert_eq(ret, FI_SUCCESS); + + fi_close(&mr->fid); + + free(buf); +} + +TestSuite(mr_event, .init = cxit_setup_rma_mr_events, + .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(mr_event, counts) +{ + int ret; + struct fi_cq_tagged_entry cqe; + struct fid_mr *mr; + struct cxip_mr *cxip_mr; + uint8_t *src_buf; + uint8_t *tgt_buf; + int src_len = 8; + int tgt_len = 4096; + uint64_t key_val = 200; + uint64_t orig_cnt; + int matches; + int accesses; + uint64_t operand1; + uint64_t result1; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_ioc result_ioc; + struct fi_rma_ioc rma_ioc; + + /* Need remote counters */ + cxit_create_rem_cntrs(); + + src_buf = malloc(src_len); + cr_assert_not_null(src_buf, "src_buf alloc failed"); + + tgt_buf = calloc(1, tgt_len); + cr_assert_not_null(tgt_buf, "tgt_buf alloc failed"); + + /* Create MR */ + ret = fi_mr_reg(cxit_domain, tgt_buf, tgt_len, + FI_REMOTE_WRITE | FI_REMOTE_READ, 0, + key_val, 0, &mr, NULL); + cr_assert(ret == FI_SUCCESS); + + cxip_mr = container_of(mr, struct cxip_mr, mr_fid); + + ret = fi_mr_bind(mr, &cxit_ep->fid, 0); + cr_assert(ret == FI_SUCCESS); + + cr_assert(cxit_rem_cntr != NULL); + ret = fi_mr_bind(mr, &cxit_rem_cntr->fid, FI_REMOTE_WRITE); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_enable(mr); + cr_assert(ret == FI_SUCCESS); + + if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY) + key_val = fi_mr_key(mr); + + /* Match counts do not apply to optimized MR */ + if (cxip_generic_is_mr_key_opt(key_val)) + goto done; + + orig_cnt = fi_cntr_read(cxit_rem_cntr); + + matches = ofi_atomic_get32(&cxip_mr->match_events); + accesses = ofi_atomic_get32(&cxip_mr->access_events); + + ret = fi_write(cxit_ep, src_buf, src_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate remote counter was incremented correctly */ + while (orig_cnt + 1 != fi_cntr_read(cxit_rem_cntr)) + ; + + /* Validate match and access counts incremented */ + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) >= matches + 1, + "Match count not updated for RMA\n"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) >= accesses + 1, + "RMA access count not updated\n"); + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == + ofi_atomic_get32(&cxip_mr->access_events), + "RMA matches do not equal accesses"); + + matches = ofi_atomic_get32(&cxip_mr->match_events); + accesses = ofi_atomic_get32(&cxip_mr->access_events); + + ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, + key_val, FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_WRITE, NULL); + + /* Validate remote counter was incremented correctly */ + while (orig_cnt + 2 != fi_cntr_read(cxit_rem_cntr)) + ; + + /* Validate match and access counts incremented */ + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) >= matches + 1, + "Match count not updated for atomic"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) >= accesses + 1, + "Atomic access count not updated"); + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == + ofi_atomic_get32(&cxip_mr->access_events), + "Atomic matches do not equal accesses"); + + matches = ofi_atomic_get32(&cxip_mr->match_events); + accesses = ofi_atomic_get32(&cxip_mr->access_events); + + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, NULL, &result1, NULL, + cxit_ep_fi_addr, 0, key_val, FI_UINT64, + FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + + /* Validate remote counter was incremented correctly */ + while (orig_cnt + 3 != fi_cntr_read(cxit_rem_cntr)) + ; + + /* Validate match and access counts incremented */ + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) >= matches + 1, + "Fetch atomic match count not updated for atomic"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) >= accesses + 1, + "Fetch atomic access count not updated"); + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == + ofi_atomic_get32(&cxip_mr->access_events), + "Fetch atomic matches do not equal accesses"); + + matches = ofi_atomic_get32(&cxip_mr->match_events); + accesses = ofi_atomic_get32(&cxip_mr->access_events); + + ioc.addr = &operand1; + ioc.count = 1; + result_ioc.addr = &result1; + result_ioc.count = 1; + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key_val; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + /* Do a fetch with a flush */ + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_DELIVERY_COMPLETE); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_ATOMIC | FI_READ, NULL); + + /* Validate remote counter was incremented correctly, + * once for atomic and once for flush. + */ + while (orig_cnt + 5 != fi_cntr_read(cxit_rem_cntr)) + ; + + /* Validate match and access counts incremented */ + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) >= matches + 1, + "Fetch atomic/flush match count not updated for atomic"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) >= accesses + 1, + "Fetch atomic/flush access count not updated"); + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == + ofi_atomic_get32(&cxip_mr->access_events), + "Fetch atomic flush matches do not equal accesses"); + +done: + fi_close(&mr->fid); + + free(tgt_buf); + free(src_buf); +} + +Test(mr_event, not_found_counts) +{ + int ret; + struct fi_cq_err_entry err; + struct fi_cq_tagged_entry cqe; + struct fid_mr *mr; + struct cxip_mr *cxip_mr; + uint8_t *src_buf; + uint8_t *tgt_buf; + int src_len = 8; + int tgt_len = 4096; + uint64_t key_val = 200; + int matches; + int accesses; + uint64_t operand1; + uint64_t result1; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_ioc result_ioc; + struct fi_rma_ioc rma_ioc; + + src_buf = malloc(src_len); + cr_assert_not_null(src_buf, "src_buf alloc failed"); + + tgt_buf = calloc(1, tgt_len); + cr_assert_not_null(tgt_buf, "tgt_buf alloc failed"); + + /* Create MR */ + ret = fi_mr_reg(cxit_domain, tgt_buf, tgt_len, + FI_REMOTE_WRITE | FI_REMOTE_READ, 0, + key_val, 0, &mr, NULL); + cr_assert(ret == FI_SUCCESS); + + cxip_mr = container_of(mr, struct cxip_mr, mr_fid); + + ret = fi_mr_bind(mr, &cxit_ep->fid, 0); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_enable(mr); + cr_assert(ret == FI_SUCCESS); + + if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY) + key_val = fi_mr_key(mr); + + /* Match counts do not apply to optimized MR */ + if (cxip_generic_is_mr_key_opt(key_val)) + goto done; + + /* Use invalid key so that remote MR is not found */ + key_val++; + + matches = ofi_atomic_get32(&cxip_mr->match_events); + accesses = ofi_atomic_get32(&cxip_mr->access_events); + + ret = fi_write(cxit_ep, src_buf, src_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret); + cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err); + + /* Validate match and access counts did not increment */ + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches, + "Match count updated for RMA\n"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses, + "Access count updated for RMA\n"); + + ret = fi_atomic(cxit_ep, &operand1, 1, 0, cxit_ep_fi_addr, 0, + key_val, FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic success %d", ret); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret); + cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err); + + /* Validate match and access counts did not increment */ + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches, + "Match count updated for atomic\n"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses, + "Access count updated for atomic\n"); + + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, NULL, &result1, NULL, + cxit_ep_fi_addr, 0, key_val, FI_UINT64, + FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic fetch success %d", + ret); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret); + cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err); + + /* Validate match and access counts did not increment */ + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches, + "Match count updated for atomic fetch\n"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses, + "Access count updated for atomic fetch\n"); + + ioc.addr = &operand1; + ioc.count = 1; + result_ioc.addr = &result1; + result_ioc.count = 1; + rma_ioc.addr = 0; + rma_ioc.count = 1; + rma_ioc.key = key_val; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + /* Do a fetch with a flush */ + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_DELIVERY_COMPLETE); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic flush success %d", + ret); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret); + cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err); + + /* Validate match and access counts did not increment */ + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches, + "Match count updated for atomic flush\n"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses, + "Access count updated for atomic flush\n"); + +done: + fi_close(&mr->fid); + + free(tgt_buf); + free(src_buf); +} + +Test(mr_event, bounds_err_counts) +{ + int ret; + struct fi_cq_err_entry err; + struct fi_cq_tagged_entry cqe; + struct fid_mr *mr; + struct cxip_mr *cxip_mr; + uint8_t *src_buf; + uint8_t *tgt_buf; + int src_len = 16; + int tgt_len = 8; + uint64_t key_val = 200; /* Force client key to be standard MR */ + int matches; + int accesses; + uint64_t operand1; + uint64_t result1; + struct fi_msg_atomic msg = {}; + struct fi_ioc ioc; + struct fi_ioc result_ioc; + struct fi_rma_ioc rma_ioc; + struct cxip_ep *cxi_ep; + + src_buf = malloc(src_len); + cr_assert_not_null(src_buf, "src_buf alloc failed"); + + tgt_buf = calloc(1, tgt_len); + cr_assert_not_null(tgt_buf, "tgt_buf alloc failed"); + + /* Create MR */ + ret = fi_mr_reg(cxit_domain, tgt_buf, tgt_len, + FI_REMOTE_WRITE | FI_REMOTE_READ, 0, + key_val, 0, &mr, NULL); + cr_assert(ret == FI_SUCCESS); + + cxip_mr = container_of(mr, struct cxip_mr, mr_fid); + + ret = fi_mr_bind(mr, &cxit_ep->fid, 0); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_enable(mr); + cr_assert(ret == FI_SUCCESS); + + if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY) + key_val = fi_mr_key(mr); + + /* Match counts do not apply to optimized MR */ + if (cxip_generic_is_mr_key_opt(key_val)) + goto done; + + /* Netsim does not generate EVENT_MATCH for bounds, + * while hardware does. TODO: Fix this in netsim. + */ + cxi_ep = container_of(cxit_ep, struct cxip_ep, ep); + + matches = ofi_atomic_get32(&cxip_mr->match_events); + accesses = ofi_atomic_get32(&cxip_mr->access_events); + + /* src len is greater than remote MR len */ + ret = fi_write(cxit_ep, src_buf, src_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret); + cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err); + + /* Validate match and access counts increment */ + if (!is_netsim(cxi_ep->ep_obj)) { + matches++; + accesses++; + } + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches, + "Match count mismatch for RMA\n"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses, + "Access count mismatch for RMA\n"); + + /* Remote offset of 8 is greater than remote MR bounds */ + ret = fi_atomic(cxit_ep, &operand1, 1, NULL, cxit_ep_fi_addr, 8, + key_val, FI_UINT64, FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic success %d", ret); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret); + cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err); + + /* Validate match and access counts increment */ + if (!is_netsim(cxi_ep->ep_obj)) { + matches++; + accesses++; + } + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches, + "Match count mismatch for atomic\n"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses, + "Access count mismatch for atomic\n"); + + /* Remote offset of 8 is greater than remote MR bounds */ + ret = fi_fetch_atomic(cxit_ep, &operand1, 1, NULL, &result1, NULL, + cxit_ep_fi_addr, 8, key_val, FI_UINT64, + FI_SUM, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic fetch success %d", + ret); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret); + cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err); + + /* Validate match and access counts increment */ + if (!is_netsim(cxi_ep->ep_obj)) { + matches++; + accesses++; + } + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches, + "Match count mismatch atomic fetch\n"); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses, + "Access count mismatch for atomic fetch\n"); + + ioc.addr = &operand1; + ioc.count = 1; + result_ioc.addr = &result1; + result_ioc.count = 1; + + /* Remote offset of 8 is greater than remote MR bounds */ + rma_ioc.addr = 8; + rma_ioc.count = 1; + rma_ioc.key = key_val; + + msg.msg_iov = &ioc; + msg.iov_count = 1; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.datatype = FI_UINT64; + msg.op = FI_SUM; + + /* Do a fetch with a flush */ + ret = fi_fetch_atomicmsg(cxit_ep, &msg, &result_ioc, NULL, 1, + FI_DELIVERY_COMPLETE); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected atomic flush success %d", + ret); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1, "Unexpected fi_cq_readerr return %d", ret); + cr_assert_eq(err.err, FI_EIO, "Unexpected error value %d", err.err); + + /* For an atomic flush with FI_DELIVERY_COMPLETE using an + * out-of-bounds offset we expect both the atomic and zero + * by flush to generate events. + */ + if (!is_netsim(cxi_ep->ep_obj)) { + matches++; + accesses++; + } + cr_assert(ofi_atomic_get32(&cxip_mr->match_events) == matches + 1, + "Match count != %d for flush with atomic error", + matches + 1); + cr_assert(ofi_atomic_get32(&cxip_mr->access_events) == accesses + 1, + "Access count != %d for flush with atomic error", + accesses + 1); + +done: + fi_close(&mr->fid); + + free(tgt_buf); + free(src_buf); +} + +/* + * With FI_MR_PROV_KEY, test if all PID IDX mapping resources required by + * optimized MR are consumed, that falling back to standard MR is done. + * This test should run with and without MR cache disabled. + */ +TestSuite(mr_resources, .init = cxit_setup_domain, .fini = cxit_teardown_domain, + .timeout = 120); + +#define NUM_MR_TEST_EP 15 +#define NUM_MR_PER_EP 86 + +Test(mr_resources, opt_fallback) +{ + struct fid_domain *dom[NUM_MR_TEST_EP]; + struct fid_ep *ep[NUM_MR_TEST_EP]; + struct fid_av *av[NUM_MR_TEST_EP]; + struct fid_cq *cq[NUM_MR_TEST_EP]; + struct fid_mr **mr; + char buf[256]; + int ret; + int num_dom; + int num_mr; + int tot_mr; + + if (!cxit_prov_key) + return; + + mr = calloc(NUM_MR_TEST_EP * NUM_MR_PER_EP, + sizeof(struct fid_mr *)); + cr_assert(mr != NULL, "calloc"); + + for (num_dom = 0, tot_mr = 0; num_dom < NUM_MR_TEST_EP; num_dom++) { + + ret = fi_domain(cxit_fabric, cxit_fi, &dom[num_dom], NULL); + cr_assert(ret == FI_SUCCESS, "fi_domain"); + + ret = fi_endpoint(dom[num_dom], cxit_fi, &ep[num_dom], NULL); + cr_assert(ret == FI_SUCCESS, "fi_endpoint"); + + ret = fi_av_open(dom[num_dom], &cxit_av_attr, + &av[num_dom], NULL); + cr_assert(ret == FI_SUCCESS, "fi_av_open"); + + ret = fi_ep_bind(ep[num_dom], &av[num_dom]->fid, 0); + cr_assert(ret == FI_SUCCESS, "fi_ep_bind AV"); + + ret = fi_cq_open(dom[num_dom], &cxit_tx_cq_attr, + &cq[num_dom], NULL); + cr_assert(ret == FI_SUCCESS, "fi_cq_open"); + + ret = fi_ep_bind(ep[num_dom], &cq[num_dom]->fid, + FI_TRANSMIT); + cr_assert(ret == FI_SUCCESS, "fi_ep_bind TX CQ"); + + ret = fi_ep_bind(ep[num_dom], &cq[num_dom]->fid, + FI_RECV); + cr_assert(ret == FI_SUCCESS, "fi_ep_bind RX CQ"); + + ret = fi_enable(ep[num_dom]); + cr_assert(ret == FI_SUCCESS, "fi_enable"); + + /* Create only optimized MR for this EP */ + for (num_mr = 0; num_mr < NUM_MR_PER_EP; num_mr++, tot_mr++) { + + ret = fi_mr_reg(dom[num_dom], buf, 256, + FI_REMOTE_WRITE | FI_REMOTE_READ, + 0, 0, 0, &mr[tot_mr], NULL); + cr_assert(ret == FI_SUCCESS, "fi_mr_reg"); + + ret = fi_mr_bind(mr[tot_mr], &ep[num_dom]->fid, 0); + cr_assert(ret == FI_SUCCESS, "fi_mr_bind"); + + ret = fi_mr_enable(mr[tot_mr]); + cr_assert(ret == FI_SUCCESS, "fi_mr_enable"); + } + } + + /* + * Validate that sufficient MR were created to exhaust the PID IDX + * mappings of 2560. There are two mappings required for each MR + * and 4 PID IDX mappings required by each endpoint created. + */ + cr_assert(4 * num_dom + tot_mr * 2 >= 2560, "Number of MR created"); + + for (num_mr = 0; num_mr < tot_mr; num_mr++) { + ret = fi_close(&mr[num_mr]->fid); + cr_assert(ret == FI_SUCCESS, "fi_close MR"); + } + + for (num_dom = 0; num_dom < NUM_MR_TEST_EP; num_dom++) { + ret = fi_close(&ep[num_dom]->fid); + cr_assert(ret == FI_SUCCESS, "fi_close EP"); + + ret = fi_close(&cq[num_dom]->fid); + cr_assert(ret == FI_SUCCESS, "fi_close CQ"); + + ret = fi_close(&av[num_dom]->fid); + cr_assert(ret == FI_SUCCESS, "fi_close AV"); + + ret = fi_close(&dom[num_dom]->fid); + cr_assert(ret == FI_SUCCESS, "fi_close Domain"); + } + + free(mr); +} diff --git a/prov/cxi/test/msg.c b/prov/cxi/test/msg.c new file mode 100644 index 00000000000..058761b2745 --- /dev/null +++ b/prov/cxi/test/msg.c @@ -0,0 +1,2169 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include + +#include +#include +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(msg, .init = cxit_setup_msg, .fini = cxit_teardown_msg, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test basic send/recv */ +Test(msg, ping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic send/recv with data */ +Test(msg, pingdata) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + uint64_t data = 0xabcdabcdabcdabcd; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_senddata(cxit_ep, send_buf, send_len, NULL, data, + cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV | FI_REMOTE_CQ_DATA), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == data, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic inject send */ +Test(msg, inject_ping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_inject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr); + cr_assert_eq(ret, FI_SUCCESS, "fi_inject failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_MSG | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Make sure a TX event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + free(send_buf); + free(recv_buf); +} + +/* Test basic injectdata */ +Test(msg, injectdata_ping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + uint64_t data = 0xabcdabcdabcdabcd; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_injectdata(cxit_ep, send_buf, send_len, data, + cxit_ep_fi_addr); + cr_assert_eq(ret, FI_SUCCESS, "fi_inject failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, + FI_MSG | FI_RECV | FI_REMOTE_CQ_DATA, NULL, data, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Make sure a TX event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + free(send_buf); + free(recv_buf); +} + +/* Test basic sendv/recvv */ +Test(msg, vping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + ret = fi_recvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + ret = fi_sendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic sendmsg/recvmsg */ +Test(msg, msgping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + ret = fi_recvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = NULL; + + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic sendmsg/recvmsg with two EP bound to same CQ */ +Test(msg, msgping_cq_share) +{ + int i, ret; + uint8_t *recv_buf, + *recv_buf2, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + struct iovec riovec; + struct iovec riovec2; + struct iovec siovec; + struct fid_ep *fid_ep2; + struct cxip_addr ep2_addr; + fi_addr_t ep2_fi_addr; + size_t addrlen = sizeof(cxit_ep_addr); + int num_recv_comps = 0; + + /* Create a second EP bound to the same CQs as original */ + ret = fi_endpoint(cxit_domain, cxit_fi, &fid_ep2, NULL); + cr_assert(ret == FI_SUCCESS, "fi_endpoint"); + cr_assert_not_null(fid_ep2); + + ret = fi_ep_bind(fid_ep2, &cxit_tx_cq->fid, cxit_tx_cq_bind_flags); + cr_assert(!ret, "fe_ep_bind TX CQ to 2nd EP"); + ret = fi_ep_bind(fid_ep2, &cxit_rx_cq->fid, cxit_rx_cq_bind_flags); + cr_assert(!ret, "fe_ep_bind RX CQ to 2nd EP"); + + ret = fi_ep_bind(fid_ep2, &cxit_av->fid, 0); + cr_assert(!ret, "fi_ep_bind AV to 2nd EP"); + + ret = fi_enable(fid_ep2); + cr_assert(ret == FI_SUCCESS, "fi_enable of 2nd EP"); + + ret = fi_getname(&fid_ep2->fid, &ep2_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "fi_getname for 2nd EP"); + cr_assert(addrlen == sizeof(ep2_addr), "addr length"); + + ret = fi_av_insert(cxit_av, (void *)&ep2_addr, 1, + &ep2_fi_addr, 0, NULL); + cr_assert(ret == 1); + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + recv_buf2 = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf2); + memset(recv_buf2, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer for first EP */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + ret = fi_recvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Post RX buffer for second EP */ + riovec2.iov_base = recv_buf2; + riovec2.iov_len = recv_len; + rmsg.msg_iov = &riovec2; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + ret = fi_recvmsg(fid_ep2, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = NULL; + + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Send 64 byte message to 2nd EP */ + smsg.addr = ep2_fi_addr; + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_send to EP2 failed %d", ret); + + /* Wait for async events from single CQ bound to multiple EP + * to verify receive notification for each EP occurs. + */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == 1) { + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, + "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, + "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, + "Invalid source address"); + num_recv_comps++; + } + } while (num_recv_comps < 2); + cr_assert_eq(num_recv_comps, 2, "Not all completions received"); + + /* Wait for async events indicating data has been sent */ + for (i = 0; i < 2; i++) { + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + } + + /* Validate sent data to each receive buffer */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + cr_expect_eq(recv_buf2[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf2[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + ret = fi_close(&fid_ep2->fid); + cr_assert(ret == FI_SUCCESS, "fi_close endpoint2"); + + free(send_buf); + free(recv_buf); + free(recv_buf2); +} + +/* Test basic sendmsg/recvmsg with data */ +Test(msg, msgping_wdata) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + struct iovec riovec; + struct iovec siovec; + uint64_t data = 0xabcdabcdabcdabcd; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + ret = fi_recvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = NULL; + smsg.data = data; + + ret = fi_sendmsg(cxit_ep, &smsg, FI_REMOTE_CQ_DATA); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV | FI_REMOTE_CQ_DATA), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == data, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic injectmsg */ +Test(msg, inject_msgping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + ret = fi_recvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = NULL; + + ret = fi_sendmsg(cxit_ep, &smsg, FI_INJECT); + cr_assert_eq(ret, FI_SUCCESS, "fi_sendmsg failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_MSG | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_MSG | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test send/recv sizes small to large */ +Test(msg, sizes) +{ + int i, j, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64*1024; /* 128k fails */ + int send_len = 64*1024; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + bool sent; + bool recved; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + for (i = 0; i <= recv_len; i = (i ? i << 1 : 1)) { + recved = sent = false; + + /* Post RX buffer */ + ret = fi_recv(cxit_ep, i ? recv_buf : NULL, i, NULL, + FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send to self */ + ret = fi_send(cxit_ep, i ? send_buf : NULL, i, NULL, + cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Gather both events, ensure progress on both sides. */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == 1) { + cr_assert_eq(recved, false); + recved = true; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) { + cr_assert_eq(sent, false); + sent = true; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + } while (!(sent && recved)); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == i, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + + /* Validate sent data */ + for (j = 0; j < i; j++) { + cr_expect_eq(recv_buf[j], send_buf[j], + "data mismatch, element[%d], exp=%d saw=%d, size:%d err=%d\n", + j, send_buf[j], recv_buf[j], i, err++); + } + } + + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test send/recv sizes large to small (this exercises MR caching) */ +Test(msg, sizes_desc) +{ + int i, j, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64*1024; /* 128k fails */ + int send_len = 64*1024; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + bool sent; + bool recved; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + for (i = recv_len; i >= 1; i >>= 1) { + recved = sent = false; + + /* Post RX buffer */ + ret = fi_recv(cxit_ep, recv_buf, i, NULL, FI_ADDR_UNSPEC, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_send(cxit_ep, send_buf, i, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Gather both events, ensure progress on both sides. */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == 1) { + cr_assert_eq(recved, false); + recved = true; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) { + cr_assert_eq(sent, false); + sent = true; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + } while (!(sent && recved)); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == i, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + + /* Validate sent data */ + for (j = 0; j < i; j++) { + cr_expect_eq(recv_buf[j], send_buf[j], + "data mismatch, element[%d], exp=%d saw=%d, size:%d err=%d\n", + j, send_buf[j], recv_buf[j], i, err++); + } + } + + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test software posted receives greater than hardware limits */ +Test(msg, sw_max_recv, .timeout = CXIT_DEFAULT_TIMEOUT) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + fi_addr_t from; + char *rx_mode; + + /* Test is only valid in software only matching */ + rx_mode = getenv("FI_CXI_RX_MATCH_MODE"); + if (!rx_mode || strcmp(rx_mode, "software")) { + cr_assert(1); + return; + } + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + /* Only 64K buffer IDs are available */ + for (i = 0; i < 68000; i++) { + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + } + + /* Send 64 bytes to self */ + for (i = 0; i < 68000; i++) { + ret = fi_send(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + } +} + +/* Test send/recv interoperability with tagged messaging */ +Test(msg, tagged_interop) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + uint8_t *trecv_buf, + *tsend_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + trecv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(trecv_buf); + memset(trecv_buf, 0, recv_len); + + tsend_buf = aligned_alloc(s_page_size, send_len); + cr_assert(tsend_buf); + + for (i = 0; i < send_len; i++) + tsend_buf[i] = i + 0xc1; + + /* Post tagged RX buffer */ + ret = fi_trecv(cxit_ep, trecv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Post RX buffer */ + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Send 64 byte tagged message to self */ + ret = fi_tsend(cxit_ep, tsend_buf, send_len, NULL, cxit_ep_fi_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_TAGGED | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_TAGGED | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(trecv_buf[i], tsend_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, tsend_buf[i], trecv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(tsend_buf); + free(trecv_buf); + + free(send_buf); + free(recv_buf); +} + +#define RECV_CTX ((void *)0xabc0000000000000) +#define SEND_CTX ((void *)0xdef0000000000000) + +void do_multi_recv(uint8_t *send_buf, size_t send_len, + uint8_t *recv_buf, size_t recv_len, + bool send_first, size_t sends, size_t olen) +{ + int i, j, ret; + int err = 0; + fi_addr_t from; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + struct iovec riovec; + struct iovec siovec; + uint64_t rxe_flags; + uint64_t txe_flags; + size_t sent = 0; + size_t recved = 0; + size_t err_recved = 0; + struct fi_cq_tagged_entry tx_cqe; + struct fi_cq_tagged_entry rx_cqe; + struct fi_cq_err_entry err_cqe = {}; + size_t recved_len = 0; + bool dequeued = false; + + if (!sends) + sends = recv_len / send_len; + + memset(recv_buf, 0, recv_len); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = RECV_CTX; + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = SEND_CTX; + + if (send_first) { + for (i = 0; i < sends; i++) { + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, + "fi_sendmsg failed %d", ret); + } + + /* Progress send to ensure it arrives unexpected */ + i = 0; + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) { + sent = true; + break; + } + cr_assert_eq(ret, -FI_EAGAIN, + "send failed %d", ret); + } while (i++ < 100000); + } + + ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "fi_recvmsg failed %d", ret); + + if (!send_first) { + sleep(1); + for (i = 0; i < sends; i++) { + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, + "fi_sendmsg failed %d", ret); + } + } + + /* Gather both events, ensure progress on both sides. */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == 1) { + rxe_flags = FI_MSG | FI_RECV; + + validate_multi_recv_rx_event(&rx_cqe, RECV_CTX, + send_len, rxe_flags, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, + "Invalid source address"); + + if (rx_cqe.flags & FI_MULTI_RECV) { + cr_assert(!dequeued); + dequeued = true; + } + + recved_len = rx_cqe.len; + + /* Validate sent data */ + uint8_t *rbuf = rx_cqe.buf; + + for (j = 0; j < recved_len; j++) { + cr_expect_eq(rbuf[j], send_buf[j], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + j, send_buf[j], rbuf[j], + err++); + cr_assert(err < 10); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + recved++; + } else if (ret == -FI_EAVAIL) { + ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0); + cr_assert_eq(ret, 1); + + recved_len = err_cqe.len; + uint8_t *rbuf = recv_buf + ((sends-1) * send_len); + + /* The truncated transfer is always the last, which + * dequeued the multi-recv buffer. + */ + rxe_flags = FI_MSG | FI_RECV; + + cr_assert(err_cqe.op_context == RECV_CTX, + "Error RX CQE Context mismatch"); + cr_assert((err_cqe.flags & ~FI_MULTI_RECV) == rxe_flags, + "Error RX CQE flags mismatch"); + cr_assert(err_cqe.len == send_len - olen, + "Invalid Error RX CQE length, got: %ld exp: %ld", + err_cqe.len, recv_len); + cr_assert(err_cqe.buf == rbuf, + "Invalid Error RX CQE address (%p %p)", + err_cqe.buf, rbuf); + cr_assert(err_cqe.data == 0, + "Invalid Error RX CQE data"); + cr_assert(err_cqe.tag == 0, + "Invalid Error RX CQE tag"); + cr_assert(err_cqe.olen == olen, + "Invalid Error RX CQE olen, got: %ld exp: %ld", + err_cqe.olen, olen); + cr_assert(err_cqe.err == FI_ETRUNC, + "Invalid Error RX CQE code\n"); + cr_assert(err_cqe.prov_errno == C_RC_OK, + "Invalid Error RX CQE errno"); + cr_assert(err_cqe.err_data == NULL); + cr_assert(err_cqe.err_data_size == 0); + + if (err_cqe.flags & FI_MULTI_RECV) { + cr_assert(!dequeued); + dequeued = true; + } + + /* Validate sent data */ + for (j = 0; j < recved_len; j++) { + cr_expect_eq(rbuf[j], send_buf[j], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + j, send_buf[j], rbuf[j], + err++); + cr_assert(err < 10); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + err_recved++; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) { + txe_flags = FI_MSG | FI_SEND; + sent++; + validate_tx_event(&tx_cqe, txe_flags, SEND_CTX); + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + } while (sent < sends || (recved + err_recved) < sends); +} + +struct msg_multi_recv_params { + size_t send_len; + size_t recv_len; + bool ux; + size_t sends; + size_t olen; +}; + +#define SHORT_SEND_LEN 128 +#define SHORT_SENDS 200 +#define LONG_SEND_LEN 4096 +#define LONG_SENDS 20 +#define SHORT_OLEN (3*1024) +#define LONG_OLEN 1024 + +static struct msg_multi_recv_params params[] = { +#if 1 + /* expected/unexp eager */ + {.send_len = SHORT_SEND_LEN, + .recv_len = SHORT_SENDS * SHORT_SEND_LEN, + .ux = false}, + {.send_len = SHORT_SEND_LEN, + .recv_len = SHORT_SENDS * SHORT_SEND_LEN, + .ux = true}, + + /* exp/unexp long */ + {.send_len = LONG_SEND_LEN, + .recv_len = LONG_SENDS*LONG_SEND_LEN, + .ux = false}, + {.send_len = LONG_SEND_LEN, + .recv_len = LONG_SENDS*LONG_SEND_LEN, + .ux = true}, +#endif + +#if 1 + /* exp/unexp overflow */ + {.send_len = LONG_SEND_LEN, + .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - LONG_OLEN), + .ux = false, + .sends = LONG_SENDS+1, + .olen = LONG_OLEN}, + {.send_len = LONG_SEND_LEN, + .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - LONG_OLEN), + .ux = true, + .sends = LONG_SENDS+1, + .olen = LONG_OLEN}, +#endif + +#if 1 + /* exp/unexp overflow */ + {.send_len = LONG_SEND_LEN, + .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - SHORT_OLEN), + .ux = false, + .sends = LONG_SENDS+1, + .olen = SHORT_OLEN}, + {.send_len = LONG_SEND_LEN, + .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - SHORT_OLEN), + .ux = true, + .sends = LONG_SENDS+1, + .olen = SHORT_OLEN}, +#endif +}; + +ParameterizedTestParameters(msg, multi_recv) +{ + size_t param_sz; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct msg_multi_recv_params, params, + param_sz); +} + +/* Test multi-recv messaging */ +ParameterizedTest(struct msg_multi_recv_params *param, msg, multi_recv) +{ + void *recv_buf; + void *send_buf; + + recv_buf = aligned_alloc(s_page_size, param->recv_len); + cr_assert(recv_buf); + + send_buf = aligned_alloc(s_page_size, param->send_len); + cr_assert(send_buf); + + do_multi_recv(send_buf, param->send_len, recv_buf, + param->recv_len, param->ux, param->sends, + param->olen); + + free(send_buf); + free(recv_buf); +} + +/* Test multi-recv cancel */ +Test(msg, multi_recv_cancel) +{ + int i, ret; + uint8_t *recv_buf; + int recv_len = 0x1000; + int recvs = 5; + struct fi_cq_tagged_entry rx_cqe; + struct fi_cq_err_entry err_cqe; + struct fi_msg rmsg = {}; + struct iovec riovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + for (i = 0; i < recvs; i++) { + ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + } + + for (i = 0; i < recvs; i++) { + ret = fi_cancel(&cxit_ep->fid, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cancel failed %d", ret); + } + + for (i = 0; i < recvs; i++) { + do { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + if (ret == -FI_EAVAIL) + break; + + cr_assert_eq(ret, -FI_EAGAIN, + "unexpected event %d", ret); + } while (1); + + ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0); + cr_assert_eq(ret, 1); + + cr_assert(err_cqe.op_context == NULL, + "Error RX CQE Context mismatch"); + cr_assert(err_cqe.flags == (FI_MSG | FI_RECV | FI_MULTI_RECV), + "Error RX CQE flags mismatch"); + cr_assert(err_cqe.err == FI_ECANCELED, + "Invalid Error RX CQE code\n"); + cr_assert(err_cqe.prov_errno == 0, + "Invalid Error RX CQE errno"); + } +} + +/* Test out-of-order multi-receive transaction completion */ +Test(msg, multi_recv_ooo) +{ + int i, j, ret; + int err = 0; + fi_addr_t from; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + struct iovec riovec; + struct iovec siovec; + uint64_t rxe_flags; + int bytes_sent = 0; + uint8_t *recv_buf; + uint8_t *send_buf; + size_t send_len = 8*1024; + int sends = 10; + size_t recv_len = send_len * 5 + 64 * 5; + int sent = 0; + int recved = 0; + struct fi_cq_tagged_entry tx_cqe[sends]; + struct fi_cq_tagged_entry rx_cqe[sends]; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = NULL; + + ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + sleep(1); + for (i = 0; i < sends; i++) { + /* Interleave long and short sends. They will complete in a + * different order than they were sent or received. + */ + if (i % 2) + siovec.iov_len = 64; + else + siovec.iov_len = 8*1024; + + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", + ret); + } + + for (i = 0; i < sends; i++) { + /* Gather both events, ensure progress on both sides. */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe[recved], 1, + &from); + if (ret == 1) { + recved++; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe[sent], 1); + if (ret == 1) { + sent++; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + } while (!(sent == sends && recved == sends)); + } + + for (i = 0; i < sends; i++) { + bytes_sent += rx_cqe[i].len; + rxe_flags = FI_MSG | FI_RECV; + if (bytes_sent > (recv_len - CXIP_EP_MIN_MULTI_RECV)) + rxe_flags |= FI_MULTI_RECV; + + cr_assert(rx_cqe[i].flags == rxe_flags, "CQE flags mismatch"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + validate_tx_event(&tx_cqe[i], FI_MSG | FI_SEND, NULL); + + /* Validate sent data */ + uint8_t *rbuf = rx_cqe[i].buf; + + for (j = 0; j < rx_cqe[i].len; j++) { + cr_expect_eq(rbuf[j], send_buf[j], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + j, send_buf[j], recv_buf[j], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + } + + free(send_buf); + free(recv_buf); +} + +Test(msg, fc_multi_recv, .timeout = 30) +{ + int i, j, k, ret, tx_ret; + uint8_t *send_bufs; + uint8_t *send_buf; + int send_len = 64; + uint8_t *recv_buf; + int recv_len = 64; + int mrecv_msgs = 10; + struct fi_msg rmsg = {}; + struct iovec riovec; + struct fi_cq_tagged_entry tx_cqe; + struct fi_cq_tagged_entry rx_cqe; + int nsends_concurrent = 3; /* must be less than the LE pool min. */ + int nsends = 20; + int sends = 0; + fi_addr_t from; + + cr_assert(!(nsends % mrecv_msgs)); + + send_bufs = aligned_alloc(s_page_size, send_len * nsends_concurrent); + cr_assert(send_bufs); + + recv_buf = aligned_alloc(s_page_size, recv_len * mrecv_msgs); + cr_assert(recv_buf); + + for (i = 0; i < nsends_concurrent - 1; i++) { + send_buf = send_bufs + (i % nsends_concurrent) * send_len; + memset(send_buf, i, send_len); + + tx_ret = fi_send(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, NULL); + } + + for (i = nsends_concurrent - 1; i < nsends; i++) { + send_buf = send_bufs + (i % nsends_concurrent) * send_len; + memset(send_buf, i, send_len); + + do { + tx_ret = fi_send(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, NULL); + + /* Progress RX to avoid EQ drops */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", ret); + + /* Just progress */ + fi_cq_read(cxit_tx_cq, NULL, 0); + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, FI_SUCCESS, "fi_tsend failed %d", tx_ret); + + do { + tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + + /* Progress RX to avoid EQ drops */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", ret); + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d", + tx_ret); + + validate_tx_event(&tx_cqe, FI_MSG | FI_SEND, NULL); + + if (!(++sends % 1000)) + printf("%u Sends complete.\n", sends); + } + + for (i = 0; i < nsends_concurrent - 1; i++) { + do { + tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + + /* Progress RX to avoid EQ drops */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", ret); + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d", + tx_ret); + + validate_tx_event(&tx_cqe, FI_MSG | FI_SEND, NULL); + + if (!(++sends % 1000)) + printf("%u Sends complete.\n", sends); + } + + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len * mrecv_msgs; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + for (i = 0; i < nsends / mrecv_msgs; i++) { + memset(recv_buf, 0, recv_len * mrecv_msgs); + do { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); + + ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", + ret); + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + for (k = 0; k < mrecv_msgs; k++) { + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, + &from); + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", + ret); + + validate_multi_recv_rx_event(&rx_cqe, NULL, recv_len, + FI_MSG | FI_RECV, 0, 0); + cr_assert(from == cxit_ep_fi_addr, + "Invalid source address"); + bool last_msg = (k == (mrecv_msgs - 1)); + bool dequeued = rx_cqe.flags & FI_MULTI_RECV; + + cr_assert(!(last_msg ^ dequeued)); + + for (j = 0; j < recv_len; j++) { + cr_assert_eq(recv_buf[k * recv_len + j], + (uint8_t)i * mrecv_msgs + k, + "data mismatch, recv: %d,%d element[%d], exp=%d saw=%d\n", + i, k, j, + (uint8_t)i * mrecv_msgs + k, + recv_buf[k * recv_len + j]); + } + } + } + + free(send_bufs); + free(recv_buf); +} + +static void test_fc_multi_recv(size_t xfer_len, bool progress_before_post) +{ + int ret; + char *recv_buf; + char *send_buf; + int i; + struct fi_msg rmsg = {}; + struct iovec riovec; + unsigned int send_events = 0; + unsigned int recv_events = 0; + struct fi_cq_tagged_entry cqe; + size_t min_mrecv = 0; + size_t opt_len = sizeof(size_t); + bool unlinked = false; + + /* Needs to exceed available LEs. */ + unsigned int num_xfers = 100; + + ret = fi_setopt(&cxit_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, + &min_mrecv, opt_len); + cr_assert(ret == FI_SUCCESS); + + recv_buf = calloc(num_xfers, xfer_len); + cr_assert(recv_buf); + + send_buf = calloc(num_xfers, xfer_len); + cr_assert(send_buf); + + for (i = 0; i < (num_xfers * xfer_len); i++) + send_buf[i] = (char)(rand() % 256); + + /* Fire off all the unexpected sends expect 1. Last send will be sent + * expectedly to verify that hardware has updates the manage local LE + * start and length fields accordingly. + */ + for (i = 0; i < num_xfers - 1; i++) { + do { + ret = fi_send(cxit_ep, &send_buf[i * xfer_len], + xfer_len, NULL, cxit_ep_fi_addr, NULL); + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_rx_cq, &cqe, 0); + fi_cq_read(cxit_tx_cq, &cqe, 0); + } + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + } + + /* Progress before post will cause all ULEs to be onloaded before the + * append occurs. + */ + if (progress_before_post) + fi_cq_read(cxit_rx_cq, &cqe, 0); + + /* Append late multi-recv buffer. */ + riovec.iov_base = recv_buf; + riovec.iov_len = num_xfers * xfer_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = cxit_ep_fi_addr; + rmsg.context = NULL; + + do { + ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV); + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_tx_cq, NULL, 0); + fi_cq_read(cxit_rx_cq, NULL, 0); + } + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + + /* Wait for all send events. Since this test can be run with or without + * flow control, progressing the RX CQ may be required. + */ + while (send_events != (num_xfers - 1)) { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN || ret == 1); + if (ret == 1) + send_events++; + + /* Progress RXC. */ + fi_cq_read(cxit_rx_cq, &cqe, 0); + } + + /* Wait for all receive events. */ + while (recv_events != (num_xfers - 1)) { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN || ret == 1); + if (ret == 1 && cqe.flags & FI_RECV) + recv_events++; + } + + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Make last send expected. This ensures that hardware and/or software + * has correctly updated the LE start and length fields correctly. + */ + do { + ret = fi_send(cxit_ep, &send_buf[(num_xfers - 1) * xfer_len], + xfer_len, NULL, cxit_ep_fi_addr, NULL); + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + + /* Wait for all send events. Since this test can be run with or without + * flow control, progressing the RX CQ may be required. + */ + while (send_events != num_xfers) { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN || ret == 1); + if (ret == 1) + send_events++; + + /* Progress RXC. */ + fi_cq_read(cxit_rx_cq, &cqe, 0); + } + + /* Process the last receive event and the multi-receive event signaling + * the provider is no longer using the buffer. + */ + while (recv_events != num_xfers && !unlinked) { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN || ret == 1); + if (ret == 1) { + if (cqe.flags & FI_RECV) + recv_events++; + if (cqe.flags & FI_MULTI_RECV) + unlinked = true; + } + } + + /* Data integrity check. If hardware/software mismanaged the multi-recv + * start and/or length fields on the expected send, data will be + * corrupted. + */ + for (i = 0; i < (num_xfers * xfer_len); i++) + cr_assert_eq(send_buf[i], recv_buf[i], + "Data miscompare: byte=%u", i); + + free(send_buf); + free(recv_buf); +} + +Test(msg, fc_multi_recv_rdzv, .timeout = 10) +{ + /* Transfer size needs to be large enough to trigger rendezvous. */ + test_fc_multi_recv(16384, false); +} + +Test(msg, fc_multi_recv_rdzv_onload_ules, .timeout = 10) +{ + /* Transfer size needs to be large enough to trigger rendezvous. */ + test_fc_multi_recv(16384, true); +} + +Test(msg, fc_no_eq_space_expected_multi_recv, .timeout = 10) +{ + test_fc_multi_recv(1, false); +} + +Test(msg, fc_no_eq_space_expected_multi_recv_onload_ules, .timeout = 10) +{ + test_fc_multi_recv(1, false); +} + +Test(msg, zero_byte_send_recv_iov) +{ + int ret; + struct fi_cq_tagged_entry cqe; + + ret = fi_recvv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recvv failed: %d", ret); + + ret = fi_sendv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_sendv failed: %d", ret); + + do { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); +} + +Test(msg, zero_byte_send_recv_msg) +{ + int ret; + struct fi_cq_tagged_entry cqe; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + + rmsg.addr = cxit_ep_fi_addr; + + ret = fi_recvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_recvmsg failed: %d", ret); + + smsg.addr = cxit_ep_fi_addr; + + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_sendmsg failed: %d", ret); + + do { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); +} + +/* Verify that FI_AV_USER_ID is returned from fi_cq_readfrom(). */ +Test(msg, av_user_id) +{ + int ret; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + fi_addr_t from; + fi_addr_t user_id = 0xdeadbeef; + + /* Need to remove loopback address from AV and reinsert with + * FI_AV_USER_ID. + */ + ret = fi_av_remove(cxit_av, &cxit_ep_fi_addr, 1, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_remove failed: %d", ret); + + cxit_ep_fi_addr = user_id; + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + FI_AV_USER_ID, NULL); + cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret); + + ret = fi_recv(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + ret = fi_send(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + cr_assert_eq(from, user_id, "Invalid user id: expected=%#lx got=%#lx", + user_id, from); + + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); +} + +/* Verify that FI_AV_USER_ID is returned from fi_cq_readfrom(). */ +Test(msg, av_user_id_domain_cap) +{ + int ret; + struct fid_cq *cq; + struct fid_av *av; + struct fid_ep *ep; + struct fi_cq_attr cxit_tx_cq_attr = { + .format = FI_CQ_FORMAT_TAGGED, + }; + struct fi_cq_tagged_entry cqe; + fi_addr_t from; + fi_addr_t dest_ep; + fi_addr_t user_id = 0xdeadbeef; + char addr[256]; + size_t addr_size = sizeof(addr); + struct fi_av_attr av_attr = { + .flags = FI_AV_USER_ID, + }; + + ret = fi_cq_open(cxit_domain, &cxit_tx_cq_attr, &cq, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cq_open failed: %d", ret); + + ret = fi_av_open(cxit_domain, &av_attr, &av, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_open failed: %d", ret); + + ret = fi_endpoint(cxit_domain, cxit_fi, &ep, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_endpoint failed: %d", ret); + + ret = fi_ep_bind(ep, &cq->fid, FI_TRANSMIT | FI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind failed: %d", ret); + + ret = fi_ep_bind(ep, &av->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_ep_bind failed: %d", ret); + + ret = fi_enable(ep); + cr_assert_eq(ret, FI_SUCCESS, "fi_enable failed: %d", ret); + + ret = fi_getname(&ep->fid, addr, &addr_size); + cr_assert_eq(ret, FI_SUCCESS, "fi_getname failed: %d", ret); + + ret = fi_av_insert(av, addr, 1, &dest_ep, 0, NULL); + cr_assert_eq(ret, 1, "fi_av_insert failed: %d", ret); + + ret = fi_av_set_user_id(av, dest_ep, user_id, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_av_set_user_id failed: %d", ret); + + ret = fi_recv(ep, NULL, 0, NULL, dest_ep, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + ret = fi_send(ep, NULL, 0, NULL, dest_ep, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + do { + ret = fi_cq_readfrom(cq, &cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + if (cqe.flags & FI_SEND) { + do { + ret = fi_cq_readfrom(cq, &cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + } + + cr_assert_eq(from, user_id, "Invalid user id: expected=%#lx got=%#lx", + user_id, from); + + ret = fi_close(&ep->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close failed %d", ret); + + ret = fi_close(&av->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close failed %d", ret); + + ret = fi_close(&cq->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close failed %d", ret); +} + +TestSuite(hybrid_preemptive, .timeout = CXIT_DEFAULT_TIMEOUT); + +#define RX_SIZE 2U + +Test(hybrid_preemptive, posted_recv_preemptive) +{ + int ret; + int i; + + ret = setenv("FI_CXI_HYBRID_POSTED_RECV_PREEMPTIVE", "1", 1); + cr_assert(ret == 0); + + ret = setenv("FI_CXI_RX_MATCH_MODE", "hybrid", 1); + cr_assert(ret == 0); + + cxit_fi_hints = cxit_allocinfo(); + cr_assert(cxit_fi_hints); + + cxit_fi_hints->rx_attr->size = RX_SIZE; + + cxit_setup_msg(); + + /* Posting more receives than RX_SIZE should cause transition to + * SW EP. + */ + for (i = 0; i < RX_SIZE + 1; i++) { + ret = fi_recv(cxit_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL); + + if (i < RX_SIZE) + cr_assert(ret == FI_SUCCESS); + else + cr_assert(ret == -FI_EAGAIN); + } + + while (ret == -FI_EAGAIN) { + fi_cq_read(cxit_rx_cq, NULL, 0); + ret = fi_recv(cxit_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, NULL); + } + + cr_assert(ret == FI_SUCCESS); + + cxit_teardown_msg(); +} + +Test(hybrid_preemptive, unexpected_msg_preemptive) +{ + int ret; + int i; + struct cxip_ep *cxip_ep; + + ret = setenv("FI_CXI_HYBRID_UNEXPECTED_MSG_PREEMPTIVE", "1", 1); + cr_assert(ret == 0); + + ret = setenv("FI_CXI_RX_MATCH_MODE", "hybrid", 1); + cr_assert(ret == 0); + + cxit_fi_hints = cxit_allocinfo(); + cr_assert(cxit_fi_hints); + + cxit_fi_hints->rx_attr->size = RX_SIZE; + + cxit_setup_msg(); + + cxip_ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + + /* Posting more unexpected messages than RX_SIZE should cause + * transition to SW EP. + */ + for (i = 0; i < RX_SIZE + 1; i++) { + ret = fi_send(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, NULL); + cr_assert(ret == FI_SUCCESS); + } + + while (cxip_ep->ep_obj->rxc.state != RXC_ENABLED_SOFTWARE) + fi_cq_read(cxit_rx_cq, NULL, 0); + + cr_assert(ret == FI_SUCCESS); + + cxit_teardown_msg(); +} diff --git a/prov/cxi/test/multinode/README.md b/prov/cxi/test/multinode/README.md new file mode 100644 index 00000000000..c4b6cc7dae2 --- /dev/null +++ b/prov/cxi/test/multinode/README.md @@ -0,0 +1,126 @@ +*SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP* + +# Multinode Framework + +The multinode_frmwk provides a framework for writing multinode test applications +under a Workload Manager (WLM). + +The framework itself is controlled by a number of environment variables provided +by the WLM, or the user environment: + +- **PMI_SIZE** is supplied by the WLM, and indicates the total number of nodes in the +job. + +- **PMI_RANK** is supplied by the WLM, and indicates the rank of this instance of the +application. + +- **PMI_SHARED_SECRET** is supplied by the WLM, and is a "magic number" (a nsec +timestamp) that is guaranteed to be common to all instances of the application, +and unique to each job. + +- **PMI_NUM_HSNS** is supplied by the user environment, and defaults to 1 if not +specified. It can have a value from 1 to 4, and indicates the number of NICs +(per node) to bring into play. + +- **PMI_HOME** is supplied by the user environment, and defaults to $HOME if not +specified. This indicates the file system directory used for the file system +Allgather operation, and must be readable and writable. + +# APP: test_frmwk + +The **test_frmwk** application is a basic sanity test for the framework itself. + +$ srun -Nn ./test_frmwk [args...] + +# APP: test_zbcoll + +The **test_zbcoll** application is a full regression suite for the zbcoll +implementation, which provides a high-performance zero-buffer implementation of +Barrier, Broadcast, and IOR Reduce used in the process of bootstrapping the +collective join operation. + +$ srun -Nn ./test_zbcoll [args...] + +# APP: test_coll + +The **test_coll** application is a full regression suite for the accelerated +collectives. It requires a multicast configuration service, which presents +itself as a REST API. + +$ srun -Nn ./test_coll [args...] + +## Simulated Multicast ## + +A *simulated* multicast configuration service is provided in the multinode +subdirectory. It uses FLASK (Python), and returns a small number of specifically +invalid multicast addresses that are interpreted as a request for a UNICAST +implementation of collectives. This implementation is not performant and should +not be used in production -- it implements the broadcast phase of the +accelerated collective as a series of point-to-point sends from the HWRoot to +each leaf node, and as there is no multicast in-tree reduction, the HWRoot +becomes a target of an incast from all the leaf transmissions. This can be used, +however, to fully test the software paths and behaviors on small collective +groups, without any involvement from the fabric manager software. + +The FLASK simulation is typically started in a window on the WLM job-launch node +as follows: + +$ ./flask_fmgrsrv.py --host *ipaddress* --port *port* + +The *ipaddress* can be obtained on the host where it is run using: + +$ hostname -I | awk '{print $1}' + +The *port* can be any valid, unused port. A value of 5000 typically works. + +A number of environment variables control the libfabric collective behavior: + +- **FI_CXI_COLL_JOB_ID** is an identifier unique to each job. + +- **FI_CXI_COLL_JOB_STEP_ID** is an identifier unique to each job-step. + +- **FI_CXI_COLL_MCAST_TOKEN** is a security token used to authenticate the +application to the fabric manager when using the REST API. + +- **FI_CXI_HWCOLL_ADDRS_PER_JOB** is the maximum number of multicast addresses + available to this job. + +- **FI_CXI_HWCOLL_MIN_NODES** is the minimum number of endpoints required to support accelerated collectives. + +- **FI_CXI_COLL_FABRIC_MGR_URL** is the URL for the fabric manager REST API. + +- **FI_CXI_COLL_RETRY_USEC** is the time spent waiting for reduction + completion before performing a retry. + +- **FI_CXI_COLL_TIMEOUT_USEC** is the length of time hardware reduction engines + will be reserved before timing out and delivering a partial result. + +- **FI_CXI_COLL_USE_DMA_PUT** (experimental) uses Cassini DMA to initiate sends +for reduction packets. + +The framework will set all of the above environment variables to usable +defaults, if they are not already specified in the user environment, with the +exception of **FI_CXI_COLL_FABRIC_MGR_URL**, which must be explicitly defined in +the user environment. + +$ export FI_CXI_COLL_FABRIC_MGR_URL='http://*ipaddress*:*port*' + +The simulated FLASK service can be tested using: + +$ curl $FI_CXI_COLL_FABRIC_MGR_URL + +This should return a JSON object containing help text strings. + +**NOTE**: The simulated service uses http, not https. + +## Production Multicast ## + +Full-scale (performant) test_coll runs can be performed by specifying the real +fabric manager REST API URL. + +This will require that the WLM export a valid **FI_CXI_COLL_MCAST_TOKEN** in the +job environment after acquiring the token for the job from the fabric manager. +This is an opaque session token that persists for the duration of the job. + +**NOTE**: The real service uses https, not http, and is a trusted service. diff --git a/prov/cxi/test/multinode/flask_fmgrsrv.py b/prov/cxi/test/multinode/flask_fmgrsrv.py new file mode 100644 index 00000000000..1916e3dbe84 --- /dev/null +++ b/prov/cxi/test/multinode/flask_fmgrsrv.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +# Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP + +help = f''' +Multicast REST server simulation for distributed testing + +http://host:port/ +- GET produces this help as a JSON list + +http://host:port/fabric/collectives/multicast +- POST generates a single multicast address and hwroot node +- GET lists all multicast addresses +- DELETE deletes all multicast addresses + +http://host:port/fabric/collectives/mcastid/ +- DELETE deletes specified multicast address + +Multicast addresses are invalid (>=8192), causing UNICAST behavior +Only addresses 8192-8199 are supported, to test exhaustion +''' +import argparse +import textwrap +import sys +import json + +from argparse import ArgumentParser, HelpFormatter +from flask import Flask, request +from flask_restful import Api, Resource + +# Global storage for addresses/roots +mcastroots = [] +mcastaddrs = [] + +class RawFormatter(HelpFormatter): + def _fill_text(self, text, width, indent): + return "\n".join([textwrap.fill(line, width) for line in textwrap.indent(textwrap.dedent(text), indent).splitlines()]) + +class fabtestInfo(Resource): + def get(self): + return help.splitlines(), 200 + +def delEntry(value): + global mcastroots + global mcastaddrs + + try: + idx = mcastaddrs.index(value) + del mcastroots[idx] + del mcastaddrs[idx] + print("DELETE ", value) + except: + print("multicast", value, "not in use") + pass + +class delete8192(Resource): + def delete(self): + delEntry(8192) + +class delete8193(Resource): + def delete(self): + delEntry(8193) + +class delete8194(Resource): + def delete(self): + delEntry(8194) + +class delete8195(Resource): + def delete(self): + delEntry(8195) + +class delete8196(Resource): + def delete(self): + delEntry(8196) + +class delete8197(Resource): + def delete(self): + delEntry(8197) + +class delete8198(Resource): + def delete(self): + delEntry(8198) + +class delete8199(Resource): + def delete(self): + delEntry(8199) + +class fabtestServer(Resource): + def get(self): + # Lists the existing multicast addresses + global mcastroots + global mcastaddrs + + addrs = [] + for k,v in enumerate(mcastroots): + addrs.append({'root':v, 'mcast':mcastaddrs[k]}) + info = { + 'ADDRLIST': addrs, + } + return info, 200 + + def delete(self): + # Deletes all multicast addresses + global mcastroots + global mcastaddrs + + mcastroots = [] + mcastaddrs = [] + return None, 200 + + def post(self): + # Creates a new multicast address + global mcastroots + global mcastaddrs + + print(request.json) + required = { + 'jobID', 'macs', 'timeout', + } + optional = { + 'jobStepID' + } + info = {} + error = [] + dupmac = [] + + # Test for required fields, append error messages if missing + for key in required: + if key not in request.json: + error.append("no " + key) + else: + info[key] = request.json[key] + # Test macs for empty or duplicate addresses + if not error and not request.json['macs']: + error.append('empty macs') + for mac in request.json['macs']: + if mac not in dupmac: + dupmac.append(mac) + else: + error.append('duplicate mac=' + str(mac)) + + # Test for optional fields, provide defaults if missing + for key in optional: + if key not in request.json: + info[key] = None + else: + info[key] = request.json[key] + + # Find a globally-unused mac address as hwRoot + info['hwRoot'] = None + for mac in request.json['macs']: + if mac not in mcastroots: + info['hwRoot'] = mac + break + if not info['hwRoot']: + error.append('no hwRoot usable') + + # Find a globally unused mcast address + info['mcastID'] = None + for adr in range(8192, 8199): + if adr not in mcastaddrs: + info['mcastID'] = adr + break + if not info['mcastID']: + error.append('no mcast available') + + # Report any accumulated errors + if error: + info = { + 'error' : ', '.join(error) + } + return info, 400 + + # Otherwise, record and return complete record + mcastroots.append(mac) + mcastaddrs.append(adr) + + info['jobID'] = request.json['jobID'] + info['jobStepID'] = request.json['jobStepID'] + info['macs'] = request.json['macs'] + info['timeout'] = request.json['timeout'] + info['documentSelfLink'] = 'fabric/collectives/mcastID/' + adr + + return info, 200 + +def main(argv): + parser = argparse.ArgumentParser( + description=help, formatter_class=RawFormatter) + parser.add_argument('--host', default=None) + parser.add_argument('--port', default=None) + args = parser.parse_args() + + app = Flask(__name__) + api = Api(app); + api.add_resource(fabtestInfo, '/') + api.add_resource(fabtestServer, '/fabric/collectives/multicast') + api.add_resource(delete8192, '/fabric/collectives/mcastid/8192') + api.add_resource(delete8193, '/fabric/collectives/mcastid/8193') + api.add_resource(delete8194, '/fabric/collectives/mcastid/8194') + api.add_resource(delete8195, '/fabric/collectives/mcastid/8195') + api.add_resource(delete8196, '/fabric/collectives/mcastid/8196') + api.add_resource(delete8197, '/fabric/collectives/mcastid/8197') + api.add_resource(delete8198, '/fabric/collectives/mcastid/8198') + api.add_resource(delete8199, '/fabric/collectives/mcastid/8199') + app.run(debug=True, host=args.host, port=args.port) + +if __name__ == "__main__": + main(sys.argv) diff --git a/prov/cxi/test/multinode/multinode_frmwk.c b/prov/cxi/test/multinode/multinode_frmwk.c new file mode 100644 index 00000000000..94a847d3ba4 --- /dev/null +++ b/prov/cxi/test/multinode/multinode_frmwk.c @@ -0,0 +1,890 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP + */ + +/* + * libfabric C test framework for multinode testing. + * + * This must be compiled with: + * + * - PLATFORM_CASSINI_HW=1 (or other hardware flag) + * + * Tests are run using srun: $ srun -Nn ./test_frmwk 'n' is the number of nodes + * to use. Some tests may place requirements on 'n'. + * + * frmwk_init_libfabric() sets up + * - generic fabric info for CXI driver + * - one domain (fabric address) + * - one endpoint + * - one of each of the following + * - eq + * - tx cq + * - rx cq + * - send cntr + * - recv cntr + * - read cntr + * - write cntr + * - remote cntr + * + * frmwk_populate_av() uses a sockets-based Allgather operation to collect local + * HSN addresses and distribute them over the entire set of nodes, and then + * creates and binds the fi_av object for the endpoint. This 'populate' function + * has been separated out from initialization, to allow the framework to use + * other means of population (e.g. MPI). The following environment variables are + * significant: + * - PMI_SIZE (WLM) number of ranks in job (from WLM) + * - PMI_RANK (WLM) rank of this process (from WLM) + * - PMI_SHARED_SECRET (WLM) unique job identifier (from WLM) + * - PMI_NUM_HSNS (USER) optional, defaults to 1 + * - PMI_HOME (USER) optional, preferred file system directory to use + * - HOME (USER) default file system directory to use + * + * frmwk_enable_libfabric() can be used after the fi_av object has been + * initialized. + * + * frmwk_free_libfabric() terminates the libfabric instance and cleans up. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "multinode_frmwk.h" + +/* If not compiled with DEBUG=1, this is a no-op */ +#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) + +#define RETURN_ERROR(ret, txt) \ + if (ret != FI_SUCCESS) { \ + fprintf(stderr, "FAILED %s = %s\n", txt, fi_strerror(-ret)); \ + return ret; \ + } + +#define CLOSE_OBJ(obj) do {if (obj) fi_close(&obj->fid); } while (0) + +/* Taken from SLURM environment variables */ +int frmwk_numranks; /* PMI_SIZE */ +int frmwk_rank; /* PMI_RANK */ +int frmwk_nics_per_rank; /* PMI_NUM_HSNS (defaults to 1) */ +int frmwk_numnics; +const char *frmwk_unique; /* PMI_SHARED_SECRET */ +const char *frmwk_nodename; /* SLURMD_NODENAME */ +const char frmwk_node0[32]; /* SLURMD_NODELIST (first name) */ +union nicaddr *frmwk_nics; /* array of NIC addresses plus rank and hsn */ + +int _frmwk_init; + +char *cxit_node; +char *cxit_service; +uint64_t cxit_flags; +struct fi_info *cxit_fi_hints; +struct fi_info *cxit_fi; + +struct fid_fabric *cxit_fabric; +struct fid_domain *cxit_domain; +struct fi_cxi_dom_ops *cxit_dom_ops; + +struct mem_region { + uint8_t *mem; + struct fid_mr *mr; +}; + +struct fid_ep *cxit_ep; +struct fi_eq_attr cxit_eq_attr = { + .size = 32, + .flags = FI_WRITE, + .wait_obj = FI_WAIT_NONE +}; +uint64_t cxit_eq_bind_flags = 0; + +struct fid_eq *cxit_eq; + +struct fi_cq_attr cxit_rx_cq_attr = { + .format = FI_CQ_FORMAT_TAGGED + +}; +uint64_t cxit_rx_cq_bind_flags = FI_RECV; +struct fid_cq *cxit_rx_cq; + +struct fi_cq_attr cxit_tx_cq_attr = { + .format = FI_CQ_FORMAT_TAGGED, + .size = 16384 +}; +uint64_t cxit_tx_cq_bind_flags = FI_TRANSMIT; +struct fid_cq *cxit_tx_cq; + +fi_addr_t cxit_ep_fi_addr; + +struct fi_cntr_attr cxit_cntr_attr = {}; +struct fid_cntr *cxit_send_cntr; +struct fid_cntr *cxit_recv_cntr; +struct fid_cntr *cxit_read_cntr; +struct fid_cntr *cxit_write_cntr; +struct fid_cntr *cxit_rem_cntr; + +struct fi_av_attr cxit_av_attr = { + .type = FI_AV_TABLE, + .rx_ctx_bits = 0 +}; +struct fid_av *cxit_av; + +int cxit_n_ifs; +struct fid_av_set *cxit_av_set; +struct fid_mc *cxit_mc; +fi_addr_t cxit_mc_addr; + +/* HMEM memory functional overlays */ +int mr_create_ext(size_t len, uint64_t access, uint8_t seed, uint64_t key, + struct fid_cntr *cntr, struct mem_region *mr) +{ + int ret; + + if (len) { + mr->mem = calloc(1, len); + ret = (mr->mem != NULL) ? FI_SUCCESS : FI_ENOMEM; + RETURN_ERROR(ret, __func__); + } else { + mr->mem = 0; + } + + for (size_t i = 0; i < len; i++) + mr->mem[i] = i + seed; + + ret = fi_mr_reg(cxit_domain, mr->mem, len, access, 0, key, 0, + &mr->mr, NULL); + RETURN_ERROR(ret, "fi_mr_reg"); + + ret = fi_mr_bind(mr->mr, &cxit_ep->fid, 0); + RETURN_ERROR(ret, "fi_mr_bind ep"); + + if (cntr) { + ret = fi_mr_bind(mr->mr, &cntr->fid, FI_REMOTE_WRITE); + RETURN_ERROR(ret, "fi_mr_bind cntr"); + } + + ret = fi_mr_enable(mr->mr); + RETURN_ERROR(ret, "fi_mr_enable"); + + return 0; +} + +static ssize_t copy_from_hmem_iov(void *dest, size_t size, + enum fi_hmem_iface iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, + uint64_t hmem_iov_offset) +{ + size_t cpy_size = MIN(size, hmem_iov->iov_len); + + assert(iface == FI_HMEM_SYSTEM); + assert(hmem_iov_count == 1); + assert(hmem_iov_offset == 0); + + memcpy(dest, hmem_iov->iov_base, cpy_size); + + return cpy_size; +} + +static ssize_t copy_to_hmem_iov(enum fi_hmem_iface iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, + uint64_t hmem_iov_offset, const void *src, + size_t size) +{ + size_t cpy_size = MIN(size, hmem_iov->iov_len); + + assert(iface == FI_HMEM_SYSTEM); + assert(hmem_iov_count == 1); + assert(hmem_iov_offset == 0); + + memcpy(hmem_iov->iov_base, src, cpy_size); + + return cpy_size; +} + +struct fi_hmem_override_ops cxit_hmem_ops = { + .copy_from_hmem_iov = copy_from_hmem_iov, + .copy_to_hmem_iov = copy_to_hmem_iov, +}; + +/* A minimal generic context for use with asynchronous operations */ +struct mycontext { + int rx_err; + int rx_prov_err; + int tx_err; + int tx_prov_err; +}; + +/* display message on stdout from rank 0 */ +int frmwk_log0(const char *fmt, ...) +{ + va_list args; + int len = 0; + + if (_frmwk_init && frmwk_rank != 0) + return 0; + + va_start(args, fmt); + len = vfprintf(stdout, fmt, args); + va_end(args); + fflush(stdout); + return len; +} + +/* display message with rank designation */ +int frmwk_log(const char *fmt, ...) +{ + va_list args; + int len = 0; + + if (_frmwk_init) + len += fprintf(stdout, "[%2d] ", frmwk_rank); + va_start(args, fmt); + len += vfprintf(stdout, fmt, args); + va_end(args); + fflush(stdout); + return len; +} + +/* Implement a simple sockets-based allgather for testing. + * + * This selects one node across all of the nodes to serve as a local root, and + * then uses sockets to transfer information. + */ +#define FAIL(cond, msg, label) \ + if (cond) { \ + printf("FAIL socket %s=%d\n", msg, cond); \ + goto label; \ + } + +/* Sockets can chop up large reads */ +static ssize_t _fullread(int fd, char *ptr, ssize_t size) +{ + ssize_t rem = size; + ssize_t len; + + while (rem > 0) { + len = read(fd, ptr, rem); + if (len < 0) + return len; + ptr += len; + rem -= len; + } + return size; +} + +/* Sockets can chop up large writes */ +static ssize_t _fullwrite(int fd, char *ptr, ssize_t size) +{ + ssize_t rem = size; + ssize_t len; + + while (rem > 0) { + len = write(fd, ptr, rem); + if (len < 0) + return len; + ptr += len; + rem -= len; + } + return size; +} + +/* frmwk_node0 (first in list) serves as root */ +int _accept(int portno, size_t size, void *data, void *rslt) +{ + int listenfd = 0; + int *connfd, conncnt, connidx; + struct sockaddr_in serv_addr = { 0 }; + char *rsltp; + size_t siz; + ssize_t len; + int error, ret; + + // any early exit reports failure + error = -1; + + // create the socket + listenfd = socket(AF_INET, SOCK_STREAM, 0); + FAIL(listenfd < 0, "socket", lablisten); + + // release the socket immediately after termination + ret = setsockopt(listenfd, SOL_SOCKET, SO_REUSEADDR, + &(int){1}, sizeof(int)); + FAIL(ret < 0, "reuseaddr", lablisten); + + // bind the socket to accept any incoming connections + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + serv_addr.sin_port = htons(portno); + ret = bind(listenfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)); + FAIL(ret < 0, "bind", lablisten); + + // limit the number of connections + conncnt = frmwk_numranks - 1; + ret = listen(listenfd, conncnt); + FAIL(ret < 0, "listen", lablisten); + + // create the connection array + connfd = calloc(conncnt, sizeof(*connfd)); + FAIL(!connfd, "connfd", lablisten); + + // initialize to invalid file descriptors + for (connidx = 0; connidx < conncnt; connidx++) + connfd[connidx] = -1; + + // add our contribution to the result + rsltp = rslt; + memcpy(rsltp, data, size); + rsltp += size; + + // accept connections and start the root protocol + for (connidx = 0; connidx < conncnt; connidx++) { + int fd; + + fd = accept(listenfd, (struct sockaddr *)NULL, NULL); + FAIL(fd < 0, "accept", labclose); + + // record this for later send + connfd[connidx] = fd; + + // read from the connection + siz = size; + len = _fullread(fd, rsltp, siz); + FAIL(len < siz, "read", labclose); + + // advance the result pointer + rsltp += siz; + } + + // all contributions complete, send the result + for (connidx = 0; connidx < conncnt; connidx++) + { + int fd; + + fd = connfd[connidx]; + siz = frmwk_numranks * size; + len = _fullwrite(fd, rslt, siz); + FAIL(len < siz, "write", labclose); + } + + // report success + error = 0; + +labclose: + for (connidx = 0; connidx < conncnt; connidx++) + close(connfd[connidx]); + free(connfd); +lablisten: + close(listenfd); + return error; +} + +/* nodes other than frmwk_node0 serve as leaves */ +int _connect(int portno, size_t size, void *data, void *rslt) +{ + int connfd = 0; + struct sockaddr_in serv_addr = { 0 }; + struct hostent *he; + struct in_addr **addr_list; + size_t siz; + ssize_t len; + int error, ret; + + // any early exit returns error + error = -1; + + // create the socket + connfd = socket(AF_INET, SOCK_STREAM, 0); + FAIL(connfd < 0, "socket", labclose); + + // release the socket immediately after termination + ret = setsockopt(connfd, SOL_SOCKET, SO_REUSEADDR, + &(int){1}, sizeof(int)); + FAIL(ret < 0, "reuseaddr", labclose); + + // get network address of frmwk_node0 and connect socket + he = gethostbyname(frmwk_node0); + FAIL(!he, "gethostbyname", labclose); + + addr_list = (struct in_addr **)he->h_addr_list; + FAIL(!addr_list, "gethostbyname empty", labclose); + + serv_addr.sin_family = AF_INET; + serv_addr.sin_port = htons(portno); + serv_addr.sin_addr = *addr_list[0]; + do { + usleep(1000); + ret = connect(connfd, (struct sockaddr *)&serv_addr, + sizeof(serv_addr)); + } while (ret < 0); + + // write our data + siz = size; + len = _fullwrite(connfd, data, siz); + FAIL(len < siz, "write", labclose); + + // wait for full data response + siz = frmwk_numranks * size; + len = _fullread(connfd, rslt, siz); + FAIL(len < siz, "read", labclose); + + // report success + error = 0; + +labclose: + close(connfd); + return error; +} + +int frmwk_allgather(size_t size, void *data, void *rslt) +{ + int portno = 5000; + + return (!strcmp(frmwk_node0, frmwk_nodename)) ? + _accept(portno, size, data, rslt) : + _connect(portno, size, data, rslt); +} + +int frmwk_barrier(void) +{ + ssize_t size = sizeof(char); + char data = 0; + char *rslt; + int ret; + + rslt = calloc(frmwk_numranks, sizeof(char)); + ret = frmwk_allgather(size, &data, rslt); + free(rslt); + + return ret; +} + +/** + * @brief Check for minimum number of ranks + * + * @param minranks required minimum number of ranks + * @return int error code, 0 on success + */ +int frmwk_check_env(int minranks) +{ + if (!_frmwk_init) { + fprintf(stderr, "Framework not initialized\n"); + return -1; + } + if (frmwk_numranks < minranks) { + /* only one rank makes noise */ + if (!frmwk_rank) + fprintf(stderr, "Requires >= %d ranks\n", minranks); + return -1; + } + return 0; +} + +/** + * @brief Shut down the libfabric test framework. + */ +void frmwk_free_libfabric(void) +{ + /* must close EP before closing anything bound to it */ + CLOSE_OBJ(cxit_ep); + CLOSE_OBJ(cxit_av); + CLOSE_OBJ(cxit_rem_cntr); + CLOSE_OBJ(cxit_write_cntr); + CLOSE_OBJ(cxit_read_cntr); + CLOSE_OBJ(cxit_recv_cntr); + CLOSE_OBJ(cxit_send_cntr); + CLOSE_OBJ(cxit_eq); + CLOSE_OBJ(cxit_tx_cq); + CLOSE_OBJ(cxit_rx_cq); + CLOSE_OBJ(cxit_domain); + CLOSE_OBJ(cxit_fabric); + fi_freeinfo(cxit_fi); + fi_freeinfo(cxit_fi_hints); +} + +/** + * @brief Initialize the libfabric test framework. + * + * The ep_obj->src_addr has a PID value of 511 (PID_ANY) until the EP is + * enabled, at which point the actual PID is assigned. Nothing works if the PIDs + * are mismatched between ranks. + * + * @return int error code, 0 on success + */ +int frmwk_init_libfabric(void) +{ + int ret; + + if (!_frmwk_init) { + fprintf(stderr, "Framework not initialized\n"); + return -1; + } + + cxit_fi_hints = fi_allocinfo(); + ret = (cxit_fi_hints != NULL) ? FI_SUCCESS : FI_ENOMEM; + + cxit_fi_hints->fabric_attr->prov_name = strdup("cxi"); + cxit_fi_hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &cxit_fi); + RETURN_ERROR(ret, "fi_getinfo"); + + ret = fi_fabric(cxit_fi->fabric_attr, &cxit_fabric, NULL); + RETURN_ERROR(ret, "fi_fabric"); + + ret = fi_domain(cxit_fabric, cxit_fi, &cxit_domain, NULL); + RETURN_ERROR(ret, "fi_domain"); + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_1, 0, + (void **)&cxit_dom_ops, NULL); + RETURN_ERROR(ret, "fi_open_ops 1"); + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_2, 0, + (void **)&cxit_dom_ops, NULL); + RETURN_ERROR(ret, "fi_open_ops 2"); + + ret = fi_open_ops(&cxit_domain->fid, FI_CXI_DOM_OPS_3, 0, + (void **)&cxit_dom_ops, NULL); + RETURN_ERROR(ret, "fi_open_ops 3"); + + ret = fi_set_ops(&cxit_domain->fid, FI_SET_OPS_HMEM_OVERRIDE, 0, + &cxit_hmem_ops, NULL); + RETURN_ERROR(ret, "fi_set_ops"); + + ret = fi_endpoint(cxit_domain, cxit_fi, &cxit_ep, NULL); + RETURN_ERROR(ret, "fi_endpoint"); + + ret = fi_cq_open(cxit_domain, &cxit_rx_cq_attr, &cxit_rx_cq, NULL); + RETURN_ERROR(ret, "fi_cq_open RX"); + + ret = fi_ep_bind(cxit_ep, &cxit_rx_cq->fid, cxit_rx_cq_bind_flags); + RETURN_ERROR(ret, "fi_ep_bind RX_CQ"); + + ret = fi_cq_open(cxit_domain, &cxit_tx_cq_attr, &cxit_tx_cq, NULL); + RETURN_ERROR(ret, "fi_cq_open TX"); + ret = fi_ep_bind(cxit_ep, &cxit_tx_cq->fid, cxit_tx_cq_bind_flags); + RETURN_ERROR(ret, "fi_ep_bind TX_CQ"); + + ret = fi_eq_open(cxit_fabric, &cxit_eq_attr, &cxit_eq, NULL); + RETURN_ERROR(ret, "fi_eq_open"); + ret = fi_ep_bind(cxit_ep, &cxit_eq->fid, cxit_eq_bind_flags); + RETURN_ERROR(ret, "fi_ep_bind EQ"); + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_send_cntr, NULL); + RETURN_ERROR(ret, "fi_cntr_open SEND"); + ret = fi_ep_bind(cxit_ep, &cxit_send_cntr->fid, FI_SEND); + RETURN_ERROR(ret, "fi_ep_bind SEND CNTR"); + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_recv_cntr, NULL); + RETURN_ERROR(ret, "fi_cntr_open RECV"); + ret = fi_ep_bind(cxit_ep, &cxit_recv_cntr->fid, FI_RECV); + RETURN_ERROR(ret, "fi_ep_bind RECV CNTR"); + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_read_cntr, NULL); + RETURN_ERROR(ret, "fi_cntr_open READ"); + ret = fi_ep_bind(cxit_ep, &cxit_read_cntr->fid, FI_READ); + RETURN_ERROR(ret, "fi_ep_bind READ CNTR"); + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_write_cntr, NULL); + RETURN_ERROR(ret, "fi_cntr_open WRITE"); + ret = fi_ep_bind(cxit_ep, &cxit_write_cntr->fid, FI_WRITE); + RETURN_ERROR(ret, "fi_ep_bind WRITE CNTR"); + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_rem_cntr, NULL); + RETURN_ERROR(ret, "fi_cntr_open REMOTE"); + + cxit_av_attr.count = 1024; + ret = fi_av_open(cxit_domain, &cxit_av_attr, &cxit_av, NULL); + RETURN_ERROR(ret, "fi_av_open"); + + ret = fi_ep_bind(cxit_ep, &cxit_av->fid, 0); + RETURN_ERROR(ret, "fi_ep_bind AV"); + + ret = fi_enable(cxit_ep); + RETURN_ERROR(ret, "fi_enable"); + + return 0; +} + +/** + * @brief One way of populating the address vector. + * + * This uses frmwk_allgather() to perform the allgather of addresses across all + * nodes in the job. To work properly, the libfabric endpoint must already be + * enabled. + * + * This also serves as a barrier that ensures that all ranks have reached this + * call, i.e. all ranks have enabled their respective endpoint. If an endpoint + * is not enabled when another endpoint sends a packet, the sender will receive + * an ACK, but the target will drop the packet. + * + * This routine can be replaced with anything that provides an accurate AV + * across all nodes in the job, e.g. MPI, symmetric AVs distributed by some + * other out-of-band means to all nodes, or logical (rank) addressing of the + * Cassini chips. + * + * @param fiaddr : returns array of fi_addr_t in rank order + * @param size : returns size of fiaddr array + * @return int error code, 0 on success. + */ +int frmwk_populate_av(fi_addr_t **fiaddrp, size_t *sizep) +{ + struct cxip_addr *alladdrs = NULL; + fi_addr_t *fiaddrs = NULL; + int i, ret; + + if (!fiaddrp || !sizep) + return -FI_EINVAL; + + ret = -FI_EFAULT; + ret = frmwk_gather_nics(); + if (ret < 0) + goto fail; + + ret = -FI_ENOMEM; + alladdrs = calloc(frmwk_numnics, sizeof(*alladdrs)); + fiaddrs = calloc(frmwk_numnics, sizeof(*fiaddrs)); + if (!fiaddrs || !alladdrs) + goto fail; + + for (i = 0; i < frmwk_numnics; i++) + alladdrs[i].nic = frmwk_nics[i].nic; + ret = fi_av_insert(cxit_av, alladdrs, frmwk_numnics, + fiaddrs, 0, NULL); + if (ret != frmwk_numnics) + goto fail; + + *sizep = frmwk_numnics; + *fiaddrp = fiaddrs; + return FI_SUCCESS; + +fail: + free(fiaddrs); + free(alladdrs); + return ret; +} + +/** + * @brief Display an error message to stderr and return error code. + * + * This prints to stderr only if ret != 0. It includes rank of the failing + * process and the size of the job. These values are meaningful only after + * frmwk_populate_av() has successfully completed. + * + * @param ret : error code + * @param fmt : printf format + * @param ... : printf parameters + * @return int value of ret + */ +int frmwk_errmsg(int ret, const char *fmt, ...) +{ + va_list args; + char host[256]; + char *str; + int len; + + if (!ret) + return 0; + + if (gethostname(host, sizeof(host))) + strcpy(host, "unknown"); + + va_start(args, fmt); + len = vasprintf(&str, fmt, args); + va_end(args); + if (len < 0) + str = "(no errmsg)"; + fprintf(stderr, "%s rank %2d of %2d FAILED %d: %s", + host, frmwk_rank, frmwk_numranks, ret, str); + if (len >= 0) + free(str); + + return ret; +} + +/* Read /sys files to get the HSN nic addresses */ +static void get_local_nic(int hsn, union nicaddr *nic) +{ + char fname[256]; + char text[256]; + char *ptr; + FILE *fid; + int i, n; + + /* default */ + strcpy(text, "FF:FF:FF:FF:FF:FF\n"); + /* read from file, if possible */ + snprintf(fname, sizeof(fname), "/sys/class/net/hsn%d/address", hsn); + if ((fid = fopen(fname, "r"))) { + n = fread(text, 1, sizeof(text), fid); + fclose(fid); + text[n] = 0; + } + TRACE("HSN address: %s", text); + + /* parse "XX:XX:XX:XX:XX:XX\n" into 48-bit integer value */ + nic->value = 0L; + ptr = text; + for (i = 0; i < 6; i++) { + nic->value <<= 8; + nic->value |= strtol(ptr, &ptr, 16); + ptr++; + } + nic->hsn = hsn; + nic->rank = frmwk_rank; + TRACE("rank=%2d hsn=%d nic=%05x\n", nic->rank, nic->hsn, nic->nic); +} + +/* Sort comparator */ +static int _compare(const void *v1, const void *v2) +{ + uint64_t *a1 = (uint64_t *)v1; + uint64_t *a2 = (uint64_t *)v2; + + if (*a1 < *a2) + return -1; + if (*a1 > *a2) + return 1; + return 0; +} + +/* Allgather on NIC addresses across collective */ +int frmwk_gather_nics(void) +{ + union nicaddr *mynics = NULL; + int i, ret, localsize; + + if (frmwk_nics) + return 0; + + localsize = frmwk_nics_per_rank * NICSIZE; + mynics = calloc(1, localsize); + frmwk_nics = calloc(frmwk_numranks, localsize); + if (!mynics || !frmwk_nics) + goto fail; + + for (i = 0; i < frmwk_nics_per_rank; i++) + get_local_nic(i, &mynics[i]); + + ret = frmwk_allgather(localsize, mynics, frmwk_nics); + if (ret) + goto fail; + + frmwk_numnics = frmwk_numranks * frmwk_nics_per_rank; + qsort(frmwk_nics, frmwk_numnics, NICSIZE, _compare); + TRACE("---\n"); + for (i = 0; i < frmwk_numnics; i++) + TRACE("rank=%2d hsn=%d nic=%05x\n", + frmwk_nics[i].rank, + frmwk_nics[i].hsn, + frmwk_nics[i].nic); + return 0; + +fail: + frmwk_numnics = 0; + free(frmwk_nics); + free(mynics); + return -1; +} + +/* User call for the address of rank, hsn */ +int frmwk_nic_addr(int rank, int hsn) +{ + if (!frmwk_nics || + rank < 0 || rank >= frmwk_numranks || + hsn < 0 || hsn >= frmwk_nics_per_rank) + return -1; + return (long)frmwk_nics[rank * frmwk_nics_per_rank + hsn].nic; +} + +/* Get environment variable as string representation of int */ +static int getenv_int(const char *name) +{ + char *env; + int value; + + value = -1; + env = getenv(name); + if (env) + sscanf(env, "%d", &value); + return value; +} + +/* Initialize the framework */ +void frmwk_init(bool quiet) +{ + char *s, *d; + int ret = -1; + + /* Values are provided by the WLM */ + s = getenv("SLURM_NODELIST"); + d = (char *)frmwk_node0; + while (s && *s && *s != '-' && *s != ',') { + if (*s == '[') + s++; + else + *d++ = *s++; + } + *d = 0; + frmwk_nodename = getenv("SLURMD_NODENAME"); + frmwk_numranks = getenv_int("PMI_SIZE"); + frmwk_rank = getenv_int("PMI_RANK"); + frmwk_unique = getenv("PMI_SHARED_SECRET"); + if (frmwk_numranks < 1 || frmwk_rank < 0 || !frmwk_unique) { + if (quiet) + goto fail; + fprintf(stderr, "invalid PMI_SIZE=%d\n", frmwk_numranks); + fprintf(stderr, "invalid PMI_RANK=%d\n", frmwk_rank); + fprintf(stderr, "invalid PMI_SHARED_SECRET=%s\n", frmwk_unique); + fprintf(stderr, "Must be run under compatible WLM\n"); + goto fail; + } + + /* Optional for multiple HSNs, defaults to hsn0 */ + frmwk_nics_per_rank = getenv_int("PMI_NUM_HSNS"); + if (frmwk_nics_per_rank < 1) + frmwk_nics_per_rank = 1; + + /* Re-export these as libfabric equivalents */ + setenv("FI_CXI_COLL_JOB_ID", frmwk_unique, 1); + setenv("FI_CXI_COLL_JOB_STEP_ID", "0", 1); + setenv("FI_CXI_COLL_MCAST_TOKEN", "aaaaaa", 1); + setenv("FI_CXI_HWCOLL_MIN_NODES", "4", 1); + setenv("FI_CXI_HWCOLL_ADDRS_PER_JOB", "4", 1); + setenv("FI_CXI_COLL_FABRIC_MGR_URL", "what?", 1); + + ret = 0; +fail: + _frmwk_init = (!ret); +} + +void frmwk_term(void) +{ + free(frmwk_nics); + frmwk_nics = NULL; + frmwk_unique = NULL; + frmwk_nics_per_rank = 0; + frmwk_numranks = 0; + frmwk_rank = 0; + _frmwk_init = 0; +} diff --git a/prov/cxi/test/multinode/multinode_frmwk.h b/prov/cxi/test/multinode/multinode_frmwk.h new file mode 100644 index 00000000000..f20c89dd9dd --- /dev/null +++ b/prov/cxi/test/multinode/multinode_frmwk.h @@ -0,0 +1,88 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * (c) Copyright 2021-2023 Hewlett Packard Enterprise Development LP + */ + +#ifndef FRMWK_HEADER +#define FRMWK_HEADER + +union nicaddr { + uint64_t value; + struct { + uint64_t nic:20; // 20-bit CXI NIC address + uint64_t net:28; // 28-bit network route + uint64_t hsn:2; // up to 4 CXI chips per node + uint64_t rank:14; // up to 16k ranks + } __attribute__((__packed__)); +}; +#define NICSIZE (sizeof(union nicaddr)) + +/* These are initialized by frmwk_init() */ +extern int frmwk_nics_per_rank; +extern int frmwk_numranks; +extern int frmwk_numnics; +extern int frmwk_rank; + +/* This is initialized by frmwk_populate_av() */ +extern union nicaddr *frmwk_nics; + +extern char *cxit_node; +extern char *cxit_service; +extern uint64_t cxit_flags; +extern struct fi_info *cxit_fi_hints; +extern struct fi_info *cxit_fi; + +extern struct fid_fabric *cxit_fabric; +extern struct fid_domain *cxit_domain; +extern struct fi_cxi_dom_ops *dom_ops; + +extern struct fid_ep *cxit_ep; +extern struct fi_eq_attr cxit_eq_attr; +extern uint64_t cxit_eq_bind_flags; +extern struct fid_eq *cxit_eq; + +extern struct fi_cq_attr cxit_rx_cq_attr; +extern uint64_t cxit_rx_cq_bind_flags; +extern struct fid_cq *cxit_rx_cq; + +extern struct fi_cq_attr cxit_tx_cq_attr; +extern uint64_t cxit_tx_cq_bind_flags; +extern struct fid_cq *cxit_tx_cq; + +extern fi_addr_t cxit_ep_fi_addr; + +extern struct fi_cntr_attr cxit_cntr_attr; +extern struct fid_cntr *cxit_send_cntr; +extern struct fid_cntr *cxit_recv_cntr; +extern struct fid_cntr *cxit_read_cntr; +extern struct fid_cntr *cxit_write_cntr; +extern struct fid_cntr *cxit_rem_cntr; + +extern struct fi_av_attr cxit_av_attr; +extern struct fid_av *cxit_av; + +extern int cxit_n_ifs; +extern struct fid_av_set *cxit_av_set; +extern struct fid_mc *cxit_mc; +extern fi_addr_t cxit_mc_addr; + +int frmwk_allgather(size_t size, void *data, void *rslt); +int frmwk_barrier(void); +int frmwk_gather_nics(void); +int frmwk_nic_addr(int rank, int hsn); + +void frmwk_init(bool quiet); +void frmwk_term(void); +int frmwk_init_libfabric(void); +void frmwk_free_libfabric(void); +int frmwk_check_env(int minranks); +int frmwk_populate_av(fi_addr_t **fiaddr, size_t *size); +int frmwk_errmsg(int ret, const char *fmt, ...) + __attribute__((format(__printf__, 2, 3))); +int frmwk_log0(const char *fmt, ...) + __attribute__((format(__printf__, 1, 2))); +int frmwk_log(const char *fmt, ...) + __attribute__((format(__printf__, 1, 2))); + +#endif /* FRMWK_HEADER */ diff --git a/prov/cxi/test/multinode/perf_align.c b/prov/cxi/test/multinode/perf_align.c new file mode 100644 index 00000000000..4c2093fa7bc --- /dev/null +++ b/prov/cxi/test/multinode/perf_align.c @@ -0,0 +1,58 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP + * + * Generic ad-hoc CPU performance tests. + */ + +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + struct timespec ts1, ts2; + uint8_t arr[16]; + uint64_t *a; + double *d; + uint64_t count; + int i; + + /* Test alignment consequences on integer sum */ + for (i = 0; i < 8; i++) { + count = 1000000000; + a = (uint64_t *)&arr[i]; + clock_gettime(CLOCK_MONOTONIC, &ts1); + while (count--) + (*a) += 1; + clock_gettime(CLOCK_MONOTONIC, &ts2); + if (ts2.tv_nsec < ts1.tv_nsec) { + ts2.tv_nsec += 1000000000; + ts2.tv_sec -= 1; + } + ts2.tv_nsec -= ts1.tv_nsec; + ts2.tv_sec -= ts1.tv_sec; + printf("a[%d] = %3ld.%09ld\n", i, ts2.tv_sec, ts2.tv_nsec); + } + + /* Test alignment consequences on double sum */ + for (i = 0; i < 8; i++) { + count = 1000000000; + d = (double *)&arr[i]; + clock_gettime(CLOCK_MONOTONIC, &ts1); + while (count--) + (*d) += 1.0; + clock_gettime(CLOCK_MONOTONIC, &ts2); + if (ts2.tv_nsec < ts1.tv_nsec) { + ts2.tv_nsec += 1000000000; + ts2.tv_sec -= 1; + } + ts2.tv_nsec -= ts1.tv_nsec; + ts2.tv_sec -= ts1.tv_sec; + printf("d[%d] = %3ld.%09ld\n", i, ts2.tv_sec, ts2.tv_nsec); + } + + return 0; +} diff --git a/prov/cxi/test/multinode/perf_getip.c b/prov/cxi/test/multinode/perf_getip.c new file mode 100644 index 00000000000..6acbe1c157f --- /dev/null +++ b/prov/cxi/test/multinode/perf_getip.c @@ -0,0 +1,139 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP + */ + +/* Compile: cc -o getip getip.c */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int get_mac_ioctls(char **macs, int count) +{ + struct ifreq ifr, *it, *end; + struct ifconf ifc; + char buf[1024]; + int success = 0; + int sock; + int i, idx, ret; + char *mptr; + unsigned char *sptr; + + // acquire socket identifier + sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); + if (sock < 0) + return sock; + // prepare the ifc structure + ifc.ifc_len = sizeof(buf); + ifc.ifc_buf = buf; + // populate ifc structure from kernel + ret = ioctl(sock, SIOCGIFCONF, &ifc); + if (ret < 0) + return ret; + // walk through the interfaces + it = ifc.ifc_req; + end = it + (ifc.ifc_len / sizeof(struct ifreq)); + idx = 0; + for (; it != end && idx < count; it++) { + // find only hsn* interfaces + strcpy(ifr.ifr_name, it->ifr_name); + if (strncmp(ifr.ifr_name, "hsn", 3)) + continue; + // acquire flags + if (ioctl(sock, SIOCGIFFLAGS, &ifr)) + continue; + // acquire hardware address + if (ioctl(sock, SIOCGIFHWADDR, &ifr)) + continue; + // copy hardware address into return pointer + mptr = macs[idx++]; + sptr = ifr.ifr_hwaddr.sa_data; + for (i = 0; i < 6; i++) { + if (i) + *mptr++ = ':'; + mptr += sprintf(mptr, "%02x", *sptr++); + } + *mptr = 0; + } + close(sock); + return idx; +} + +int get_mac_sysfile(char **macs, int count) +{ + DIR *dir; + FILE *fid; + struct dirent *dent; + char path[1024]; + int n, idx; + + // open the network sysfs directory + dir = opendir("/sys/class/net"); + if (!dir) + return 1; + // read the directory contents + idx = 0; + while ((dent = readdir(dir)) && idx < count) { + // find only hsn* interfaces + if (strncmp("hsn", dent->d_name, 3)) + continue; + // open and read the address file + sprintf(path, "/sys/class/net/%s/address", dent->d_name); + fid = fopen(path, "r"); + n = fread(macs[idx++], 32, 1, fid); + fclose(fid); + } + closedir(dir); + return 0; +} + +int main(int argc, char **argv) +{ + struct timespec t0, t1; + long int count; + char **macs; + int i, num; + int secs = 2; + + macs = calloc(4, sizeof(char *)); + for (i = 0; i < 4; i++) + macs[i] = malloc(32); + + get_mac_ioctls(macs, 4); + clock_gettime(CLOCK_MONOTONIC, &t0); + t0.tv_sec += secs; + count = 0; + do { + get_mac_ioctls(macs, 4); + count++; + clock_gettime(CLOCK_MONOTONIC, &t1); + } while (t1.tv_sec < t0.tv_sec || + (t1.tv_sec == t0.tv_sec && + t1.tv_nsec < t0.tv_nsec)); + printf("direct: %9ld\n", count); + + get_mac_sysfile(macs, 4); + clock_gettime(CLOCK_MONOTONIC, &t0); + t0.tv_sec += secs; + count = 0; + do { + get_mac_sysfile(macs, 4); + count++; + clock_gettime(CLOCK_MONOTONIC, &t1); + } while (t1.tv_sec < t0.tv_sec || + (t1.tv_sec == t0.tv_sec && + t1.tv_nsec < t0.tv_nsec)); + printf("sysfs : %9ld\n", count); + + return 0; +} diff --git a/prov/cxi/test/multinode/test_barrier.c b/prov/cxi/test/multinode/test_barrier.c new file mode 100644 index 00000000000..285e32e73ce --- /dev/null +++ b/prov/cxi/test/multinode/test_barrier.c @@ -0,0 +1,454 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP + */ + +/** + * Standalone BARRIER test to illustrate how to set up collectives. + * + * This uses the multinode_frmwk.c code to prepare a generic multinode + * environment for libfabric collectives testing, and provides some common + * tools for: + * + * - evaluating SLURM environment variables + * - configuring a vanilla libfabric, including HMEM overlays + * - distributing HSN addresses among the nodes + * + * The distribution of HSN addresses uses a linux socket-based method to + * share the HSN addresses among the nodes, and as such, presumes the + * existence of a standard Ethernet network linking the nodes (which is also + * presumed by SLURM). + * + * This code creates a single av_set consisting of the HSN0 addresses among + * the full set of nodes (i.e. MPI_COMM_WORLD), and then performs + * fi_join_collective() to obtain a "multicast" address to be used in the + * barrier operation. Note that the create_av_set() call checks for the + * environment variable FI_COLL_FABRIC_MGR_URL. If this is set, the join will + * attempt to use the specified fabric manager URL to set up a valid + * multicast address in the fabric. If this environment variable is not set, + * the join will use a "unicast" model in which all leaf nodes communicate + * with the root, and the root communicates with the leaves. + * + * The unicast model is not intended to be performant; it is intended to be + * simple, since the primary purpose of the unicast model is debugging and + * instruction. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "multinode_frmwk.h" + +/** + * Create av_set. + * + * fiaddrs must be in the same order across all nodes. + * + * rootidx is the index of the collective root for this group. + */ +int create_av_set(fi_addr_t *fiaddrs, size_t size, int rootidx, + struct fid_av_set **avsetp) +{ + struct cxip_comm_key comm_key = { + .keytype = (cxip_env.coll_fabric_mgr_url) ? + COMM_KEY_NONE : COMM_KEY_UNICAST, + .ucast.hwroot_idx = rootidx + }; + struct fi_av_set_attr attr = { + .count = 0, + .start_addr = FI_ADDR_NOTAVAIL, + .end_addr = FI_ADDR_NOTAVAIL, + .stride = 1, + .comm_key_size = sizeof(comm_key), + .comm_key = (void *)&comm_key, + .flags = 0, + }; + struct fid_av_set *avset; + int i, ret; + + *avsetp = NULL; + + if (rootidx < 0 || rootidx >= size) { + printf("%s invalid rootidx value=%d\n", __func__, rootidx); + return -1; + } + + // create empty av_set (alloc and initialize to empty) + ret = fi_av_set(cxit_av, &attr, &avset, NULL); + if (ret) { + printf("%s fi_av_set failed %d\n", __func__, ret); + goto quit; + } + // append all addresses (in rank order) to av_set + for (i = 0; i < size; i++) { + ret = fi_av_set_insert(avset, fiaddrs[i]); + if (ret) { + printf("%s fi_av_set_insert failed %d\n", __func__, ret); + goto quit; + } + } + *avsetp = avset; + return 0; + +quit: + printf("%s FAILED %d\n", __func__, ret); + if (avset) + fi_close(&avset->fid); + return ret; +} + +/** + * Poll the cqs once, and fill out a cqd structure. + * + * Note that the cqd is the largest supported cqd structure, so it can serve + * as both the success (smaller) or failure (larger) return structure. + * + * The rx_cq has the discard flag set for all operational modes other than + * the COMM_KEY_RANK simulation, so rx_cq events are not generated in this + * test code. Should this be expanded to use the COMM_KEY_RANK simulation + * (e.g. for automated single-node regression testing), the rx_cq should be + * read and the data discarded. + */ +static ssize_t _poll_cqs(struct fi_cq_err_entry *pcqd) +{ + ssize_t size; + +#if 1 + /* read/discard rx_cq -- needed in COMM_KEY_RANK simulation only */ + size = fi_cq_read(cxit_rx_cq, pcqd, 1); + if (size == -FI_EAVAIL) + size = fi_cq_readerr(cxit_rx_cq, pcqd, 1); +#endif + + /* tx_cq indicates barrier status */ + size = fi_cq_read(cxit_tx_cq, pcqd, 1); + if (size == -FI_EAVAIL) { + size = fi_cq_readerr(cxit_tx_cq, pcqd, 1); + if (size >= 0) + size = -FI_EAVAIL; + } + + return size; +} + +static ssize_t _wait_cqs(struct fi_cq_err_entry *pcqd) +{ + ssize_t size; + + do { + size = _poll_cqs(pcqd); + } while (size == -FI_EAGAIN); + return size; +} + +/** + * Poll the endpoint EQ once, and fill out an eqd data structure. + */ +static ssize_t _poll_eq(uint32_t *pevent, struct fi_eq_err_entry *peqd) { + struct cxip_ep *ep; + struct fid_eq *eq; + ssize_t size; + + ep = container_of(cxit_ep, struct cxip_ep, ep); + eq = &ep->ep_obj->coll.eq->util_eq.eq_fid; + size = fi_eq_read(eq, pevent, peqd, sizeof(*peqd), 0); + if (size == -FI_EAVAIL) { + size = fi_eq_readerr(eq, peqd, 0); + if (size >= 0) + size = -FI_EAVAIL; + } + return size; +} + +static ssize_t _wait_eq(uint32_t *pevent, struct fi_eq_err_entry *peqd) { + ssize_t size; + + do { + size = _poll_eq(pevent, peqd); + } while (size == -FI_EAGAIN); + return size; +} + +/** + * Join the specified avset to create a multicast reference pointer. + * + * This is implemented as a blocking call, for simplicity. In practice, + * multiple join operations can be initiated, and then the wait can be called + * until all joins have completed. Note that completion can occur in any + * order, and the resulting eqd.context value must be checked to see which of + * the joins completed. We are simply using the avset pointer itself as the + * context, but in production, this could be any kind of unique pointer or + * index. + */ +int join_collective(struct fid_av_set *avset, struct fid_mc **mcp) +{ + struct fi_cq_err_entry cqd; + struct fi_eq_err_entry eqd; + uint32_t event; + ssize_t ret; + + do { + _poll_cqs(&cqd); + ret = fi_join_collective(cxit_ep, FI_ADDR_NOTAVAIL, + avset, 0L, mcp, avset); + } while (ret == -FI_EAGAIN); + if (ret) { + frmwk_log("join initiation error = %ld\n", ret); + return ret; + } + + ret = _wait_eq(&event, &eqd); + if (ret < 0) { + frmwk_log("join wait error = %ld\n", ret); + return ret; + } + if (event != FI_JOIN_COMPLETE) { + frmwk_log("join event = %d != %d\n", event, FI_JOIN_COMPLETE); + return -FI_EADDRNOTAVAIL; + } + + return FI_SUCCESS; +} + +/** + * Perform N barriers in sequence. + * + * delay is a limit for a random delay inserted before each barrier is + * initiated. The srand() function has initialized rand() to different seeds + * based on the rank, so this serves to ensure that the ranks initiate + * barriers out-of-sync in different orders, i.e. sometimes the root will go + * first, sometimes a leaf will go first. + */ +int barrier(struct fid_mc *mc, int N, int delay) +{ + struct timespec t0, t1; + uint64_t icontext; + uint64_t wcontext; + struct fi_cq_err_entry cqd; + int i, ret; + + srand(100*frmwk_rank); + icontext = 0x1000; + clock_gettime(CLOCK_MONOTONIC, &t0); + for (i = 0; i < N; i++) { + do { + if (delay) + usleep(rand() % delay); + ret = fi_barrier(cxit_ep, (fi_addr_t)mc, + (void *)icontext); + } while (ret == -FI_EAGAIN); + frmwk_log("fi_barrier(%08lx) = %d\n", icontext, ret); + if (ret != FI_SUCCESS) + break; + + ret = _wait_cqs(&cqd); + wcontext = (ret > 0) ? (uint64_t)cqd.op_context : -1L; + frmwk_log("wait_cqs(%08lx) = %d\n", wcontext, ret); + icontext++; + } + clock_gettime(CLOCK_MONOTONIC, &t1); + if (t1.tv_nsec < t0.tv_nsec) { + t1.tv_nsec += 1000000000; + t1.tv_sec--; + } + t1.tv_nsec -= t0.tv_nsec; + t1.tv_sec -= t0.tv_sec; + if (i < N) { + frmwk_log0("failed after %d barriers\n", i); + return -1; + } + frmwk_log0("%d barriers completed in %ld.%09ld sec\n", N, + t1.tv_sec, t1.tv_nsec); + return FI_SUCCESS; +} + +int barrier2(struct fid_mc *mc, int N) +{ + struct timespec t0, t1; + uint64_t icontext; + uint64_t wcontext; + struct fi_cq_err_entry cqd; + int started, pending, blocked; + int ret; + + clock_gettime(CLOCK_MONOTONIC, &t0); + started = 0; + pending = 0; + blocked = 0; + icontext = 0x1000; + while (started < N) { + /* start barriers until blocked by -FI_EAGAIN */ + ret = fi_barrier(cxit_ep, (fi_addr_t)mc, (void *)icontext); + if (ret == FI_SUCCESS) { + started++; + pending++; + blocked = 0; + frmwk_log("fi_barrier[%08lx] started=%d pending=%d\n", + icontext, started, pending); + icontext++; + continue; + } + if (ret != -FI_EAGAIN) { + frmwk_log("fi_barrier[%08lx] = %d (failed)\n", icontext, ret); + break; + } + if (!blocked++) + frmwk_log("fi_barrier[%08lx] blocked\n", icontext); + /* poll for one barrier */ + ret = _poll_cqs(&cqd); + wcontext = (ret > 0) ? (uint64_t)cqd.op_context : -1L; + if (ret > 0) { + if (ret > 1) + frmwk_log("poll returned %d unexpected\n", ret); + pending -= ret; + blocked = 0; + frmwk_log("_poll_cqs[%08lx], pending = %d\n", + wcontext, pending); + continue; + } + if (ret != -FI_EAGAIN) { + frmwk_log("_poll_cqs = %d (failed)\n", ret); + break; + } + } + frmwk_log("started %d of %d, pending %d\n", started, N, pending); + if (started < N) { + frmwk_log("failed\n"); + return -1; + } + while (pending > 0) { + ret = _poll_cqs(&cqd); + wcontext = (ret > 0) ? (uint64_t)cqd.op_context : -1L; + if (ret > 0) { + pending -= ret; + frmwk_log("wait_cqs[%08lx], pending = %d\n", + wcontext, pending); + continue; + } + if (ret != -FI_EAGAIN) { + frmwk_log("_poll_cqs = %d\n", ret); + break; + } + } + frmwk_log("completed %d\n", started); + clock_gettime(CLOCK_MONOTONIC, &t1); + if (t1.tv_nsec < t0.tv_nsec) { + t1.tv_nsec += 1000000000; + t1.tv_sec--; + } + t1.tv_nsec -= t0.tv_nsec; + t1.tv_sec -= t0.tv_sec; + if (started < N) { + frmwk_log("_wait_cqs() = %d\n", ret); + frmwk_log("failed after %d barriers\n", started); + return -1; + } + frmwk_log("%d barriers completed in %ld.%09ld sec\n", N, + t1.tv_sec, t1.tv_nsec); + return FI_SUCCESS; +} + +const char *helpstr = + "\n" + "-N specifies the number of barriers to perform, default=1\n" + "-R specifies the rank to be used as the root, default=0\n" + "-D specifies a random max delay in usec, default=0\n" + "-p parallel barriers\n" + "\n"; + +int main(int argc, char **argv) +{ + fi_addr_t *fiaddrs = NULL; + size_t size = 0; + int rootidx = 0; + struct fid_av_set *avset = NULL; + struct fid_mc *mc = NULL; + int N = 1; + bool parallel = false; + int delay = 0; + int help = 0; + int opt, ret; + + while ((opt = getopt(argc, argv, "hpN:R:D:")) != -1) { + switch (opt) { + case 'p': + parallel = true; + break; + case 'N': + N = atoi(optarg); + break; + case 'R': + rootidx = atoi(optarg); + break; + case 'D': + delay = atoi(optarg); + break; + case 'h': + default: + help = 1; + ret = (opt == 'h') ? 0 : 1; + break; + } + } + + /* Read environment variables and initialize frmwk memory */ + frmwk_init(help); + if (help) { + frmwk_log0("Usage: %s [-h] [-N iterations]\n" + " [-R root_rank] [-D usec] [-p]\n", + basename(argv[0])); + frmwk_log0("%s", helpstr); + return ret; + } + + /* Test requires a minimum of two nodes */ + if (frmwk_check_env(2)) + return -1; + + /* Must be done before populting AV */ + ret = frmwk_init_libfabric(); + if (ret) + goto quit; + + /* Acquire HSN0 addresses and distribute across job */ + ret = frmwk_populate_av(&fiaddrs, &size); + if (ret) + goto quit; + + /* Create the MPI_COMM_WORLD group */ + ret = create_av_set(fiaddrs, size, rootidx, &avset); + if (ret) + goto quit; + + /* Create the collective multicast identifier */ + ret = join_collective(avset, &mc); + if (ret) + goto quit; + + /* Perform N barriers */ + ret = (parallel) ? barrier2(mc, N) : barrier(mc, N, delay); + if (ret) + goto quit; + +quit: + if (mc) + fi_close(&mc->fid); + if (avset) + fi_close(&avset->fid); + free(fiaddrs); + frmwk_free_libfabric(); + frmwk_term(); + return ret; +} diff --git a/prov/cxi/test/multinode/test_coll.c b/prov/cxi/test/multinode/test_coll.c new file mode 100644 index 00000000000..974f8c54bb6 --- /dev/null +++ b/prov/cxi/test/multinode/test_coll.c @@ -0,0 +1,1400 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP + */ + +/* + * Validation test for the multinode coll implementation. + * + * Launch using: srun -N4 ./test_coll [args] + * Note that -N4 is the minimum. There is no maximum. + */ + +/** + * Test the coll functions in a real environment. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "multinode_frmwk.h" + +/* If not compiled with DEBUG=1, this is a no-op */ +#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) + +/* convert delays to nsecs */ +#define nUSEC(n) (n * 1000L) +#define nMSEC(n) (n * 1000000L) +#define nSEC(n) (n * 1000000000L) + +int verbose = 0; + +/* Signaling NaN generation, for testing. + * Linux feature requires GNU_SOURCE. + * This generates a specific sNaN value. + */ +static inline double cxip_snan64(void) +{ + return _bits2dbl(0x7ff4000000000000); +} + +/* initialize nsecs timer structure */ +static inline void _init_nsecs(struct timespec *tsp) +{ + clock_gettime(CLOCK_MONOTONIC, tsp); +} + +/* return elapsed nsecs since initialized tsp */ +static inline long _measure_nsecs(struct timespec *tsp) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + ts.tv_nsec -= tsp->tv_nsec; + ts.tv_sec -= tsp->tv_sec; + if (ts.tv_nsec < 0) { + ts.tv_nsec += 1000000000L; + ts.tv_sec -= 1; + } + return 1000000000L*ts.tv_sec + ts.tv_nsec; +} + +static inline void _nsecs_from_now(struct timespec *tsp, long nsecs) +{ + long secs = (nsecs/1000000000L); + + nsecs %= 1000000000L; + clock_gettime(CLOCK_MONOTONIC, tsp); + tsp->tv_nsec += nsecs; + tsp->tv_sec += secs; + if (tsp->tv_nsec > 1000000000L) { + tsp->tv_nsec -= 1000000000L; + tsp->tv_sec += 1; + } +} + +static inline bool _nsecs_expired(const struct timespec *tsp) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + if (ts.tv_sec < tsp->tv_sec) + return false; + if (ts.tv_sec == tsp->tv_sec && + ts.tv_nsec < tsp->tv_nsec) + return false; + return true; +} + +/* poll rx and tx cqs once to drive I/O and return completion context */ +static void *_poll_cqs(void) +{ + struct fi_cq_err_entry cqd; + ssize_t size; + + size = fi_cq_read(cxit_rx_cq, &cqd, 1); + if (size == -FI_EAVAIL) + size = fi_cq_readerr(cxit_rx_cq, &cqd, 1); + if (size > 0) { + TRACE("rx event seen\n"); + TRACE(" size %ld\n",size); + TRACE(" buf %p\n",cqd.buf); + TRACE(" data %016lx\n",cqd.data); + TRACE(" err %d\n",cqd.err); + TRACE(" err_data %p\n",cqd.err_data); + TRACE(" err_data_size %ld\n",cqd.err_data_size); + TRACE(" flags %016lx\n",cqd.flags); + TRACE(" len %ld\n",cqd.len); + TRACE(" olen %ld\n",cqd.olen); + TRACE(" op_context %p\n",cqd.op_context); + TRACE(" prov_errno %d\n",cqd.prov_errno); + TRACE(" tag %016lx\n",cqd.tag); + } else if (size != -FI_EAGAIN) + TRACE("rx ERROR seen = %ld\n", size); + + size = fi_cq_read(cxit_tx_cq, &cqd, 1); + if (size == -FI_EAVAIL) + size = fi_cq_readerr(cxit_tx_cq, &cqd, 1); + if (size > 0) { + TRACE("tx event seen\n"); + TRACE(" size %ld\n",size); + TRACE(" buf %p\n",cqd.buf); + TRACE(" data %016lx\n",cqd.data); + TRACE(" err %d\n",cqd.err); + TRACE(" err_data %p\n",cqd.err_data); + TRACE(" err_data_size %ld\n",cqd.err_data_size); + TRACE(" flags %016lx\n",cqd.flags); + TRACE(" len %ld\n",cqd.len); + TRACE(" olen %ld\n",cqd.olen); + TRACE(" op_context %p\n",cqd.op_context); + TRACE(" prov_errno %d\n",cqd.prov_errno); + TRACE(" tag %016lx\n",cqd.tag); + return cqd.op_context; + } + if (size != -FI_EAGAIN) + TRACE("tx ERROR seen = %ld\n", size); + TRACE("%s return NULL\n", __func__); + return NULL; +} + +/* blocking wait for single collective op completion */ +static void _wait_cqs(void *pcontext) +{ + TRACE("Wait for context %p\n", pcontext); + do { + if (pcontext == _poll_cqs()) + break; + } while (true); +} + +/** + * @brief Manage multiple av_sets. + * + * The avset_ary is an ordered list of different av_set objects, each of which + * represents a specific collective group. + * + * In this test framework, the multi join operation will create an + * independent join (and mc object) for EACH av_set in the avset_ary, to be + * initiated concurrently. Joins will be initiated in the list order, and + * completed in an arbitrary order. + * + * Hint: fid_av_set consists of just a constant self-size value, and a list + * of function pointers. It is contained inside cxip_av_set, which contains + * the addresses, address counts, etc. You need to take container_of() on the + * fid_av_set pointer to get the containing cxip_av_set. Real (non-test) + * users will not need this extra information. + */ +struct avset_ary { + struct fid_av_set **avset; + int avset_cnt; + int avset_siz; +}; + +void avset_ary_init(struct avset_ary *setary) +{ + setary->avset = NULL; + setary->avset_cnt = 0; + setary->avset_siz = 0; +} + +void avset_ary_destroy(struct avset_ary *setary) +{ + int i; + + if (setary->avset) { + for (i = 0; i < setary->avset_cnt; i++) + fi_close(&setary->avset[i]->fid); + free(setary->avset); + } + avset_ary_init(setary); +} + +/* create a single avset using fiaddrs, size, and append it to the setary */ +int avset_ary_append(fi_addr_t *fiaddrs, size_t size, + int mcast_addr, int root_idx, + struct avset_ary *setary) +{ + struct cxip_comm_key comm_key = { + .keytype = (cxip_env.coll_fabric_mgr_url) ? + COMM_KEY_NONE : COMM_KEY_UNICAST, + .ucast.mcast_addr = mcast_addr, + .ucast.hwroot_idx = root_idx + }; + struct fi_av_set_attr attr = { + .count = 0, + .start_addr = FI_ADDR_NOTAVAIL, + .end_addr = FI_ADDR_NOTAVAIL, + .stride = 1, + .comm_key_size = sizeof(comm_key), + .comm_key = (void *)&comm_key, + .flags = 0, + }; + struct fid_av_set *setp; + int i, ret; + + // expand accumulator list as necessary + TRACE("%s cnt=%d siz=%d\n", __func__, setary->avset_cnt, + setary->avset_siz); + if (setary->avset_siz <= setary->avset_cnt) { + void *ptr; + int siz; + + TRACE("%s expand setary\n", __func__); + siz = setary->avset_siz + 4; + ptr = realloc(setary->avset, siz * sizeof(void *)); + if (!ptr) { + TRACE("%s realloc failed\n", __func__); + ret = -FI_ENOMEM; + goto quit; + } + setary->avset_siz = siz; + setary->avset = ptr; + } + // create empty av_set (alloc and initialize to empty) + ret = fi_av_set(cxit_av, &attr, &setp, NULL); + if (ret) { + TRACE("%s fi_av_set failed %d\n", __func__, ret); + goto quit; + } + // append addresses to av_set + for (i = 0; i < size; i++) { + ret = fi_av_set_insert(setp, fiaddrs[i]); + if (ret) { + TRACE("%s fi_av_set_insert failed %d\n", __func__, ret); + goto quit; + } + } + // add to expanded list + setary->avset[setary->avset_cnt++] = setp; + return 0; + +quit: + TRACE("%s: FAILED %d\n", __func__, ret); + if (setp) { + fi_close(&setp->fid); + free(setp); + } + return ret; +} + +/** + * @brief Perform concurrent joins over avset_ary objects. + * + * A single multi-join will initiate concurrent join operations over each of + * the av_set objects in the avset_ary. + * + * Each join is represented by a join_item, which contains a pointer to the + * generating av_set, and the resulting mc object. It also records a + * completion result and a provider error (if any). The join_items are linked + * to a dlist called the joinlist. + * + * A multi-join can be called multiple times for the same joinlist, and will + * continue add join_items to the joinlist. + * + * If the av_set objects are all disjoint, joins should proceed in parallel. + * If the av_set objects overlap, the first join will proceed, and subsequent + * joins will return -FI_EAGAIN until the blocking zbcoll getgroup operation + * completes, after which they will proceed in parallel. If the maximum + * zbcoll groupid value is acquired, all join operations will be blocked + * until at least one join operation completes, freeing a zbcoll groupid. + * + * Proper behavior is dependent on initiating all joins in the same relative + * order on every participating endpoint, which is a general MPI requirement + * for all collective operations. + * + * This returns when all joins specified in the setary have been initiated. + * + * Note that fi_join_collective() can be called from an endpoint that is not + * a valid endpoint in the collective group. These tests, in fact, will call + * fi_join_collective() for every endpoint in the WLM job, even if the av_set + * represents some subset of this. The call will return the value + * -FI_ECONNREFUSED for endpoints that do not belong to the collective + * group, and this causes the join structure to be discarded without adding + * it to the result joinlist. This means that when doing a multijoin, + * different endpoints may have different joinlist lengths. + * + * A join failure on an endpoint that is part of the collective group will + * result in an error propagated to all members of that group through zbcoll, + * so all endpoints will fail the join operation with the same error code. + */ +struct join_item { + struct dlist_entry entry; + struct fid_av_set *avset; + struct fid_mc *mc; + int prov_errno; + int retval; + int trace_no; +}; + +/* poll the collective eq once, count of completions (0 or 1) */ +static int _poll_eq(void) +{ + struct cxip_ep *ep; + struct fid_eq *eq; + struct fi_eq_err_entry eqd = {}; + struct join_item *jctx; + uint32_t event; + int ret; + + ep = container_of(cxit_ep, struct cxip_ep, ep); + eq = &ep->ep_obj->coll.eq->util_eq.eq_fid; + + jctx = NULL; + ret = fi_eq_read(eq, &event, &eqd, sizeof(eqd), 0); + if (ret >= 0) { + TRACE("read EQ = %d\n", ret); + if (ret < sizeof(struct fi_eq_entry)) { + TRACE("fi_eq_read()=%d, exp=%ld\n", + ret, sizeof(struct fi_eq_entry)); + return -FI_EINVAL; + } + TRACE("=== EQ SUCCESS\n"); + TRACE(" size = %d\n", ret); + TRACE(" event = %d\n", event); + TRACE(" fid = %p\n", eqd.fid); + TRACE(" context = %p\n", eqd.context); + TRACE(" data = %lx\n", eqd.data); + if (eqd.context && event == FI_JOIN_COMPLETE) { + jctx = eqd.context; + jctx->retval = 0; + jctx->prov_errno = 0; + return 1; + } + } + if (ret == -FI_EAVAIL) { + TRACE("read EQ = %d\n", ret); + ret = fi_eq_readerr(eq, &eqd, 0); + if (ret < sizeof(struct fi_eq_err_entry)) { + TRACE("fi_eq_readerr()=%d, exp=%ld\n", + ret, sizeof(struct fi_eq_err_entry)); + return -FI_EINVAL; + } + TRACE("=== EQ error available\n"); + TRACE(" size = %d\n", ret); + TRACE(" event = %d\n", event); + TRACE(" fid = %p\n", eqd.fid); + TRACE(" context = %p\n", eqd.context); + TRACE(" data = %lx\n", eqd.data); + TRACE(" err = %s (%d)\n", + fi_strerror(-eqd.err), eqd.err); + TRACE(" prov_err= %d\n", eqd.prov_errno); + TRACE(" err_data= %p\n", eqd.err_data); + TRACE(" err_size= %ld\n", eqd.err_data_size); + if (eqd.context) { + jctx = eqd.context; + jctx->retval = eqd.err; + jctx->prov_errno = eqd.prov_errno; + return 1; + } + } + if (ret != -FI_EAGAIN) { + TRACE("read EQ = %d\n", ret); + TRACE("=== EQ other\n"); + TRACE(" size = %d\n", ret); + TRACE(" event = %d\n", event); + } + return 0; +} + +/* close a list of collectives */ +void coll_multi_release(struct dlist_entry *joinlist) +{ + struct join_item *jctx; + + TRACE("coll_multi_release\n"); + while (!dlist_empty(joinlist)) { + dlist_pop_front(joinlist, struct join_item, jctx, entry); + TRACE("close mc, empty = %d\n", dlist_empty(joinlist)); + if (jctx->mc) + fi_close(&jctx->mc->fid); + TRACE("free jctx\n"); + free(jctx); + } + TRACE("return\n"); +} + +/* initiate join on all sets in setary, and append to joinlist */ +int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) +{ + struct join_item *jctx; + int i, ret, total, count; + + TRACE("ENTRY %s\n", __func__); + + // perform collective joins from setary + total = setary->avset_cnt; + count = 0; + for (i = 0; i < total; i++) { + jctx = calloc(1, sizeof(*jctx)); + jctx->trace_no = i; + jctx->avset = setary->avset[i]; + dlist_init(&jctx->entry); + TRACE("join %d of %d initiating\n", i, total); + ret = fi_join_collective(cxit_ep, FI_ADDR_NOTAVAIL, + setary->avset[i], 0L, + &jctx->mc, jctx); + /* node is not participating in this join */ + if (ret == -FI_ECONNREFUSED) { + free(jctx); + continue; + } + TRACE("join %d continuing ret=%d\n", i, ret); + if (ret != FI_SUCCESS) { + TRACE("join %d FAILED\n", ret); + goto fail; + } + /* wait for join to complete */ + do { + _poll_cqs(); + ret = _poll_eq(); + } while (ret == 0); + dlist_insert_tail(&jctx->entry, joinlist); + count++; + } + TRACE("DONE %s completed %d joins\n", __func__, count); + return FI_SUCCESS; + +fail: + TRACE("TEST failed\n"); + coll_multi_release(joinlist); + return ret; +} + +/* Perform cleanup on a multijoin */ +void coll_join_cleanup(struct avset_ary *setary, struct dlist_entry *joinlist) +{ + coll_multi_release(joinlist); + avset_ary_destroy(setary); +} + +struct join_item *coll_join_item(struct dlist_entry *joinlist, int index) +{ + struct join_item *jctx; + + dlist_foreach_container(joinlist, struct join_item, jctx, entry) { + if (!index--) + return jctx; + } + return NULL; +} + + +/* Utility function to create a single join with no errors */ +struct join_item *coll_single_join(fi_addr_t *fiaddrs, size_t size, + int mcast_addr, int root_idx, + int exp_retval, int exp_prov_errno, + struct avset_ary *setary, + struct dlist_entry *joinlist, + const char *msg) +{ + struct join_item *jctx = NULL; + int ret; + + avset_ary_init(setary); + ret = avset_ary_append(fiaddrs, size, mcast_addr, root_idx, setary); + if (ret) { + TRACE("%s JOIN avset_ary_append()=%d\n", msg, ret); + goto quit; + } + + dlist_init(joinlist); + ret = coll_multi_join(setary, joinlist); + if (ret) { + TRACE("%s JOIN coll_multi_join()=%d\n", msg, ret); + goto quit; + } + + jctx = dlist_first_entry_or_null(joinlist, struct join_item, entry); + if (!jctx) { + TRACE("%s JOIN produced NULL result\n", msg); + goto quit; + } + + if (jctx->retval != exp_retval || jctx->prov_errno != exp_prov_errno) { + TRACE("%s JOIN ret=%d,exp=%d prov_errno=%d,exp=%d\n", msg, + jctx->retval, exp_retval, + jctx->prov_errno, exp_prov_errno); + goto quit; + } + TRACE("%s JOIN SUCCESS\n", msg); + return jctx; + +quit: + TRACE("%s JOIN FAILED\n", msg); + coll_join_cleanup(setary, joinlist); + return NULL; +} + +#if 0 +int _test_multi_barrier(struct avset_ary *setary, struct dlist_entry *joinlist, + int N, long *nsec_delay, int total_secs) +{ + struct timespec *nsec_times, nsec_start; + int i, ret; + + nsec_times = calloc(sizeof(struct timespec), N); + ret = coll_init_multi_join(setary, joinlist); + if (ret) { + TRACE("multicast_join init error = %d\n", ret); + goto quit; + } + ret = coll_wait_multi_join(joinlist); + if (ret) { + TRACE("multicast_join wait error = %d\n", ret); + goto quit; + } + + _nsecs_from_now(&nsec_start, 0L); + nsec_start.tv_sec += total_secs; + + for (i = 0; i < N; i++) + _nsecs_from_now(&nsec_times[i], nsec_delay[i]); + while (!_nsecs_expired(&nsec_start)) { + for (i = 0; i < N; i++) { + if (!_nsecs_expired(&nsec_times[i])) + continue; + for (j = 0; j < ) + } + + } +quit: + free(nsec_times); + coll_multi_releasejoinlist); + avset_ary_destroy(setary); + return ret; +} +#endif + +/** + * @brief Simple test of join, returns a count of errors. + * + * This creates a single avset_ary from the supplied addresses, with hwroot + * of zero, and performs a single join, tests errors, and cleans up. Used to + * probe the basic error conditions. + */ +int _test_join(fi_addr_t *fiaddrs, size_t size, int exp_ret, + int exp_prov_errno) +{ + struct avset_ary setary; + struct dlist_entry joinlist; + struct join_item *jctx; + int ret, errcnt; + + errcnt = 0; + avset_ary_init(&setary); + ret = avset_ary_append(fiaddrs, size, 0, 1, &setary); + errcnt += !!ret; + + dlist_init(&joinlist); + ret = coll_multi_join(&setary, &joinlist); + errcnt += !!ret; + + dlist_foreach_container(&joinlist, struct join_item, jctx, entry) { + if (jctx->retval != exp_ret || + jctx->prov_errno != exp_prov_errno) { + TRACE("exp_ret=%d retval=%d\n", + exp_ret, jctx->retval); + TRACE("exp_prov_errno=%d prov_errno=%d\n", + exp_prov_errno, jctx->prov_errno); + errcnt++; + } + } + + coll_multi_release(&joinlist); + avset_ary_destroy(&setary); + + return errcnt; +} + +/* Simple test of barrier, returns a count of errors. */ +int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count) +{ + struct avset_ary setary; + struct dlist_entry joinlist; + struct join_item *jctx; + uint64_t context; + int i, ret, total, errcnt; + + errcnt = 0; + total = 0; + avset_ary_init(&setary); + ret = avset_ary_append(fiaddrs, size, 0, 1, &setary); + errcnt += !!ret; + if (ret) { + TRACE("BARRIER avset not created\n"); + goto quit; + } + + dlist_init(&joinlist); + ret = coll_multi_join(&setary, &joinlist); + errcnt += !!ret; + if (ret) { + TRACE("BARRIER JOIN not initiated\n"); + goto quit; + } + TRACE("BARRIER JOIN COMPLETE\n"); + + jctx = dlist_first_entry_or_null(&joinlist, struct join_item, entry); + TRACE("Barrier join complete, jctx = %p\n", jctx); + for (i = 0; i < count; i++) { + do { + usleep(rand() % 100); + ret = fi_barrier(cxit_ep, (fi_addr_t )jctx->mc, + &context); + TRACE("barrier = %d\n", ret); + } while (ret == -FI_EAGAIN); + if (ret == FI_SUCCESS) { + TRACE("spin 1...\n"); + _wait_cqs(&context); + TRACE("BARRIER COMPLETE #%d\n", i); + total++; + } else { + TRACE("BARRIER FAILED #%d, ret=%d\n", i, ret); + errcnt++; + } + } + +quit: + frmwk_log0("Barrier errcnt=%d total=%d\n", errcnt, total); + coll_multi_release(&joinlist); + avset_ary_destroy(&setary); + return errcnt; +} + +/* Simple test of broadcast, returns a count of errors. */ +int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int rootidx) +{ + struct avset_ary setary; + struct dlist_entry joinlist; + struct join_item *jctx; + uint64_t data[4], rslt[4]; + uint64_t context; + int i, ret, errcnt; + + errcnt = 0; + jctx = coll_single_join(fiaddrs, size, 0, rootidx, 0, 0, + &setary, &joinlist, "BROADCAST"); + if (!jctx) { + TRACE("BROADCAST JOIN returned NULL\n"); + goto quit; + } + + data[0] = 0x12345678; + data[1] = 0x2468ace0; + data[2] = 0x13579bdf; + data[3] = 0x10101010; + memset(rslt, 0, sizeof(rslt)); + if (frmwk_rank == rootidx) + memcpy(rslt, data, sizeof(rslt)); + do { + _poll_cqs(); + ret = fi_broadcast(cxit_ep, rslt, 4, NULL, + (fi_addr_t )jctx->mc, fiaddrs[rootidx], + FI_UINT64, 0L, &context); + } while (ret == -FI_EAGAIN); + errcnt += !!ret; + if (ret == FI_SUCCESS) { + TRACE("spin 1...\n"); + _wait_cqs(&context); + TRACE("BROADCAST COMPLETE\n"); + if (memcmp(rslt, data, sizeof(rslt))) { + for (i = 0; i < 4; i++) + TRACE("[%d] %016lx exp %016lx\n", + i, rslt[i], data[i]); + errcnt++; + } + } else { + TRACE("ret = %d\n", ret); + TRACE("BROADCAST FAILED\n"); + errcnt++; + } + +quit: + coll_multi_release(&joinlist); + avset_ary_destroy(&setary); + return errcnt; +} + +/* simple test of allreduce, returns a count of errors. */ +int _test_allreduce(fi_addr_t *fiaddrs, size_t size) +{ + struct avset_ary setary; + struct dlist_entry joinlist; + struct join_item *jctx; + int64_t *data, *rslt, *comp; + uint64_t context; + int i, j, ret, errcnt; + + errcnt = 0; + avset_ary_init(&setary); + ret = avset_ary_append(fiaddrs, size, 0, 1, &setary); + errcnt += !!ret; + if (ret) { + TRACE("ALLREDUCE avset not created\n"); + goto quit; + } + + dlist_init(&joinlist); + ret = coll_multi_join(&setary, &joinlist); + errcnt += !!ret; + if (ret) { + TRACE("ALLREDUCE JOIN not initiated\n"); + goto quit; + } + + jctx = dlist_first_entry_or_null(&joinlist, struct join_item, entry); + TRACE("jctx = %p\n", jctx); + TRACE("mc = %p\n", jctx->mc); + + data = calloc(frmwk_numranks*4, sizeof(int64_t)); + comp = calloc(4, sizeof(int64_t)); + rslt = calloc(4, sizeof(int64_t)); + for (i = 0; i < frmwk_numranks; i++) { + for (j = 0; j < 4; j++) { + data[4*i + j] = ((int64_t)(rand() - RAND_MAX/2) << 32); + data[4*i + j] |= rand(); + comp[j] += data[4*i + j]; + } + } + do { + _poll_cqs(); + ret = fi_allreduce(cxit_ep, &data[frmwk_rank*4], 4, NULL, + rslt, NULL, (fi_addr_t )jctx->mc, FI_INT64, + FI_SUM, 0L, &context); + } while (ret == -FI_EAGAIN); + errcnt += !!ret; + if (ret == FI_SUCCESS) { + TRACE("spin 1...\n"); + _wait_cqs(&context); + TRACE("ALLREDUCE COMPLETE\n"); + if (memcmp(rslt, comp, 4*sizeof(int64_t))) { + for (i = 0; i < 4; i++) + TRACE("[%d] %016lx exp %016lx\n", + i, rslt[i], comp[i]); + errcnt++; + } + } else { + TRACE("ret = %d\n", ret); + TRACE("ALLREDUCE FAILED\n"); + errcnt++; + } + free(rslt); + free(comp); + free(data); + +quit: + coll_multi_release(&joinlist); + avset_ary_destroy(&setary); + return errcnt; +} + +/** + * Main application. + * + * ./test_coll -h or srun -Nx ./test_coll -h displays syntax and a list of + * tests. + * + * Each test is bracketed by do {...} while(0) and will be evaluated against + * the test mask created by the -t argument. If the test isn't in the -t + * selection, then the test is silently skipped. Tests can be easily + * rearranged or modified by adding new do {...} while(0) test cases. Each + * should begin with PREAMBLE(), which manages the argument handling. + * + * Each test case should end with frmwk_barrier(), which uses the framework() + * sockets-based barrier to ensure separation of the test cases. + */ + +static uint64_t testmask = 0L; + +#define TAG(skip) (skip ? "SKIP " : "----") +#define TEST(n) (1 << n) +#define STDMSG(ret) ((ret > 0) ? "SKIP" : ((ret) ? "FAIL" : "good")) +#define PREAMBLE(skip, num, nam) \ + ret = 1; \ + testname = nam; \ + if (help) { \ + frmwk_log0("%2d: %s\n", num, testname); break; \ + }; \ + if (!(testmask & TEST(num))) break; \ + frmwk_log0("%4s %2d:%s\n", TAG(skip), num, testname); \ + TRACE("%4s %2d:%s\n", TAG(skip), num, testname); \ + if (skip) break; \ + ret = 0 + +int main(int argc, char **argv) +{ + bool trace_enabled = true; + fi_addr_t *fiaddrs = NULL; + fi_addr_t myaddr; + struct cxip_addr mycaddr; + size_t mycaddr_len; + size_t size = 0; + int errcnt = 0; + int tstcnt = 0; + int tstnum = 0; + int ret = 0; + int N = 0; + bool help = false; + struct join_item *jctx; + struct avset_ary setary; + struct dlist_entry joinlist; + + + const char *testname; + char opt; + int i, j; + + /* by default, perform all tests */ + testmask = -1L; + testname = NULL; + + TRACE("enter main\n"); + while ((opt = getopt(argc, argv, "hvVt:N:")) != -1) { + char *str, *s, *p; + + switch (opt) { + case 't': + /* perform only selected tests */ + str = optarg; + i = j = 0; + testmask = 0L; + while (*str) { + while (*str == ' ') + str++; + s = str; + while (*str && *str != ',') + str++; + if (*str) + *str++ = 0; + p = s; + while (*p && *p != '-') + p++; + i = atoi(s); + j = (*p) ? atoi(++p) : i; + if (j > 63) + j = 63; + while (i <= j) + testmask |= (1L << i++); + } + break; + case 'N': + N = atoi(optarg); + break; + case 'V': + trace_enabled = true; + break; + case 'v': + verbose = true; + break; + case 'h': + help = true; + break; + default: + help = true; + frmwk_log0("Syntax error\n"); + break; + } + } + + /* initialize framework, silently if running help */ + frmwk_init(help); + srand(frmwk_rank); + + /* Collect env variable information from WLM */ + do { + if (help) { + frmwk_log0( + "Usage: test_coll [-hvV] -Ncount[-t testno[-testno][,...]]\n"); + frmwk_log0("\nTests:\n"); + break; + } + + /* Test requires a minimum of four nodes */ + if (frmwk_check_env(4)) + return -1; + + /* Initialize libfabric on this node */ + ret = frmwk_init_libfabric(); + errcnt += !!ret; + if (frmwk_errmsg(ret, "frmwk_init_libfabric()\n")) + goto done; + + cxip_trace_enable(trace_enabled); + TRACE("==== tracing enabled offset %d\n", frmwk_rank); + + /* always start with FI_UNIVERSE */ + ret = frmwk_populate_av(&fiaddrs, &size); + errcnt += !!ret; + if (frmwk_errmsg(ret, "frmwk_populate_av()\n")) + goto done; + + myaddr = fiaddrs[frmwk_rank]; + ret = fi_av_lookup(cxit_av, myaddr, &mycaddr, &mycaddr_len); + errcnt += !!ret; + if (frmwk_errmsg(ret, "fi_av_lookup(%d)\n", frmwk_rank)) + goto done; + + TRACE("numranks=%2d rank=%2d fiaddr=%ld caddr=%05x\n", + frmwk_numranks, frmwk_rank, myaddr, mycaddr.nic); + } while (0); + if (errcnt) + goto done; + + /* TEST CASES*/ + + /* Sanity test of framework. + */ + do { + PREAMBLE(0, tstnum, "test framework"); + ret = 0; + tstcnt += 1; + errcnt += !!ret; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + /* Sanity test of the avset_ary_append() utility function. + */ + do { + PREAMBLE(0, tstnum, "create av_set list 1"); + // Test multijoins over one array list + TRACE("======= %s\n", testname); + avset_ary_init(&setary); + ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); + errcnt += !!ret; + + avset_ary_destroy(&setary); + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + /* Exercise the avset_ary_append() utility function. + */ + do { + PREAMBLE(0, tstnum, "create av_set list 10"); + // Exercise creating av_set lists + avset_ary_init(&setary); + dlist_init(&joinlist); + + ret = 0; + for (i = 0; !ret && i < 10; i++) + ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); + TRACE("ret=%d cnt=%d siz=%d\n", ret, + setary.avset_cnt, setary.avset_siz); + errcnt += !!ret; + errcnt += !!(setary.avset_cnt != 10); + errcnt += !!(setary.avset_siz < 10); + + avset_ary_destroy(&setary); + errcnt += !!(setary.avset_cnt != 0); + errcnt += !!(setary.avset_siz != 0); + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + /* Sanity test for _test_join() utility function. + */ + do { + PREAMBLE(0, tstnum, "test join (simple)"); + // Test single join over one array list + TRACE("======= %s\n", testname); + jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, + &setary, &joinlist, "simple"); + coll_join_cleanup(&setary, &joinlist); + errcnt += !!!jctx; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + /* Test join operation with a 1-second delay on the root. + */ + do { + PREAMBLE(0, tstnum, "test join (slow root)"); + // cause slow root rank + if (frmwk_rank == 0) + usleep(1000000); + jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, + &setary, &joinlist, "slow root"); + coll_join_cleanup(&setary, &joinlist); + errcnt += !!!jctx; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + /* Test join operation with a 1-second delay on a leaf. + */ + do { + PREAMBLE(0, tstnum, "test join (slow leaf)"); + // cause slow leaf rank + if (frmwk_rank == (frmwk_numranks - 1)) + usleep(1000000); + jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, + &setary, &joinlist, "slow leaf"); + coll_join_cleanup(&setary, &joinlist); + errcnt += !!!jctx; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "force -FI_EAGAIN on root getgroup"); + // cause zbcoll root (rank 0) to reject getgroup requests once + cxip_trap_set(0, CXIP_TRAP_GETGRP, -FI_EAGAIN); + // cause non-root ranks attempt zbcoll getgroup first + if (frmwk_rank == 0) + usleep(10000); + jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, + &setary, &joinlist, "FI_EAGAIN root"); + coll_join_cleanup(&setary, &joinlist); + errcnt += !!!jctx; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "force -FI_EAGAIN on root broadcast"); + // cause zbcoll root (rank 0) to reject broadcast requests once + cxip_trap_set(0, CXIP_TRAP_BCAST, -FI_EAGAIN); + jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, + &setary, &joinlist, + "FI_EAGAIN root bcast"); + coll_join_cleanup(&setary, &joinlist); + errcnt += !!!jctx; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "force -FI_EAGAIN on root reduce"); + // cause zbcoll root (rank 0) to reject join reduce once + cxip_trap_set(0, CXIP_TRAP_REDUCE, -FI_EAGAIN); + jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, + &setary, &joinlist, + "FI_EAGAIN root reduce"); + coll_join_cleanup(&setary, &joinlist); + errcnt += !!!jctx; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "force -FI_EFAULT on PTE alloc"); + // cause zbcoll root (rank 0) to simulate PTE alloc failure + cxip_trap_set(0, CXIP_TRAP_INITPTE, -FI_EFAULT); + ret = _test_join(fiaddrs, size, -FI_EAVAIL, + CXIP_PROV_ERRNO_PTE); + tstcnt += 1; + errcnt += !!ret; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "perform barrier"); + ret = _test_barrier(fiaddrs, size, 1); + errcnt += !!ret; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "perform broadcast"); + for (i = 0; i < frmwk_numranks; i++) { + ret = _test_broadcast(fiaddrs, size, i); + errcnt += !!ret; + } + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "perform allreduce sum"); + ret = _test_allreduce(fiaddrs, size); + TRACE("allreduce ret = %d\n", ret); + errcnt += !!ret; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "perform barrier x N"); + ret = _test_barrier(fiaddrs, size, N); + errcnt += !!ret; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "test mcast dup"); + avset_ary_init(&setary); + TRACE("avset initialized\n"); + ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); + TRACE("avset append 1 = %d\n", ret); + ret = avset_ary_append(fiaddrs, size, 0, 1, &setary); + TRACE("avset append 2 = %d\n", ret); + + dlist_init(&joinlist); + ret = coll_multi_join(&setary, &joinlist); + TRACE("join = %d\n", ret); + + jctx = coll_join_item(&joinlist, 0); + TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", + jctx->mc, jctx->retval, jctx->prov_errno); + if (jctx->retval || jctx->prov_errno) { + TRACE("unexpected result on coll 0\n"); + errcnt++; + } + jctx = coll_join_item(&joinlist, 1); + TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", + jctx->mc, jctx->retval, jctx->prov_errno); + if (jctx->retval != -FI_EAVAIL || + jctx->prov_errno != CXIP_PROV_ERRNO_MCAST_INUSE) { + TRACE("unexpected result on coll 1\n"); + errcnt++; + } + tstcnt += 1; + + frmwk_log0("%4s\n", STDMSG(ret)); + coll_multi_release(&joinlist); + avset_ary_destroy(&setary); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "test hwroot dup"); + avset_ary_init(&setary); + TRACE("avset initialized\n"); + ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); + TRACE("avset append 1 = %d\n", ret); + ret = avset_ary_append(fiaddrs, size, 1, 0, &setary); + TRACE("avset append 2 = %d\n", ret); + + dlist_init(&joinlist); + ret = coll_multi_join(&setary, &joinlist); + TRACE("join = %d\n", ret); + + jctx = coll_join_item(&joinlist, 0); + TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", + jctx->mc, jctx->retval, jctx->prov_errno); + if (jctx->retval || jctx->prov_errno) { + TRACE("unexpected result on coll 0\n"); + errcnt++; + } + jctx = coll_join_item(&joinlist, 1); + TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", + jctx->mc, jctx->retval, jctx->prov_errno); + if (jctx->retval != -FI_EAVAIL || + jctx->prov_errno != CXIP_PROV_ERRNO_HWROOT_INUSE) { + TRACE("unexpected result on coll 1\n"); + errcnt++; + } + tstcnt += 1; + + frmwk_log0("%4s\n", STDMSG(ret)); + coll_multi_release(&joinlist); + avset_ary_destroy(&setary); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "test hwroot and mcast dup"); + avset_ary_init(&setary); + TRACE("avset initialized\n"); + ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); + TRACE("avset append 1 = %d\n", ret); + ret = avset_ary_append(fiaddrs, size, 0, 0, &setary); + TRACE("avset append 2 = %d\n", ret); + + dlist_init(&joinlist); + ret = coll_multi_join(&setary, &joinlist); + TRACE("join = %d\n", ret); + + jctx = coll_join_item(&joinlist, 0); + TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", + jctx->mc, jctx->retval, jctx->prov_errno); + if (jctx->retval || jctx->prov_errno) { + TRACE("unexpected result on coll 0\n"); + errcnt++; + } + jctx = coll_join_item(&joinlist, 1); + TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", + jctx->mc, jctx->retval, jctx->prov_errno); + if (jctx->retval != -FI_EAVAIL || + jctx->prov_errno != CXIP_PROV_ERRNO_HWROOT_INUSE) { + TRACE("unexpected result on coll 1\n"); + errcnt++; + } + tstcnt += 1; + + frmwk_log0("%4s\n", STDMSG(ret)); + coll_multi_release(&joinlist); + avset_ary_destroy(&setary); + } while (0); + tstnum++; + + do { + PREAMBLE(0, tstnum, "test multiple join"); + avset_ary_init(&setary); + TRACE("avset initialized\n"); + + for (i = 0; i < N; i++) { + ret = avset_ary_append(fiaddrs, size, i, i, &setary); + TRACE("avset append %d = %d\n", i, ret); + } + + dlist_init(&joinlist); + ret = coll_multi_join(&setary, &joinlist); + TRACE("multijoin = %d\n", ret); + + for (i = 0; i < N; i++) { + int exp_ret = (i < size) ? 0 : -FI_EAVAIL; + int exp_errno = (i < size) ? 0 : CXIP_PROV_ERRNO_HWROOT_INUSE; + int good; + + jctx = coll_join_item(&joinlist, i); + if (!jctx) { + TRACE("no join item\n"); + continue; + } + good = (jctx->retval == exp_ret && + jctx->prov_errno == exp_errno); + errcnt += !good; + TRACE("item %d mc=%p retval=%d prov_errno=%d %s\n", + i, jctx->mc, jctx->retval, jctx->prov_errno, + good ? "ok" : "bad"); + } + tstcnt += 1; + + frmwk_log0("%4s\n", STDMSG(ret)); + coll_multi_release(&joinlist); + avset_ary_destroy(&setary); + } while (0); + tstnum++; + + + do { + PREAMBLE(0, tstnum, "test multiple broadcast"); + + uint64_t **datary, *ctxary, *ctxptr; + int in_progress, tree, root, i, j; + + /* set up maximum number of trees possible */ + avset_ary_init(&setary); + for (tree = 0; tree < size; tree++) { + ret = avset_ary_append(fiaddrs, size, tree, tree, &setary); + TRACE("avset append group %d = %d\n", tree, ret); + } + TRACE("avset initialized\n"); + + dlist_init(&joinlist); + ret = coll_multi_join(&setary, &joinlist); + TRACE("multijoin = %d\n", ret); + + /* context and data for each collective tree */ + ctxary = calloc(size, sizeof(uint64_t)); + datary = calloc(size, sizeof(void *)); + for (tree = 0; tree < size; tree++) { + datary[tree] = calloc(4, sizeof(uint64_t)); + ctxary[tree] = tree; + } + + /* repeat the collective N times as requested*/ + for (i = 0; i < N; i++) { + in_progress = 0; + + /* rotate root every time */ + root = i%size; + + /* start a broadcast on every tree */ + for (tree = 0; tree < size; tree++) { + uint64_t id = (uint64_t)tree << 32; + + /* prepare the data */ + memset(datary[tree], 0, 4*sizeof(uint64_t)); + if (frmwk_rank == root) { + for (j = 0; j < 4; j++) + datary[tree][j] = id|root; + } + TRACE("strt=%d tree=%d\n", i, tree); + for (j = 0; j < 4; j++) + TRACE(" %016lx\n", datary[tree][j]); + + } + for (tree = 0; tree < size; tree++) { + int tree2 = (tree + frmwk_rank)%size; + + usleep(rand() % 100); + jctx = coll_join_item(&joinlist, tree2); + ret = fi_broadcast(cxit_ep, datary[tree2], 4, NULL, + (fi_addr_t )jctx->mc, + fiaddrs[root], FI_UINT64, + 0L, &ctxary[tree2]); + in_progress++; + TRACE("in_progress=%d\n", in_progress); + if ((ctxptr = _poll_cqs())) { + in_progress--; + TRACE("ctxptr=%ld in_progress=%d\n", + *ctxptr, in_progress); + } + } + while (in_progress > 0) { + if ((ctxptr = _poll_cqs())) { + in_progress--; + TRACE("ctxptr=%ld in_progress=%d\n", + *ctxptr, in_progress); + } + } + for (tree = 0; tree < size; tree++) { + TRACE("rslt=%d tree=%d\n", i, tree); + for (j = 0; j < 4; j++) + TRACE(" %016lx\n", datary[tree][j]); + + } + } + tstcnt += 1; + + frmwk_log0("%4s\n", STDMSG(ret)); + coll_multi_release(&joinlist); + avset_ary_destroy(&setary); + } while (0); + tstnum++; + +#if 0 + do { + PREAMBLE(0, tstnum, "title of test"); + ret = 0; // some test + errcnt += !!ret; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; +#endif + + if (help) + return (errcnt); + +done: + frmwk_log0("%2d tests run, %d failures\n", tstcnt, errcnt); + frmwk_log0(!!errcnt ? "ERRORS SEEN\n" : "SUCCESS\n"); + free(fiaddrs); + frmwk_free_libfabric(); + frmwk_term(); + return !!errcnt; +} diff --git a/prov/cxi/test/multinode/test_frmwk.c b/prov/cxi/test/multinode/test_frmwk.c new file mode 100644 index 00000000000..6f8fdd7a850 --- /dev/null +++ b/prov/cxi/test/multinode/test_frmwk.c @@ -0,0 +1,80 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2022-2023 Hewlett Packard Enterprise Development LP + */ + +/* + * Validation test for the pmi_frmwk implementation. + * + * Launch using: srun -N4 ./test_frmwk + * + * This can be used as a prototype for test applications. + * + * This activates libfabric, populates the AV, and then frees the libfabric + * instance. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "multinode_frmwk.h" + +#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) + +int main(int argc, char **argv) +{ + fi_addr_t *fiaddr = NULL; + size_t size = 0; + int i, j, ret; + + frmwk_init(false); + printf("[%d|%d] initialized\n", frmwk_rank, frmwk_numranks); + + ret = frmwk_gather_nics(); + for (i = 0; i < frmwk_numranks; i++) { + printf("[%d|%d] rank %d HSNS [", frmwk_rank, frmwk_numranks, i); + for (j = 0; j < frmwk_nics_per_rank; j++) + printf(" %05x", frmwk_nic_addr(i, j)); + printf("]\n"); + } + + frmwk_barrier(); + + ret = frmwk_init_libfabric(); + if (frmwk_errmsg(ret, "frmwk_init_libfabric()\n")) + return ret; + + ret = frmwk_populate_av(&fiaddr, &size); + if (frmwk_errmsg(ret, "frmwk_populate_av()\n")) + return ret; + + printf("[%d|%d] fiaddrs\n", frmwk_rank, frmwk_numranks); + for (i = 0; i < size; i++) { + printf("[%d|%d] %ld\n", frmwk_rank, frmwk_numranks, + fiaddr[i]); + } + + cxip_trace_enable(true); + TRACE("Trace message test %d\n", 0); + TRACE("Trace message test %d\n", 1); + cxip_trace_enable(false); + TRACE("This message should not appear\n"); + cxip_trace_enable(true); + TRACE("This message should appear\n"); + + frmwk_free_libfabric(); + free(fiaddr); + + frmwk_term(); + return ret; +} diff --git a/prov/cxi/test/multinode/test_zbcoll.c b/prov/cxi/test/multinode/test_zbcoll.c new file mode 100644 index 00000000000..d19ff3aabd8 --- /dev/null +++ b/prov/cxi/test/multinode/test_zbcoll.c @@ -0,0 +1,1414 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP + */ + +/* + * Validation test for the multinode zbcoll implementation. + * + * Launch using: srun -N4 ./test_zbcoll [args] + */ + +/** + * Test the zbcoll functions in a real environment. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "multinode_frmwk.h" + +#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) + +/* convert delays to nsecs */ +#define nUSEC(n) (n * 1000L) +#define nMSEC(n) (n * 1000000L) +#define nSEC(n) (n * 1000000000L) + +int verbose = false; + +/* initialize nsecs timer structure */ +static inline void _init_nsecs(struct timespec *tsp) +{ + clock_gettime(CLOCK_MONOTONIC, tsp); +} + +/* return elapsed nsecs since initialized tsp */ +static inline long _measure_nsecs(struct timespec *tsp) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + ts.tv_nsec -= tsp->tv_nsec; + ts.tv_sec -= tsp->tv_sec; + if (ts.tv_nsec < 0) { + ts.tv_nsec += 1000000000L; + ts.tv_sec -= 1; + } + return 1000000000L*ts.tv_sec + ts.tv_nsec; +} + +/* introduce random jitter delay into operations per rank */ +void _jitter(int usec) +{ + static unsigned int seed = 0; + if (!seed) + seed = rand() + frmwk_rank + 1; + if (usec) { + usec = rand_r(&seed) % usec; + TRACE("_jitter delay = %d usec\n", usec); + usleep(usec); + } +} + +/* utility to poll and capture trailing errors/completions */ +static void _idle_wait(struct cxip_ep_obj *ep_obj, int msec) +{ + uint32_t dsc0, err0, ack0, rcv0; + uint32_t dsc, err, ack, rcv; + struct timespec ts; + long nsecs = 0L; + + cxip_zbcoll_get_counters(ep_obj, &dsc0, &err0, &ack0, &rcv0); + _init_nsecs(&ts); + do { + cxip_ep_zbcoll_progress(ep_obj); + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + nsecs = _measure_nsecs(&ts); + if (dsc==dsc0 && err==err0 && ack==ack0 && rcv==rcv0) + continue; + TRACE("ns=%ld dsc=%d err=%d ack=%d rcv=%d\n", + nsecs, dsc, err, ack, rcv); + cxip_trace_flush(); + dsc0 = dsc; + err0 = err; + ack0 = ack; + rcv0 = rcv; + } while (msec < 0 || nsecs < nMSEC(msec)); +} + +/* utility to do a primitive wait for send completion based on counters */ +static int _send_wait(struct cxip_zbcoll_obj *zb, int sndcnt, int rcvcnt) +{ + struct cxip_ep_obj *ep_obj = zb->ep_obj; + uint32_t dsc, err, ack, rcv; + struct timespec ts; + long nsecs = 0L; + + _init_nsecs(&ts); + do { + cxip_ep_zbcoll_progress(ep_obj); + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + if (err || dsc) + break; + if (ack >= sndcnt && rcv >= rcvcnt) + break; + nsecs = _measure_nsecs(&ts); + } while (nsecs < nMSEC(100)); + TRACE("ns=%ld dsc=%d err=%d ack=%d rcv=%d rc=%d\n", + nsecs, dsc, err, ack, rcv, zb->error); + if (nsecs >= nMSEC(100)) { + TRACE("TIMEOUT\n"); + return 1; + } + if (err || dsc || ack < sndcnt || rcv < rcvcnt) { + TRACE("TRANSPORT FAILURE\n"); + return 1; + } + if (zb->error) { + TRACE("STATE FAILURE\n"); + return 1; + } + cxip_trace_flush(); + return 0; +} + +/* send a single packet from node to node, and wait for completion */ +static struct cxip_addr bad_cxip_addr; +static int bad_cxip_index = -1; + +int _test_send_to_dest(struct cxip_ep_obj *ep_obj, + size_t size, fi_addr_t *fiaddrs, + int src, int dst, uint64_t payload) +{ + struct cxip_zbcoll_obj *zb; + int grp_rank; + int sndcnt, rcvcnt; + int i, ret; + + ret = cxip_zbcoll_alloc(ep_obj, size, fiaddrs, ZB_NOSIM, &zb); + if (frmwk_errmsg(ret, "%s: cxip_zbcoll_alloc()\n", __func__)) + return ret; + + /* replace an address with a different address */ + if (bad_cxip_index >= 0 && bad_cxip_index < size) { + TRACE("bad id being used\n"); + zb->caddrs[bad_cxip_index] = bad_cxip_addr; + } + + grp_rank = zb->state[0].grp_rank; + + ep_obj->zbcoll.disable = true; + zb->grpid = 0; + cxip_zbcoll_reset_counters(ep_obj); + if (src < 0 && dst < 0) { + /* every source to every destination */ + sndcnt = size; + rcvcnt = size; + for (i = 0; i < size; i++) + cxip_zbcoll_send(zb, grp_rank, i, payload); + } else if (src < 0) { + /* every source sends to one destination */ + sndcnt = 1; + rcvcnt = (dst == grp_rank) ? size : 0; + cxip_zbcoll_send(zb, grp_rank, dst, payload); + } else if (dst < 0 && src == grp_rank) { + /* this source sends to every destination */ + sndcnt = size; + rcvcnt = 1; + for (i = 0; i < size; i++) + cxip_zbcoll_send(zb, grp_rank, i, payload); + } else if (dst < 0) { + /* some other src to every destination */ + sndcnt = 0; + rcvcnt = 1; + } else if (grp_rank == src) { + /* this source to a destination */ + sndcnt = 1; + rcvcnt = (grp_rank == dst) ? 1 : 0; + cxip_zbcoll_send(zb, grp_rank, dst, payload); + } else if (grp_rank == dst) { + /* some other source to this destination */ + sndcnt = 0; + rcvcnt = 1; + } else { + /* not participating */ + sndcnt = 0; + rcvcnt = 0; + } + ret = _send_wait(zb, sndcnt, rcvcnt); + ep_obj->zbcoll.disable = false; + cxip_zbcoll_free(zb); + + return ret; +} + +/* normal utility to wait for collective completion, returns coll error */ +static int _coll_wait(struct cxip_zbcoll_obj *zb, long nsec_wait) +{ + uint32_t dsc, err, ack, rcv; + struct timespec ts; + long nsecs = 0L; + + if (!zb) { + TRACE("%s: NULL zb passed\n", __func__); + return -FI_EINVAL; + } + _init_nsecs(&ts); + do { + cxip_ep_zbcoll_progress(zb->ep_obj); + cxip_zbcoll_get_counters(zb->ep_obj, &dsc, &err, &ack, &rcv); + /* this waits for a software completion */ + if (zb->error || !zb->busy) + break; + nsecs = _measure_nsecs(&ts); + } while (nsecs < nsec_wait); + TRACE("ns=%ld dsc=%d err=%d ack=%d rcv=%d\n", + nsecs, dsc, err, ack, rcv); + if (nsecs >= nsec_wait) { + TRACE("TIMEOUT\n"); + return -FI_ETIMEDOUT; + } + /* return the software error code -- may be -FI_EAGAIN */ + TRACE("return code = %d\n", zb->error); + return zb->error; +} + +/** + * @brief Internal workhorse to create zb object and get group id. + * + * If the endpoint is not in the group, this will return FI_SUCCESS, delete the + * zb object (if any), and do nothing. + * + * This creates a zb object as necessary. + * + * This destroys the zb object on any error. + * + * This call blocks for up to 100 msec waiting for completion. + * + * @param ep_obj : endpoint + * @param size : number of NIDs in group + * @param fiaddrs: fiaddrs in group + * @param zbp : return pointer to zb object (may be non-NULL) + * @return int : libfabric error code + */ +int _getgroup(struct cxip_ep_obj *ep_obj, + size_t size, fi_addr_t *fiaddrs, + struct cxip_zbcoll_obj **zbp) +{ + int ret; + + /* need a zbcoll object for this */ + if (!zbp) { + TRACE("%s: NULL zbp passed\n", __func__); + return -FI_EINVAL; + } + if (!*zbp) { + ret = cxip_zbcoll_alloc(ep_obj, size, fiaddrs, ZB_NOSIM, zbp); + if (ret == -FI_ECONNREFUSED) { + TRACE("=== COMPLETED SKIP\n"); + return FI_SUCCESS; + } + if (frmwk_errmsg(ret, "%s: cxip_zbcoll_alloc()\n", __func__)) + goto out; + } + + /* getgroup collective */ + do { + TRACE("microsleep\n"); + usleep(10); + ret = cxip_zbcoll_getgroup(*zbp); + if (ret == -FI_EAGAIN) + continue; + if (frmwk_errmsg(ret, "%s: cxip_zbcoll_getgroup()\n", __func__)) + break; + /* Returns a collective completion error */ + ret = _coll_wait(*zbp, nMSEC(100)); + if (ret == -FI_EAGAIN) + continue; + break; + } while (true); + + /* clean up after error */ + if (ret) + goto out; + + TRACE("=== COMPLETED GETGROUP grpid=%d ret=%s\n", (*zbp)->grpid, + fi_strerror(-ret)); + return FI_SUCCESS; + +out: + TRACE("%s: failed\n", __func__); + cxip_zbcoll_free(*zbp); + *zbp = NULL; + return ret; +} + +/* detect overt getgroup errors */ +int _check_getgroup_errs(struct cxip_zbcoll_obj *zb, int exp_grpid) +{ + return (frmwk_errmsg(!zb, "zb == NULL") || + frmwk_errmsg(zb->error, "zb->error == %d\n", zb->error) || + frmwk_errmsg(zb->grpid != exp_grpid, "zb->grpid=%d exp=%d\n", + zb->grpid, exp_grpid)); +} + +/* rotate array[size] by rot positions */ +void _rotate_array32(uint32_t *array, size_t size, int rot) +{ + uint32_t *copy; + uint32_t i, j; + + copy = calloc(size, sizeof(uint32_t)); + memcpy(copy, array, size*sizeof(uint32_t)); + for (i = 0; i < size; i++) { + j = (i + rot) % size; + array[i] = copy[j]; + } + free(copy); +} + +/* shuffle array[size] randomly */ +void _shuffle_array32(uint32_t *array, size_t size) +{ + uint32_t i, j, t; + + for (i = 0; i < size-1; i++) { + j = i + (rand() / ((RAND_MAX / (size - i)) + 1)); + t = array[j]; + array[j] = array[i]; + array[i] = t; + } +} + +/** + * @brief Perform multiple concurrent getgroup operations. + * + * Parametrized test to thoroughly exercise getgroup edge conditions. + * + * This sets up to acquire 'nruns' group IDs. + * + * On each run it will only use 'naddrs' of the 'size' endpoints. If the default + * value of -1 is used, each run will use a random number between 1 and 'size'. + * + * Prior to each run, the list of addresses is rotated. If 'rot' is -1, the list + * is randomly shuffled. The purpose of rotation is to guarantee disjoint sets + * of NIDs can be created. For instance, if you have 16 addresses (size=16), and + * you set nruns=naddrs=rot=4, then all of the groups will be disjoint. + * + * This imposes a random jitter of up to 'usec' microseconds on each node, to + * break up synchronous behavior among the nodes, and exaggerate race + * conditions. + * + * This presumes a shared file system across all of the nodes under srun, and + * writes results to files named using the rank number, overwriting old files + * from prior runs. The rank 0 node will complete the test by reading back all + * of the files and processing them to ensure correct behavior. + * + * @param ep_obj : endpoint object + * @param size : total number of NID addresses + * @param fiaddrs: all NID addresses + * @param nruns : nruns of concurrency + * @param naddrs : number of NIDs to use (-1 implies random) + * @param rot : nid rotations per run (-1 implies shuffle) + * @param usec : usec jitter to impose randomly + * @return int : 0 on success, or error code + */ +int _multigroup(struct cxip_ep_obj *ep_obj, size_t size, fi_addr_t *fiaddrs, + int nruns, int naddrs, int rot, int usec) +{ + char fnam[256]; + FILE *fd; + struct cxip_zbcoll_obj **zb; + fi_addr_t *addrs; + uint32_t *index; + uint32_t **rows; + uint32_t *length; + int *grps; + bool shuffle = false; + uint32_t dsc, err, ack, rcv; + int i, j, ret; + + cxip_zbcoll_reset_counters(ep_obj); + + ret = 0; + if (nruns < 0) + nruns = size; + if (nruns > cxip_zbcoll_max_grps(false)) + nruns = cxip_zbcoll_max_grps(false); + if (naddrs > size) + naddrs = size; + + addrs = calloc(size, sizeof(fi_addr_t));// indices converted to addrs + index = calloc(size, sizeof(uint32_t)); // nid indices (easier to read) + for (j = 0; j < size; j++) + index[j] = j; + + /* rows : getgroup requests, list of nids involved + * length : number of addrs in each getgroup request, is <= size + * grps : resulting group ID for each getgroup request + * zb : zb_coll object for each getgroup request + */ + rows = calloc(nruns, sizeof(void *)); + length = calloc(nruns, sizeof(uint32_t)); + grps = calloc(nruns, sizeof(int)); + zb = calloc(nruns, sizeof(void *)); + for (i = 0; i < nruns; i++) { + /* -1 means random sizes */ + if (naddrs < 0) { + length[i] = 1 + (rand() % (size - 1)); + } else { + length[i] = naddrs; + } + /* -1 means shuffle targets */ + if (rot < 0) { + rot = 1; + shuffle = true; + } + /* copy shuffled indices into row */ + rows[i] = calloc(length[i], sizeof(uint32_t)); + _rotate_array32(index, size, rot); + if (shuffle) + _shuffle_array32(index, size); + memcpy(rows[i], index, length[i]*sizeof(uint32_t)); + } + + /* create zb with grpid, in same group order across nodes */ + for (i = 0; i < nruns; i++) { + for (j = 0; j < length[i]; j++) + addrs[j] = fiaddrs[rows[i][j]]; + _jitter(usec); + ret = _getgroup(ep_obj, length[i], addrs, &zb[i]); + if (frmwk_errmsg(ret, "FAILURE getgroup %d\n", i)) { + TRACE("FAILURE getgroup %d\n", i); + goto done; + } + grps[i] = (zb[i]) ? zb[i]->grpid : -1; + } + + /* need to compare each node result with other, write to file */ + sprintf(fnam, "grpid%d", frmwk_rank); + fd = fopen(fnam, "w"); + + cxip_zbcoll_get_counters(ep_obj, &dsc, &err, &ack, &rcv); + fprintf(fd, "%d %d %d %d\n", dsc, err, ack, rcv); + for (i = 0; i < nruns; i++) { + fprintf(fd, " %2d", grps[i]); + for (j = 0; j < size; j++) + fprintf(fd, " %2d", (j < length[i]) ? rows[i][j] : -1); + fprintf(fd, "\n"); + } + fclose(fd); + + + /* clean up */ +done: + for (i = 0; i < nruns; i++) { + cxip_zbcoll_free(zb[i]); + free(rows[i]); + } + free(grps); + free(length); + free(rows); + free(index); + free(addrs); + return ret; +} + +/* display the accumulated data for the full test run */ +void _printrun(size_t size, int irun, int ***data) +{ + int irank, inid; + + printf("Test run #%d\n", irun); + for (irank = 0; irank < frmwk_numranks; irank++) { + printf("rank %2d: ", irank); + if (data[irank][irun][0] < 0) { + printf("SKIP\n"); + continue; + } + printf("GRP %2d:", data[irank][irun][0]); + for (inid = 1; inid < size+1; inid++) + printf(" %2d", data[irank][irun][inid]); + printf("\n"); + } +} + +/** + * @brief Check _multigroup results across all nodes. + * + * This is run only on the rank 0 process, and verifies the prior test run. + * + * @param size : total number of NID addresses + * @param nruns : nruns of concurrency in test + * @return int : 0 on success, non-zero on failure + */ +int _multicheck(size_t size, int nruns) +{ + char fnam[256]; + FILE *fd; + uint32_t *dsc, *err, *ack, *rcv; + int ***data; + uint64_t bitv, *mask; + int grp, nid; + int irank, irank2, irun, inid, ret; + + ret = 0; + /* data[irank][irun][inid], inid==0 is grpid */ + data = calloc(frmwk_numranks, sizeof(void *)); + for (irank = 0; irank < frmwk_numranks; irank++) { + data[irank] = calloc(nruns, sizeof(void *)); + for (irun = 0; irun < nruns; irun++) { + data[irank][irun] = calloc(size + 1, sizeof(int)); + } + } + /* one bit for each nid, max is 64 */ + mask = calloc(size, sizeof(uint64_t)); + dsc = calloc(frmwk_numranks, sizeof(uint32_t)); + err = calloc(frmwk_numranks, sizeof(uint32_t)); + ack = calloc(frmwk_numranks, sizeof(uint32_t)); + rcv = calloc(frmwk_numranks, sizeof(uint32_t)); + + /* read in the per-rank file data from the last test run */ + for (irank = 0; irank < frmwk_numranks; irank++) { + /* read file contents into data array */ + sprintf(fnam, "grpid%d", irank); + fd = fopen(fnam, "r"); + if (! fd) { + printf("Could not open %s\n", fnam); + ret = 1; + goto cleanup; + } + if (fscanf(fd, " %d %d %d %d", + &dsc[irank], + &err[irank], + &ack[irank], + &rcv[irank]) < 4) { + printf("bad read (errs)\n"); + ret = 1; + goto cleanup; + } + for (irun = 0; irun < nruns; irun++) { + for (inid = 0; inid < size + 1; inid++) { + int *ptr = &data[irank][irun][inid]; + if (fscanf(fd, " %d", ptr) < 1) { + printf("bad read[%d,%d]\n", irun, inid); + ret = 1; + goto cleanup; + } + } + } + fclose(fd); + } + + /* All ranks in any test run must use the same grpid, ranks */ + for (irun = 0; irun < nruns; irun++) { + irank2 = -1; + for (irank = 1; irank < frmwk_numranks; irank++) { + /* grpid < 0: rank not involved */ + if (data[irank][irun][0] < 0) + continue; + /* remember first involved rank */ + if (irank2 < 0) + irank2 = irank; + /* compare entire row with first involved */ + for (inid = 0; inid < size+1; inid++) + if (data[irank][irun][inid] != + data[irank2][irun][inid]) + break; + /* miscompare is a failure */ + if (inid < size+1) { + printf("ERROR in run #%d @ %d\n", irun, inid); + printf("reductions do not match\n"); + _printrun(size, irun, data); + ret = 1; + goto cleanup; + } + } + } + /* validated that all ranks in each run are identical */ + + /* No nid should reuse the same grpid, only check rank 0 */ + irank = 0; + for (irun = 0; irun < nruns; irun++) { + /* grpid < 0: rank not involved */ + if (data[irank][irun][0] < 0) + continue; + grp = data[irank][irun][0]; + for (inid = 1; inid < size+1; inid++) { + /* ignore unused fiaddrs */ + if (data[irank][irun][inid] < 0) + continue; + nid = data[irank][irun][inid]; + bitv = 1L << grp; + /* failure if grpid already used */ + if (mask[nid] & bitv) { + printf("ERROR in run #%d @ %d\n", + irun, inid); + printf("reuse of grpid %d by %d\n", + grp, nid); + _printrun(size, irun, data); + goto cleanup; + } + mask[nid] |= bitv; + } + } + + /* We don't expect discard or ack errors */ + for (irank = 0; irank < frmwk_numranks; irank++) + if (dsc[irank] || err[irank]) + break; + if (irank < frmwk_numranks) { + printf("ERROR transmission errors\n"); + for (irank = 0; irank < frmwk_numranks; irank++) { + printf("rank %2d: dsc=%d err=%d ack=%d rcv=%d\n", + irank, dsc[irank], err[irank], + ack[irank], rcv[irank]); + } + goto cleanup; + } + +cleanup: + if (verbose) { + printf("==================\n"); + printf("Dump all test runs\n"); + for (irun = 0; irun < nruns; irun++) + _printrun(size, irun, data); + printf("getgroup test %s\n", !ret ? "passed" : "FAILED"); + } + fflush(stdout); + + free(dsc); + free(err); + free(ack); + free(rcv); + free(mask); + for (irank = 0; irank < frmwk_numranks; irank++) { + for (irun = 0; irun < nruns; irun++) + free(data[irank][irun]); + free(data[irank]); + } + free(data); + return ret; +} + +/* use up all group IDs, then free zb objects and add more */ +int _exhaustgroup(struct cxip_ep_obj *ep_obj, size_t size, fi_addr_t *fiaddrs, + int nruns, int usec) +{ + struct cxip_zbcoll_obj **zb; + int maxgrps; + int i, n, ret = 0; + + maxgrps = cxip_zbcoll_max_grps(false); + if (nruns < 0) + nruns = maxgrps + 10; + zb = calloc(nruns, sizeof(void *)); + n = 1; + for (i = 0; i < nruns; i++) { + _jitter(usec); + ret = _getgroup(ep_obj, size, fiaddrs, &zb[i]); + if (ret == -FI_EBUSY) { + /* free an old zb, and try again */ + cxip_zbcoll_free(zb[n]); + zb[n] = NULL; + ret = _getgroup(ep_obj, size, fiaddrs, &zb[i]); + if (frmwk_errmsg(ret, "FAILURE\n")) { + TRACE("FAILURE\n"); + break; + } + if (zb[i]->grpid != n) { + TRACE("FAILURE\n"); + break; + } + n = (n + 3) % maxgrps; + } + } + for (i = 0; i < nruns; i++) + cxip_zbcoll_free(zb[i]); + + return 0; +} + +/* Wait for completion, log errors, free zb object */ +int _test_wait_free(struct cxip_zbcoll_obj *zb, + uint64_t *result, uint64_t expect) +{ + int ret; + + /* wait for completion */ + ret = _coll_wait(zb, nMSEC(100)); + if (frmwk_errmsg(ret, "reduce wait failed\n")) + goto done; + + if (!result) + goto done; + + TRACE("expect=%08lx result=%08lx, ret=%s\n", + expect, *result, fi_strerror(-ret)); + if (*result != expect) { + ret = 1; + frmwk_errmsg(ret, "expect=%08lx result=%08lx\n", + expect, *result); + } +done: + cxip_zbcoll_free(zb); + return ret; +} + +/* barrier across all NIDs, return zb object */ +int _test_barr(struct cxip_ep_obj *ep_obj, + size_t size, fi_addr_t *fiaddrs, + struct cxip_zbcoll_obj **zbp) +{ + struct cxip_zbcoll_obj *zb = NULL; + int ret; + + /* need a zbcoll context for this */ + ret = _getgroup(ep_obj, size, fiaddrs, &zb); + if (ret) + goto out; + + /* reset counters */ + cxip_zbcoll_reset_counters(ep_obj); + + /* if this fails, do not continue */ + ret = cxip_zbcoll_barrier(zb); + if (frmwk_errmsg(ret, "barr0 return=%s, exp=%d\n", fi_strerror(-ret), 0)) + goto out; + + /* try this again, should fail with -FI_EAGAIN */ + ret = cxip_zbcoll_barrier(zb); + if (frmwk_errmsg((ret != -FI_EAGAIN), "barr1 return=%d, exp=%d\n", + ret, -FI_EAGAIN)) + goto out; + + *zbp = zb; + return 0; +out: + cxip_zbcoll_free(zb); + return 1; +} + +/* broadcast the payload from rank 0 to all other ranks, return zb object */ +int _test_bcast(struct cxip_ep_obj *ep_obj, + size_t size, fi_addr_t *fiaddrs, + uint64_t *result, struct cxip_zbcoll_obj *zb) +{ + int ret; + + TRACE("%s: entry\n", __func__); + /* reset counters */ + cxip_zbcoll_reset_counters(ep_obj); + + /* if this fails, do not continue */ + TRACE("%s: initiate broadcast\n", __func__); + ret = cxip_zbcoll_broadcast(zb, result); + TRACE("bcast payload=%08lx, ret=%s\n", *result, fi_strerror(-ret)); + if (frmwk_errmsg(ret, "bcast0 return=%s, exp=%d\n", fi_strerror(-ret), 0)) + goto out; + + /* try this again, should fail with -FI_EAGAIN */ + ret = cxip_zbcoll_broadcast(zb, result); + TRACE("bcast payload=%08lx, ret=%s\n", *result, fi_strerror(-ret)); + if (frmwk_errmsg((ret != -FI_EAGAIN), "bcast1 return=%d, exp=%d\n", + ret, -FI_EAGAIN)) + goto out; + return 0; +out: + TRACE("%s: failed\n", __func__); + return 1; +} + +/* Generate a random number with some constant bits, limited to 53 bits. + * rand() sequence is deterministic. + */ +static inline uint64_t _reduce_val(void) +{ + uint64_t val = rand(); + val = (val << 32) | rand(); + return (val | 0x10010002) % (1L << 54); +} + +int _test_reduce(struct cxip_ep_obj *ep_obj, + size_t size, fi_addr_t *fiaddrs, + uint64_t *payload, struct cxip_zbcoll_obj *zb) +{ + int ret; + + /* reset counters */ + cxip_zbcoll_reset_counters(ep_obj); + + /* if this fails, do not continue */ + ret = cxip_zbcoll_reduce(zb, payload); + TRACE("reduce payload=%08lx, ret=%s\n", *payload, fi_strerror(-ret)); + if (frmwk_errmsg(ret, "reduce0 return=%s, exp=%d\n", + fi_strerror(-ret), 0)) + goto out; + + /* try this again, should fail with -FI_EAGAIN */ + ret = cxip_zbcoll_reduce(zb, payload); + TRACE("reduce payload=%08lx, ret=%s\n", *payload, fi_strerror(-ret)); + if (frmwk_errmsg((ret != -FI_EAGAIN), "reduce1 return=%d, exp=%d\n", + ret, -FI_EAGAIN)) + goto out; + + return 0; +out: + TRACE("%s: failed\n", __func__); + return 1; +} + +const char *testnames[] = { + "test 0: send one packet 0 -> 0", + "test 1: send one packet 0 -> 1", + "test 2: send one packet 1 -> 0", + "test 3: send one packet 0 -> N", + "test 4: send one packet N -> 0", + "test 5: send one packet N -> N", + "test 6: single getgroup", + "test 7: double getgroup full overlap", + "test 8: double getgroup partial overlap", + "test 9: getgroup regression [-NMRD]", + "test 10: getgroup exahustion [-ND]", + "test 11: barrier", + "test 12: broadcast (single)", + "test 13: broadcast (concurrent)", + "test 14: reduce (single)", + "test 15: reduce (concurrent)", + "test 16: getgroup perf", + "test 17: barrier perf", + "test 18: broadcast perf", + "test 19: reduce perf", + "test 20: send bad dest [-B required]", + "test 21: recv bad dest [-B required]", + NULL +}; +const char *testname; + +int usage(int ret) +{ + int i; + + frmwk_log0("Usage: test_zbcoll [-hvV] [-s seed]\n" + " [-N nruns] [-M sublen] [-R rotate]\n" + " [-D usec_delay] [-B bad_NIC]\n" + " [-t testno[,testno...]]\n" + "\n" + " -h displays this help\n" + " -v provides verbose output\n" + " -V provides per-node tracing\n" + " -s specifies a random seed for randomized tests\n" + " -t specifies tests e.g. (1,2,3) or (1-3) or (1-3,11-12)" + "\n"); + for (i = 0; testnames[i]; i++) + frmwk_log0("%s\n", testnames[i]); + + return ret; +} + +/* scan for integers in -t option */ +static inline char *scanint(char *ptr, int *val) +{ + char *p = ptr; + while (*ptr >= '0' && *ptr <= '9') + ptr++; + *val = atoi(p); + return ptr; +} + +#define TEST(n) (1 << n) +static inline bool _istest(uint64_t mask, int test) +{ + return (mask & (1 << test)) && (testname = testnames[test]); +} + +int main(int argc, char **argv) +{ + bool trace_enabled = false; + char hostname[256]; + fi_addr_t *fiaddrs = NULL; + struct cxip_ep *cxip_ep; + struct cxip_ep_obj *ep_obj; + struct cxip_zbcoll_obj *zb1 = NULL; + struct cxip_zbcoll_obj *zb2 = NULL; + size_t size = 0; + unsigned int seed; + uint64_t testmask; + uint64_t result1, result2; + uint64_t payload1, payload2; + uint64_t expect1, expect2; + int opt, nruns, naddrs, rot, usec, badnic, ret; + + int errcnt = 0; + int i; + + seed = 123; + usec = 0; // as fast as possible + nruns = -1; // run maximum number groups + naddrs = -1; // random selection of fiaddrs + rot = -1; // random shuffle of fiaddrs + testmask = -1; // run all tests + badnic = -1; // do not use an address override + + while ((opt = getopt(argc, argv, "hvVt:s:N:M:R:D:B:")) != -1) { + char *str, *s, *p; + int i, j; + + switch (opt) { + case 't': + testmask = 0; + str = optarg; + i = j = 0; + while (*str) { + s = str; + while (*str && *str != ',') + str++; + if (*str) + *str++ = 0; + p = s; + while (*p && *p != '-') + p++; + i = atoi(s); + j = (*p) ? atoi(++p) : i; + while (i <= j) + testmask |= 1 << i++; + } + break; + case 's': + seed = atoi(optarg); + break; + case 'N': + nruns = atoi(optarg); + break; + case 'M': + naddrs = atoi(optarg); + break; + case 'R': + rot = atoi(optarg); + break; + case 'D': + usec = atoi(optarg); + break; + case 'T': + frmwk_rank = atoi(optarg); + break; + case 'B': + badnic = strtol(optarg, NULL, 16); + break; + case 'V': + trace_enabled = true; + break; + case 'v': + verbose = true; + break; + case 'h': + return usage(0); + default: + return usage(1); + } + } + + frmwk_init(false); + if (frmwk_check_env(4)) + return -1; + + ret = frmwk_init_libfabric(); + if (frmwk_errmsg(ret, "frmwk_init_libfabric()\n")) + return ret; + + cxip_trace_rank = frmwk_rank; + cxip_trace_numranks = frmwk_numranks; + cxip_trace_enable(trace_enabled); + TRACE("==== tracing enabled offset %d\n", frmwk_rank); + + srand(seed); + if (naddrs < 0) + naddrs = frmwk_numranks; + if (nruns < 0) + nruns = frmwk_numranks; + if (nruns > cxip_zbcoll_max_grps(false)) + nruns = cxip_zbcoll_max_grps(false); + + frmwk_log0("Using random seed = %d\n", seed); + if (verbose) { + frmwk_log0("verbose = true\n"); + frmwk_log0("nruns = %d\n", nruns); + frmwk_log0("naddrs = %d\n", naddrs); + frmwk_log0("rotate = %d\n", rot); + frmwk_log0("delay = %d usec\n", usec); + } + + cxip_ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + ep_obj = cxip_ep->ep_obj; + + /* always start with FI_UNIVERSE */ + ret = frmwk_populate_av(&fiaddrs, &size); + if (frmwk_errmsg(ret, "frmwk_populate_av()\n")) + return 1; + frmwk_log0("libfabric populated\n"); + + gethostname(hostname, sizeof(hostname)); + TRACE("%s NIC=%04x PID=%d\n", hostname, ep_obj->src_addr.nic, + ep_obj->ptable->pid); + + if (_istest(testmask, 0)) { + TRACE("======= %s\n", testname); + ret = _test_send_to_dest(ep_obj, size, fiaddrs, 0, 0, frmwk_rank); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret)); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 1)) { + TRACE("======= %s\n", testname); + ret = _test_send_to_dest(ep_obj, size, fiaddrs, 0, 1, frmwk_rank); + errcnt += !!ret; + TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret)); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 2)) { + TRACE("======= %s\n", testname); + ret = _test_send_to_dest(ep_obj, size, fiaddrs, 1, 0, frmwk_rank); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret)); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 3)) { + TRACE("======= %s\n", testname); + ret = _test_send_to_dest(ep_obj, size, fiaddrs, 0, -1, frmwk_rank); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret)); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 4)) { + TRACE("======= %s\n", testname); + ret = _test_send_to_dest(ep_obj, size, fiaddrs, -1, 0, frmwk_rank); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret)); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 5)) { + TRACE("======= %s\n", testname); + ret = _test_send_to_dest(ep_obj, size, fiaddrs, -1, -1, frmwk_rank); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + TRACE("rank %2d result = %s\n", frmwk_rank, fi_strerror(-ret)); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 6)) { + TRACE("======= %s\n", testname); + zb1 = NULL; + ret = 0; + ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1); + ret += !!_check_getgroup_errs(zb1, 0); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + cxip_zbcoll_free(zb1); + frmwk_barrier(); + } + + if (_istest(testmask, 7)) { + TRACE("======= %s\n", testname); + zb1 = NULL; + ret = 0; + ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1); + ret += !!_getgroup(ep_obj, size, fiaddrs, &zb2); + ret += !!_check_getgroup_errs(zb1, 0); + ret += !!_check_getgroup_errs(zb2, 1); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + cxip_zbcoll_free(zb2); + cxip_zbcoll_free(zb1); + frmwk_barrier(); + } + + if (_istest(testmask, 8)) { + TRACE("======= %s\n", testname); + zb1 = zb2 = NULL; + ret = 0; + TRACE("test one\n"); + if (frmwk_rank != frmwk_numranks-1) { + ret += !!_getgroup(ep_obj, size-1, &fiaddrs[0], &zb2); + ret += !!_check_getgroup_errs(zb2, 0); + } else { + TRACE("SKIP\n"); + } + TRACE("test two\n"); + if (frmwk_rank != 0) { + ret += !!_getgroup(ep_obj, size-1, &fiaddrs[1], &zb1); + ret += !!_check_getgroup_errs(zb1, 1); + } else { + TRACE("SKIP\n"); + } + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + cxip_zbcoll_free(zb2); + cxip_zbcoll_free(zb1); + frmwk_barrier(); + } + + if (_istest(testmask, 9)) { + TRACE("======= %s\n", testname); + ret = 0; + ret += !!_multigroup(ep_obj, size, fiaddrs, nruns, naddrs, + rot, usec); + frmwk_barrier(); + + if (!ret && frmwk_rank == 0) + ret += !!_multicheck(size, nruns); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 10)) { + TRACE("======= %s\n", testname); + ret = 0; + ret += !!_exhaustgroup(ep_obj, size, fiaddrs, nruns, usec); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + ret += !!_test_barr(ep_obj, size, fiaddrs, &zb1); + ret += !!_test_wait_free(zb1, NULL, 0); + frmwk_barrier(); + } + + if (_istest(testmask, 11)) { + TRACE("======= %s\n", testname); + zb1 = NULL; + ret = 0; + ret += !!_test_barr(ep_obj, size, fiaddrs, &zb1); + ret += !!_test_wait_free(zb1, NULL, 0); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 12)) { + TRACE("======= %s\n", testname); + zb1 = NULL; + ret = 0; + result1 = (frmwk_rank) ? frmwk_rank : 0x123; + ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1); + ret += !!_test_bcast(ep_obj, size, fiaddrs, &result1, zb1); + ret += !!_test_wait_free(zb1, &result1, 0x123); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 13)) { + TRACE("======= %s\n", testname); + zb1 = zb2 = NULL; + ret = 0; + result1 = (frmwk_rank) ? frmwk_rank : 0x123; + result2 = (frmwk_rank) ? frmwk_rank : 0x456; + ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1); + ret += !!_getgroup(ep_obj, size, fiaddrs, &zb2); + ret += !!_test_bcast(ep_obj, size, fiaddrs, &result1, zb1); + ret += !!_test_bcast(ep_obj, size, fiaddrs, &result2, zb2); + TRACE("waiting for bcast 1\n"); + ret += !!_test_wait_free(zb1, &result1, 0x123); + TRACE("waiting for bcast 2\n"); + ret += !!_test_wait_free(zb2, &result2, 0x456); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 14)) { + + TRACE("======= %s\n", testname); + expect1 = -1L % (1L << 54); + for (i = 0; i < size; i++) { + uint64_t val = _reduce_val(); + if (i == frmwk_rank) + payload1 = val; + expect1 &= val; + } + zb1 = NULL; + ret = 0; + ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1); + ret += !!_test_reduce(ep_obj, size, fiaddrs, + &payload1, zb1); + ret += !!_test_wait_free(zb1, &payload1, expect1); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 15)) { + TRACE("======= %s\n", testname); + expect1 = -1L % (1L << 54); + expect2 = -1L % (1L << 54); + for (i = 0; i < size; i++) { + uint64_t val = _reduce_val(); + if (i == frmwk_rank) + payload1 = val; + expect1 &= val; + } + for (i = 0; i < size; i++) { + uint64_t val = _reduce_val(); + if (i == frmwk_rank) + payload2 = val; + expect2 &= val; + } + zb1 = zb2 = NULL; + ret = 0; + ret += !!_getgroup(ep_obj, size, fiaddrs, &zb1); + ret += !!_getgroup(ep_obj, size, fiaddrs, &zb2); + ret += !!_test_reduce(ep_obj, size, fiaddrs, + &payload1, zb1); + ret += !!_test_reduce(ep_obj, size, fiaddrs, + &payload2, zb2); + ret += !!_test_wait_free(zb1, &payload1, expect1); + ret += !!_test_wait_free(zb2, &payload2, expect2); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + frmwk_barrier(); + } + + if (_istest(testmask, 16)) { + struct timespec t0; + long count = 0; + double time; + + TRACE("======= %s\n", testname); + trace_enabled = cxip_trace_enable(false); + zb1 = NULL; + ret = cxip_zbcoll_alloc(ep_obj, size, fiaddrs, ZB_NOSIM, &zb1); + clock_gettime(CLOCK_MONOTONIC, &t0); + while (!ret && count < 100000) { + int ret2; + do { + ret += !!cxip_zbcoll_getgroup(zb1); + ret2 = _coll_wait(zb1, nMSEC(100)); + } while (!ret && ret2 == -FI_EAGAIN); + ret += !!ret2; + cxip_zbcoll_rlsgroup(zb1); + count++; + } + time = _measure_nsecs(&t0); + time /= 1.0*count; + time /= 1000.0; + cxip_trace_enable(trace_enabled); + cxip_zbcoll_free(zb1); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n", + ret ? "FAIL" : "ok", testname, count, time); + frmwk_barrier(); + } + + if (_istest(testmask, 17)) { + struct timespec t0; + long count = 0; + double time; + + TRACE("======= %s\n", testname); + trace_enabled = cxip_trace_enable(false); + zb1 = NULL; + ret = _getgroup(ep_obj, size, fiaddrs, &zb1); + clock_gettime(CLOCK_MONOTONIC, &t0); + while (!ret && count < 100000) { + ret += !!cxip_zbcoll_barrier(zb1); + ret += !!_coll_wait(zb1, nMSEC(100)); + count++; + } + time = _measure_nsecs(&t0); + time /= 1.0*count; + time /= 1000.0; + cxip_trace_enable(trace_enabled); + cxip_zbcoll_free(zb1); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n", + ret ? "FAIL" : "ok", testname, count, time); + frmwk_barrier(); + } + + if (_istest(testmask, 18)) { + struct timespec t0; + uint64_t result = 0x1234; + long count = 0; + double time; + + TRACE("======= %s\n", testname); + trace_enabled = cxip_trace_enable(false); + zb1 = NULL; + ret = _getgroup(ep_obj, size, fiaddrs, &zb1); + clock_gettime(CLOCK_MONOTONIC, &t0); + while (!ret && count < 100000) { + ret += !!cxip_zbcoll_broadcast(zb1, &result); + ret += !!_coll_wait(zb1, nMSEC(100)); + count++; + } + time = _measure_nsecs(&t0); + time /= 1.0*count; + time /= 1000.0; + cxip_trace_enable(trace_enabled); + cxip_zbcoll_free(zb1); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n", + ret ? "FAIL" : "ok", testname, count, time); + frmwk_barrier(); + } + + if (_istest(testmask, 19)) { + struct timespec t0; + uint64_t result = 0x1234; + long count = 0; + double time; + + TRACE("======= %s\n", testname); + trace_enabled = cxip_trace_enable(false); + zb1 = NULL; + ret = _getgroup(ep_obj, size, fiaddrs, &zb1); + clock_gettime(CLOCK_MONOTONIC, &t0); + while (!ret && count < 100000) { + ret += !!cxip_zbcoll_reduce(zb1, &result); + ret += !!_coll_wait(zb1, nMSEC(100)); + count++; + } + time = _measure_nsecs(&t0); + time /= 1.0*count; + time /= 1000.0; + cxip_trace_enable(trace_enabled); + cxip_zbcoll_free(zb1); + errcnt += !!ret; + _idle_wait(ep_obj, 100); + frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n", + ret ? "FAIL" : "ok", testname, count, time); + frmwk_barrier(); + } + + if (_istest(testmask, 20)) { + if (badnic >= 0) { + TRACE("======= %s\n", testname); + bad_cxip_addr.nic = badnic; + bad_cxip_addr.pid = 0; + bad_cxip_index = 1; + ret = _test_send_to_dest(ep_obj, size, fiaddrs, + 0, 1, frmwk_rank); + bad_cxip_index = -1; + errcnt += !!ret; + _idle_wait(ep_obj, 100); + TRACE("rank %2d result = %d\n", frmwk_rank, ret); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + } else { + frmwk_log0("%4s %s\n", "SKIP", testname); + } + frmwk_barrier(); + } + + if (_istest(testmask, 21)) { + if (badnic >= 0) { + TRACE("======= %s\n", testname); + ret = _test_send_to_dest(ep_obj, size, fiaddrs, + 0, 1, frmwk_rank); + ret = _test_send_to_dest(ep_obj, size, fiaddrs, + 1, 0, frmwk_rank); + //ret = _getgroup(ep_obj, size, fiaddrs, &zb1); + TRACE("listening forever....\n"); + cxip_trace_flush(); + _idle_wait(ep_obj, -1); + frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); + } else { + frmwk_log0("%4s %s\n", "SKIP", testname); + } + frmwk_barrier(); + } + + TRACE("Finished test run, cleaning up\n"); + free(fiaddrs); + frmwk_free_libfabric(); + frmwk_log0(!!errcnt ? "ERRORS SEEN\n" : "SUCCESS\n"); + frmwk_term(); + return !!errcnt; +} diff --git a/prov/cxi/test/nic.c b/prov/cxi/test/nic.c new file mode 100644 index 00000000000..583d3950ea1 --- /dev/null +++ b/prov/cxi/test/nic.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include +#include +#include +#include + +#include +#include + +#include "libcxi/libcxi.h" +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(fid_nic, .timeout = 5); + +Test(fid_nic, validate_nic_attr) +{ + int ret; + struct cxil_dev *dev; + struct cxi_svc_fail_info fail_info = {}; + struct cxi_svc_desc svc_desc = {}; + uint16_t valid_vni = 0x120; + struct fi_info *info; + struct cxip_nic_attr *nic_attr; + + /* Need to allocate a service to be used by libfabric. */ + ret = cxil_open_device(0, &dev); + cr_assert_eq(ret, 0, "cxil_open_device failed: %d", ret); + + svc_desc.restricted_vnis = 1; + svc_desc.enable = 1; + svc_desc.num_vld_vnis = 1; + svc_desc.vnis[0] = valid_vni; + + ret = cxil_alloc_svc(dev, &svc_desc, &fail_info); + cr_assert_gt(ret, 0, "cxil_alloc_svc failed: %d", ret); + svc_desc.svc_id = ret; + + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", + NULL, FI_SOURCE, NULL, &info); + cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); + + nic_attr = (struct cxip_nic_attr *)info->nic->prov_attr; + cr_assert_eq(nic_attr->version, 1); + cr_assert_eq(nic_attr->addr, dev->info.nid); + cr_assert_eq(nic_attr->default_rgroup_id, svc_desc.svc_id); + cr_assert_eq(nic_attr->default_vni, valid_vni); + + fi_freeinfo(info); + ret = cxil_destroy_svc(dev, svc_desc.svc_id); + cr_assert_eq(ret, 0, "cxil_destroy_svc failed: %d", ret); + cxil_close_device(dev); +} diff --git a/prov/cxi/test/repsum.c b/prov/cxi/test/repsum.c new file mode 100644 index 00000000000..c544a97e9ab --- /dev/null +++ b/prov/cxi/test/repsum.c @@ -0,0 +1,587 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP + */ + +/* Notes: + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +bool verbose = false; + +typedef void (*GenFunc)(void); +typedef double (*SumFunc)(size_t, double*); + +struct sum_dist { + const char *name; // distribution name + GenFunc func; // distribution generator +}; + +struct sum_test { + const char* name; // test name + SumFunc func; // test function + double min, max; // cumulative results + bool contrib; // contribute to global min/max +}; + +struct sum_test_suite { + double gmin, gmax; // cumulative global bounds +}; + +/** + * @brief Data generation models. + * + * These functions generate arrays of doubles using different models that create + * different distributions of numbers. + */ + +unsigned int seed = 3; +size_t numvals = 0; +double *values = NULL; + +/* Data generators for the dataset */ +void init_dataset(size_t size) +{ + free(values); + numvals = size; + values = calloc(size, sizeof(double)); +} + +void gen_const_data(void) +{ + /* constant data */ + int i; + + for (i = 0; i < numvals; i++) + values[i] = 1.0; +} + +void gen_random_data(void) +{ + /* randomized data */ + int i; + + if (seed) { + srand(seed); + seed = 0; + } + for (i = 0; i < numvals; i++) { + int rnd, e; + + rnd = random(); + e = -32*(rnd & 0x7); + rnd >>= 3; + values[i] = scalbn(((rnd * 2.0)/RAND_MAX) - 1.0, e); + } +} + +void gen_series_data(void) +{ + /* converging series */ + double s = 1.0; + int i; + + for (i = 0; i < numvals; i++) { + values[i] = s / (i+1); + s = -s; + } +} + +void gen_sine_data(void) +{ + /* sine wave, particularly hard on reproducibility */ + double s = 2.0*M_PI/numvals; + int i; + + for (i = 0; i < numvals; i++) { + values[i] = sin(s*i); + } +} + +void gen_range_data(void) +{ + int i, e, s, v; + + /* oscillating between -inf and +inf */ + v = 0; + s = 1; + for (i = 0; i < numvals; i++) { + if (!(i % 2048)) { + v += 1; + s = -s; + } + e = (i % 2048) - 1023; + values[i] = s*scalbn(1.0*v, e - 1023); + } +} + +/** + * @brief Data ordering models. + * + * These functions reorder generated data to test associativity. + * + */ + +void nosort_data(void) +{ +} + +int _sortfunc(const void *p1, const void *p2) +{ + double *v1 = (double *)p1; + double *v2 = (double *)p2; + + if (*v1 == *v2) + return 0; + return (*v1 < *v2) ? -1 : 1; +} + +void sort_data(void) +{ + qsort(values, numvals, sizeof(double), _sortfunc); +} + +void scramble_data(void) +{ + int i, j; + double t; + + for (i = numvals-1; i > 0; i--) { + j = random() %(i+1); + t = values[i]; + values[i] = values[j]; + values[j] = t; + } +} + +void reverse_data(void) +{ + int i, j, half; + double t; + + half = numvals/2; + for (i = 0; i < half; i++) { + j = numvals-1-i; + t = values[i]; + values[i] = values[j]; + values[j] = t; + } +} + +/** + * @brief Summation algoritihms. + * + * These function perform the double summation using different algorithms. + */ + +double simple_sum(size_t n, double *v) +{ + double s = 0.0; + int i; + + for (i = 0; i < n; i++) + s += v[i]; + + return s; +} + +#define RADIX 32 +double tree_sum(size_t n, double *v) +{ + double s = 0.0; + int i, k; + + if (n > RADIX) { + k = n/RADIX; + for (i = 0; i < RADIX - 1; i++, n -= k) + s += tree_sum(k, &v[k*i]); + s += tree_sum(n, &v[k*i]); + } else { + for (i = 0; i < n; i++) + s += v[i]; + } + + return s; +} + +double Kahans_sum(size_t n, double *v) +{ + double s = 0.0; + double c = 0.0; + int i; + + for (i = 0; i < n; i++) { + double y = v[i] - c; + double t = s + y; + + c = (t - s) - y; + s = t; + } + + return s; +} + +void print_repsum(struct cxip_repsum *x) +{ + printf("M=%3d T=[%016lx, %016lx, %016lx, %016lx] oflow=%d inexact=%d\n", + x->M, x->T[0], x->T[1], x->T[2], x->T[3], + x->overflow, x->inexact); +} + +/** + * @brief Static structures to make the above models accessible to the test + * code. + * + */ + +struct sum_dist test_dists[] = { + {.name="const", .func=&gen_const_data}, + {.name="random", .func=&gen_random_data}, + {.name="series", .func=&gen_series_data}, + {.name="sin", .func=&gen_sine_data}, + {.name="range", .func=&gen_range_data} +}; +#define NUM_DISTS (sizeof(test_dists)/sizeof(struct sum_dist)) + +struct sum_dist test_perms[] = { + {.name="nosort", .func=&nosort_data}, + {.name="sort", .func=&sort_data}, + {.name="scramble", .func=&scramble_data}, + {.name="reverse", .func=&reverse_data}, +}; +#define NUM_PERMS (sizeof(test_perms)/sizeof(struct sum_dist)) +#define PERM_NOSORT 0 +#define PERM_SORT 1 +#define PERM_SCRAMBLE 2 +#define PERM_REVERSE 3 + +struct sum_test test_cases[] = { + {.name="simple_sum", .func=&simple_sum, .contrib=true}, + {.name="tree_sum", .func=&tree_sum, .contrib=true}, + {.name="Kahans_sum", .func=&Kahans_sum, .contrib=true}, + {.name="rep_sum", .func=&cxip_rep_sum, .contrib=false}, +}; +#define NUM_CASES (sizeof(test_cases)/sizeof(struct sum_test)) +#define TEST_SIMPLE 0 +#define TEST_TREE 1 +#define TEST_KAHAN 2 +#define TEST_REPSUM 3 + +struct sum_test_suite test_suite; + +/** + * @brief Main test code. + * + * The basic model is to take a particular distribution of doubles, then perform + * multiple summations of that distribution with different orderings of the + * values, retaining the result as a (min, max) pair. + * + * For a perfectly-reproducible summation method, the final result for each + * distribution will show min == max. + */ + +void _show_results(void) +{ + struct sum_test *test; + double dif, mid, err; + int n; + + for (n = 0; n < NUM_CASES; n++) { + test = &test_cases[n]; + dif = (test->max - test->min); + mid = (test->max + test->min)/2.0; + err = fabs(mid ? dif/mid : dif); + + if (verbose) + printf("%12s %29.20g %29.20g %g\n", + test->name, test->min, test->max, err); + } +} + +void _reset_results(void) +{ + int n; + + test_suite.gmax = -HUGE_VAL; + test_suite.gmin = HUGE_VAL; + for (n = 0; n < NUM_CASES; n++) { + test_cases[n].max = -HUGE_VAL; + test_cases[n].min = HUGE_VAL; + } +} + +/* Perform a single summation and record min/max */ +void _runtest(struct sum_test *test) +{ + double sum; + + sum = test->func(numvals, values); + if (test->min > sum) + test->min = sum; + if (test->max < sum) + test->max = sum; + if (test->contrib) { + if (test_suite.gmin > sum) + test_suite.gmin = sum; + if (test_suite.gmax < sum) + test_suite.gmax = sum; + } +} + +/* Perform a summations */ +void _run_tests(uint64_t tstmask) +{ + int n; + + for (n = 0; n < NUM_CASES; n++) { + if (!(tstmask & (1 << n))) + continue; + if (verbose) + printf(" ... %s\n", test_cases[n].name); + _runtest(&test_cases[n]); + } +} + +/* reorder the data, and perform summations using different methods */ +void run_permutations(uint64_t tstmask) +{ + int sequence[] = { + PERM_NOSORT, + PERM_REVERSE, + PERM_SORT, + PERM_REVERSE, + PERM_SCRAMBLE, + PERM_REVERSE, + }; + int seqcnt = sizeof(sequence)/sizeof(int); + int n, p; + + _reset_results(); + for (n = 0; n < seqcnt; n++) { + p = sequence[n]; + if (verbose) + printf(" ----- %s\n", test_perms[p].name); + test_perms[p].func(); + _run_tests(tstmask); + } + _show_results(); +} + +/* generate a distribution of values, and run permutations */ +void run_dists(uint64_t dstmask, uint64_t tstmask) +{ + int n; + + for (n = 0; n < NUM_DISTS; n++) { + if (!(dstmask & (1 << n))) + continue; + if (verbose) + printf("======= %s\n", test_dists[n].name); + test_dists[n].func(); + run_permutations(tstmask); + } +} + +static inline bool _equal(double a, double b) +{ + return (isnan(a) && isnan(b)) || a == b; +} + +TestSuite(repsum, .init = cxit_setup_ep, .fini =cxit_teardown_ep, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* + * Convert double->repsum and repsum->double, and compare for equality. + */ +Test(repsum, convert) +{ + struct cxip_repsum x; + double s[] = {1.0, -1.0}; + double d1, d2; + int i, j, k; + + /* note that this exponent spans subnormals and +inf/-inf */ + for (i = -1100; i < 1100; i++) { + for (j = 0; j < 53; j++) { + for (k = 0; k < 2; k++) { + d1 = scalbn(s[k]*((1 << j) - 1), i); + cxip_dbl_to_rep(&x, d1); + cxip_rep_to_dbl(&d2, &x); + cr_assert(_equal(d1, d2), + "%d, %d: %.13e != %.13e\n", + i, j, d1, d2); + } + } + } + /* explicit -inf */ + d1 = -INFINITY; + cxip_dbl_to_rep(&x, d1); + cxip_rep_to_dbl(&d2, &x); + cr_assert(d1 == d2, "%d, %d, %.13e != %.13e\n", i, j, d1, d2); + /* explicit +inf */ + d1 = +INFINITY; + cxip_dbl_to_rep(&x, d1); + cxip_rep_to_dbl(&d2, &x); + cr_assert(d1 == d2, "%d, %d: %.13e != %.13e\n", i, j, d1, d2); + /* explicit NaN */ + d1 = NAN; + cxip_dbl_to_rep(&x, d1); + cxip_rep_to_dbl(&d2, &x); + cr_assert(isnan(d2), "%d, %d: %.13e != %.13e %016lx != %016lx\n", + i, j, d1, d2, _dbl2bits(d1), _dbl2bits(d2)); +} + +/* + * Add two values using double and using repsum, and compare for equality. + */ +Test(repsum, add) +{ + double s1[] = {1.0, 1.0, -1.0, -1.0}; + double s2[] = {1.0, -1.0, 1.0, -1.0}; + double d1, d2, d3, d4; + int i, j, k; + + /* note that this exponent spans subnormals and +inf/-inf */ + for (i = -1100; i < 1100; i++) { + for (j = 0; j < 53; j++) { + for (k = 0; k < 4; k++) { + d1 = scalbn(s1[k]*((1 << j) - 1), i); + d2 = scalbn(s2[k]*((1 << j) - 1), i+1); + d3 = d1 + d2; + d4 = cxip_rep_add_dbl(d1, d2); + cr_assert(_equal(d3, d4), + "%d, %d, %d: %.13e != %.13e" + " %016lx %016lx %016lx %016lx\n", + i, j, k, d3, d4, + _dbl2bits(d1), _dbl2bits(d2), + _dbl2bits(d3), _dbl2bits(d4)); + } + } + } +} + +/* + * Add combinations of NAN and INFINITY, compare for correct result. + */ +Test(repsum, inf) +{ + double a[] = {1.0, +INFINITY, -INFINITY, NAN}; + double d1, d2, d3, d4, exp; + int i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + d1 = a[i]; + d2 = a[j]; + d3 = d1 + d2; + d4 = cxip_rep_add_dbl(d1, d2); + if (isnan(d1) || isnan(d2)) + exp = NAN; + else if (isinf(d1) && isinf(d2)) + exp = (d1 == d2) ? d1 : NAN; + else if (isinf(d1)) + exp = d1; + else if (isinf(d2)) + exp = d2; + else + exp = d3; + cr_assert(_equal(d3, exp), + "dbl %d, %d: (%e + %e) = %e, expected %e\n", + i, j, d1, d2, d3, exp); + cr_assert(_equal(d4, exp), + "rep %d, %d: (%e + %e) = %e, expected %e\n", + i, j, d1, d2, d4, exp); + } + } +} + +/* + * Test for overflow by performing too many sums. + * 0.5 places MSBit in bit 39 of a bin. + * 1LL << 24 additions of 0.5 will fill overflow area. + * One more addition should trigger overflow. + */ +Test(repsum, overflow) +{ + struct cxip_repsum x, y; + long int i, n; + + cxip_dbl_to_rep(&x, 0.0); + cxip_dbl_to_rep(&y, 0.5); + n = 1LL << 24; + for (i = 0L; i < n-1; i++) { + cxip_rep_add(&x, &y); + if (x.overflow) + break; + } + cr_assert(!x.overflow, "overflow at %lx not expected\n", i++); + cxip_rep_add(&x, &y); + cr_assert(x.overflow, "overflow at %ld expected\n", i); + cxip_dbl_to_rep(&y, 0.0); + cxip_rep_add(&y, &x); + cr_assert(y.overflow, "overflow not propagated\n"); +} + +/* + * Test for expected loss of precision. + * Adding 1.0*2^i for i=(0,39) will fill a bin. + * Doing this four times will fill the T[] array. + * Doing this one more time will drop the LSBin. + */ +Test(repsum, inexact) +{ + struct cxip_repsum x, y; + int i, n; + + cxip_dbl_to_rep(&x, 0.0); + n = 4*40; + for (i = 0; i < n; i++) { + cxip_dbl_to_rep(&y, scalbn(1.0, i)); + cxip_rep_add(&x, &y); + if (x.inexact) + break; + } + cr_assert(!x.inexact, "inexact at %x not expected\n", i++); + cxip_dbl_to_rep(&y, scalbn(1.0, i)); + cxip_rep_add(&x, &y); + cr_assert(x.inexact, "inexact at %x expected\n", i); + cxip_dbl_to_rep(&y, 0.0); + cxip_rep_add(&y, &x); + cr_assert(y.inexact, "inexact not propagated\n"); +} + +/* + * Test comparison of different methods over datasets + * In all cases, repsum should be reproducible, err = 0. + */ +Test(repsum, comparison) +{ + struct sum_test *test; + double dif, mid, err; + + init_dataset(100000); + run_dists(-1L, -1L); + + test = &test_cases[TEST_REPSUM]; + dif = (test->max - test->min); + mid = (test->max + test->min)/2.0; + err = fabs(mid ? dif/mid : dif); + if (err) + printf("%12s %29.20g %29.20g %g\n", + test->name, test->min, test->max, err); + cr_assert(!err, "repsum is not reproducible\n"); +} diff --git a/prov/cxi/test/rma.c b/prov/cxi/test/rma.c new file mode 100644 index 00000000000..6a53fee3b07 --- /dev/null +++ b/prov/cxi/test/rma.c @@ -0,0 +1,2236 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include + +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +#define RMA_WIN_KEY 0x1f + +TestSuite(rma, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rma, zero_byte_writev) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + + mr_create(0, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key_val, + &mem_window); + + ret = fi_writev(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, 0, key_val, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_writev failed: %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + mr_destroy(&mem_window); +} + +Test(rma, zero_byte_writemsg) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct fi_rma_iov rma[1] = {}; + + mr_create(0, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key_val, + &mem_window); + + rma[0].key = key_val; + + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + ret = fi_writemsg(cxit_ep, &msg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed: %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + mr_destroy(&mem_window); +} + +Test(rma, zero_byte_readv) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + + mr_create(0, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key_val, + &mem_window); + + ret = fi_readv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, 0, key_val, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_readv failed: %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_READ, NULL); + + mr_destroy(&mem_window); +} + +Test(rma, zero_byte_readmsg) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct fi_rma_iov rma[1] = {}; + + mr_create(0, FI_REMOTE_WRITE | FI_REMOTE_READ, 0, &key_val, + &mem_window); + + rma[0].key = key_val; + + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + ret = fi_readmsg(cxit_ep, &msg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_readmsg failed: %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_READ, NULL); + + mr_destroy(&mem_window); +} + +/* Test fi_write simple case. Test IDC sizes to multi-packe sizes. */ +Test(rma, simple_write) +{ + int ret; + uint8_t *send_buf; + int win_len = 16 * 1024; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + for (send_len = 1; send_len <= win_len; send_len <<= 1) { + ret = fi_write(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + } + + mr_destroy(&mem_window); + free(send_buf); +} + +/* Test compatibility of client/provider keys */ +Test(rma, key_compatibility) +{ + int ret; + uint8_t *send_buf; + int win_len = 16 * 1024; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fid_domain *domain2; + struct fid_ep *ep2; + struct fid_cq *tx_cq2; + struct fid_cq *rx_cq2; + struct fid_av *av2; + struct cxip_addr ep2_addr; + size_t addrlen = sizeof(ep2_addr); + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + struct cxip_domain *dom; + struct cxip_mr_key cxip_key; + bool first_domain_prov_key; + + /* Create second RMA endpoint in the opposite client/provider + * mr_mode as the test default EP. When tested with + * CXIP_TEST_PROV_KEY=true is set, then the second EP is started + * in client key mode, if not set, then the second EP is started + * in provider key mode. + */ + if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY) { + first_domain_prov_key = true; + cxit_fi->domain_attr->mr_mode &= ~FI_MR_PROV_KEY; + cxit_fi->domain_attr->mr_key_size = sizeof(uint32_t); + } else { + first_domain_prov_key = false; + cxit_fi->domain_attr->mr_mode |= FI_MR_PROV_KEY; + cxit_fi->domain_attr->mr_key_size = sizeof(uint64_t); + } + ret = fi_domain(cxit_fabric, cxit_fi, &domain2, NULL); + cr_assert(ret == FI_SUCCESS, "fi_domain 2nd domain"); + dom = container_of(domain2, struct cxip_domain, + util_domain.domain_fid); + if (first_domain_prov_key) + cr_assert(!dom->is_prov_key, "2nd domain not client key"); + else + cr_assert(dom->is_prov_key, "2nd domain not provider key"); + + ret = fi_endpoint(domain2, cxit_fi, &ep2, NULL); + cr_assert(ret == FI_SUCCESS, "fi_endpoint 2nd endpoint"); + + ret = fi_av_open(domain2, &cxit_av_attr, &av2, NULL); + cr_assert(ret == FI_SUCCESS, "fi_av_open 2nd AV"); + ret = fi_ep_bind(ep2, &av2->fid, 0); + cr_assert(ret == FI_SUCCESS, "fi_ep_bind 2nd AV"); + + ret = fi_cq_open(domain2, &cxit_tx_cq_attr, &tx_cq2, NULL); + cr_assert(ret == FI_SUCCESS, "fi_cq_open 2nd TX CQ"); + ret = fi_ep_bind(ep2, &tx_cq2->fid, FI_TRANSMIT); + cr_assert(ret == FI_SUCCESS, "fi_ep_bind 2nd TX CQ"); + + ret = fi_cq_open(domain2, &cxit_rx_cq_attr, &rx_cq2, NULL); + cr_assert(ret == FI_SUCCESS, "fi_cq_open 2nd RX CQ"); + ret = fi_ep_bind(ep2, &rx_cq2->fid, FI_RECV); + cr_assert(ret == FI_SUCCESS, "fi_ep_bind 2nd RX CQ"); + + ret = fi_enable(ep2); + cr_assert(ret == FI_SUCCESS, "fi_enable 2nd EP"); + + ret = fi_getname(&ep2->fid, &ep2_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "fi_getname 2nd EP"); + + /* Setup AV, adding fake, first EP then second EP */ + ret = fi_av_insert(av2, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + ret = fi_av_insert(av2, (void *)&cxit_ep_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1, "fi_av_insert 1st EP into AV2"); + ret = fi_av_insert(av2, (void *)&ep2_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1, "fi_av_insert 2nd EP into AV2"); + + /* Add second EP to default EP's AV */ + ret = fi_av_insert(cxit_av, (void *)&ep2_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1, "fi_av_insert 2nd EP into cxit_av"); + + /* First EP creates a MR with a key of the type specified in + * the associated domain. The second EP will use this key + * which to initiate a transfer. + */ + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + cxip_key.raw = key_val; + if (first_domain_prov_key) + cr_assert(cxip_key.is_prov, "Key is not provider key"); + else + cr_assert(!cxip_key.is_prov, "Key is not client key"); + + for (send_len = 1; send_len <= win_len; send_len <<= 1) { + ret = fi_write(ep2, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(tx_cq2, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + } + + ret = fi_close(&ep2->fid); + cr_assert(ret == FI_SUCCESS, "fi_close EP2"); + ret = fi_close(&tx_cq2->fid); + cr_assert(ret == FI_SUCCESS, "fi_close TX CQ2"); + ret = fi_close(&rx_cq2->fid); + cr_assert(ret == FI_SUCCESS, "fi_close RX CQ2"); + ret = fi_close(&av2->fid); + cr_assert(ret == FI_SUCCESS, "fi_close AV2"); + ret = fi_close(&domain2->fid); + cr_assert(ret == FI_SUCCESS, "fi_close domain2"); + + mr_destroy(&mem_window); + free(send_buf); +} + +void cxit_setup_rma_opt(void) +{ + cxit_setup_getinfo(); + + /* Explicitly request unordered RMA */ + cxit_fi_hints->caps = FI_RMA; + cxit_fi_hints->tx_attr->msg_order = 0; + + cxit_setup_rma(); +} + +TestSuite(rma_opt, .init = cxit_setup_rma_opt, .fini = cxit_teardown_rma, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test an optimal fi_write. */ +Test(rma_opt, opt_write) +{ + int ret; + uint8_t *send_buf; + int win_len = 16 * 1024; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + uint64_t res_start; + uint64_t res_end; + uint64_t hits_start; + uint64_t hits_end; + struct cxip_ep *cxi_ep; + + ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT, + &res_start, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + ret = cxit_dom_read_cntr(C_CNTR_LPE_PLEC_HITS, + &hits_start, NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create_ext(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, NULL, + &mem_window); + + for (send_len = 1; send_len <= win_len; send_len <<= 1) { + ret = fi_write(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + } + + mr_destroy(&mem_window); + free(send_buf); + + ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT, + &res_end, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + cr_expect(res_end > res_start); + + ret = cxit_dom_read_cntr(C_CNTR_LPE_PLEC_HITS, + &hits_end, NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + cxi_ep = container_of(cxit_ep, struct cxip_ep, ep); + if (!is_netsim(cxi_ep->ep_obj)) { + cr_assert(hits_end > hits_start); + } else { + if (hits_end == hits_start) + printf("PLEC Hits not registered (unsupported on netsim)\n"); + } +} + +/* Test simple writes to a standard MR. */ +Test(rma, simple_write_std_mr) +{ + int ret; + uint8_t *send_buf; + int win_len = 16 * 1024; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = 0xdef; + struct fi_cq_tagged_entry cqe; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + for (send_len = 1; send_len <= win_len; send_len <<= 1) { + ret = fi_write(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + } + + mr_destroy(&mem_window); + free(send_buf); +} + +/* Test fi_writev simple case */ +Test(rma, simple_writev) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct iovec iov[1]; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window); + + iov[0].iov_base = send_buf; + iov[0].iov_len = send_len; + + /* Send 8 bytes from send buffer data to RMA window 0 */ + ret = fi_writev(cxit_ep, iov, NULL, 1, cxit_ep_fi_addr, 0, key_val, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_writev failed %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + mr_destroy(&mem_window); + free(send_buf); +} + +void do_writemsg(uint64_t flags) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window); + + iov[0].iov_base = send_buf; + iov[0].iov_len = send_len; + + rma[0].addr = 0; + rma[0].len = send_len; + rma[0].key = key_val; + + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + /* Send 8 bytes from send buffer data to RMA window 0 at FI address 0 + * (self) + */ + ret = fi_writemsg(cxit_ep, &msg, flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + if (flags & FI_CXI_HRP) + usleep(1000); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + mr_destroy(&mem_window); + free(send_buf); +} + +/* Test fi_writemsg with flags */ +Test(rma, writemsg) +{ + do_writemsg(0); + do_writemsg(FI_FENCE); +} + +void cxit_rma_setup_nofence(void) +{ + cxit_setup_getinfo(); + cxit_fi_hints->caps = CXIP_EP_PRI_CAPS; + cxit_setup_rma(); +} + +/* Test RMA without FI_FENCE */ +Test(rma_nofence, nofence, + .init = cxit_rma_setup_nofence, + .fini = cxit_teardown_rma) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_msg_rma msg = {}; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window); + + iov[0].iov_base = send_buf; + iov[0].iov_len = send_len; + + rma[0].addr = 0; + rma[0].len = send_len; + rma[0].key = key_val; + + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + ret = fi_writemsg(cxit_ep, &msg, FI_FENCE); + cr_assert(ret == -FI_EINVAL); + + ret = fi_readmsg(cxit_ep, &msg, FI_FENCE); + cr_assert(ret == -FI_EINVAL); + + mr_destroy(&mem_window); + free(send_buf); +} + +void cxit_rma_setup_no_rma_events(void) +{ + cxit_setup_getinfo(); + + cxit_fi_hints->caps = FI_RMA | FI_ATOMIC; + cxit_setup_rma(); +} + +/* Test HRP Put */ +Test(rma_opt, hrp, + .init = cxit_rma_setup_no_rma_events, + .fini = cxit_teardown_rma) +{ + int ret; + uint64_t hrp_acks_start; + uint64_t hrp_acks_end; + struct cxip_ep *cxi_ep; + + /* HRP not supported in netsim */ + cxi_ep = container_of(cxit_ep, struct cxip_ep, ep); + if (is_netsim(cxi_ep->ep_obj)) + return; + + ret = cxit_dom_read_cntr(C_CNTR_HNI_HRP_ACK, + &hrp_acks_start, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + do_writemsg(0); + do_writemsg(FI_CXI_HRP); + do_writemsg(0); + + for (int i = 0; i < 10; i++) + do_writemsg(FI_CXI_HRP); + + ret = cxit_dom_read_cntr(C_CNTR_HNI_HRP_ACK, + &hrp_acks_end, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + cr_assert_eq(hrp_acks_end - hrp_acks_start, 11, + "unexpected hrp_acks count: %lu\n", + hrp_acks_end - hrp_acks_start); +} + +/* Perform a write that uses a flushing ZBR at the target. */ +Test(rma, flush) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + uint64_t flags = FI_DELIVERY_COMPLETE; + uint64_t flushes_start; + uint64_t flushes_end; + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS, + &flushes_start, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window); + + iov[0].iov_base = send_buf; + iov[0].iov_len = send_len; + + rma[0].addr = 0; + rma[0].len = send_len; + rma[0].key = key_val; + + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + /* Send 8 bytes from send buffer data to RMA window 0 at FI address 0 + * (self) + */ + ret = fi_writemsg(cxit_ep, &msg, flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + mr_destroy(&mem_window); + free(send_buf); + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS, + &flushes_end, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + cr_assert(flushes_end > flushes_start); +} + +/* Test fi_writemsg with FI_INJECT flag */ +Test(rma, simple_writemsg_inject) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + uint64_t flags = FI_INJECT; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window); + + iov[0].iov_base = send_buf; + iov[0].iov_len = send_len; + + rma[0].addr = 0; + rma[0].len = send_len; + rma[0].key = key_val; + + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + /* Send 8 bytes from send buffer data to RMA window 0 at FI address 0 + * (self) + */ + ret = fi_writemsg(cxit_ep, &msg, flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + mr_destroy(&mem_window); + + /* Try using standard MR */ + + key_val = 1000; + mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window); + rma[0].key = key_val; + + /* Send 8 bytes from send buffer data to RMA window 0 at FI address 0 + * (self) + */ + ret = fi_writemsg(cxit_ep, &msg, flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + mr_destroy(&mem_window); + + free(send_buf); +} + +/* Test fi_inject_write simple case */ +Test(rma, simple_inject_write) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + cr_assert(!fi_cntr_read(cxit_write_cntr)); + + /* Test invalid inject length */ + ret = fi_inject_write(cxit_ep, send_buf, + cxit_fi->tx_attr->inject_size + 100, + cxit_ep_fi_addr, 0, key_val); + cr_assert(ret == -FI_EMSGSIZE); + + /* Send 8 bytes from send buffer data to RMA window 0 */ + ret = fi_inject_write(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0, + key_val); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_read(cxit_write_cntr) != 1) + ; + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + mr_destroy(&mem_window); + free(send_buf); +} + +/* Test fi_read simple case */ +Test(rma, simple_read) +{ + int ret; + uint8_t *local; + int remote_len = 0x1000; + int local_len = 8; + uint64_t key_val = 0xa; + struct fi_cq_tagged_entry cqe; + struct mem_region remote; + + local = calloc(1, local_len); + cr_assert_not_null(local, "local alloc failed"); + + mr_create(remote_len, FI_REMOTE_READ, 0xc0, &key_val, &remote); + + cr_assert(!fi_cntr_read(cxit_read_cntr)); + + /* Get 8 bytes from the source buffer to the receive buffer */ + ret = fi_read(cxit_ep, local, local_len, NULL, cxit_ep_fi_addr, 0, + key_val, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_read() failed (%d)", ret); + + while (fi_cntr_read(cxit_read_cntr) != 1) + ; + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret); + + validate_tx_event(&cqe, FI_RMA | FI_READ, NULL); + + /* Validate sent data */ + for (int i = 0; i < local_len; i++) + cr_expect_eq(local[i], remote.mem[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + local[i], remote.mem[i]); + + mr_destroy(&remote); + free(local); +} + +/* Test fi_readv simple case */ +Test(rma, simple_readv) +{ + int ret; + uint8_t *local; + int remote_len = 0x1000; + int local_len = 8; + uint64_t key_val = 0x2a; + struct fi_cq_tagged_entry cqe; + struct mem_region remote; + struct iovec iov[1]; + + local = calloc(1, local_len); + cr_assert_not_null(local, "local alloc failed"); + + mr_create(remote_len, FI_REMOTE_READ, 0x3c, &key_val, &remote); + + iov[0].iov_base = local; + iov[0].iov_len = local_len; + + /* Get 8 bytes from the source buffer to the receive buffer */ + ret = fi_readv(cxit_ep, iov, NULL, 1, cxit_ep_fi_addr, 0, key_val, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_readv() failed (%d)", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret); + + validate_tx_event(&cqe, FI_RMA | FI_READ, NULL); + + /* Validate sent data */ + for (int i = 0; i < local_len; i++) + cr_expect_eq(local[i], remote.mem[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + local[i], remote.mem[i]); + + mr_destroy(&remote); + free(local); +} + +/* Test fi_readmsg simple case */ +Test(rma, simple_readmsg) +{ + int ret; + uint8_t *local; + int remote_len = 0x1000; + int local_len = 8; + uint64_t key_val = 0x2a; + struct fi_cq_tagged_entry cqe; + struct mem_region remote; + struct fi_msg_rma msg = {}; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + uint64_t flags = 0; + + local = calloc(1, local_len); + cr_assert_not_null(local, "local alloc failed"); + + mr_create(remote_len, FI_REMOTE_READ, 0xd9, &key_val, &remote); + + iov[0].iov_base = local; + iov[0].iov_len = local_len; + + rma[0].addr = 0; + rma[0].len = local_len; + rma[0].key = key_val; + + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + /* Get 8 bytes from the source buffer to the receive buffer */ + ret = fi_readmsg(cxit_ep, &msg, flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_readv() failed (%d)", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read() failed (%d)", ret); + + validate_tx_event(&cqe, FI_RMA | FI_READ, NULL); + + /* Validate sent data */ + for (int i = 0; i < local_len; i++) + cr_expect_eq(local[i], remote.mem[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + local[i], remote.mem[i]); + + mr_destroy(&remote); + free(local); +} + +/* Test fi_readmsg failure cases */ +Test(rma, readmsg_failures) +{ + int ret; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + struct fi_msg_rma msg = { + .msg_iov = iov, + .rma_iov = rma, + .iov_count = 1, + .rma_iov_count = 1, + }; + uint64_t flags = 0; + + /* Invalid msg value */ + ret = fi_readmsg(cxit_ep, NULL, flags); + cr_assert_eq(ret, -FI_EINVAL, "NULL msg return %d", ret); + + msg.iov_count = cxit_fi->tx_attr->rma_iov_limit + 1; + ret = fi_readmsg(cxit_ep, &msg, flags); + cr_assert_eq(ret, -FI_EINVAL, "Invalid iov_count return %d", ret); + + msg.iov_count = cxit_fi->tx_attr->rma_iov_limit; + flags = FI_DIRECTED_RECV; /* Invalid flag value */ + ret = fi_readmsg(cxit_ep, &msg, flags); + cr_assert_eq(ret, -FI_EBADFLAGS, "Invalid flag unexpected return %d", + ret); +} + +/* Test fi_writemsg failure cases */ +Test(rma, writemsg_failures) +{ + int ret; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + struct fi_msg_rma msg = { + .msg_iov = iov, + .rma_iov = rma, + .iov_count = 1, + .rma_iov_count = 1, + }; + uint64_t flags = 0; + size_t send_len = 10; + char send_buf[send_len]; + + /* Invalid msg value */ + ret = fi_writemsg(cxit_ep, NULL, flags); + cr_assert_eq(ret, -FI_EINVAL, "NULL msg return %d", ret); + + msg.iov_count = cxit_fi->tx_attr->rma_iov_limit + 1; + ret = fi_writemsg(cxit_ep, &msg, flags); + cr_assert_eq(ret, -FI_EINVAL, "Invalid iov_count return %d", ret); + + msg.iov_count = cxit_fi->tx_attr->rma_iov_limit; + flags = FI_DIRECTED_RECV; /* Invalid flag value */ + ret = fi_writemsg(cxit_ep, &msg, flags); + cr_assert_eq(ret, -FI_EBADFLAGS, "Invalid flag return %d", ret); + + /* Invalid length */ + iov[0].iov_base = send_buf; + iov[0].iov_len = cxit_fi->ep_attr->max_msg_size + 1; + + rma[0].addr = 0; + rma[0].len = send_len; + rma[0].key = 0xa; + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + + ret = fi_writemsg(cxit_ep, &msg, 0); + cr_assert_eq(ret, -FI_EMSGSIZE, "Invalid flag return %d", ret); + + /* Invalid inject length */ + iov[0].iov_len = C_MAX_IDC_PAYLOAD_RES+1; + + ret = fi_writemsg(cxit_ep, &msg, FI_INJECT); + cr_assert_eq(ret, -FI_EMSGSIZE, "Invalid flag return %d", ret); +} + +void rmamsg_bounds(bool write, bool opt_mr) +{ + int ret; + struct mem_region mem_window; + uint64_t key_val = opt_mr ? RMA_WIN_KEY : 200; + struct fi_cq_tagged_entry cqe; + struct fi_cq_err_entry err; + struct fi_msg_rma msg = {}; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + size_t good_len = 4096; + char *src_buf; + + /* Create over-sized send buffer for bounds checking */ + src_buf = calloc(1, good_len * 2); + cr_assert_not_null(src_buf, "send_buf alloc failed"); + mr_create(good_len, + write ? FI_REMOTE_WRITE : FI_REMOTE_READ, 0xa0, + &key_val, &mem_window); + memset(mem_window.mem, 0x33, good_len); + + /* Good length to verify operation */ + iov[0].iov_base = src_buf; + iov[0].iov_len = good_len; + + rma[0].addr = 0; + rma[0].len = good_len; + rma[0].key = key_val; + + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + if (write) + ret = fi_writemsg(cxit_ep, &msg, FI_COMPLETION); + else + ret = fi_readmsg(cxit_ep, &msg, FI_COMPLETION); + + cr_assert_eq(ret, FI_SUCCESS, "Bad RMA API status %d", ret); + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "Unexpected RMA failure"); + + /* Use a bad length to cause a bounds violation and + * verify failure is detected. + */ + iov[0].iov_len = good_len * 2; + rma[0].len = good_len * 2; + + if (write) + ret = fi_writemsg(cxit_ep, &msg, FI_COMPLETION); + else + ret = fi_readmsg(cxit_ep, &msg, FI_COMPLETION); + + cr_assert_eq(ret, FI_SUCCESS, "Bad RMA return status %d", ret); + + /* There should be a source error entry. */ + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success"); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert(ret == 1); + cr_assert_eq(err.err, FI_EIO, "Error return %d", err.err); + + mr_destroy(&mem_window); + free(src_buf); +} + +Test(rma, writemsg_bounds_opt) +{ + rmamsg_bounds(true, true); +} + +Test(rma, writemsg_bounds_std) +{ + rmamsg_bounds(true, false); +} + +Test(rma, readmsg_bounds_opt) +{ + rmamsg_bounds(false, true); +} + +Test(rma, readmsg_bounds_std) +{ + rmamsg_bounds(false, false); +} + +/* Test fi_readv failure cases */ +Test(rma, readv_failures) +{ + int ret; + struct iovec iov = {}; + + /* Invalid count value */ + ret = fi_readv(cxit_ep, &iov, NULL, + cxit_fi->tx_attr->rma_iov_limit + 1, + cxit_ep_fi_addr, 0, 0, NULL); + cr_assert_eq(ret, -FI_EINVAL, "Invalid count return %d", ret); +} + +/* Test fi_writev failure cases */ +Test(rma, writev_failures) +{ + int ret; + struct iovec iov = {}; + + /* Invalid count value */ + ret = fi_writev(cxit_ep, &iov, NULL, + cxit_fi->tx_attr->rma_iov_limit + 1, + cxit_ep_fi_addr, 0, 0, NULL); + cr_assert_eq(ret, -FI_EINVAL, "Invalid count return %d", ret); +} + +/* Perform an RMA write spanning a page */ +Test(rma, write_spanning_page) +{ + int ret; + uint8_t *send_buf; + uint8_t *send_addr; + int win_len = s_page_size * 2; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + send_addr = (uint8_t *)FLOOR(send_buf + s_page_size, s_page_size) - 4; + memset(send_addr, 0xcc, send_len); + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + memset(mem_window.mem, 0x33, win_len); + + /* Send 8 bytes from send buffer data to RMA window 0 */ + ret = fi_write(cxit_ep, send_addr, send_len, NULL, cxit_ep_fi_addr, 0, + key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_addr[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_addr[i]); + + mr_destroy(&mem_window); + free(send_buf); +} + +Test(rma, rma_cleanup) +{ + int ret; + long i; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + int writes = 50; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + for (i = 0; i < win_len; i++) + send_buf[i] = 0xb1 * i; + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + /* Send 8 bytes from send buffer data to RMA window 0 */ + for (i = 0; i < writes; i++) { + ret = fi_write(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, key_val, (void *)i); + cr_assert(ret == FI_SUCCESS); + } + + mr_destroy(&mem_window); + + /* Exit without gathering events. */ +} + +void cxit_setup_rma_selective_completion(void) +{ + cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + + cxit_setup_getinfo(); + cxit_fi_hints->tx_attr->op_flags = FI_COMPLETION; + cxit_setup_rma(); +} + +/* Test selective completion behavior with RMA. */ +Test(rma_sel, selective_completion, + .init = cxit_setup_rma_selective_completion, + .fini = cxit_teardown_rma) +{ + int ret; + uint8_t *loc_buf; + int win_len = 0x1000; + int loc_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct iovec iov; + struct fi_rma_iov rma; + int count = 0; + + loc_buf = calloc(1, win_len); + cr_assert_not_null(loc_buf, "loc_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE | FI_REMOTE_READ, 0xa0, &key_val, + &mem_window); + + iov.iov_base = loc_buf; + iov.iov_len = loc_len; + + rma.addr = 0; + rma.key = key_val; + + msg.msg_iov = &iov; + msg.iov_count = 1; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + /* Puts */ + + /* Completion requested by default. */ + for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) { + ret = fi_write(cxit_ep, loc_buf, loc_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + count++; + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < loc_len; i++) + cr_assert_eq(mem_window.mem[i], loc_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], loc_buf[i]); + } + + /* Completion explicitly requested. */ + for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) { + iov.iov_len = loc_len; + ret = fi_writemsg(cxit_ep, &msg, FI_COMPLETION); + cr_assert(ret == FI_SUCCESS); + count++; + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < loc_len; i++) + cr_assert_eq(mem_window.mem[i], loc_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], loc_buf[i]); + } + + /* Suppress completion. */ + for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) { + iov.iov_len = loc_len; + ret = fi_writemsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + while (fi_cntr_read(cxit_write_cntr) != count) + ; + + /* Validate sent data */ + for (int i = 0; i < loc_len; i++) + while (mem_window.mem[i] != loc_buf[i]) + sched_yield(); + + /* Ensure no events were generated */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + } + + /* Inject never generates an event */ + loc_len = 8; + ret = fi_inject_write(cxit_ep, loc_buf, loc_len, cxit_ep_fi_addr, 0, + key_val); + cr_assert(ret == FI_SUCCESS); + + /* Validate sent data */ + for (int i = 0; i < loc_len; i++) + while (mem_window.mem[i] != loc_buf[i]) + sched_yield(); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Gets */ + memset(loc_buf, 0, win_len); + count = 0; + + /* Completion requested by default. */ + for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) { + memset(loc_buf, 0, loc_len); + ret = fi_read(cxit_ep, loc_buf, loc_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + count++; + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_READ, NULL); + + /* Validate sent data */ + for (int i = 0; i < loc_len; i++) + cr_assert_eq(mem_window.mem[i], loc_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], loc_buf[i]); + } + + /* Completion explicitly requested. */ + for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) { + memset(loc_buf, 0, loc_len); + iov.iov_len = loc_len; + ret = fi_readmsg(cxit_ep, &msg, FI_COMPLETION); + cr_assert(ret == FI_SUCCESS); + count++; + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_READ, NULL); + + /* Validate sent data */ + for (int i = 0; i < loc_len; i++) + cr_assert_eq(mem_window.mem[i], loc_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], loc_buf[i]); + } + + /* Suppress completion. */ + for (loc_len = 1; loc_len <= win_len; loc_len <<= 1) { + memset(loc_buf, 0, loc_len); + iov.iov_len = loc_len; + ret = fi_readmsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + count++; + + while (fi_cntr_read(cxit_read_cntr) != count) + ; + + /* Validate sent data */ + for (int i = 0; i < loc_len; i++) + cr_assert_eq(mem_window.mem[i], loc_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], loc_buf[i]); + + /* Ensure no events were generated */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + } + + mr_destroy(&mem_window); + free(loc_buf); +} + +void cxit_setup_rma_selective_completion_suppress(void) +{ + cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + + cxit_setup_getinfo(); + cxit_fi_hints->tx_attr->op_flags = 0; + cxit_setup_rma(); +} + +/* Test selective completion behavior with RMA. */ +Test(rma_sel, selective_completion_suppress, + .init = cxit_setup_rma_selective_completion_suppress, + .fini = cxit_teardown_rma) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct iovec iov; + struct fi_rma_iov rma; + int write_count = 0; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + iov.iov_base = send_buf; + iov.iov_len = send_len; + + rma.addr = 0; + rma.key = key_val; + + msg.msg_iov = &iov; + msg.iov_count = 1; + msg.rma_iov = &rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + /* Normal writes do not generate completions */ + for (send_len = 1; send_len <= win_len; send_len <<= 1) { + memset(mem_window.mem, 0, send_len); + ret = fi_write(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + write_count++; + + while (fi_cntr_read(cxit_write_cntr) != write_count) + ; + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + while (mem_window.mem[i] != send_buf[i]) + sched_yield(); + + /* Ensure no events were generated */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + } + + /* Request completions from fi_writemsg */ + for (send_len = 1; send_len <= win_len; send_len <<= 1) { + memset(mem_window.mem, 0, send_len); + iov.iov_len = send_len; + ret = fi_writemsg(cxit_ep, &msg, FI_COMPLETION); + cr_assert(ret == FI_SUCCESS); + write_count++; + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + } + + /* Suppress completions using fi_writemsg */ + for (send_len = 1; send_len <= win_len; send_len <<= 1) { + memset(mem_window.mem, 0, send_len); + iov.iov_len = send_len; + ret = fi_writemsg(cxit_ep, &msg, 0); + cr_assert(ret == FI_SUCCESS); + write_count++; + + while (fi_cntr_read(cxit_write_cntr) != write_count) + ; + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + while (mem_window.mem[i] != send_buf[i]) + sched_yield(); + + /* Ensure no events were generated */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + } + + /* Inject never generates an event */ + send_len = 8; + memset(mem_window.mem, 0, send_len); + ret = fi_inject_write(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0, + key_val); + cr_assert(ret == FI_SUCCESS); + write_count++; + + while (fi_cntr_read(cxit_write_cntr) != write_count) + ; + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + while (mem_window.mem[i] != send_buf[i]) + sched_yield(); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + mr_destroy(&mem_window); + free(send_buf); +} + +/* Test remote counter events with RMA */ +Test(rma, rem_cntr) +{ + int ret; + uint8_t *send_buf; + int win_len = 16 * 1024; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + int count = 0; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + for (send_len = 1; send_len <= win_len; send_len <<= 1) { + ret = fi_write(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for remote counter event, then check data */ + count++; + + while (fi_cntr_read(cxit_rem_cntr) != count) + ; + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + /* Gather source completion after data */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + } + + mr_destroy(&mem_window); + free(send_buf); +} + +/* Test RMA FI_MORE */ +Test(rma, more) +{ + int ret; + uint8_t *send_buf; + int win_len = 16; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + int i; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + for (i = 0; i < win_len; i++) + send_buf[i] = 0xa + i; + + mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window); + + iov[0].iov_base = send_buf; + iov[0].iov_len = send_len; + + rma[0].addr = 0; + rma[0].len = send_len; + rma[0].key = key_val; + + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + ret = fi_writemsg(cxit_ep, &msg, FI_MORE); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret); + + /* Ensure no completion before the doorbell ring */ + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "write failed %d", ret); + } while (i++ < 100000); + + iov[0].iov_base = send_buf + send_len; + rma[0].addr += send_len; + ret = fi_writemsg(cxit_ep, &msg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret); + + /* Wait for two events. */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + mr_destroy(&mem_window); + free(send_buf); +} + +Test(rma, std_mr_inject) +{ + int ret; + uint8_t *send_buf; + int iters = 10; + int send_len = 8; + int win_len = send_len * iters; + struct mem_region mem_window; + uint64_t key_val = CXIP_PTL_IDX_MR_OPT_CNT; + struct fi_cq_tagged_entry cqe; + int i; + + send_buf = calloc(1, send_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0xa0, &key_val, &mem_window); + + cr_assert(!fi_cntr_read(cxit_write_cntr)); + + for (i = 0; i < iters; i++) { + /* Send 8 bytes from send buffer data to RMA window 0 */ + ret = fi_inject_write(cxit_ep, send_buf, send_len, + cxit_ep_fi_addr, i * send_len, key_val); + cr_assert(ret == FI_SUCCESS); + } + + /* Corrupt the user buffer to make sure the NIC is not using it for an + * inject. + */ + memset(send_buf, 0xff, send_len); + + while (fi_cntr_read(cxit_write_cntr) != iters) + ; + + /* Validate sent data */ + for (int i = 0; i < win_len; i++) + cr_assert_eq(mem_window.mem[i], 0, + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + /* Make sure an event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + mr_destroy(&mem_window); + free(send_buf); +} + +static void rma_invalid_target_mr_key(uint64_t rkey) +{ + int ret; + struct fi_cq_tagged_entry cqe; + struct fi_cq_err_entry err; + + /* Zero byte write to invalid MR key. */ + ret = fi_inject_write(cxit_ep, NULL, 0, cxit_ep_fi_addr, 0, rkey); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_readerr(cxit_write_cntr) != 1) + ; + + /* No target event should be generated. */ + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* There should be an source error entry. */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAVAIL); + + /* Expect a source error. */ + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert(ret == 1); + + /* Expect no other events. */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); +} + +Test(rma, invalid_target_std_mr_key) +{ + rma_invalid_target_mr_key(0x1234); +} + +Test(rma, invalid_target_opt_mr_key) +{ + rma_invalid_target_mr_key(0x10); +} + +Test(rma, invalid_source_mr_key) +{ + int ret; + + ret = fi_inject_write(cxit_ep, NULL, 0, cxit_ep_fi_addr, 0, + 0x100000001); + cr_assert(ret == -FI_EKEYREJECTED); +} + +static void rma_invalid_read_target_mr_key(uint64_t rkey) +{ + int ret; + struct fi_cq_tagged_entry cqe; + struct fi_cq_err_entry err; + + /* Zero byte read to invalid MR key. */ + ret = fi_read(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, 0, rkey, NULL); + cr_assert(ret == FI_SUCCESS); + + while (fi_cntr_readerr(cxit_read_cntr) != 1) + ; + + /* No target event should be generated. */ + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* There should be an source error entry. */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAVAIL); + + /* Expect a source error. */ + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert(ret == 1); + + /* Expect no other events. */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); +} + +Test(rma, invalid_read_target_std_mr_key) +{ + rma_invalid_read_target_mr_key(0x1234); +} + +Test(rma, invalid_read_target_opt_mr_key) +{ + rma_invalid_read_target_mr_key(0x10); +} + +static void rma_hybrid_mr_desc_test_runner(bool write, bool cq_events) +{ + struct mem_region source_window; + struct mem_region remote_window; + int iters = 10; + int send_len = 1024; + int win_len = send_len * iters; + uint64_t source_key = 0x2; + uint64_t remote_key = 0x1; + int ret; + int i; + struct iovec msg_iov = {}; + struct fi_rma_iov rma_iov = {}; + struct fi_msg_rma msg_rma = {}; + void *desc[1]; + struct fi_cq_tagged_entry cqe; + uint64_t rma_flags = cq_events ? FI_TRANSMIT_COMPLETE | FI_COMPLETION : + FI_TRANSMIT_COMPLETE; + uint64_t cqe_flags = write ? FI_RMA | FI_WRITE : FI_RMA | FI_READ; + struct fid_cntr *cntr = write ? cxit_write_cntr : cxit_read_cntr; + + ret = mr_create(win_len, FI_READ | FI_WRITE, 0xa, &source_key, + &source_window); + cr_assert(ret == FI_SUCCESS); + + desc[0] = fi_mr_desc(source_window.mr); + cr_assert(desc[0] != NULL); + + ret = mr_create(win_len, FI_REMOTE_READ | FI_REMOTE_WRITE, 0x3, + &remote_key, &remote_window); + cr_assert(ret == FI_SUCCESS); + + msg_rma.msg_iov = &msg_iov; + msg_rma.desc = desc; + msg_rma.iov_count = 1; + msg_rma.addr = cxit_ep_fi_addr; + msg_rma.rma_iov = &rma_iov; + msg_rma.rma_iov_count = 1; + + for (i = 0; i < iters; i++) { + msg_iov.iov_base = source_window.mem + (i * send_len); + msg_iov.iov_len = send_len; + + rma_iov.addr = i * send_len; + rma_iov.key = remote_key; + rma_iov.len = send_len; + + if (write) + ret = fi_writemsg(cxit_ep, &msg_rma, rma_flags); + else + ret = fi_readmsg(cxit_ep, &msg_rma, rma_flags); + cr_assert_eq(ret, FI_SUCCESS, "Bad rc=%d\n", ret); + } + + ret = fi_cntr_wait(cntr, iters, 1000); + cr_assert(ret == FI_SUCCESS); + + if (cq_events) { + for (i = 0; i < iters; i++) { + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, cqe_flags, NULL); + } + } + + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + for (i = 0; i < win_len; i++) + cr_assert_eq(source_window.mem[i], remote_window.mem[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + source_window.mem[i], remote_window.mem[i]); + + mr_destroy(&source_window); + mr_destroy(&remote_window); +} + +TestSuite(rma_hybrid_mr_desc, .init = cxit_setup_rma_hybrid_mr_desc, + .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rma_hybrid_mr_desc, non_inject_selective_completion_write) +{ + rma_hybrid_mr_desc_test_runner(true, false); +} + +Test(rma_hybrid_mr_desc, selective_completion_read) +{ + rma_hybrid_mr_desc_test_runner(false, false); +} + +Test(rma_hybrid_mr_desc, non_inject_completion_write) +{ + rma_hybrid_mr_desc_test_runner(true, true); +} + +Test(rma_hybrid_mr_desc, completion_read) +{ + rma_hybrid_mr_desc_test_runner(false, true); +} + +static void rma_hybrid_invalid_addr_mr_desc_test_runner(bool write, + bool cq_events) +{ + struct mem_region source_window; + struct mem_region remote_window; + int send_len = 1024; + uint64_t source_key = 0x2; + uint64_t remote_key = 0x1; + int ret; + struct iovec msg_iov = {}; + struct fi_rma_iov rma_iov = {}; + struct fi_msg_rma msg_rma = {}; + void *desc[1]; + struct fi_cq_tagged_entry cqe; + struct fi_cq_err_entry err; + uint64_t rma_flags = cq_events ? FI_TRANSMIT_COMPLETE | FI_COMPLETION : + FI_TRANSMIT_COMPLETE; + struct fid_cntr *cntr = write ? cxit_write_cntr : cxit_read_cntr; + + ret = mr_create(send_len, FI_READ | FI_WRITE, 0xa, &source_key, + &source_window); + cr_assert(ret == FI_SUCCESS); + + desc[0] = fi_mr_desc(source_window.mr); + cr_assert(desc[0] != NULL); + + ret = mr_create(send_len, FI_REMOTE_READ | FI_REMOTE_WRITE, 0x3, + &remote_key, &remote_window); + cr_assert(ret == FI_SUCCESS); + + msg_rma.msg_iov = &msg_iov; + msg_rma.desc = desc; + msg_rma.iov_count = 1; + msg_rma.addr = cxit_ep_fi_addr; + msg_rma.rma_iov = &rma_iov; + msg_rma.rma_iov_count = 1; + + /* Generate invalid memory address. */ + msg_iov.iov_base = source_window.mem + 0xfffffffff; + msg_iov.iov_len = send_len; + + rma_iov.key = remote_key; + rma_iov.len = send_len; + + if (write) + ret = fi_writemsg(cxit_ep, &msg_rma, rma_flags); + else + ret = fi_readmsg(cxit_ep, &msg_rma, rma_flags); + cr_assert_eq(ret, FI_SUCCESS, "Bad rc=%d\n", ret); + + while (fi_cntr_readerr(cntr) != 1) + ; + + /* No target event should be generated. */ + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* There should be an source error entry. */ + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == -FI_EAVAIL); + + /* Expect a source error. */ + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert(ret == 1); + + /* Expect no other events. */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + mr_destroy(&source_window); + mr_destroy(&remote_window); +} + +Test(rma_hybrid_mr_desc, invalid_addr_non_inject_selective_completion_write) +{ + rma_hybrid_invalid_addr_mr_desc_test_runner(true, false); +} + +Test(rma_hybrid_mr_desc, invalid_addr_selective_completion_read) +{ + rma_hybrid_invalid_addr_mr_desc_test_runner(false, false); +} + +Test(rma_hybrid_mr_desc, invalid_addr_non_inject_completion_write) +{ + rma_hybrid_invalid_addr_mr_desc_test_runner(true, true); +} + +Test(rma_hybrid_mr_desc, invalid_addr_completion_read) +{ + rma_hybrid_invalid_addr_mr_desc_test_runner(false, true); +} + +void cxit_rma_setup_tx_alias_no_fence(void) +{ + int ret; + uint64_t order = FI_ORDER_RMA_WAW; + + cxit_setup_getinfo(); + cxit_fi_hints->caps = CXIP_EP_PRI_CAPS; + cxit_setup_tx_alias_rma_dc(); + + /* Set WAW ordering */ + ret = fi_set_val(&cxit_tx_alias_ep->fid, FI_OPT_CXI_SET_MSG_ORDER, + (void *)&order); + cr_assert_eq(ret, FI_SUCCESS, "fi_set_val(FI_OPT_SET_MSG_ORDER)"); +} + +/* RMA TX Alias capability */ +TestSuite(rma_tx_alias, .init = cxit_rma_setup_tx_alias_no_fence, + .fini = cxit_teardown_tx_alias_rma, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rma_tx_alias, flush) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + uint64_t flags = FI_DELIVERY_COMPLETE; + uint64_t flushes_start; + uint64_t flushes_end; + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS, + &flushes_start, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window); + + iov[0].iov_base = send_buf; + iov[0].iov_len = send_len; + + rma[0].addr = 0; + rma[0].len = send_len; + rma[0].key = key_val; + + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + /* Send 8 bytes from send buffer data to RMA window 0 at FI address 0 + * (self) + */ + ret = fi_writemsg(cxit_tx_alias_ep, &msg, flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg failed %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + mr_destroy(&mem_window); + free(send_buf); + + ret = cxit_dom_read_cntr(C_CNTR_IXE_DMAWR_FLUSH_REQS, + &flushes_end, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + cr_assert(flushes_end > flushes_start); +} + +Test(rma_tx_alias, weak_fence) +{ + int ret; + uint8_t *send_buf; + int win_len = 0x1000; + int send_len = 8; + int i; + struct mem_region mem_window; + uint64_t key_val = RMA_WIN_KEY; + struct fi_cq_tagged_entry cqe; + struct fi_msg_rma msg = {}; + struct iovec iov[1]; + struct fi_rma_iov rma[1]; + uint64_t flags = FI_DELIVERY_COMPLETE; + + send_buf = calloc(1, win_len); + cr_assert_not_null(send_buf, "send_buf alloc failed"); + + for (i = 0; i < send_len*2; i++) + send_buf[i] = i; + + mr_create(win_len, FI_REMOTE_WRITE, 0x44, &key_val, &mem_window); + + iov[0].iov_base = send_buf; + iov[0].iov_len = send_len; + + rma[0].addr = 0; + rma[0].len = send_len; + rma[0].key = key_val; + + msg.msg_iov = iov; + msg.iov_count = 1; + msg.rma_iov = rma; + msg.rma_iov_count = 1; + msg.addr = cxit_ep_fi_addr; + + /* Verify FI_FENCE can not be done with original EP */ + ret = fi_writemsg(cxit_ep, &msg, flags | FI_FENCE); + cr_assert_eq(ret, -FI_EINVAL, "fi_writemsg FI_FENCE ret %d", ret); + + /* Verify FI_CXI_WEAK_FENCE can be done with original EP */ + ret = fi_writemsg(cxit_ep, &msg, flags | FI_CXI_WEAK_FENCE); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg FI_WEAK_FENCE ret %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + /* Verifiy FI_CXI_WEAK_FENCE can be done with alias EP */ + rma[0].addr = send_len; + iov[0].iov_base = send_buf + send_len; + ret = fi_writemsg(cxit_tx_alias_ep, &msg, flags | FI_CXI_WEAK_FENCE); + cr_assert_eq(ret, FI_SUCCESS, "fi_writemsg FI_WEAK_FENCE ret %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < send_len * 2; i++) + cr_assert_eq(mem_window.mem[i], send_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + mem_window.mem[i], send_buf[i]); + + mr_destroy(&mem_window); + free(send_buf); +} + +TestSuite(rma_mr_event, .init = cxit_setup_rma, .fini = cxit_teardown_rma, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test that use of stale MR keys cannot access cached memory */ +Test(rma_mr_event, stale_key) +{ + int ret; + long i; + struct fi_cq_err_entry err; + struct fi_cq_tagged_entry cqe; + struct fid_mr *mr; + struct cxip_mr *cxip_mr; + uint8_t *src_buf; + uint8_t *src_buf2; + uint8_t *tgt_buf; + int src_len = 8; + int tgt_len = 4096; + uint64_t key_val = 200; + + src_buf = malloc(src_len); + cr_assert_not_null(src_buf, "src_buf alloc failed"); + src_buf2 = malloc(src_len); + cr_assert_not_null(src_buf2, "src_buf2 alloc failed"); + tgt_buf = calloc(1, tgt_len); + cr_assert_not_null(tgt_buf, "tgt_buf alloc failed"); + + for (i = 0; i < src_len; i++) { + src_buf[i] = 0xb1 * i; + src_buf2[i] = 0xa1 * i; + } + + /* Create MR */ + ret = fi_mr_reg(cxit_domain, tgt_buf, tgt_len, FI_REMOTE_WRITE, 0, + key_val, 0, &mr, NULL); + cr_assert(ret == FI_SUCCESS); + + /* We known cached FI_MR_PROV_KEY cannot support this + * level of robustness, so just skip FI_MR_PROV_KEY + * unless FI_CXI_MR_MATCH_EVENTS is enabled. + */ + cxip_mr = container_of(mr, struct cxip_mr, mr_fid); + if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY && + !cxip_mr->count_events) { + fi_close(&mr->fid); + goto done; + } + + ret = fi_mr_bind(mr, &cxit_ep->fid, 0); + cr_assert(ret == FI_SUCCESS); + + ret = fi_mr_enable(mr); + cr_assert(ret == FI_SUCCESS); + + if (cxit_fi->domain_attr->mr_mode & FI_MR_PROV_KEY) + key_val = fi_mr_key(mr); + + ret = fi_write(cxit_ep, src_buf, src_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + validate_tx_event(&cqe, FI_RMA | FI_WRITE, NULL); + + /* Validate sent data */ + for (int i = 0; i < src_len; i++) + cr_assert_eq(tgt_buf[i], src_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + tgt_buf[i], src_buf[i]); + + /* Close MR but leave memory backing it allocated/cached */ + fi_close(&mr->fid); + + /* Try to access using stale key */ + ret = fi_write(cxit_ep, src_buf2, src_len, NULL, + cxit_ep_fi_addr, 0, key_val, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, -FI_EAVAIL, "Unexpected RMA success %d", ret); + + ret = fi_cq_readerr(cxit_tx_cq, &err, 1); + cr_assert_eq(ret, 1); + cr_assert_eq(err.err, FI_EIO, "Error return %d", err.err); + + /* Verfiy data was not modified with src_buf2 data */ + for (int i = 0; i < src_len; i++) + cr_assert_eq(tgt_buf[i], src_buf[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + tgt_buf[i], src_buf[i]); + +done: + free(tgt_buf); + free(src_buf); + free(src_buf2); +} diff --git a/prov/cxi/test/rocr.c b/prov/cxi/test/rocr.c new file mode 100644 index 00000000000..3d9567e133e --- /dev/null +++ b/prov/cxi/test/rocr.c @@ -0,0 +1,763 @@ +/* + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "libcxi/libcxi.h" +#include "cxip.h" +#include "cxip_test_common.h" + +#define MAX_MSG_SIZE 1048576U +#define MAX_BUF_OFFSET 65536U +#define REGION_MAX 255 + +static unsigned int seed; +static hsa_agent_t agent; +static hsa_region_t regions[REGION_MAX]; +static int num_regions; +static hsa_region_t coarse_grain; +bool coarse_grain_valid; +static hsa_region_t fine_grain; +bool fine_grain_valid; + +static hsa_status_t get_gpu_agent(hsa_agent_t agent, void *data) { + hsa_status_t status; + hsa_device_type_t device_type; + + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + if (HSA_STATUS_SUCCESS == status && HSA_DEVICE_TYPE_GPU == device_type) { + hsa_agent_t* ret = (hsa_agent_t*)data; + *ret = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t callback_get_num_regions(hsa_region_t region, void* data) { + int *num_regions = (int *)data; + (*num_regions)++; + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t callback_get_regions(hsa_region_t region, void* data) { + hsa_region_t **region_list = (hsa_region_t **)data; + **region_list = region; + (*region_list)++; + return HSA_STATUS_SUCCESS; +} + +static void hsa_test_init(void) +{ + hsa_status_t hsa_ret; + hsa_region_t *ptr_reg = regions; + int i; + size_t size_r; + + enable_cxi_hmem_ops = 0; + seed = time(NULL); + srand(seed); + + hsa_ret = hsa_init(); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS); + + hsa_ret = hsa_iterate_agents(get_gpu_agent, &agent); + cr_assert_eq(hsa_ret, HSA_STATUS_INFO_BREAK); + + hsa_ret = hsa_agent_iterate_regions(agent, callback_get_num_regions, + &num_regions); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS); + cr_assert(num_regions <= REGION_MAX); + + hsa_ret = hsa_agent_iterate_regions(agent, callback_get_regions, + &ptr_reg); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS); + + for (i = 0; i < num_regions; i++) { + hsa_ret = hsa_region_get_info(regions[i], + HSA_REGION_INFO_GLOBAL_FLAGS, + &size_r); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS); + + if (size_r & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED && + !fine_grain_valid) { + fine_grain = regions[i]; + fine_grain_valid = true; + } + + if (size_r & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED && + !coarse_grain_valid) { + coarse_grain = regions[i]; + coarse_grain_valid = true; + } + + if (fine_grain_valid && coarse_grain_valid) + break; + } + + cr_assert_eq(coarse_grain_valid, true, + "Failed to find coarse grain memory"); + cr_assert_eq(fine_grain_valid, true, + "Failed to find fine grain memory"); +} + +static void hsa_test_fini(void) +{ + hsa_status_t hsa_ret; + + hsa_ret = hsa_shut_down(); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS); +} + +TestSuite(hsa, .timeout = CXIT_DEFAULT_TIMEOUT, .init = hsa_test_init, + .fini = hsa_test_fini); + +static void hsa_message_runner(void *hsa_send_buf, void *hsa_recv_buf, + size_t buf_size, bool device_only_mem, + bool unexpected) +{ + int ret; + char *send_buf; + char *recv_buf; + struct fi_cq_tagged_entry cqe; + int i; + hsa_status_t hsa_ret; + int j; + + cxit_setup_msg(); + + /* For device only memcpy, send and recv buffer as used for data + validation. + */ + if (device_only_mem) { + send_buf = malloc(buf_size); + cr_assert_neq(send_buf, NULL, "Failed to allocate memory"); + + recv_buf = calloc(1, buf_size); + cr_assert_neq(send_buf, NULL, "Failed to allocate memory"); + } else { + send_buf = hsa_send_buf; + recv_buf = hsa_recv_buf; + } + + for (j = 0; j < 2; j++) { + + ret = open("/dev/urandom", O_RDONLY); + cr_assert_neq(ret, -1, "open failed: %d", -errno); + read(ret, send_buf, buf_size); + close(ret); + + if (device_only_mem) { + hsa_ret = hsa_memory_copy(hsa_send_buf, send_buf, + buf_size); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, + "hsaMemcpy failed: %d", hsa_ret); + } + + if (unexpected) { + ret = fi_send(cxit_ep, hsa_send_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + ret = fi_recv(cxit_ep, hsa_recv_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + } else { + ret = fi_recv(cxit_ep, hsa_recv_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + ret = fi_send(cxit_ep, hsa_send_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + } + + do { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + if (device_only_mem) { + hsa_ret = hsa_memory_copy(recv_buf, hsa_recv_buf, + buf_size); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, + "hsaMemcpy failed: %d", hsa_ret); + } + + for (i = 0; i < buf_size; i++) + cr_assert_eq(send_buf[i], recv_buf[i], + "Data corruption at byte %d seed %u iter %d", i, seed, j); + } + + if (device_only_mem) { + free(recv_buf); + free(send_buf); + } + + cxit_teardown_msg(); +} + +enum mem_type { + COARSE, + FINE, +}; + +static void hsa_dev_memory_test(size_t buf_size, size_t buf_offset, + bool unexpected, bool hmem_dev_reg, + enum mem_type type) +{ + hsa_status_t hsa_ret; + void *hsa_send_buf; + void *hsa_recv_buf; + int ret; + hsa_region_t region; + + if (hmem_dev_reg) + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1); + else + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + if (type == COARSE) + region = coarse_grain; + else + region = fine_grain; + + /* hsa buffers will be used for RDMA. */ + hsa_ret = hsa_memory_allocate(region, buf_size + buf_offset, + &hsa_send_buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", hsa_ret); + + hsa_ret = hsa_memory_allocate(region, buf_size + buf_offset, + &hsa_recv_buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", hsa_ret); + + hsa_message_runner((void *)((char *)hsa_send_buf + buf_offset), + (void *)((char *)hsa_recv_buf + buf_offset), + buf_size, true, unexpected); + + hsa_ret = hsa_memory_free(hsa_recv_buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree failed: %d", + hsa_ret); + + hsa_ret = hsa_memory_free(hsa_send_buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree failed: %d", + hsa_ret); + +} + +/* Test messaging using rendezvous, device memory, and HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_rdvz_hmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, true, COARSE); +} + +/* Test messaging using eager, device memory, and HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_eager_hmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, true, COARSE); +} + +/* Test messaging using IDC, device memory, and HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_idc_hmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, true, COARSE); +} + +/* Test messaging using rendezvous, device memory, unexpected messaging, and + * HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_rdvz_unexpected_hmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, true, COARSE); +} + +/* Test messaging using eager, device memory, unexpected messaging, and + * HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_eager_unexpected_hmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, true, COARSE); +} + +/* Test messaging using IDC, device memory, unexpected messaging, and + * HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_idc_unexpected_hmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, true, COARSE); +} + +/* Test messaging using rendezvous, device memory, and without HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_rdvz_noHmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, false, COARSE); +} + +/* Test messaging using eager, device memory, and without HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_eager_noHmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, false, COARSE); +} + +/* Test messaging using IDC, device memory, and without HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_idc_noHmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, false, COARSE); +} + +/* Test messaging using rendezvous, device memory, unexpected messaging, and + * without HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_rdvz_unexpected_noHmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, false, COARSE); +} + +/* Test messaging using eager, device memory, unexpected messaging, and + * without HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_eager_unexpected_noHmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, false, COARSE); +} + +/* Test messaging using IDC, device memory, unexpected messaging, and + * without HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_idc_unexpected_noHmemDevReg_coarse) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, false, COARSE); +} + +/* Test messaging using rendezvous, device memory, and HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_rdvz_hmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, true, FINE); +} + +/* Test messaging using eager, device memory, and HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_eager_hmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, true, FINE); +} + +/* Test messaging using IDC, device memory, and HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_idc_hmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, true, FINE); +} + +/* Test messaging using rendezvous, device memory, unexpected messaging, and + * HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_rdvz_unexpected_hmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, true, FINE); +} + +/* Test messaging using eager, device memory, unexpected messaging, and + * HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_eager_unexpected_hmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, true, FINE); +} + +/* Test messaging using IDC, device memory, unexpected messaging, and + * HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_idc_unexpected_hmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, true, FINE); +} + +/* Test messaging using rendezvous, device memory, and without HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_rdvz_noHmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, false, FINE); +} + +/* Test messaging using eager, device memory, and without HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_eager_noHmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, false, FINE); +} + +/* Test messaging using IDC, device memory, and without HMEM device memory + * registration for load/store access. + */ +Test(hsa, messaging_devMemory_idc_noHmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, false, false, FINE); +} + +/* Test messaging using rendezvous, device memory, unexpected messaging, and + * without HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_rdvz_unexpected_noHmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % MAX_MSG_SIZE; + if (buf_size > 65536) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, false, FINE); +} + +/* Test messaging using eager, device memory, unexpected messaging, and + * without HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_eager_unexpected_noHmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + while (true) { + buf_size = rand() % 1024; + if (buf_size > 256) + break; + } + + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, false, FINE); +} + +/* Test messaging using IDC, device memory, unexpected messaging, and + * without HMEM device memory registration for load/store access. + */ +Test(hsa, messaging_devMemory_idc_unexpected_noHmemDevReg_fine) +{ + size_t buf_size; + size_t buf_offset; + + buf_size = rand() % 128; + buf_offset = rand() % MAX_BUF_OFFSET; + + hsa_dev_memory_test(buf_size, buf_offset, true, false, FINE); +} + +static void verify_dev_reg_handle(bool hmem_dev_reg, enum mem_type type) +{ + int ret; + void *buf; + hsa_status_t hsa_ret; + struct fid_mr *fid_mr; + size_t buf_size = 1024; + struct cxip_mr *mr; + hsa_region_t region; + + cxit_setup_msg(); + + if (type == COARSE) + region = coarse_grain; + else + region = fine_grain; + + hsa_ret = hsa_memory_allocate(region, buf_size, &buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaMalloc failed: %d", + hsa_ret); + + ret = fi_mr_reg(cxit_domain, buf, buf_size, FI_READ, 0, 0x123, 0, + &fid_mr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_mr_reg failed: %d", ret); + + mr = container_of(fid_mr, struct cxip_mr, mr_fid); + + cr_assert_eq(mr->md->handle_valid, hmem_dev_reg, + "Bad cxip_md handle_valid"); + cr_assert_eq(mr->md->info.iface, FI_HMEM_ROCR, + "Invalid CXIP MD iface: %d", mr->md->info.iface); + + ret = fi_close(&fid_mr->fid); + cr_assert_eq(ret, FI_SUCCESS, "fi_close MR failed: %d", ret); + + hsa_ret = hsa_memory_free(buf); + cr_assert_eq(hsa_ret, HSA_STATUS_SUCCESS, "hsaFree failed: %d", + hsa_ret); + + cxit_teardown_msg(); +} + +/* Verify MD handle is false. */ +Test(hsa, verify_noHmemDevReg_coarse) +{ + int ret; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + verify_dev_reg_handle(false, COARSE); +} + +/* Verify MD handle is true. */ +Test(hsa, verify_hmemDevReg_coarse) +{ + int ret; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + verify_dev_reg_handle(true, COARSE); +} + +/* Verify MD handle is false. */ +Test(hsa, verify_noHmemDevReg_fine) +{ + int ret; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "1", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + verify_dev_reg_handle(false, FINE); +} + +/* Verify MD handle is true. */ +Test(hsa, verify_hmemDevReg_fine) +{ + int ret; + + ret = setenv("FI_CXI_DISABLE_HMEM_DEV_REGISTER", "0", 1); + cr_assert_eq(ret, 0, "setenv failed: %d", -errno); + + verify_dev_reg_handle(true, FINE); +} diff --git a/prov/cxi/test/run.sh b/prov/cxi/test/run.sh new file mode 100644 index 00000000000..053df6ee304 --- /dev/null +++ b/prov/cxi/test/run.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# +# Run a command in a VM. Start a new VM if necessary. + +RUNCMD=$@ +DIR=`dirname $0` + +if ! [ -c /dev/cxi0 ]; then + echo "Cassini device not present; attempting to launch netsim VM" + RUNCMD="$RUNCMD" $DIR/startvm.sh +else + if [ -z "$RUNCMD" ]; then + RUNCMD=${SHELL} + fi + ${RUNCMD} +fi diff --git a/prov/cxi/test/run_criterion_tests.py b/prov/cxi/test/run_criterion_tests.py new file mode 100644 index 00000000000..093f0fee2c2 --- /dev/null +++ b/prov/cxi/test/run_criterion_tests.py @@ -0,0 +1,551 @@ +#!/usr/bin/env python3 + +""" +Executes criterion tests using parameters defined in an +input yaml file and outputs results to a file in TAP format +""" + +import pexpect +import sys +import yaml +import re + +from enum import Enum +from argparse import ArgumentParser +from contextlib import ExitStack + + +class Result(Enum): + """ + Enum for test results + """ + PASSED = 1 + FAILED = 2 + SKIPPED = 3 + + +class Node: + """ + Class for managing a node's SSH connection + """ + def __init__(self, node_name): + """ + Args: + node_name: the name of the node + """ + self.name = node_name + self.ssh = open_ssh(self.name, prompt=PROMPT) + self.ssh.logfile = sys.stdout + + +class TestSet: + """ + Set of tests that use common runtime parameters and CSR settings + """ + def __init__(self, desc, test_filter=None, runtime_params=None, csr_list=None): + """ + Args: + desc: description of the test set + test_filter: tests to run + runtime_params: runtime parameters to use with this test set + csr_list: csrs to set prior to running the tests + """ + self.description = desc + self.filter = test_filter + self.runtime_params = runtime_params + self.csr_list = csr_list + self.csr_list_initial_vals = [] + self.test_list = [] + + # generate the list of tests + self.generate_testlist() + + def set_csrs_for_test_set(self): + """ + capture original csr values and set csrs to new values + + """ + if self.csr_list is not None: + send(node, "cd {}".format(PYCXI_DIR)) + + for c in self.csr_list: + csr = c[0] + field = c[1] + new_value = c[2] + + # capture initial values + orig_val = get_csr_value(csr, field) + self.csr_list_initial_vals.append([csr, field, orig_val]) + + # set new values + set_csr_value(csr, field, new_value) + + def restore_csrs(self): + """ + Restore csrs to their original values + + """ + if self.csr_list is not None: + print("Restoring CSRs...") + send(node, "cd {}".format(PYCXI_DIR)) + + for c in self.csr_list_initial_vals: + field = c[1] + if isinstance(c[2], dict): + # csr is an array, so set each member of the array + for csr, orig_value in c[2].items(): + set_csr_value(csr, field, orig_value) + else: + csr = c[0] + orig_value = c[2] + set_csr_value(csr, field, orig_value) + + def generate_testlist(self): + """ + generate a list of tests to run based on the provided filter + + """ + send(node, "cd {}".format(TEST_DIR)) + + # create criterion test list + send(node, './cxitest -l > testlist 2>&1 && sleep 1 && echo "DONE"', resp_1="DONE") + f_name = "{}/testlist".format(TEST_DIR) + with open(f_name) as file: + all_lines = [line.rstrip() for line in file.readlines()] + + # create regex instance for filter (if needed) + regex_filter = None + if self.filter is not None: + regex_filter = re.compile(self.filter, re.IGNORECASE) + + # parse testlist and create test objects for this test set + area = None + for line in all_lines: + if ':' in line: + area = line.split()[0].replace(":", "") + else: + tst_name = line.split()[1] + + # indicates "disabled" flag was set in Criterion test + skip_test = "skipped" in line + + # create test objects for this test set based on the filter (if provided) + if self.filter is None or ( + regex_filter is not None and regex_filter.match("{}/{}".format(area, tst_name))): + tst = Test(area, tst_name, self.description, self.runtime_params, skip=skip_test) + self.test_list.append(tst) + + def execute_tests(self): + """ + Executes the tests in the test set and capture the output + """ + with ExitStack() as cleanup: + + # restore CSRs on exit + cleanup.callback(self.restore_csrs) + + # set CSRs for test set + self.set_csrs_for_test_set() + + send(node, "cd {}".format(TEST_DIR)) + + # execute tests in test list + for te in self.test_list: + sys.stdout.flush() + cmd = '{} > tmp_result 2>&1 && echo "DONE"'.format(te.test_cmd) + + # execute test + send(node, cmd, resp_1="DONE", timeout=60) + + results_index = 0 + enable_logging = False + + # process raw results file + with open("{}/tmp_result".format(TEST_DIR)) as file: + all_lines = [line.strip() for line in file.readlines()] + + # capture all output related to this test + for ln in all_lines: + line = ansi_escape.sub('', ln).rstrip() + test_str = " {}::{}".format(te.test_area, te.test_name) + if test_str in line: + if "RUN" in line and line.endswith(test_str): + # start capturing output for this test + enable_logging = True + + # create a TestResult instance for this test + te.results.append(TestResult(results_index)) + + # if CSRs were modified, include that in the log: + if self.csr_list is not None: + te.results[results_index].log.append("Modified CSRs: {}".format(self.csr_list)) + + # log the actual Criterion test command + te.results[results_index].log.append("Test cmd: {}".format(te.test_cmd)) + + # log the "RUN" output + te.results[results_index].log.append(line) + elif "{}:".format(test_str) in line: + + # set the test result + te.results[results_index].result = get_result(line) + + # capture the entire result line + te.results[results_index].log.append(line) + + # the test is finished, so stop capturing output for this test + enable_logging = False + + # increment index (multiple results for Test instance indicates a parameterized tests) + results_index += 1 + + elif enable_logging: + # test is in process, so capture all console output that occurs + te.results[results_index].log.append(line) + + # display all logged output belonging to this particular test + for res in te.results: + print("\n-------------------------------------------------------") + for s in res.log: + print(s) + print("-------------------------------------------------------\n") + + +class Test: + """ + An individual test, which may contain multiple TestResult objects if the test is parameterized + """ + def __init__(self, test_area, test_name, desc, t_params=None, skip=False): + """ + Args: + test_area: the test area + test_name: the test name + desc: description of the test + t_params: runtime parameters for this test + skip: flag to indicate if the test should be skipped + """ + self.test_area = test_area + self.test_name = test_name + self.desc = desc + self.skip = skip + self.results = [] + + # create the runtime parameters string for this test + param_str = "" + if t_params is not None: + for pa, v in t_params.items(): + param_str += "{}={} ".format(pa, v) + + self.test_params = param_str + + # create the test cmd + self.test_cmd = \ + '{} ./cxitest --filter="{}/{}" --verbose=1 -j1 --ascii'.format(param_str, test_area, test_name) + + # create TestResult for skipped test + if self.skip: + st = TestResult() + st.result = Result.SKIPPED + self.results.append(st) + + def create_tap_results(self): + """ + Parse results log and create TAP results for this test + """ + for res in self.results: + # get test number for this test + test_num = get_current_test_count_and_inc() + + # determine TAP result based on test result + tap_result = "ok {}".format(test_num) if res.result != Result.FAILED else "not ok {}".format(test_num) + + # construct the TAP test name + t_name = "{}::{}".format(self.test_area, self.test_name) + + # if we have a parameterized test, append index to the test name + if len(self.results) > 1: + t_name = "{}::{}".format(t_name, res.index) + + # append the description + t_name = "{} - {}".format(t_name, self.desc) + + # if test was skipped, include skip comment + if res.result == Result.SKIPPED: + t_name = "{} # skip".format(t_name) + + # include additional comment for disabled tests + if self.skip: + t_name += " Disabled flag set in criterion test " + + # append the tap result and test name to the tap report + tap_report.append("{} {}".format(tap_result, t_name)) + + # include all logged output during this test in the tap report + for m in res.log: + tap_report.append("# {}".format(m)) + + +class TestResult: + """ + Result and log for a particular test + """ + def __init__(self, index=0): + """ + + Args: + index: test index - used with parameterized tests + """ + self.index = index + self.result = Result.FAILED + self.log = [] + + +def get_result(the_line): + """ + Determine the test result from the given line + + Args: + the_line: the line to check + + Returns: the result + + """ + if "PASS" in the_line: + return Result.PASSED + elif "SKIP" in the_line: + return Result.SKIPPED + else: + return Result.FAILED + + +def set_csr_value(csr, field, value): + """ + Sets a CSR field to the given value + + Args: + csr: the CSR + field: the field + value: the value + + """ + # use cxiutil to set the value + send(node, "cd {}".format(PYCXI_DIR)) + cmd = 'cxiutil store csr {} {}={} && sleep 1 && echo "DONE"'.format(csr, field, value) + send(node, cmd, resp_1="DONE") + sys.stdout.flush() + + # verify the new value is set as expected + new_val = get_csr_value(csr, field) + if isinstance(new_val, dict): + # we have a CSR array, so verify each member of the array + for v in new_val.values(): + if int(v) != int(value): + raise RuntimeError("Unable to set CSR with cmd: {}. " + "Actual value of {} = {}".format(cmd, field, v)) + else: + if int(new_val) != int(value): + raise RuntimeError("Unable to set CSR with cmd: {}. " + "Actual value of {} = {}".format(cmd, field, new_val)) + + +def get_csr_value(csr, field): + """ + Returns the value of the CSR field. If the CSR is an array, returns a dict containing each CSR index and value + + Args: + csr: the CSR + field: the field + + Returns: the value, or a dict containing each CSR index and value + + """ + + # use cxiutil to get the value + send(node, "cd {}".format(PYCXI_DIR)) + sys.stdout.flush() + send(node, 'cxiutil dump csr {} > tmp && sleep 1 && echo "DONE"'.format(csr, field), resp_1="DONE") + + with open("{}/tmp".format(PYCXI_DIR)) as file: + all_lines = [line.rstrip() for line in file.readlines()] + + # parse the cxiutil output + response = {} + for line in all_lines: + if "hex" in line: + csr = line.split()[0] + + if field in line and "0x" in line: + response[csr] = line.split()[2] + + # csr array, so return a dict containing each value in the csr array + if len(response) > 1: + return response + # not a csr array, so just return the value + elif len(response) == 1: + return response[csr] + else: + raise RuntimeError("Unable to read CSR {} {}".format(csr, field)) + + +def generate_tap_file(): + """ + generate the TAP results file + """ + total_test_count = 0 + + # capture the total number of tests + for ts in test_set_list: + for tst in ts.test_list: + total_test_count += len(tst.results) + + # add TAP header line + tap_header = "1..{}".format(total_test_count) + tap_report.append(tap_header) + + # capture TAP results of each test + for ts in test_set_list: + for element in ts.test_list: + element.create_tap_results() + + # create TAP file + with open(RESULTS_FILE, 'w') as file_handler: + for tap_line in tap_report: + file_handler.write("{}\n".format(tap_line)) + print(tap_line) + + +def get_current_test_count_and_inc(): + """ + returns the current test count prior to incrementing it + + Returns: the current test count + + """ + global current_test_count + tmp_count = current_test_count + current_test_count += 1 + return tmp_count + + +def open_ssh(node_addr, prompt): + """ + Create ssh connection to the given ip address + + Args: + node_addr: the node name / ip address + prompt: the prompt to expect + + Returns: SSH connection / process + + """ + s = pexpect.spawn("ssh {}".format(node_addr), encoding='utf-8') + try: + rc = s.expect([prompt, "Password:"], timeout=30) + if rc == 1: + s.sendline(PASSWORD) + s.expect(prompt, timeout=10) + except pexpect.TIMEOUT: + print("Unable to ssh to {}".format(node_addr)) + raise pexpect.TIMEOUT + return s + + +def send(the_node, cmd, resp_1=None, resp_2=None, expect_prompt=True, timeout=30): + """ + send a command to the given node and verify expected response(s) + + Args: + the_node: the node + cmd: the command to send + resp_1: the first expected response (if not None) + resp_2: the second expected response (if not None) + expect_prompt: flag to indicate if a prompt is expected + timeout: the maximum time to wait for a response before throwing an exception + """ + ssh_sesh = the_node.ssh + ssh_sesh.sendline(cmd) + + if resp_1: + ssh_sesh.expect(resp_1, timeout=timeout) + + if resp_2: + ssh_sesh.expect(resp_2, timeout=timeout) + + if expect_prompt: + ssh_sesh.expect(PROMPT, timeout=timeout) + + +if __name__ == "__main__": + + # used to filter ansi escape chars + ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]') + + p = ArgumentParser("run_criterion_tests") + p.add_argument('-n', + dest="node", + nargs='?', + type=str, + required=True, + help="Name of node where test is to be run") + + p.add_argument('-y', + dest="yaml_file", + nargs='?', + type=str, + required=True, + help="Path to the test YAML file") + + args = p.parse_args() + + # parse the input yaml file + try: + with open(args.yaml_file, 'r') as stream: + f = yaml.safe_load(stream) + + except FileNotFoundError: + print("YAML file not found: {}".format(args.yaml_file)) + + LIBFABRIC_DIR = f["env"]["libfabric_dir_on_node"] + TEST_DIR = "{}/prov/cxi/test".format(LIBFABRIC_DIR) + PYCXI_DIR = f["env"]["pycxi_dir_on_node"] + PROMPT = f["env"]["node_prompt"] + PASSWORD = f["env"]["node_password"] + RESULTS_FILE = "{}/results.tap".format(TEST_DIR) + + # holds all TAP results + tap_report = [] + + # instantiate node object + node = Node(args.node) + + # activate pycxi venv for cxiutil and remove old tap files + send(node, "cd {}".format(PYCXI_DIR)) + send(node, ". .venv/bin/activate") + send(node, "cd {}".format(TEST_DIR)) + send(node, "rm *.tap") + + # set global runtime parameters prior to running tests + default_runtime_parameters = f["global_runtime_parameters"] + for params in default_runtime_parameters: + for param, val in params.items(): + send(node, "export {}={}".format(param, val)) + + current_test_count = 1 + + # create test sets + test_set_list = [] + for test in f["tests"]: + test_set_list.append(TestSet( + desc=test["description"], + test_filter=test["filter"], + runtime_params=test["runtime_parameters"], + csr_list=test["csrs"]) + ) + + # execute the tests in each test set + for test_set in test_set_list: + test_set.execute_tests() + + # generate the tap file + generate_tap_file() + diff --git a/prov/cxi/test/run_tests_vm.sh b/prov/cxi/test/run_tests_vm.sh new file mode 100644 index 00000000000..760f5162ecb --- /dev/null +++ b/prov/cxi/test/run_tests_vm.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# +# Run unit tests in a VM. + +DIR=`dirname $0` +cd $DIR + +./run.sh ./test.sh $1 diff --git a/prov/cxi/test/startvm-setup.sh b/prov/cxi/test/startvm-setup.sh new file mode 100644 index 00000000000..06f19c8c958 --- /dev/null +++ b/prov/cxi/test/startvm-setup.sh @@ -0,0 +1,54 @@ +#!/bin/sh +# +# Initialize a VM for CXI testing and run a command. + +DBS_DIR=$(realpath "../../../..") + +if [[ -z $RUNCMD ]]; then + RUNCMD="$@" +fi + +export LC_ALL=en_US.UTF-8 + +ulimit -s unlimited +ulimit -l unlimited + +modprobe ptp +modprobe iommu_v2 || modprobe amd_iommu_v2 +insmod $DBS_DIR/slingshot_base_link/sbl.ko +insmod $DBS_DIR/sl-driver/knl/cxi-sl.ko +insmod $DBS_DIR/cxi-driver/cxi/cxi-core.ko disable_default_svc=0 +insmod $DBS_DIR/cxi-driver/cxi/cxi-user.ko +insmod $DBS_DIR/cxi-driver/cxi/cxi-eth.ko +insmod $DBS_DIR/kdreg2/kdreg2.ko + +# Sleep to wait for Ethernet interface to come up +sleep 3 + +# Locate the first down Ethernet interface and configure it. +regex="eth([0-9]{1}).+DOWN" +eth_id=-1 +interfaces="$(ip addr)" +if [[ $interfaces =~ $regex ]]; then + eth_id=${BASH_REMATCH[1]} +fi + +if [ $eth_id -eq -1 ]; then + echo "Failed to find Ethernet interface" + exit 1 +fi + +AMA=`cat /sys/class/net/eth$eth_id/address | awk -F':' '{print "02:00:" $3 ":" $4 ":" $5 ":" $6}'` + +ip link set eth$eth_id addr $AMA +ip link set dev eth$eth_id up + +# Add pycxi utilities to path +export PATH=$DBS_DIR/pycxi/utils:$PATH + +# Initialize pycxi environment +. $DBS_DIR/pycxi/.venv/bin/activate + +if [[ ! -z $RUNCMD ]]; then + $RUNCMD +fi diff --git a/prov/cxi/test/startvm.sh b/prov/cxi/test/startvm.sh new file mode 100644 index 00000000000..933bd082fed --- /dev/null +++ b/prov/cxi/test/startvm.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +# Start a VM, optionally load the test driver, and exit. + +# The parameters are given to netsim. See netsim -h. +# ./startvm.sh -> run one instance with 1 NIC +# ./startvm.sh -N 3 -> run one instance with 3 NICs +# ./startvm.sh -n 2 -> launch 2 VMs each with 1 NIC +# +# +# Note: When using multiple VMs, it is recommended to set the USE_XTERM +# variable. Each VM will be opened in a new xterm window. +# +# USE_XTERM=1 ./startvm.sh -n 2 + +cd `dirname $0` + +DBS_DIR=$(pwd)/../../../.. +VIRTME_DIR=$DBS_DIR/virtme +QEMU_DIR=$DBS_DIR/cassini-qemu/x86_64-softmmu/ + +# If the emulator is not running, start it. This script must run under +# its control, so qemu can connect to it. Simply relaunch self under +# netsim's control. +if [[ ! -v NETSIM_ID ]]; then + exec $DBS_DIR/nic-emu/netsim $@ $(basename $0) +fi + +# Check whether this script is already in a VM or not (ie. running +# under a hypervisor.) If not, we'll need a different setup for nested +# VMs. +HYP=$(grep -c "^flags.*\ hypervisor" /proc/cpuinfo) + +if [[ $NETSIM_NICS -eq 1 ]]; then + CCN_OPTS="-device ccn,addr=8" +elif [[ $NETSIM_NICS -eq 2 ]]; then + CCN_OPTS="-device ccn,addr=8 -device ccn,addr=13" +elif [[ $NETSIM_NICS -eq 4 ]]; then + CCN_OPTS="-device ccn,addr=8 -device ccn,addr=0xd -device ccn,addr=0x12 -device ccn,addr=0x17" +fi + +# -M q35 = Standard PC (Q35 + ICH9, 2009) (alias of pc-q35-2.10) +# MSI-X needs interrupt remapping enabled to fully work. +# w/ Intel IOMMU. Intremap on requires kernel-irqchip=off OR kernel-irqchip=split +QEMU_OPTS="--qemu-opts -machine q35,kernel-irqchip=split -machine q35 -global q35-pcihost.pci-hole64-size=64G -device intel-iommu,intremap=on,caching-mode=on -smp 4 $CCN_OPTS" +KERN_OPTS="--kopt iommu=pt --kopt intel_iommu=on --kopt iomem=relaxed" +KERN_OPTS="$KERN_OPTS --kopt transparent_hugepage=never --kopt hugepagesz=1g --kopt default_hugepagesz=1g --kopt hugepages=1 --kopt pci=realloc" +KERN_OPTS="$KERN_OPTS --kopt hugepagesz=2M --kopt hugepages=256" + +if [[ $HYP -eq 0 ]]; then + # First VM needs more memory to launch nested VMs + # Only the first VM will have the CCN qemu device. Nested VMs will + # have VFs exported to them + QEMU_OPTS="$QEMU_OPTS -m 8192" + + if [[ -n $QEMU_MOPTS ]]; then + QEMU_OPTS="$QEMU_OPTS $QEMU_MOPTS" + fi +else + # Nested VM. Use the first PCI VF + # PCIFN = 0000:00:14.0 or similar + + # Bind cxi1 to get its info + echo 1 > /sys/class/cxi/cxi0/device/sriov_numvfs + PCIFN=$(basename $(readlink /sys/class/cxi/cxi0/device/virtfn0)) + VENDOR=$(cat /sys/class/cxi/cxi0/device/virtfn0/vendor) + DEVICE=$(cat /sys/class/cxi/cxi0/device/virtfn0/device) + + # Unbind VF from cxi core driver. cxi1 no longer exists + echo $PCIFN > /sys/bus/pci/drivers/cxi_core/unbind + + # Bind the VF to vfio driver + modprobe vfio_pci + echo ${VENDOR##*x} ${DEVICE##*x} > /sys/bus/pci/drivers/vfio-pci/new_id + + # Tell qemu to bind the VF + QEMU_OPTS="$QEMU_OPTS -device vfio-pci,host=$PCIFN" +fi + +PATH=$QEMU_DIR:$VIRTME_DIR:/sbin:$PATH + +VIRTME_OPTS="--rwdir=$(pwd) --pwd" + +if [[ $KDIR ]]; then + VIRTME_OPTS="--kdir $KDIR --mods=auto $VIRTME_OPTS" +else + VIRTME_OPTS="--installed-kernel $VIRTME_OPTS" +fi + +if [[ $MOS ]]; then + QEMU_OPTS="$QEMU_OPTS -m 2048" + KERN_OPTS="$KERN_OPTS --kopt kernelcore=1024M --kopt lwkcpus=0.1-3 --kopt lwkmem=1G" +fi + +SETUP_SCRIPT="`dirname $0`/startvm-setup.sh" + +# Start the VM, execute the script inside, and exit ... +if [[ $RUNCMD ]]; then + virtme-run --script-sh "$SETUP_SCRIPT $RUNCMD" $VIRTME_OPTS $KERN_OPTS $QEMU_OPTS + +# ... or start a VM and execute the script but don't exit +elif [[ $USE_XTERM -eq 1 ]]; then + xterm -e "virtme-run --init-sh '$SETUP_SCRIPT' $VIRTME_OPTS $KERN_OPTS $QEMU_OPTS" +else + virtme-run --init-sh "$SETUP_SCRIPT" $VIRTME_OPTS $KERN_OPTS $QEMU_OPTS +fi + +# ... or just start a clean VM +#virtme-run --installed-kernel --pwd $KERN_OPTS $QEMU_OPTS diff --git a/prov/cxi/test/tagged.c b/prov/cxi/test/tagged.c new file mode 100644 index 00000000000..f730f5228e9 --- /dev/null +++ b/prov/cxi/test/tagged.c @@ -0,0 +1,5777 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include + +#include +#include +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +TestSuite(tagged, .init = cxit_setup_tagged, .fini = cxit_teardown_tagged, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test basic send/recv */ +Test(tagged, ping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Try invalid lengths */ + ret = fi_tsend(cxit_ep, send_buf, cxit_fi->ep_attr->max_msg_size+1, + NULL, cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tsend failed %d", ret); + + free(send_buf); + free(recv_buf); +} + +/* Test basic zero-byte send/recv */ +Test(tagged, zbr) +{ + int ret; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + fi_addr_t from; + + ret = fi_trecv(cxit_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + ret = fi_tsend(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, 0, FI_TAGGED | FI_RECV, NULL, 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Try an unexpected send */ + ret = fi_tsend(cxit_ep, NULL, 0, NULL, cxit_ep_fi_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + sleep(1); + + ret = fi_trecv(cxit_ep, NULL, 0, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, 0, FI_TAGGED | FI_RECV, NULL, 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); +} + +static void simple_rdzv(bool check_invalid_length) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 8192; + int send_len = 8192; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Send 8192 bytes to self */ + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + if (check_invalid_length) { + ret = fi_tsend(cxit_ep, send_buf, + cxit_fi->ep_attr->max_msg_size+1, + NULL, cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tsend failed %d", ret); + } + + free(send_buf); + free(recv_buf); +} + +/* Test basic rendezvous send */ +Test(tagged, rdzv) +{ + simple_rdzv(true); +} + +/* Verify unrestricted non-eager rendezvous get is used if requested */ +Test(tagged, alt_read_rdzv) +{ + char *rdzv_proto; + uint64_t end_pkt_cnt; + uint64_t start_pkt_cnt; + int ret; + + /* If not testing alt_read protocol skip */ + rdzv_proto = getenv("FI_CXI_RDZV_PROTO"); + if (!rdzv_proto || strcmp(rdzv_proto, "alt_read")) { + cr_assert(1); + return; + } + + ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT, + &start_pkt_cnt, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + simple_rdzv(false); + + ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT, + &end_pkt_cnt, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + /* Some number of non-eager data restricted get packets need + * have been sent. + */ + cr_assert(end_pkt_cnt > start_pkt_cnt, + "Incorrect number of restricted packets"); +} + +Test(tagged, zero_byte_tsend_trecv_iov) +{ + int ret; + struct fi_cq_tagged_entry cqe; + + ret = fi_trecvv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, 0, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvv failed: %d", ret); + + ret = fi_tsendv(cxit_ep, NULL, NULL, 0, cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendv failed: %d", ret); + + do { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); +} + +Test(tagged, zero_byte_tsend_trecv_msg) +{ + int ret; + struct fi_cq_tagged_entry cqe; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + + rmsg.addr = cxit_ep_fi_addr; + + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed: %d", ret); + + smsg.addr = cxit_ep_fi_addr; + + ret = fi_tsendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed: %d", ret); + + do { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); +} + +#if ENABLE_DEBUG +/* Verify fallback to default rendezvous proto on H/W resource failure */ +Test(tagged, fail_alt_read_rdzv) +{ + char *rdzv_proto; + uint64_t end_pkt_cnt; + uint64_t start_pkt_cnt; + int ret; + struct cxip_ep *ep = container_of(&cxit_ep->fid, + struct cxip_ep, ep.fid); + + /* If not testing alt_read protocol skip */ + rdzv_proto = getenv("FI_CXI_RDZV_PROTO"); + if (!rdzv_proto || strcmp(rdzv_proto, "alt_read")) { + cr_assert(1); + return; + } + + /* Force error on allocation of hardware resources required + * by alt_read rendezvous protocol. + */ + ep->ep_obj->txc.force_err |= CXIP_TXC_FORCE_ERR_ALT_READ_PROTO_ALLOC; + + ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT, + &start_pkt_cnt, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + simple_rdzv(false); + + ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT, + &end_pkt_cnt, NULL, true); + cr_assert_eq(ret, FI_SUCCESS, "cntr_read failed: %d\n", ret); + + /* No restricted packets should have been sent */ + cr_assert(end_pkt_cnt == start_pkt_cnt, + "Incorrect number of restricted packets"); +} +#endif /* ENABLE_DEBUG */ + +/* Test basic send/recv w/data */ +Test(tagged, pingdata) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + uint64_t data = 0xabcdabcdabcdabcd; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_tsenddata(cxit_ep, send_buf, send_len, NULL, data, + cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsenddata failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, + FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA, + NULL, data, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic inject send */ +Test(tagged, inject_ping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_tinject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tinject failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Make sure a TX event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Try invalid lengths */ + ret = fi_tinject(cxit_ep, send_buf, cxit_fi->tx_attr->inject_size+1, + cxit_ep_fi_addr, 0); + cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tinject failed %d", ret); + + ret = fi_tinject(cxit_ep, send_buf, 4*1024*1024, + cxit_ep_fi_addr, 0); + cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tinject failed %d", ret); + + ret = fi_tinject(cxit_ep, send_buf, cxit_fi->ep_attr->max_msg_size+1, + cxit_ep_fi_addr, 0); + cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tinject failed %d", ret); + + free(send_buf); + free(recv_buf); +} + +/* Test basic injectdata */ +Test(tagged, injectdata_ping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + uint64_t data = 0xabcdabcdabcdabcd; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_tinjectdata(cxit_ep, send_buf, send_len, data, + cxit_ep_fi_addr, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tinject failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, + FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA, + NULL, data, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Make sure a TX event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + free(send_buf); + free(recv_buf); +} + +/* Test basic sendv/recvv */ +Test(tagged, vping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct iovec siovec; + struct iovec riovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + ret = fi_trecvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, 0, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvv failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + ret = fi_tsendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendv failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic sendmsg/recvmsg */ +Test(tagged, msgping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + ret = fi_tsendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test FI_FENCE */ +Test(tagged, fence) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, s_page_size); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, s_page_size); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + ret = fi_tsendmsg(cxit_ep, &smsg, FI_FENCE); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Test rendezvous fence */ + send_len = recv_len = s_page_size; + siovec.iov_len = send_len; + riovec.iov_len = recv_len; + + for (i = 0; i < send_len; i++) { + recv_buf[i] = 0; + send_buf[i] = i + 0xa0; + } + + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + ret = fi_tsendmsg(cxit_ep, &smsg, FI_FENCE); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + + /* progress */ + fi_cq_read(cxit_tx_cq, &tx_cqe, 0); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +void cxit_tagged_setup_nofence(void) +{ + cxit_setup_getinfo(); + cxit_fi_hints->caps = CXIP_EP_PRI_CAPS; + cxit_setup_rma(); +} + +/* Test messaging without FI_FENCE */ +Test(tagged_nofence, nofence, + .init = cxit_tagged_setup_nofence, + .fini = cxit_teardown_rma) +{ + int ret; + uint8_t *send_buf; + int send_len = 64; + struct fi_msg_tagged smsg = {}; + struct fi_msg msg = {}; + struct iovec siovec; + + send_buf = aligned_alloc(s_page_size, s_page_size); + cr_assert(send_buf); + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + ret = fi_tsendmsg(cxit_ep, &smsg, FI_FENCE); + cr_assert_eq(ret, -FI_EINVAL); + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + msg.msg_iov = &siovec; + msg.iov_count = 1; + msg.addr = cxit_ep_fi_addr; + msg.context = NULL; + + ret = fi_sendmsg(cxit_ep, &msg, FI_FENCE); + cr_assert_eq(ret, -FI_EINVAL); + + free(send_buf); +} + +/* Test basic sendmsg/recvmsg with data */ +Test(tagged, msgping_wdata) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + struct iovec riovec; + struct iovec siovec; + uint64_t data = 0xabcdabcdabcdabcd; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + smsg.data = data; + + ret = fi_tsendmsg(cxit_ep, &smsg, FI_REMOTE_CQ_DATA); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, + FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA, NULL, + data, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic injectmsg */ +Test(tagged, inject_msgping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + ret = fi_tsendmsg(cxit_ep, &smsg, FI_INJECT); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test unexpected send/recv */ +Test(tagged, ux_ping) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + fi_addr_t from; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Send 64 bytes to self */ + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0, + NULL); + cr_assert(ret == FI_SUCCESS); + + /* Give some time for the message to move */ + sleep(1); + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert(ret == 1); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_assert(recv_buf[i] == send_buf[i], + "data mismatch, element: %d\n", i); + } + + free(send_buf); + free(recv_buf); +} + +/* Issue a fi_trecvmsg with FI_PEEK and validate result */ +ssize_t try_peek(fi_addr_t addr, uint64_t tag, uint64_t ignore, + ssize_t len, void *context, bool claim) +{ + struct fi_msg_tagged tmsg = { + .msg_iov = NULL, + .iov_count = 0, + .addr = addr, + .tag = tag, + .ignore = ignore, + .context = context, + .data = 0 + }; + struct fi_cq_tagged_entry cqe = {}; + struct fi_cq_err_entry err_cqe = {}; + fi_addr_t from; + ssize_t ret; + + do { + fi_cq_read(cxit_tx_cq, NULL, 0); + fi_cq_read(cxit_rx_cq, NULL, 0); + ret = fi_trecvmsg(cxit_ep, &tmsg, + claim ? FI_CLAIM | FI_PEEK : FI_PEEK); + } while (ret == -FI_EAGAIN); + if (ret != FI_SUCCESS) + return ret; + + do { + ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from); + if (ret == 1) { + validate_rx_event_mask(&cqe, context, len, + FI_TAGGED | FI_RECV, NULL, 0, + tag, ignore); + cr_assert_eq(from, cxit_ep_fi_addr, + "Invalid source address"); + ret = FI_SUCCESS; + break; + } else if (ret == -FI_EAVAIL) { + ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0); + cr_assert_eq(ret, 1); + + cr_assert(err_cqe.err == ENOMSG, "Bad CQE error %d", + err_cqe.err); + cr_assert(err_cqe.buf == 0, "Invalid buffer"); + cr_assert(err_cqe.olen == 0, "Invalid length"); + cr_assert(err_cqe.tag == tag, "Invalid tag"); + cr_assert(err_cqe.err == FI_ENOMSG, + "Invalid error code %d", err_cqe.err); + ret = err_cqe.err; + break; + } + } while (ret == -FI_EAGAIN); + + return ret; +} + +static int wait_peek(fi_addr_t addr, uint64_t tag, uint64_t ignore, + ssize_t len, void *context, bool claim) +{ + int ret; + + do { + ret = try_peek(addr, tag, ignore, len, context, claim); + } while (ret == FI_ENOMSG); + + return ret; +} + +#define PEEK_TAG_BASE 0x0000a000 +#define PEEK_MSG_LEN 64 +#define PEEK_NUM_MSG 4 +#define PEEK_NUM_FAKE_ADDRS 3 + +/* Test fi_trecvmsg using FI_PEEK flag to search unexpected message list. + * Additional message sizes will be tested within the multitudes tests. + */ +Test(tagged, ux_peek) +{ + ssize_t ret; + uint8_t *rx_buf; + uint8_t *tx_buf; + ssize_t rx_len = PEEK_MSG_LEN; + ssize_t tx_len = PEEK_MSG_LEN; + struct fi_cq_tagged_entry cqe; + struct fi_context rx_context[PEEK_NUM_MSG]; + struct fi_context tx_context[PEEK_NUM_MSG]; + struct fi_msg_tagged tmsg = {}; + struct iovec iovec; + fi_addr_t from; + int i, tx_comp; + struct cxip_addr fake_ep_addrs[PEEK_NUM_FAKE_ADDRS]; + + /* Add fake AV entries to test peek for non-matching valid address */ + for (i = 0; i < PEEK_NUM_FAKE_ADDRS; i++) { + fake_ep_addrs[i].nic = i + 0x41c; + fake_ep_addrs[i].pid = i + 0x21; + } + ret = fi_av_insert(cxit_av, (void *)fake_ep_addrs, + PEEK_NUM_FAKE_ADDRS, NULL, 0, NULL); + cr_assert(ret == PEEK_NUM_FAKE_ADDRS); + + rx_buf = aligned_alloc(s_page_size, rx_len * PEEK_NUM_MSG); + cr_assert(rx_buf); + memset(rx_buf, 0, rx_len * PEEK_NUM_MSG); + + tx_buf = aligned_alloc(s_page_size, tx_len * PEEK_NUM_MSG); + cr_assert(tx_buf); + + /* Send messages to build the unexpected list */ + for (i = 0; i < PEEK_NUM_MSG; i++) { + memset(&tx_buf[i * tx_len], 0xa0 + i, tx_len); + iovec.iov_base = &tx_buf[i * tx_len]; + iovec.iov_len = tx_len; + + tmsg.msg_iov = &iovec; + tmsg.iov_count = 1; + tmsg.addr = cxit_ep_fi_addr; + tmsg.tag = PEEK_TAG_BASE + i; + tmsg.ignore = 0; + tmsg.context = &tx_context[i]; + + ret = fi_tsendmsg(cxit_ep, &tmsg, FI_COMPLETION); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %" PRId64, + ret); + } + + sleep(1); + + /* Force onloading of UX entries if operating in SW EP mode */ + fi_cq_read(cxit_rx_cq, &cqe, 0); + + /* Any address with bad tag and no context */ + ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + PEEK_NUM_MSG + 1, 0, + tx_len, NULL, false); + cr_assert_eq(ret, FI_ENOMSG, "Peek with invalid tag"); + + /* Any address with bad tag with context */ + ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + PEEK_NUM_MSG + 1, 0, + tx_len, &rx_context[0], false); + cr_assert_eq(ret, FI_ENOMSG, "Peek with invalid tag"); + + /* Non matching valid source address with valid tag */ + ret = try_peek(3, PEEK_TAG_BASE, 0, tx_len, NULL, false); + cr_assert_eq(ret, FI_ENOMSG, "Peek with wrong match address"); + + /* Valid with any address and valid tag */ + ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + 1, 0, tx_len, + NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "Peek with invalid tag"); + + /* Valid with expected address and valid tag */ + ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + 1, 0, tx_len, + NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "Peek with bad address"); + + /* Valid with any address and good tag when masked correctly */ + ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + 0x20002, + 0x0FFF0000UL, tx_len, NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "Peek tag ignore bits failed"); + + /* Valid with expected address and good tag when masked correctly */ + ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + 0x20002, + 0x0FFF0000UL, tx_len, NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "Peek tag ignore bits failed"); + + /* Verify peek of all sends */ + for (i = 0; i < PEEK_NUM_MSG; i++) { + ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0, + tx_len, &rx_context[i], false); + cr_assert_eq(ret, FI_SUCCESS, "Peek valid tag not found"); + } + + /* Verify peek of all sends in reverse order */ + for (i = PEEK_NUM_MSG - 1; i >= 0; i--) { + ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0, + tx_len, &rx_context[i], false); + cr_assert_eq(ret, FI_SUCCESS, "Peek valid tag not found"); + } + + /* Receive all unexpected sends */ + for (i = 0; i < PEEK_NUM_MSG; i++) { + iovec.iov_base = &rx_buf[i * rx_len]; + iovec.iov_len = rx_len; + + tmsg.msg_iov = &iovec; + tmsg.iov_count = 1; + tmsg.addr = cxit_ep_fi_addr; + tmsg.tag = PEEK_TAG_BASE + i; + tmsg.ignore = 0; + tmsg.context = &rx_context[i]; + + ret = fi_trecvmsg(cxit_ep, &tmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, + "fi_trecvmsg failed %" PRId64, ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from); + } while (ret == -FI_EAGAIN); + + cr_assert(ret == 1); + cr_assert_eq(from, cxit_ep_fi_addr, "Invalid source address"); + validate_rx_event(&cqe, &rx_context[i], rx_len, + FI_TAGGED | FI_RECV, NULL, 0, + PEEK_TAG_BASE + i); + } + + /* Verify received data */ + for (i = 0; i < PEEK_NUM_MSG; i++) { + ret = memcmp(&tx_buf[i * tx_len], &rx_buf[i * rx_len], tx_len); + cr_assert_eq(ret, 0, "RX buffer data mismatch for msg %d", i); + } + + /* Verify received messages have been removed from unexpected list */ + for (i = 0; i < PEEK_NUM_MSG; i++) { + ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0, + tx_len, &rx_context[i], false); + cr_assert_eq(ret, FI_ENOMSG, + "Peek after receive did not fail %" PRId64, ret); + } + + /* Wait for TX async events to complete, and validate */ + tx_comp = 0; + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + if (ret == 1) { + validate_tx_event(&cqe, FI_TAGGED | FI_SEND, + &tx_context[tx_comp]); + tx_comp++; + } + cr_assert(ret == 1 || ret == -FI_EAGAIN, + "Bad fi_cq_read return %" PRId64, ret); + } while (tx_comp < PEEK_NUM_MSG); + cr_assert_eq(tx_comp, PEEK_NUM_MSG, + "Peek tsendmsg only %d TX completions read", tx_comp); + + free(rx_buf); + free(tx_buf); +} + +/* FI_PEEK with FI_CLAIM testing */ +void test_ux_claim(int num_msgs, int msg_len) +{ + ssize_t ret; + uint8_t *rx_buf; + uint8_t *tx_buf; + ssize_t rx_len = msg_len; + ssize_t tx_len = msg_len; + struct fi_cq_tagged_entry cqe; + struct fi_context *rx_context; /* [PEEK_NUM_MSG]; */ + struct fi_context *tx_context; /* [PEEK_NUM_MSG]; */ + struct fi_msg_tagged tmsg = {}; + struct iovec iovec; + fi_addr_t from; + int i, tx_comp; + struct cxip_addr fake_ep_addrs[PEEK_NUM_FAKE_ADDRS]; + + rx_context = calloc(num_msgs, sizeof(struct fi_context)); + cr_assert_not_null(rx_context); + tx_context = calloc(num_msgs, sizeof(struct fi_context)); + cr_assert_not_null(tx_context); + + rx_buf = aligned_alloc(s_page_size, rx_len * num_msgs); + cr_assert_not_null(rx_buf); + memset(rx_buf, 0, rx_len * num_msgs); + + tx_buf = aligned_alloc(s_page_size, tx_len * num_msgs); + cr_assert_not_null(tx_buf); + + /* Add fake AV entries to test peek for non-matching valid address */ + for (i = 0; i < PEEK_NUM_FAKE_ADDRS; i++) { + fake_ep_addrs[i].nic = i + 0x41c; + fake_ep_addrs[i].pid = i + 0x21; + } + ret = fi_av_insert(cxit_av, (void *)fake_ep_addrs, + PEEK_NUM_FAKE_ADDRS, NULL, 0, NULL); + cr_assert(ret == PEEK_NUM_FAKE_ADDRS); + + /* Send messages to build the unexpected list */ + for (i = 0; i < num_msgs; i++) { + memset(&tx_buf[i * tx_len], 0xa0 + i, tx_len); + iovec.iov_base = &tx_buf[i * tx_len]; + iovec.iov_len = tx_len; + + tmsg.msg_iov = &iovec; + tmsg.iov_count = 1; + tmsg.addr = cxit_ep_fi_addr; + tmsg.tag = PEEK_TAG_BASE + i; + tmsg.ignore = 0; + tmsg.context = &tx_context[i]; + + ret = fi_tsendmsg(cxit_ep, &tmsg, FI_COMPLETION); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %" PRId64, + ret); + } + + sleep(1); + + /* Force onloading of UX entries if operating in SW EP mode */ + fi_cq_read(cxit_rx_cq, &cqe, 0); + + /* Any address with bad tag and FI_CLAIM with no context */ + ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + num_msgs + 1, 0, + tx_len, NULL, true); + cr_assert_eq(ret, -FI_EINVAL, + "FI_CLAIM with invalid tag and no context"); + + /* Any address with bad tag and FI_CLAIM with context */ + ret = try_peek(FI_ADDR_UNSPEC, PEEK_TAG_BASE + num_msgs + 1, 0, + tx_len, &rx_context[0], true); + cr_assert_eq(ret, FI_ENOMSG, "FI_CLAIM with invalid tag"); + + /* Non matching valid source address with valid tag and context */ + ret = try_peek(3, PEEK_TAG_BASE, 0, tx_len, &rx_context[0], true); + cr_assert_eq(ret, FI_ENOMSG, "FI_CLAIM with wrong match address"); + + /* Verify peek of all sends */ + for (i = 0; i < num_msgs; i++) { + ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0, + tx_len, &rx_context[i], false); + cr_assert_eq(ret, FI_SUCCESS, "All unexpected tags not found"); + } + + /* Verify peek of all sends in reverse order with FI_CLAIM */ + for (i = num_msgs - 1; i >= 0; i--) { + ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0, + tx_len, &rx_context[i], true); + cr_assert_eq(ret, FI_SUCCESS, + "FI_PEEK | FI_CLAIM valid tag not found"); + } + + /* Verify peek of previously claimed messages fail */ + for (i = 0; i < num_msgs; i++) { + ret = try_peek(cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0, + tx_len, &rx_context[i], false); + cr_assert_eq(ret, FI_ENOMSG, + "Unexpected message not claimed found"); + } + + /* Receive all claimed unexpected messages */ + for (i = 0; i < num_msgs; i++) { + iovec.iov_base = &rx_buf[i * rx_len]; + iovec.iov_len = rx_len; + + tmsg.msg_iov = &iovec; + tmsg.iov_count = 1; + tmsg.addr = cxit_ep_fi_addr; + tmsg.tag = PEEK_TAG_BASE + i; + tmsg.ignore = 0; + tmsg.context = &rx_context[i]; + + ret = fi_trecvmsg(cxit_ep, &tmsg, FI_CLAIM); + cr_assert_eq(ret, FI_SUCCESS, + "fi_trecvmsg FI_CLAIM failed %" PRId64, ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from); + } while (ret == -FI_EAGAIN); + + cr_assert(ret == 1); + cr_assert_eq(from, cxit_ep_fi_addr, "Invalid source address"); + validate_rx_event(&cqe, &rx_context[i], rx_len, + FI_TAGGED | FI_RECV, NULL, 0, + PEEK_TAG_BASE + i); + } + + /* Verify received data */ + for (i = 0; i < num_msgs; i++) { + ret = memcmp(&tx_buf[i * tx_len], &rx_buf[i * rx_len], tx_len); + cr_assert_eq(ret, 0, "RX buffer data mismatch for msg %d", i); + } + + /* Wait for TX async events to complete, and validate */ + tx_comp = 0; + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + if (ret == 1) { + validate_tx_event(&cqe, FI_TAGGED | FI_SEND, + &tx_context[tx_comp]); + tx_comp++; + } + cr_assert(ret == 1 || ret == -FI_EAGAIN, + "Bad fi_cq_read return %" PRId64, ret); + } while (tx_comp < num_msgs); + cr_assert_eq(tx_comp, num_msgs, + "Peek tsendmsg only %d TX completions read", tx_comp); + + free(rx_buf); + free(tx_buf); + free(rx_context); + free(tx_context); +} + +/* Test fi_trecvmsg using FI_PEEK and FI_CLAIM flags to search unexpected + * message list and claim the message. + */ +Test(tagged, ux_claim) +{ + test_ux_claim(4, 1024); +} + +Test(tagged, ux_claim_rdzv) +{ + test_ux_claim(4, 65536); +} + +#define PEEK_ORDER_SEND_COUNT 5 +#define PEEK_ORDER_TAG 0x1234ULL + +static void verify_peek_claim_order_same_tag(size_t xfer_base_size, bool claim) +{ + void *buf; + struct fi_context context; + int i; + int ret; + struct fi_cq_tagged_entry cqe; + fi_addr_t from; + struct fi_msg_tagged tmsg = {}; + struct iovec iovec; + size_t buf_size = xfer_base_size + (PEEK_ORDER_SEND_COUNT - 1); + size_t xfer_size; + + buf = malloc(buf_size); + cr_assert_not_null(buf); + + /* Issue sends unexpected to target. Same tagged is used with different + * transfer size. Transfer size identifies operation order. + */ + for (i = 0; i < PEEK_ORDER_SEND_COUNT; i++) { + ret = fi_tsend(cxit_ep, buf, xfer_base_size + i, NULL, + cxit_ep_fi_addr, PEEK_ORDER_TAG, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed: %d", ret); + } + + /* Receives should be processed in order. Order is incrementing receive + * size. + */ + iovec.iov_base = buf; + iovec.iov_len = buf_size; + + tmsg.msg_iov = &iovec; + tmsg.iov_count = 1; + tmsg.addr = cxit_ep_fi_addr; + tmsg.tag = PEEK_ORDER_TAG; + tmsg.ignore = 0; + tmsg.context = &context; + + for (i = 0; i < PEEK_ORDER_SEND_COUNT; i++) { + xfer_size = xfer_base_size + i; + + ret = wait_peek(cxit_ep_fi_addr, PEEK_ORDER_TAG, 0, + xfer_size, tmsg.context, claim); + cr_assert_eq(ret, FI_SUCCESS, "try_peek failed: %d", ret); + + /* With claim, subsequent FI_PEEK without FI_CLAIM should always + * return next message. + */ + if (claim && i < (PEEK_ORDER_SEND_COUNT - 1)) { + ret = wait_peek(cxit_ep_fi_addr, PEEK_ORDER_TAG, 0, + xfer_size + 1, NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "try_peek failed: %d", + ret); + } + + /* Recieve unexpected message. If message is FI_CLAIM, + * FI_CONTEXT buffer contains data to progress receive. + */ + ret = fi_trecvmsg(cxit_ep, &tmsg, claim ? FI_CLAIM : 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed: %d", ret); + + do { + /* Process TX CQ (if needed). */ + fi_cq_read(cxit_tx_cq, NULL, 0); + ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from); + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, 1, "fi_cq_read failed: %d", ret); + cr_assert_eq(from, cxit_ep_fi_addr, + "Invalid user id: expected=%#lx got=%#lx", + cxit_ep_fi_addr, from); + validate_rx_event_mask(&cqe, tmsg.context, xfer_size, + FI_RECV | FI_TAGGED, + NULL, 0, PEEK_ORDER_TAG, 0); + } + + free(buf); +} + +Test(tagged, verify_peek_order_same_tag_idc) +{ + verify_peek_claim_order_same_tag(0, false); +} + +Test(tagged, verify_peek_order_same_tag_eager) +{ + verify_peek_claim_order_same_tag(257, false); +} + +Test(tagged, verify_peek_order_same_tag_rendezvous) +{ + verify_peek_claim_order_same_tag(1048576, false); +} + +Test(tagged, verify_claim_order_same_tag_idc) +{ + verify_peek_claim_order_same_tag(0, true); +} + +Test(tagged, verify_claim_order_same_tag_eager) +{ + verify_peek_claim_order_same_tag(257, true); +} + +Test(tagged, verify_claim_order_same_tag_rendezvous) +{ + verify_peek_claim_order_same_tag(1048576, true); +} + +/* Test MQD get of unexpected message list */ +void verify_ux_dump(int num, ssize_t msg_len) +{ + ssize_t ret; + size_t count; + size_t ux_count; + size_t ux_ret_count; + struct fi_cq_tagged_entry *cq_entry; + fi_addr_t *src_addr; + uint8_t *tx_buf; + ssize_t tx_len = msg_len; + uint8_t *rx_buf; + ssize_t rx_len = msg_len; + struct fi_cq_tagged_entry cqe; + struct fi_msg_tagged tmsg = {}; + struct iovec iovec; + int i; + int tx_comp = 0; + fi_addr_t from; + + rx_buf = aligned_alloc(s_page_size, rx_len * num); + cr_assert(rx_buf); + tx_buf = aligned_alloc(s_page_size, tx_len * num); + cr_assert(tx_buf); + + /* Send messages to build the unexpected list */ + for (i = 0; i < num; i++) { + memset(&tx_buf[i * tx_len], 0xa0 + i, tx_len); + iovec.iov_base = &tx_buf[i * tx_len]; + iovec.iov_len = tx_len; + + tmsg.msg_iov = &iovec; + tmsg.iov_count = 1; + tmsg.addr = cxit_ep_fi_addr; + tmsg.tag = PEEK_TAG_BASE + i; + tmsg.ignore = 0; + tmsg.context = NULL; + + ret = fi_tsendmsg(cxit_ep, &tmsg, FI_COMPLETION); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %" PRId64, + ret); + } + + sleep(1); + + /* Force onloading of UX entries if operating in SW EP mode */ + fi_cq_read(cxit_rx_cq, &cqe, 0); + + /* Call first to get number of UX entries */ + ux_ret_count = dom_ops->ep_get_unexp_msgs(cxit_ep, NULL, 0, + NULL, &ux_count); + cr_assert_eq(ux_ret_count, 0, "Num entries returned"); + count = ux_count; + + cq_entry = calloc(ux_count, sizeof(*cq_entry)); + ux_ret_count = dom_ops->ep_get_unexp_msgs(cxit_ep, cq_entry, count, + NULL, &ux_count); + cr_assert(ux_ret_count <= count, "Number UX returned <= count"); + cr_assert_eq(ux_ret_count, num, "Number UX returned wrong"); + + for (i = 0; i < ux_ret_count; i++) { + cr_assert(cq_entry[i].op_context == NULL, "Context"); + cr_assert(cq_entry[i].buf == NULL, "Buf"); + cr_assert(cq_entry[i].tag == PEEK_TAG_BASE + i, "Tag match"); + cr_assert(cq_entry[i].len == tx_len, "Length %ld", + cq_entry[i].len); + cr_assert(cq_entry[i].flags & FI_TAGGED, "FI_TAGGED"); + cr_assert(!(cq_entry[i].flags & FI_REMOTE_CQ_DATA), + "FI_REMOTE_CQ_DATA"); + } + + /* Get entries with source address */ + src_addr = calloc(ux_count, sizeof(*src_addr)); + ux_ret_count = dom_ops->ep_get_unexp_msgs(cxit_ep, cq_entry, count, + src_addr, &ux_count); + cr_assert(ux_ret_count <= count, "Number UX returned <= count"); + cr_assert_eq(ux_ret_count, num, "Number UX returned wrong"); + + for (i = 0; i < ux_ret_count; i++) + cr_assert_eq(src_addr[i], cxit_ep_fi_addr, "Source address"); + + /* Receive all unexpected messages */ + for (i = 0; i < num; i++) { + ret = fi_trecv(cxit_ep, &rx_buf[i * rx_len], rx_len, NULL, + cxit_ep_fi_addr, PEEK_TAG_BASE + i, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %ld", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &cqe, 1, &from); + } while (ret == -FI_EAGAIN); + + cr_assert(ret == 1); + cr_assert_eq(from, cxit_ep_fi_addr, "Invalid source address"); + + validate_rx_event(&cqe, NULL, rx_len, + FI_TAGGED | FI_RECV, NULL, 0, + PEEK_TAG_BASE + i); + } + + /* Verify received data */ + for (i = 0; i < num; i++) { + ret = memcmp(&tx_buf[i * tx_len], &rx_buf[i * rx_len], tx_len); + cr_assert_eq(ret, 0, "RX buffer data mismatch for msg %d", i); + } + + /* Wait for TX async events to complete, and validate */ + tx_comp = 0; + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + if (ret == 1) + tx_comp++; + cr_assert(ret == 1 || ret == -FI_EAGAIN, + "Bad fi_cq_read return %ld", ret); + } while (tx_comp < num); + cr_assert_eq(tx_comp, num, + "Peek tsendmsg only %d TX completions read", tx_comp); + + free(src_addr); + free(cq_entry); + free(tx_buf); +} + +Test(tagged, ux_dump_eager) +{ + verify_ux_dump(4, 512); +} + +Test(tagged, ux_dump_rdzv) +{ + verify_ux_dump(4, 16384); +} + +/* Test DIRECTED_RECV send/recv */ +void directed_recv(bool logical) +{ + int i, ret; + uint8_t *recv_buf, + *fake_recv_buf, + *send_buf; + int recv_len = 0x1000; + int send_len = 0x1000; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; +#define N_FAKE_ADDRS 3 + struct cxip_addr fake_ep_addrs[N_FAKE_ADDRS+1]; + fi_addr_t from; + + if (logical) + cxit_av_attr.flags = FI_SYMMETRIC; + cxit_setup_enabled_ep(); + + /* Create multiple logical names for the local EP address */ + for (i = 0; i < N_FAKE_ADDRS; i++) { + fake_ep_addrs[i].nic = i + 0x41c; + fake_ep_addrs[i].pid = i + 0x21; + } + + ret = fi_av_insert(cxit_av, (void *)fake_ep_addrs, 3, NULL, 0, NULL); + cr_assert(ret == 3); + + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + recv_buf = calloc(recv_len, 1); + cr_assert(recv_buf); + + fake_recv_buf = calloc(recv_len, 1); + cr_assert(fake_recv_buf); + + send_buf = malloc(send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post an RX buffer matching each EP name that won't be targeted */ + for (i = 0; i < N_FAKE_ADDRS; i++) { + ret = fi_trecv(cxit_ep, fake_recv_buf, recv_len, NULL, i, 0, 0, + NULL); + cr_assert(ret == FI_SUCCESS); + } + + /* Post short RX buffer matching EP name 3 */ + ret = fi_trecv(cxit_ep, recv_buf, 64, NULL, 3, 0, 0, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Post long RX buffer matching EP name 3 */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, 3, 0, 0, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Send short message to self (FI address 3) */ + send_len = 64; + + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, 3, 0, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == 3, "Invalid source address, exp: 3 got: %lu", from); + + /* Wait for async event indicating data has been sent */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + cr_expect_eq(fake_recv_buf[i], 0, + "fake data corrupted, element[%d] err=%d\n", + i, err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Send long message to self (FI address 3) */ + memset(recv_buf, 0, recv_len); + send_len = 0x1000; + + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, 3, 0, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == 3, "Invalid source address, exp: 3 got: %lu", from); + + /* Wait for async event indicating data has been sent */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + cr_expect_eq(fake_recv_buf[i], 0, + "fake data corrupted, element[%d] err=%d\n", + i, err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Send long UX message to self (FI address 3) */ + memset(recv_buf, 0, recv_len); + send_len = 0x1000; + + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, 3, 0, NULL); + cr_assert(ret == FI_SUCCESS); + + sleep(1); + + /* Post long RX buffer matching EP name 3 */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, 3, 0, 0, NULL); + cr_assert(ret == FI_SUCCESS); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + + /* Progress */ + fi_cq_read(cxit_tx_cq, &tx_cqe, 0); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == 3, "Invalid source address, exp: 3 got: %lu", from); + + /* Wait for async event indicating data has been sent */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + cr_expect_eq(fake_recv_buf[i], 0, + "fake data corrupted, element[%d] err=%d\n", + i, err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(fake_recv_buf); + free(recv_buf); + + cxit_teardown_tagged(); +} + +Test(tagged_directed, directed) +{ + directed_recv(false); +} + +Test(tagged_directed, directed_logical) +{ + directed_recv(true); +} + +/* Test unexpected send/recv */ +#define RDZV_TAG (46) + +struct tagged_thread_args { + uint8_t *buf; + size_t len; + struct fi_cq_tagged_entry *cqe; + fi_addr_t src_addr; + size_t io_num; + size_t tag; + void *context; +}; + +static void *tsend_worker(void *data) +{ + int ret; + struct tagged_thread_args *args; + uint64_t tag; + + args = (struct tagged_thread_args *)data; + tag = args->tag; + + /* Send 64 bytes to FI address 0 (self) */ + ret = fi_tsend(cxit_ep, args->buf, args->len, NULL, cxit_ep_fi_addr, + tag, NULL); + cr_assert_eq(ret, FI_SUCCESS, "%s %ld: unexpected ret %d", __func__, + args->io_num, ret); + + /* Wait for async event indicating data has been sent */ + do { + ret = fi_cq_read(cxit_tx_cq, args->cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "%s %ld: unexpected ret %d", __func__, + args->io_num, ret); + + pthread_exit(NULL); +} + +static void *trecv_worker(void *data) +{ + int ret; + struct tagged_thread_args *args; + uint64_t tag; + + args = (struct tagged_thread_args *)data; + tag = args->tag; + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, args->buf, args->len, NULL, FI_ADDR_UNSPEC, tag, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "%s %ld: unexpected ret %d", __func__, + args->io_num, ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, args->cqe, 1, &args->src_addr); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "%s %ld: unexpected ret %d", __func__, + args->io_num, ret); + + pthread_exit(NULL); +} + +Test(tagged, ux_sw_rdzv) +{ + size_t i; + int ret; + uint8_t *recv_buf, *send_buf; + size_t buf_len = 2 * 1024 * 1024; + int recv_len = 4 * 1024; + int send_len = 4 * 1024; + struct fi_cq_tagged_entry rx_cqe, tx_cqe; + pthread_t threads[2]; + struct tagged_thread_args args[2]; + pthread_attr_t attr; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + recv_buf = aligned_alloc(s_page_size, buf_len); + cr_assert(recv_buf); + memset(recv_buf, 0, buf_len); + + send_buf = aligned_alloc(s_page_size, buf_len); + cr_assert(send_buf); + + for (i = 0; i < buf_len; i++) + send_buf[i] = i + 0xa0; + + args[0].buf = send_buf; + args[0].len = send_len; + args[0].cqe = &tx_cqe; + args[0].io_num = 0; + args[0].tag = RDZV_TAG; + args[1].buf = recv_buf; + args[1].len = recv_len; + args[1].cqe = &rx_cqe; + args[1].io_num = 1; + args[1].tag = RDZV_TAG; + + /* Give some time for the message to move */ + cr_assert_arr_neq(recv_buf, send_buf, buf_len); + + /* start tsend thread */ + ret = pthread_create(&threads[0], &attr, tsend_worker, + (void *)&args[0]); + cr_assert_eq(ret, 0, "Send thread create failed %d", ret); + + sleep(1); + + /* start trecv thread */ + ret = pthread_create(&threads[1], &attr, trecv_worker, + (void *)&args[1]); + cr_assert_eq(ret, 0, "Recv thread create failed %d", ret); + + /* Wait for the threads to complete */ + ret = pthread_join(threads[0], NULL); + cr_assert_eq(ret, 0, "Send thread join failed %d", ret); + ret = pthread_join(threads[1], NULL); + cr_assert_eq(ret, 0, "Recv thread join failed %d", ret); + + pthread_attr_destroy(&attr); + + /* Validate sent data */ + cr_expect_arr_eq(recv_buf, send_buf, recv_len); + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + validate_rx_event(&rx_cqe, NULL, recv_len, FI_TAGGED | FI_RECV, NULL, + 0, args[0].tag); + cr_assert_eq(args[1].src_addr, cxit_ep_fi_addr, + "Invalid source address"); + + free(send_buf); + free(recv_buf); +} + +Test(tagged, expected_sw_rdzv) +{ + size_t i; + int ret; + uint8_t *recv_buf, *send_buf; + size_t buf_len = 2 * 1024 * 1024; + int recv_len = 4 * 1024; + int send_len = 4 * 1024; + struct fi_cq_tagged_entry rx_cqe, tx_cqe; + pthread_t threads[2]; + struct tagged_thread_args args[2]; + pthread_attr_t attr; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + recv_buf = aligned_alloc(s_page_size, buf_len); + cr_assert(recv_buf); + memset(recv_buf, 0, buf_len); + + send_buf = aligned_alloc(s_page_size, buf_len); + cr_assert(send_buf); + + for (i = 0; i < buf_len; i++) + send_buf[i] = i + 0xa0; + + args[0].buf = send_buf; + args[0].len = send_len; + args[0].cqe = &tx_cqe; + args[0].io_num = 0; + args[0].tag = RDZV_TAG; + args[1].buf = recv_buf; + args[1].len = recv_len; + args[1].cqe = &rx_cqe; + args[1].io_num = 1; + args[1].tag = RDZV_TAG; + + /* Give some time for the message to move */ + cr_assert_arr_neq(recv_buf, send_buf, buf_len); + + /* Start trecv thread first so the buffer is ready when the data is sent + */ + ret = pthread_create(&threads[1], &attr, trecv_worker, + (void *)&args[1]); + cr_assert_eq(ret, 0, "Recv thread create failed %d", ret); + + sleep(1); + + /* Start tsend thread to send the data into the ready buffer */ + ret = pthread_create(&threads[0], &attr, tsend_worker, + (void *)&args[0]); + cr_assert_eq(ret, 0, "Send thread create failed %d", ret); + + /* Wait for the threads to complete */ + ret = pthread_join(threads[0], NULL); + cr_assert_eq(ret, 0, "Send thread join failed %d", ret); + ret = pthread_join(threads[1], NULL); + cr_assert_eq(ret, 0, "Recv thread join failed %d", ret); + + pthread_attr_destroy(&attr); + + /* Validate sent data */ + cr_expect_arr_eq(recv_buf, send_buf, recv_len); + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + validate_rx_event(&rx_cqe, NULL, recv_len, FI_TAGGED | FI_RECV, NULL, + 0, args[0].tag); + cr_assert_eq(args[1].src_addr, cxit_ep_fi_addr, + "Invalid source address"); + + free(send_buf); + free(recv_buf); +} + +#define NUM_IOS (12) + +struct tagged_event_args { + struct fid_cq *cq; + struct fi_cq_tagged_entry *cqe; + size_t io_num; +}; + +static void *tagged_evt_worker(void *data) +{ + int ret; + struct tagged_event_args *args; + + args = (struct tagged_event_args *)data; + + for (size_t i = 0; i < args->io_num; i++) { + /* Wait for async event indicating data has been sent */ + do { + ret = fi_cq_read(args->cq, &args->cqe[i], 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "%ld: unexpected ret %d", i, + ret); + } + + pthread_exit(NULL); +} + +Test(tagged, multitudes_sw_rdzv, .timeout=60) +{ + int ret; + size_t buf_len = 4 * 1024; + struct fi_cq_tagged_entry rx_cqe[NUM_IOS]; + struct fi_cq_tagged_entry tx_cqe[NUM_IOS]; + struct tagged_thread_args tx_args[NUM_IOS]; + struct tagged_thread_args rx_args[NUM_IOS]; + pthread_t tx_thread; + pthread_t rx_thread; + pthread_attr_t attr; + struct tagged_event_args tx_evt_args = { + .cq = cxit_tx_cq, + .cqe = tx_cqe, + .io_num = NUM_IOS, + }; + struct tagged_event_args rx_evt_args = { + .cq = cxit_rx_cq, + .cqe = rx_cqe, + .io_num = NUM_IOS, + }; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + /* Issue the Sends */ + for (size_t tx_io = 0; tx_io < NUM_IOS; tx_io++) { + tx_args[tx_io].len = buf_len; + tx_args[tx_io].tag = tx_io; + tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(tx_args[tx_io].buf); + for (size_t i = 0; i < buf_len; i++) + tx_args[tx_io].buf[i] = i + 0xa0 + tx_io; + + ret = fi_tsend(cxit_ep, tx_args[tx_io].buf, tx_args[tx_io].len, + NULL, cxit_ep_fi_addr, tx_args[tx_io].tag, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d", + tx_io, ret); + } + + /* Start processing Send events */ + ret = pthread_create(&tx_thread, &attr, tagged_evt_worker, + (void *)&tx_evt_args); + cr_assert_eq(ret, 0, "Send thread create failed %d", ret); + + sleep(1); + + /* Force onloading of UX entries if operating in SW EP mode */ + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + + /* Peek for each tag on UX list */ + for (size_t rx_io = 0; rx_io < NUM_IOS; rx_io++) { + ret = try_peek(FI_ADDR_UNSPEC, rx_io, 0, buf_len, NULL, false); + cr_assert_eq(ret, FI_SUCCESS, "peek of UX message failed"); + } + + /* Issue the Receives */ + for (size_t rx_io = 0; rx_io < NUM_IOS; rx_io++) { + rx_args[rx_io].len = buf_len; + rx_args[rx_io].tag = rx_io; + rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(rx_args[rx_io].buf); + memset(rx_args[rx_io].buf, 0, buf_len); + + ret = fi_trecv(cxit_ep, rx_args[rx_io].buf, rx_args[rx_io].len, + NULL, FI_ADDR_UNSPEC, rx_args[rx_io].tag, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d", + rx_io, ret); + } + + /* Start processing Receive events */ + ret = pthread_create(&rx_thread, &attr, tagged_evt_worker, + (void *)&rx_evt_args); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + + /* Wait for the RX/TX event threads to complete */ + ret = pthread_join(tx_thread, NULL); + cr_assert_eq(ret, 0, "Send thread join failed %d", ret); + + ret = pthread_join(rx_thread, NULL); + cr_assert_eq(ret, 0, "Recv thread join failed %d", ret); + + /* Validate results */ + for (size_t io = 0; io < NUM_IOS; io++) { + /* Validate sent data */ + cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len); + validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL); + validate_rx_event(&rx_cqe[io], NULL, buf_len, + FI_TAGGED | FI_RECV, NULL, + 0, tx_args[rx_cqe[io].tag].tag); + + free(tx_args[io].buf); + free(rx_args[io].buf); + } + + pthread_attr_destroy(&attr); +} + +struct multitudes_params { + size_t length; + size_t num_ios; + bool peek; + bool claim; +}; + +/* This is a parameterized test to execute an arbitrary set of tagged send/recv + * operations. The test is configurable in two parameters, the length value is + * the size of the data to be transferred. The num_ios will set the number of + * matching send/recv that are launched in each test. + * + * The test will first execute the fi_tsend() for `num_ios` number of buffers. + * A background thread is launched to start processing the Cassini events for + * the Send operations. The test will then pause for 1 second. After the pause, + * the test will optionally use fi_trecvmsg() to FI_PEEK the unexpected list + * and verify the send messages are on the unexpected list. Then the + * test will execute the fi_trecv() to receive the buffers that were + * previously sent. Another background thread is then launched to process the + * receive events. When all send and receive operations have completed, the + * threads exit and the results are compared to ensure the expected data was + * returned. + * + * Based on the test's length parameter it will change the processing of the + * send and receive operation. 2kiB and below lengths will cause the eager + * data path to be used. Larger than 2kiB buffers will use the SW Rendezvous + * data path to be used. + */ +void do_multitudes(struct multitudes_params *param) +{ + int ret; + size_t buf_len = param->length; + struct fi_cq_tagged_entry *rx_cqe; + struct fi_cq_tagged_entry *tx_cqe; + struct tagged_thread_args *tx_args; + struct tagged_thread_args *rx_args; + struct fi_context *rx_ctxts; + struct iovec iovec; + struct fi_msg_tagged tmsg = {}; + pthread_t tx_thread; + pthread_t rx_thread; + pthread_attr_t attr; + struct tagged_event_args tx_evt_args = { + .cq = cxit_tx_cq, + .io_num = param->num_ios, + }; + struct tagged_event_args rx_evt_args = { + .cq = cxit_rx_cq, + .io_num = param->num_ios, + }; + char *rx_mode; + bool claim = param->claim; + + /* TODO: Remove after HW FI_CLAIM support is implemented */ + rx_mode = getenv("FI_CXI_RX_MATCH_MODE"); + if (claim && (!rx_mode || strcmp(rx_mode, "software"))) { + cr_assert(1); + return; + } + + tx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(tx_cqe); + + rx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(rx_cqe); + + tx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args)); + cr_assert_not_null(tx_args); + + rx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args)); + cr_assert_not_null(rx_args); + + rx_ctxts = calloc(param->num_ios, sizeof(struct fi_context)); + cr_assert_not_null(rx_ctxts); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + tx_evt_args.cqe = tx_cqe; + rx_evt_args.cqe = rx_cqe; + + /* Issue the Sends */ + for (size_t tx_io = 0; tx_io < param->num_ios; tx_io++) { + tx_args[tx_io].len = buf_len; + tx_args[tx_io].tag = tx_io; + tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(tx_args[tx_io].buf); + for (size_t i = 0; i < buf_len; i++) + tx_args[tx_io].buf[i] = i + 0xa0 + tx_io; + + do { + ret = fi_tsend(cxit_ep, tx_args[tx_io].buf, + tx_args[tx_io].len, NULL, + cxit_ep_fi_addr, tx_args[tx_io].tag, + NULL); + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_tx_cq, NULL, 0); + fi_cq_read(cxit_rx_cq, NULL, 0); + } + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d", + tx_io, ret); + } + + /* Start processing Send events */ + ret = pthread_create(&tx_thread, &attr, tagged_evt_worker, + (void *)&tx_evt_args); + cr_assert_eq(ret, 0, "Send thread create failed %d", ret); + + sleep(1); + + /* Force onloading of UX entries if operating in SW EP mode */ + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + + /* Optional peek to see if all send tags are found on ux list */ + if (param->peek) { + for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) { + if (claim) + rx_args[rx_io].context = &rx_ctxts[rx_io]; + + ret = try_peek(FI_ADDR_UNSPEC, rx_io, 0, buf_len, + claim ? &rx_ctxts[rx_io] : NULL, claim); + cr_assert_eq(ret, FI_SUCCESS, + "peek of UX message failed"); + } + } + + /* Issue the Receives */ + for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) { + rx_args[rx_io].len = buf_len; + rx_args[rx_io].tag = rx_io; + rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(rx_args[rx_io].buf); + memset(rx_args[rx_io].buf, 0, buf_len); + + do { + if (claim) { + iovec.iov_base = rx_args[rx_io].buf; + iovec.iov_len = rx_args[rx_io].len; + + tmsg.msg_iov = &iovec; + tmsg.iov_count = 1; + tmsg.addr = FI_ADDR_UNSPEC; + tmsg.tag = rx_args[rx_io].tag; + tmsg.ignore = 0; + tmsg.context = &rx_ctxts[rx_io]; + + ret = fi_trecvmsg(cxit_ep, &tmsg, FI_CLAIM); + } else { + ret = fi_trecv(cxit_ep, rx_args[rx_io].buf, + rx_args[rx_io].len, NULL, + FI_ADDR_UNSPEC, + rx_args[rx_io].tag, 0, NULL); + } + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, NULL, 0); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d", + rx_io, ret); + } + + /* Start processing Receive events */ + ret = pthread_create(&rx_thread, &attr, tagged_evt_worker, + (void *)&rx_evt_args); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + + /* Wait for the RX/TX event threads to complete */ + ret = pthread_join(tx_thread, NULL); + cr_assert_eq(ret, 0, "Send thread join failed %d", ret); + + ret = pthread_join(rx_thread, NULL); + cr_assert_eq(ret, 0, "Recv thread join failed %d", ret); + + /* Validate results */ + for (size_t io = 0; io < param->num_ios; io++) { + /* Validate sent data */ + cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len); + + validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL); + validate_rx_event(&rx_cqe[io], claim ? + rx_args[rx_cqe[io].tag].context : NULL, + buf_len, FI_TAGGED | FI_RECV, NULL, + 0, tx_args[rx_cqe[io].tag].tag); + free(tx_args[io].buf); + free(rx_args[io].buf); + } + + pthread_attr_destroy(&attr); + free(rx_cqe); + free(tx_cqe); + free(tx_args); + free(rx_args); + free(rx_ctxts); +} + +ParameterizedTestParameters(tagged, multitudes) +{ + size_t param_sz; + + static struct multitudes_params params[] = { + {.length = 1024, /* Eager */ + .num_ios = 10, + .peek = true}, + {.length = 2 * 1024, /* Eager */ + .num_ios = 15, + .peek = true}, + {.length = 4 * 1024, /* Rendezvous */ + .num_ios = 12, + .peek = true}, + {.length = 128 * 1024, /* Rendezvous */ + .num_ios = 25, + .peek = true}, + {.length = 1024, /* Eager */ + .num_ios = 10, + .peek = true, + .claim = true, + }, + {.length = 2 * 1024, /* Eager */ + .num_ios = 15, + .peek = true, + .claim = true, + }, + {.length = 4 * 1024, /* Rendezvous */ + .num_ios = 12, + .peek = true, + .claim = true, + }, + {.length = 128 * 1024, /* Rendezvous */ + .num_ios = 25, + .peek = true, + .claim = true, + }, + {.length = 8 * 1024, /* Rendezvous ID > 8 bits */ + .num_ios = 350, + .peek = true, + .claim = false, + }, + }; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct multitudes_params, params, + param_sz); +} + +ParameterizedTest(struct multitudes_params *param, tagged, multitudes, .timeout=60) +{ + do_multitudes(param); +} + +/* Use multitudes test to force transition from hardware + * matching to software matching. LE_POOL resources should + * be set to 60. + */ +ParameterizedTestParameters(tagged, hw2sw_multitudes) +{ + size_t param_sz; + + static struct multitudes_params params[] = { + {.length = 1024, /* Eager */ + .num_ios = 100, + .peek = true + }, + {.length = 2 * 2048, /* Rendezvous */ + .num_ios = 100, + .peek = true + }, + {.length = 8 * 2048, /* Rendezvous */ + .num_ios = 100, + .peek = true + }, + }; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct multitudes_params, params, + param_sz); +} + +/* This test will only require HW to SW matching transition if the + * LE pool resources have been limited (60) and if running in HW offloaded + * mode with RDZV offloaded and the eager long protocol is not used. + */ +ParameterizedTest(struct multitudes_params *param, tagged, hw2sw_multitudes, + .timeout=60, .disabled=false) +{ + do_multitudes(param); +} + +/* This will only test hybrid matching transition when LE resources + * are restricted to no more than 60. + */ +Test(tagged, hw2sw_hybrid_matching, .timeout=60) +{ + int ret; + size_t buf_len = 4096; + struct fi_cq_tagged_entry *rx_cqe; + struct fi_cq_tagged_entry *tx_cqe; + struct tagged_thread_args *tx_args; + struct tagged_thread_args *rx_args; + pthread_t tx_thread; + pthread_t rx_thread; + pthread_attr_t attr; + struct tagged_event_args tx_evt_args = { + .cq = cxit_tx_cq, + .io_num = 100, + }; + struct tagged_event_args rx_evt_args = { + .cq = cxit_rx_cq, + .io_num = 100, + }; + + tx_cqe = calloc(100, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(tx_cqe); + + rx_cqe = calloc(100, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(rx_cqe); + + tx_args = calloc(100, sizeof(struct tagged_thread_args)); + cr_assert_not_null(tx_args); + + rx_args = calloc(100, sizeof(struct tagged_thread_args)); + cr_assert_not_null(rx_args); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + tx_evt_args.cqe = tx_cqe; + rx_evt_args.cqe = rx_cqe; + + /* Issue 25 receives for tags 25-49 to pre-load priority list */ + for (size_t rx_io = 25; rx_io < 50; rx_io++) { + rx_args[rx_io].len = buf_len; + rx_args[rx_io].tag = rx_io; + rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(rx_args[rx_io].buf); + memset(rx_args[rx_io].buf, 0, buf_len); + + do { + ret = fi_trecv(cxit_ep, rx_args[rx_io].buf, + rx_args[rx_io].len, NULL, + FI_ADDR_UNSPEC, rx_args[rx_io].tag, + 0, NULL); + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, NULL, 0); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d", + rx_io, ret); + } + + /* Start processing Receive events */ + ret = pthread_create(&rx_thread, &attr, tagged_evt_worker, + (void *)&rx_evt_args); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + + /* Issue all of the Sends exhausting resources */ + for (size_t tx_io = 0; tx_io < 100; tx_io++) { + tx_args[tx_io].len = buf_len; + tx_args[tx_io].tag = tx_io; + tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(tx_args[tx_io].buf); + for (size_t i = 0; i < buf_len; i++) + tx_args[tx_io].buf[i] = i + 0xa0 + tx_io; + + do { + ret = fi_tsend(cxit_ep, tx_args[tx_io].buf, + tx_args[tx_io].len, NULL, + cxit_ep_fi_addr, tx_args[tx_io].tag, + NULL); + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_tx_cq, NULL, 0); + fi_cq_read(cxit_rx_cq, NULL, 0); + } + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d", + tx_io, ret); + } + + /* Start processing Send events */ + ret = pthread_create(&tx_thread, &attr, tagged_evt_worker, + (void *)&tx_evt_args); + cr_assert_eq(ret, 0, "Send thread create failed %d", ret); + + sleep(1); + + /* Force onloading of UX entries if operating in SW EP mode */ + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + + /* Issue the remainder of the receives */ + for (size_t rx_io = 0; rx_io < 100; rx_io++) { + if (rx_io >= 25 && rx_io < 50) + continue; + rx_args[rx_io].len = buf_len; + rx_args[rx_io].tag = rx_io; + rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(rx_args[rx_io].buf); + memset(rx_args[rx_io].buf, 0, buf_len); + + do { + ret = fi_trecv(cxit_ep, rx_args[rx_io].buf, + rx_args[rx_io].len, NULL, + FI_ADDR_UNSPEC, rx_args[rx_io].tag, + 0, NULL); + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, NULL, 0); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d", + rx_io, ret); + } + + /* Wait for the RX/TX event threads to complete */ + ret = pthread_join(tx_thread, NULL); + cr_assert_eq(ret, 0, "Send thread join failed %d", ret); + + ret = pthread_join(rx_thread, NULL); + cr_assert_eq(ret, 0, "Recv thread join failed %d", ret); + + /* Validate results */ + for (size_t io = 0; io < 100; io++) { + /* Validate sent data */ + cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len); + + validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL); + + validate_rx_event(&rx_cqe[io], NULL, buf_len, + FI_TAGGED | FI_RECV, NULL, + 0, tx_args[rx_cqe[io].tag].tag); + + free(tx_args[io].buf); + free(rx_args[io].buf); + } + + pthread_attr_destroy(&attr); + free(rx_cqe); + free(tx_cqe); + free(tx_args); + free(rx_args); +} + +#define RECV_INIT 0x77 +#define SEND_INIT ~RECV_INIT + +void do_msg(uint8_t *send_buf, size_t send_len, uint64_t send_tag, + uint8_t *recv_buf, size_t recv_len, uint64_t recv_tag, + uint64_t recv_ignore, bool send_first, size_t buf_size, + bool tagged, bool wdata, uint64_t data, bool match_complete) +{ + int i, ret; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + bool sent = false, + recved = false, + truncated = false; + struct fi_cq_err_entry err_cqe = {}; + size_t recved_len; + static int send_cnt; + static int recv_cnt; + static int recv_errcnt; + + struct fi_msg_tagged tsmsg = {}; + struct fi_msg smsg = {}; + struct iovec siovec; + uint64_t send_flags = 0; + + memset(recv_buf, RECV_INIT, buf_size); + + for (i = 0; i < buf_size; i++) { + if (i < send_len) + send_buf[i] = i + 0xa0; + else + send_buf[i] = SEND_INIT; + } + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = NULL; + smsg.data = data; + + tsmsg.msg_iov = &siovec; + tsmsg.iov_count = 1; + tsmsg.addr = cxit_ep_fi_addr; + tsmsg.tag = send_tag; + tsmsg.ignore = 0; + tsmsg.context = NULL; + tsmsg.data = data; + + /* FI_REMOTE_CQ_DATA flag is not strictly necessary. */ + if (wdata) + send_flags |= FI_REMOTE_CQ_DATA; + if (match_complete) + send_flags |= FI_MATCH_COMPLETE; + + if (send_first) { + if (tagged) { + ret = fi_tsendmsg(cxit_ep, &tsmsg, send_flags); + cr_assert_eq(ret, FI_SUCCESS, + "fi_tsendmsg failed %d", ret); + } else { + ret = fi_sendmsg(cxit_ep, &smsg, send_flags); + cr_assert_eq(ret, FI_SUCCESS, + "fi_sendmsg failed %d", ret); + } + + /* Progress send to ensure it arrives unexpected */ + i = 0; + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) { + sent = true; + break; + } + cr_assert_eq(ret, -FI_EAGAIN, + "send failed %d", ret); + } while (i++ < 100000); + } + + /* Post RX buffer */ + + if (tagged) { + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, recv_tag, recv_ignore, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + } else { + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + } + + if (!send_first) { + if (tagged) { + ret = fi_tsendmsg(cxit_ep, &tsmsg, send_flags); + cr_assert_eq(ret, FI_SUCCESS, + "fi_tsendmsg failed %d", ret); + } else { + ret = fi_sendmsg(cxit_ep, &smsg, send_flags); + cr_assert_eq(ret, FI_SUCCESS, + "fi_sendmsg failed %d", ret); + } + } + + /* Gather both events, ensure progress on both sides. */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == 1) { + cr_assert_eq(recved, false); + recved = true; + } else if (ret == -FI_EAVAIL) { + cr_assert_eq(recved, false); + recved = true; + truncated = true; + + ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0); + cr_assert_eq(ret, 1); + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", ret); + } + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) { + cr_assert_eq(sent, false); + sent = true; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", ret); + } + } while (!(sent && recved)); + + if (truncated) { + cr_assert(err_cqe.op_context == NULL, + "Error RX CQE Context mismatch"); + cr_assert(err_cqe.flags == + ((tagged ? FI_TAGGED : FI_MSG) | FI_RECV | + (wdata ? FI_REMOTE_CQ_DATA : 0UL)), + "Error RX CQE flags mismatch"); + cr_assert(err_cqe.len == recv_len, + "Invalid Error RX CQE length, got: %ld exp: %ld", + err_cqe.len, recv_len); + cr_assert(err_cqe.buf == 0, "Invalid Error RX CQE address"); + cr_assert(err_cqe.data == (wdata ? data : 0UL), + "Invalid Error RX CQE data"); + cr_assert(err_cqe.tag == send_tag, "Invalid Error RX CQE tag"); + cr_assert(err_cqe.olen == (send_len - recv_len), + "Invalid Error RX CQE olen, got: %ld exp: %ld", + err_cqe.olen, send_len - recv_len); + cr_assert(err_cqe.err == FI_ETRUNC, + "Invalid Error RX CQE code\n"); + cr_assert(err_cqe.prov_errno == C_RC_OK, + "Invalid Error RX CQE errno"); + cr_assert(err_cqe.err_data == NULL); + cr_assert(err_cqe.err_data_size == 0); + recved_len = err_cqe.len; + } else { + validate_rx_event(&rx_cqe, NULL, send_len, + (tagged ? FI_TAGGED : FI_MSG) | FI_RECV + | (wdata ? FI_REMOTE_CQ_DATA : 0UL), + NULL, wdata ? data : 0UL, send_tag); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + recved_len = rx_cqe.len; + } + + validate_tx_event(&tx_cqe, (tagged ? FI_TAGGED : FI_MSG) | FI_SEND, + NULL); + + /* Validate sent data */ + for (i = 0; i < buf_size; i++) { + uint8_t cmp = RECV_INIT; + if (i < recved_len) + cmp = send_buf[i]; + + cr_expect_eq(recv_buf[i], cmp, + "data mismatch, len: %ld, element[%d], exp=0x%x saw=0x%x, err=%d\n", + recv_len, i, cmp, recv_buf[i], err++); + if (err >= 10) + break; + } + cr_assert_eq(err, 0, "%d data errors seen\n", err); + + /* Check counters */ + send_cnt++; + + if (truncated) + recv_errcnt++; + else + recv_cnt++; + + while (fi_cntr_read(cxit_send_cntr) != send_cnt) + ; + while (fi_cntr_read(cxit_recv_cntr) != recv_cnt) + ; + while (fi_cntr_readerr(cxit_recv_cntr) != recv_errcnt) + ; + + /* Error count is 7 bits */ + if (recv_errcnt == 127) { + recv_errcnt = 0; + fi_cntr_seterr(cxit_recv_cntr, 0); + } +} + +#define BUF_SIZE (8*1024) +#define SEND_MIN 64 +#define SEND_INC 64 +#define TAG 0x333333333333 +#define IGNORE_ALL (-1ULL & CXIP_TAG_MASK) +#define HDR_DATA 0xabcdabcdabcdabcd + +struct tagged_rx_params { + size_t buf_size; + size_t send_min; + size_t send_inc; + uint64_t send_tag; + int recv_len_off; + uint64_t recv_tag; + uint64_t ignore; + bool ux; + bool tagged; + bool wdata; + uint64_t data; +}; + +static struct tagged_rx_params params[] = { + {.buf_size = BUF_SIZE, /* equal length no data */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = 0, + .ignore = 0, + .ux = false, + .tagged = true}, + + /* Use CQ data */ + + {.buf_size = BUF_SIZE, /* truncate */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = -8, + .recv_tag = 0, + .ignore = 0, + .ux = false, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* truncate UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = -8, + .recv_tag = 0, + .ignore = 0, + .ux = true, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* truncate ignore */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = TAG, + .recv_len_off = -8, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = false, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* truncate ignore UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = TAG, + .recv_len_off = -8, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = true, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* equal length */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = 0, + .ignore = 0, + .ux = false, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* equal length UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = 0, + .ignore = 0, + .ux = true, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* equal length ignore */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = TAG, + .recv_len_off = 0, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = false, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* equal length ignore UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = TAG, + .recv_len_off = 0, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = true, + .tagged = true}, + {.buf_size = BUF_SIZE, /* excess */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 8, + .recv_tag = 0, + .ignore = 0, + .ux = false, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* excess UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 8, + .recv_tag = 0, + .ignore = 0, + .ux = true, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* excess ignore */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = TAG, + .recv_len_off = 8, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = false, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* excess ignore UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = TAG, + .recv_len_off = 8, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = true, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + + /* Un-tagged variants */ + + {.buf_size = BUF_SIZE, /* equal length no data */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = 0, + .ignore = 0, + .ux = false, + .tagged = false}, + + /* Use CQ data */ + + {.buf_size = BUF_SIZE, /* truncate */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = -8, + .recv_tag = 0, + .ignore = 0, + .ux = false, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* truncate UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = -8, + .recv_tag = 0, + .ignore = 0, + .ux = true, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* truncate ignore */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = -8, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = false, + .tagged = true, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* truncate ignore UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = -8, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = true, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* equal length */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = 0, + .ignore = 0, + .ux = false, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* equal length UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = 0, + .ignore = 0, + .ux = true, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* equal length ignore */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = false, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* equal length ignore UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = true, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* excess */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 8, + .recv_tag = 0, + .ignore = 0, + .ux = false, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* excess UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 8, + .recv_tag = 0, + .ignore = 0, + .ux = true, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* excess ignore */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 8, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = false, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, + {.buf_size = BUF_SIZE, /* excess ignore UX */ + .send_min = SEND_MIN, + .send_inc = SEND_INC, + .send_tag = 0, + .recv_len_off = 8, + .recv_tag = ~TAG & CXIP_TAG_MASK, + .ignore = IGNORE_ALL, + .ux = true, + .tagged = false, + .wdata = true, + .data = HDR_DATA}, +}; + +ParameterizedTestParameters(tagged, rx) +{ + size_t param_sz; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct tagged_rx_params, params, + param_sz); +} + +ParameterizedTest(struct tagged_rx_params *param, tagged, rx, .timeout=30) +{ + uint8_t *recv_buf, + *send_buf; + size_t send_len; + + recv_buf = aligned_alloc(s_page_size, param->buf_size); + cr_assert(recv_buf); + + send_buf = aligned_alloc(s_page_size, param->buf_size); + cr_assert(send_buf); + + for (send_len = param->send_min; + send_len <= param->buf_size; + send_len += param->send_inc) { + do_msg(send_buf, send_len, param->send_tag, + recv_buf, send_len + param->recv_len_off, + param->recv_tag, param->ignore, param->ux, + param->buf_size, param->tagged, + param->wdata, param->data, false); + do_msg(send_buf, send_len, param->send_tag, + recv_buf, send_len + param->recv_len_off, + param->recv_tag, param->ignore, param->ux, + param->buf_size, param->tagged, + param->wdata, param->data, true); + } + + free(send_buf); + free(recv_buf); +} + +#define GB 1024*1024*1024 +Test(tagged, rput_abort, .disabled=true) +{ + size_t recv_len = GB; + size_t send_len = GB; + void *recv_buf; + void *send_buf; + int ret; + uint64_t val __attribute__((unused)); + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, 0, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + sleep(1); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + ret = fi_tsend(cxit_ep, send_buf, send_len, + NULL, cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_tsend failed %d", ret); + + sleep(1); + val = *(uint64_t *)0; +} + + +Test(tagged, oflow_replenish, .timeout=180) +{ + uint8_t *recv_buf, + *send_buf; + size_t send_len = 1024; + int i; + + recv_buf = aligned_alloc(s_page_size, send_len); + cr_assert(recv_buf); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < 6*1024+1; i++) { + do_msg(send_buf, send_len, 0, + recv_buf, send_len, 0, 0, + true, send_len, true, false, 0, false); + } + + free(send_buf); + free(recv_buf); +} + +/* Test outstanding send cleanup */ +Test(tagged, cleanup_sends) +{ + int i, ret; + uint8_t *send_buf; + int send_len = 64; + int sends = 5; + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + /* Send 64 bytes to self */ + for (i = 0; i < sends; i++) { + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + } + + /* Close Endpoint with outstanding Sends */ +} + +/* Test UX cleanup */ +Test(tagged, ux_cleanup) +{ + int i, ret; + uint8_t *send_buf; + int send_len = 64; + struct fi_cq_tagged_entry cqe; + int sends = 5; + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + /* Send 64 bytes to self */ + for (i = 0; i < sends; i++) { + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + } + + validate_tx_event(&cqe, FI_TAGGED | FI_SEND, NULL); + + /* Wait for async event indicating data has been received */ + for (i = 0 ; i < 1000; i++) + fi_cq_readfrom(cxit_rx_cq, &cqe, 1, NULL); + + free(send_buf); + + /* Close Endpoint with UX sends on the RX Queue */ +} + +/* Test outstanding recv cleanup */ +Test(tagged, cleanup_recvs) +{ + int i, ret; + uint8_t *recv_buf; + int recv_len = 64; + int recvs = 5; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + + for (i = 0; i < recvs; i++) { + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, 0x0, 0x0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + } + + /* Close Endpoint with outstanding Receives */ +} + +/* Test outstanding recv cancel */ +Test(tagged, cancel_recvs) +{ + int i, ret; + uint8_t *recv_buf; + int recv_len = 64; + int recvs = 5; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + + for (i = 0; i < recvs; i++) { + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, 0x0, 0x0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + } + + for (i = 0; i < recvs; i++) { + ret = fi_cancel(&cxit_ep->fid, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cancel failed %d", ret); + } +} + +/* Test cancel with no matching recv */ +Test(tagged, cancel_nomatch) +{ + int ret; + + ret = fi_cancel(&cxit_ep->fid, NULL); + cr_assert_eq(ret, -FI_ENOENT, "fi_cancel failed to fail %d", ret); +} + +/* Test outstanding recv cancel events */ +Test(tagged, cancel_recvs_sync) +{ + int i, ret; + uint8_t *recv_buf; + int recv_len = 64; + int recvs = 5; + struct fi_cq_tagged_entry rx_cqe; + struct fi_cq_err_entry err_cqe; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + + for (i = 0; i < recvs; i++) { + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, 0x0, 0x0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + } + + for (i = 0; i < recvs; i++) { + ret = fi_cancel(&cxit_ep->fid, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_cancel failed %d", ret); + } + + ret = fi_cancel(&cxit_ep->fid, NULL); + cr_assert_eq(ret, -FI_ENOENT, "fi_cancel failed to fail %d", ret); + + for (i = 0; i < recvs; i++) { + do { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + if (ret == -FI_EAVAIL) + break; + + cr_assert_eq(ret, -FI_EAGAIN, + "unexpected event %d", ret); + } while (1); + + ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0); + cr_assert_eq(ret, 1); + + cr_assert(err_cqe.op_context == NULL, + "Error RX CQE Context mismatch"); + cr_assert(err_cqe.flags == (FI_TAGGED | FI_RECV), + "Error RX CQE flags mismatch"); + cr_assert(err_cqe.err == FI_ECANCELED, + "Invalid Error RX CQE code\n"); + cr_assert(err_cqe.prov_errno == 0, + "Invalid Error RX CQE errno"); + } +} + +void cxit_setup_selective_completion(void) +{ + cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + cxit_rx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + + cxit_setup_getinfo(); + cxit_fi_hints->tx_attr->op_flags = FI_COMPLETION; + cxit_fi_hints->rx_attr->op_flags = FI_COMPLETION; + cxit_setup_tagged(); +} + +/* Test selective completion behavior with RMA. */ +Test(tagged_sel, selective_completion, + .init = cxit_setup_selective_completion, + .fini = cxit_teardown_tagged) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int buf_len = 0x1000; + int send_len; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged smsg = {}; + struct fi_msg_tagged rmsg = {}; + struct iovec siovec; + struct iovec riovec; + int recv_cnt = 0; + + recv_buf = aligned_alloc(s_page_size, buf_len); + cr_assert(recv_buf); + + riovec.iov_base = recv_buf; + riovec.iov_len = buf_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + send_buf = aligned_alloc(s_page_size, buf_len); + cr_assert(send_buf); + + siovec.iov_base = send_buf; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + /* Normal writes generate completions */ + for (send_len = 1; send_len <= buf_len; send_len <<= 1) { + bool sent = false; + bool rcved = false; + + memset(recv_buf, 0, send_len); + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, send_len, NULL, + FI_ADDR_UNSPEC, 0, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + recv_cnt++; + + /* Send to self */ + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async events indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == 1) + rcved = true; + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) + sent = true; + } while (!(sent && rcved)); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, + NULL, 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + } + + /* Request completions from fi_writemsg */ + for (send_len = 1; send_len <= buf_len; send_len <<= 1) { + bool sent = false; + bool rcved = false; + + memset(recv_buf, 0, send_len); + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_trecvmsg(cxit_ep, &rmsg, FI_COMPLETION); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + recv_cnt++; + + /* Send to self */ + siovec.iov_len = send_len; + ret = fi_tsendmsg(cxit_ep, &smsg, FI_COMPLETION); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async events indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == 1) + rcved = true; + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) + sent = true; + } while (!(sent && rcved)); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, + NULL, 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + } + + /* Suppress completions using fi_writemsg */ + for (send_len = 1; send_len <= buf_len; send_len <<= 1) { + memset(recv_buf, 0, send_len); + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_len = send_len; + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + recv_cnt++; + + /* Send to self */ + siovec.iov_len = send_len; + ret = fi_tsendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async events indicating data has been received */ + while (fi_cntr_read(cxit_recv_cntr) != recv_cnt) + ; + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Ensure no events were generated */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + } + + /* Inject never generates an event */ + + send_len = 8; + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, send_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + recv_cnt++; + + /* Send 64 bytes to self */ + ret = fi_tinject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Make sure a TX event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + free(send_buf); + free(recv_buf); +} + +void cxit_setup_selective_completion_suppress(void) +{ + cxit_tx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + cxit_rx_cq_bind_flags |= FI_SELECTIVE_COMPLETION; + + cxit_setup_getinfo(); + cxit_fi_hints->tx_attr->op_flags = 0; + cxit_fi_hints->rx_attr->op_flags = 0; + cxit_setup_tagged(); +} + +/* Test selective completion behavior with RMA. */ +Test(tagged_sel, selective_completion_suppress, + .init = cxit_setup_selective_completion_suppress, + .fini = cxit_teardown_tagged) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int buf_len = 0x1000; + int send_len; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged smsg = {}; + struct fi_msg_tagged rmsg = {}; + struct iovec siovec; + struct iovec riovec; + int recv_cnt = 0; + + recv_buf = aligned_alloc(s_page_size, buf_len); + cr_assert(recv_buf); + + riovec.iov_base = recv_buf; + riovec.iov_len = buf_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + send_buf = aligned_alloc(s_page_size, buf_len); + cr_assert(send_buf); + + siovec.iov_base = send_buf; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + /* Normal writes do not generate completions */ + for (send_len = 1; send_len <= buf_len; send_len <<= 1) { + memset(recv_buf, 0, send_len); + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, send_len, NULL, + FI_ADDR_UNSPEC, 0, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + recv_cnt++; + + /* Send to self */ + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async events indicating data has been received */ + while (fi_cntr_read(cxit_recv_cntr) != recv_cnt) + ; + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Ensure no events were generated */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + } + + /* Request completions from fi_writemsg */ + for (send_len = 1; send_len <= buf_len; send_len <<= 1) { + bool sent = false; + bool rcved = false; + + memset(recv_buf, 0, send_len); + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_len = send_len; + ret = fi_trecvmsg(cxit_ep, &rmsg, FI_COMPLETION); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + recv_cnt++; + + /* Send to self */ + siovec.iov_len = send_len; + ret = fi_tsendmsg(cxit_ep, &smsg, FI_COMPLETION); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async events indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == 1) + rcved = true; + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) + sent = true; + } while (!(sent && rcved)); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, + NULL, 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + } + + /* Suppress completions using fi_writemsg */ + for (send_len = 1; send_len <= buf_len; send_len <<= 1) { + memset(recv_buf, 0, send_len); + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_len = send_len; + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + recv_cnt++; + + /* Send to self */ + siovec.iov_len = send_len; + ret = fi_tsendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async events indicating data has been received */ + while (fi_cntr_read(cxit_recv_cntr) != recv_cnt) + ; + + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Ensure no events were generated */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + } + + /* Inject never generates an event */ + + send_len = 8; + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, send_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + recv_cnt++; + + /* Send 64 bytes to self */ + ret = fi_tinject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Wait for async events indicating data has been received */ + while (fi_cntr_read(cxit_recv_cntr) != recv_cnt) + ; + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* Make sure a TX event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + free(send_buf); + free(recv_buf); +} + +/* Test match complete */ +Test(tagged, match_comp) +{ + int i, j, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + for (j = 0; j < 100; j++) { + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + ret = fi_tsendmsg(cxit_ep, &smsg, FI_MATCH_COMPLETE); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, + NULL, 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + /* UX */ + + ret = fi_tsendmsg(cxit_ep, &smsg, FI_MATCH_COMPLETE); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Ensure no TX event is generated */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, + NULL, 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + } + + free(send_buf); + free(recv_buf); +} + +/* Test eager Send with FI_MORE */ +Test(tagged, esend_more) +{ + int i, ret; + uint8_t *recv_buf, + *recv_buf2, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + recv_buf2 = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf2); + memset(recv_buf2, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + /* Post two Receives */ + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + riovec.iov_base = recv_buf2; + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + ret = fi_tsendmsg(cxit_ep, &smsg, FI_MORE); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Ensure no completion before the doorbell ring */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "write failed %d", ret); + } while (i++ < 100000); + + ret = fi_tsendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Gather 2 Receive events */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Gather 2 Send events */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + cr_expect_eq(recv_buf2[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf2[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test rendezvous Send with FI_MORE */ +Test(tagged, rsend_more) +{ + int i, ret; + uint8_t *recv_buf, + *recv_buf2, + *send_buf; + int recv_len = 0x1000; + int send_len = 0x1000; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + recv_buf2 = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf2); + memset(recv_buf2, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + /* Post two Receives */ + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + riovec.iov_base = recv_buf2; + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + ret = fi_tsendmsg(cxit_ep, &smsg, FI_MORE); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Ensure no completion before the doorbell ring */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "write failed %d", ret); + } while (i++ < 100000); + + ret = fi_tsendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Gather 2 Receive events */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Gather 2 Send events */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + cr_expect_eq(recv_buf2[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf2[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test Receive with FI_MORE */ +Test(tagged, recv_more) +{ + int i, ret; + uint8_t *recv_buf, + *recv_buf2, + *send_buf; + int recv_len = 0x2000; + int send_len = 0x2000; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + struct iovec riovec; + struct iovec siovec; + struct cxip_ep *ep = container_of(cxit_ep, struct cxip_ep, ep.fid); + + /* FI_MORE has no meaning if receives are not offloaded */ + if (!ep->ep_obj->rxc.msg_offload) { + cr_assert(1); + return; + } + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + recv_buf2 = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf2); + memset(recv_buf2, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + /* Perform 2 Sends */ + ret = fi_tsendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + ret = fi_tsendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Post two Receives */ + ret = fi_trecvmsg(cxit_ep, &rmsg, FI_MORE); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + /* Ensure no completion before the doorbell ring */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "write failed %d", ret); + } while (i++ < 100000); + + riovec.iov_base = recv_buf2; + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + /* Gather 2 Receive events */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Gather 2 Send events */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + cr_expect_eq(recv_buf2[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf2[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test flow control. + * + * Perform enough Sends to overwhelm target LEs. Flow control recovery is + * transparent. + * + * Post matching Receives and check data to validate correct ordering amid flow + * control recovery. + */ +Test(tagged, fc, .timeout = 180) +{ + int i, j, ret, tx_ret; + uint8_t *send_bufs; + uint8_t *send_buf; + int send_len = 64; + uint8_t *recv_buf; + int recv_len = 64; + struct fi_cq_tagged_entry tx_cqe; + struct fi_cq_tagged_entry rx_cqe; + int nsends_concurrent = 3; /* must be less than the LE pool min. */ + int nsends = 14000; + int sends = 0; + uint64_t tag = 0xbeef; + fi_addr_t from; + + send_bufs = aligned_alloc(s_page_size, send_len * nsends_concurrent); + cr_assert(send_bufs); + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + + for (i = 0; i < nsends_concurrent - 1; i++) { + send_buf = send_bufs + (i % nsends_concurrent) * send_len; + memset(send_buf, i, send_len); + + tx_ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, tag, NULL); + } + + for (i = nsends_concurrent - 1; i < nsends; i++) { + send_buf = send_bufs + (i % nsends_concurrent) * send_len; + memset(send_buf, i, send_len); + + do { + tx_ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, tag, NULL); + + /* Progress RX to avoid EQ drops */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + + /* Just progress */ + fi_cq_read(cxit_tx_cq, NULL, 0); + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, FI_SUCCESS, "fi_tsend failed %d", tx_ret); + + do { + tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + + /* Progress RX to avoid EQ drops */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d", + tx_ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + if (!(++sends % 1000)) + printf("%u Sends complete.\n", sends); + } + + for (i = 0; i < nsends_concurrent - 1; i++) { + do { + tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + + /* Progress RX to avoid EQ drops */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d", + tx_ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + if (!(++sends % 1000)) + printf("%u Sends complete.\n", sends); + } + + for (i = 0; i < nsends; i++) { + do { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + assert(ret == -FI_EAGAIN); + + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, tag, 0, NULL); + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, recv_len, FI_TAGGED | FI_RECV, + NULL, 0, tag); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + for (j = 0; j < recv_len; j++) { + cr_assert_eq(recv_buf[j], (uint8_t)i, + "data mismatch, recv: %d element[%d], exp=%d saw=%d\n", + i, j, (uint8_t)i, recv_buf[j]); + } + } + + free(send_bufs); + free(recv_buf); +} + +#define FC_TRANS 100 + +static void *fc_sender(void *data) +{ + int i, tx_ret; + uint8_t *send_buf; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe; + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < FC_TRANS; i++) { + memset(send_buf, i, send_len); + + /* Send 64 bytes to self */ + do { + tx_ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 0xa, NULL); + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, FI_SUCCESS, "fi_tsend failed %d", tx_ret); + + do { + tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, 1, "fi_cq_read unexpected value %d", + tx_ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + } + + free(send_buf); + + pthread_exit(NULL); +} + +static void *fc_recver(void *data) +{ + int i, j, ret; + uint8_t *recv_buf; + int recv_len = 64; + struct fi_cq_tagged_entry rx_cqe; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + + for (i = 0; i < 5; i++) { + sleep(1); + + /* Progress RX to avoid EQ drops */ + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + + for (i = 0; i < FC_TRANS; i++) { + memset(recv_buf, 0, recv_len); + + /* Send 64 bytes to self */ + + do { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + assert(ret == -FI_EAGAIN); + + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, 0xa, 0, NULL); + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + do { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, recv_len, FI_TAGGED | FI_RECV, + NULL, 0, 0xa); + + for (j = 0; j < recv_len; j++) { + cr_assert_eq(recv_buf[j], i, + "data mismatch, element[%d], exp=%d saw=%d\n", + j, i, recv_buf[j]); + } + } + + free(recv_buf); + + pthread_exit(NULL); +} + +/* + * Multi-threaded flow control test. + * + * Run sender and receiver threads. Start sender first to allow it to overwhelm + * target LEs (set artificially low). Software matching is exercised while the + * receiver catches up. Matching is a hybrid of SW/HW as threads race to + * finish. + * + * Run with driver le_pool_max set below FC_TRANS. + */ +Test(tagged, fc_mt) +{ + pthread_t send_thread; + pthread_t recv_thread; + pthread_attr_t attr; + int ret; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + ret = pthread_create(&send_thread, &attr, fc_sender, NULL); + cr_assert_eq(ret, 0); + + ret = pthread_create(&recv_thread, &attr, fc_recver, NULL); + cr_assert_eq(ret, 0); + + ret = pthread_join(recv_thread, NULL); + cr_assert_eq(ret, 0); + + ret = pthread_join(send_thread, NULL); + cr_assert_eq(ret, 0); + + pthread_attr_destroy(&attr); +} + +/* Post a bunch of receives to cause append failures. */ +Test(tagged, fc_too_many_recv_early_close) +{ + void *recv_buf; + size_t recv_len = 1; + int i; + int ret; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + + for (i = 0; i < 50; i++) { + do { + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, 0xa, 0, NULL); + /* Just progress */ + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, NULL, 0); + } while (ret == -FI_EAGAIN); + + assert(ret == FI_SUCCESS); + } + + /* Early endpoint close. */ + ret = fi_close(&cxit_ep->fid); + cr_assert(ret == FI_SUCCESS, "fi_close endpoint"); + cxit_ep = NULL; + + free(recv_buf); +} + +#define RDZV_FC_ITERS 100 +#define RDZV_FC_BATCH 5 + +static void *rdzv_fc_sender(void *data) +{ + int i, j, tx_ret; + int send_id; + uint8_t *send_bufs; + uint8_t *send_buf; + long send_len = (long)data; + struct fi_cq_tagged_entry tx_cqe; + int batch_size = RDZV_FC_BATCH; + + send_bufs = aligned_alloc(s_page_size, send_len * batch_size); + cr_assert(send_bufs); + + for (i = 0; i < RDZV_FC_ITERS; i++) { + for (j = 0; j < batch_size; j++) { + send_id = i * batch_size + j; + send_buf = &send_bufs[j * send_len]; + memset(send_buf, send_id, send_len); + + do { + tx_ret = fi_tsend(cxit_ep, send_buf, send_len, + NULL, cxit_ep_fi_addr, + send_id, NULL); + + if (tx_ret == -FI_EAGAIN) { + fi_cq_read(cxit_tx_cq, &tx_cqe, 0); + sched_yield(); + } + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, FI_SUCCESS, "fi_tsend failed %d", + tx_ret); + } + + for (j = 0; j < batch_size; j++) { + do { + tx_ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + + if (tx_ret == -FI_EAGAIN) + sched_yield(); + } while (tx_ret == -FI_EAGAIN); + + cr_assert_eq(tx_ret, 1, + "fi_cq_read unexpected value %d", + tx_ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + } + } + + free(send_bufs); + + pthread_exit(NULL); +} + +static void *rdzv_fc_recver(void *data) +{ + int i, j, k, ret; + int recv_id; + uint8_t *recv_bufs; + uint8_t *recv_buf; + long recv_len = (long)data; + struct fi_cq_tagged_entry rx_cqe; + int batch_size = RDZV_FC_BATCH; + + recv_bufs = aligned_alloc(s_page_size, recv_len * batch_size); + cr_assert(recv_bufs); + + /* Let Sender get ahead and land some UX messages */ + sleep(1); + + for (i = 0; i < RDZV_FC_ITERS; i++) { + + for (j = 0; j < batch_size; j++) { + recv_id = i * batch_size + j; + recv_buf = &recv_bufs[j * recv_len]; + memset(recv_buf, -1, recv_len); + + do { + ret = fi_trecv(cxit_ep, recv_buf, recv_len, + NULL, FI_ADDR_UNSPEC, recv_id, + 0, NULL); + + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + sched_yield(); + } + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", + ret); + + do { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + + if (ret == -FI_EAGAIN) + sched_yield(); + } while (ret == -FI_EAGAIN); + + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", + ret); + + validate_rx_event(&rx_cqe, NULL, recv_len, + FI_TAGGED | FI_RECV, + NULL, 0, rx_cqe.tag); + + recv_id = rx_cqe.tag % batch_size; + recv_buf = &recv_bufs[recv_id * recv_len]; + for (k = 0; k < recv_len; k++) { + cr_assert_eq(recv_buf[k], (uint8_t)rx_cqe.tag, + "data mismatch, element[%d], exp=%d saw=%d\n", + k, (uint8_t)rx_cqe.tag, + recv_buf[k]); + } + } + } + + free(recv_bufs); + + pthread_exit(NULL); +} + +/* + * Rendezvous Send multi-threaded flow control test. + * + * Run with driver le_pool_max set just above RDZV_FC_BATCH. + */ +Test(tagged, rdzv_fc_mt, .timeout = 60) +{ + pthread_t send_thread; + pthread_t recv_thread; + pthread_attr_t attr; + int ret; + long xfer_len; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + for (xfer_len = 64; xfer_len <= 4*1024; xfer_len <<= 2) { + ret = pthread_create(&send_thread, &attr, rdzv_fc_sender, + (void *)xfer_len); + cr_assert_eq(ret, 0); + + ret = pthread_create(&recv_thread, &attr, rdzv_fc_recver, + (void *)xfer_len); + cr_assert_eq(ret, 0); + + ret = pthread_join(recv_thread, NULL); + cr_assert_eq(ret, 0); + + ret = pthread_join(send_thread, NULL); + cr_assert_eq(ret, 0); + + printf("%ld byte Sends complete\n", xfer_len); + } + + pthread_attr_destroy(&attr); +} + +Test(tagged, NC2192) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int send_len = CXIP_RDZV_THRESHOLD - 1; + int recv_len = send_len; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + fi_addr_t from; + int sends = (CXIP_OFLOW_BUF_SIZE - CXIP_RDZV_THRESHOLD) / send_len + 1; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + /* Consume 1 oflow byte */ + ret = fi_tsend(cxit_ep, send_buf, 1, NULL, cxit_ep_fi_addr, 0, + NULL); + cr_assert(ret == FI_SUCCESS); + + for (i = 0; i < sends; i++) { + do { + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, + cxit_ep_fi_addr, 1, NULL); + /* progress */ + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_tx_cq, &tx_cqe, 0); + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + } + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + } + + + /* Force processing in software mode */ + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + + for (i = 0; i < sends + 1; i++) { + fi_cq_read(cxit_tx_cq, &tx_cqe, 0); + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert(ret == 1); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + } + + for (i = 0; i < sends; i++) { + do { + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, 1, 0, NULL); + /* progress */ + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_tx_cq, &tx_cqe, 0); + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + } + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + } + + for (i = 0; i < sends; i++) { + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, + NULL, 0, 1); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + } + + /* Match the 1 byte Send */ + do { + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, 0, 0, NULL); + /* progress */ + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + + } while (ret == -FI_EAGAIN); + cr_assert(ret == FI_SUCCESS); + + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert(ret == 1); + + validate_rx_event(&rx_cqe, NULL, 1, FI_TAGGED | FI_RECV, NULL, 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + free(send_buf); + free(recv_buf); +} + +TestSuite(tagged_tclass, .init = cxit_setup_tx_alias_tagged, + .fini = cxit_teardown_tx_alias_tagged, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Simple send using both the EP and alias EP with new TC */ +Test(tagged_tclass, ping) +{ + int ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + uint32_t tclass = FI_TC_LOW_LATENCY; + fi_addr_t from; + + recv_buf = aligned_alloc(s_page_size, recv_len * 2); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len * 2); + + send_buf = aligned_alloc(s_page_size, send_len * 2); + cr_assert(send_buf); + + /* Post RX buffers */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + ret = fi_trecv(cxit_ep, recv_buf + recv_len, recv_len, NULL, + FI_ADDR_UNSPEC, 0, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Update EP alias traffic class */ + ret = fi_set_val(&cxit_tx_alias_ep->fid, FI_OPT_CXI_SET_TCLASS, + (void *)&tclass); + cr_assert_eq(ret, FI_SUCCESS, "fi_set_val failed %d for tc %d\n", + ret, tclass); + + + /* Send 64 bytes to self */ + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + ret = fi_tsend(cxit_tx_alias_ep, send_buf + send_len, send_len, NULL, + cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend for alias failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + free(send_buf); + free(recv_buf); +} + +/* Various tagged protocols using both the original endpoint + * and an alias endpoint modified to use the specified tclass. + * + * Note that receive order is not expected between the original + * and alias EP; tags are used for getting completions. + */ +struct multi_tc_params { + size_t length; + size_t num_ios; + uint32_t tclass; + uint32_t alias_mask; + bool peek; +}; + +void do_multi_tc(struct multi_tc_params *param) +{ + int ret; + size_t buf_len = param->length; + struct fid_ep *ep; + struct fi_cq_tagged_entry *rx_cqe; + struct fi_cq_tagged_entry *tx_cqe; + struct tagged_thread_args *tx_args; + struct tagged_thread_args *rx_args; + pthread_t tx_thread; + pthread_t rx_thread; + pthread_attr_t attr; + struct tagged_event_args tx_evt_args = { + .cq = cxit_tx_cq, + .io_num = param->num_ios, + }; + struct tagged_event_args rx_evt_args = { + .cq = cxit_rx_cq, + .io_num = param->num_ios, + }; + + tx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(tx_cqe); + + rx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(rx_cqe); + + tx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args)); + cr_assert_not_null(tx_args); + + rx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args)); + cr_assert_not_null(rx_args); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + tx_evt_args.cqe = tx_cqe; + rx_evt_args.cqe = rx_cqe; + + /* Set alias EP traffic class */ + ret = fi_set_val(&cxit_tx_alias_ep->fid, FI_OPT_CXI_SET_TCLASS, + ¶m->tclass); + cr_assert_eq(ret, FI_SUCCESS, "fi_set_val traffic class"); + + /* Issue the Sends */ + for (size_t tx_io = 0; tx_io < param->num_ios; tx_io++) { + tx_args[tx_io].len = buf_len; + tx_args[tx_io].tag = tx_io; + tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(tx_args[tx_io].buf); + for (size_t i = 0; i < buf_len; i++) + tx_args[tx_io].buf[i] = i + 0xa0 + tx_io; + + ep = tx_io & param->alias_mask ? cxit_tx_alias_ep : cxit_ep; + do { + ret = fi_tsend(ep, tx_args[tx_io].buf, + tx_args[tx_io].len, NULL, + cxit_ep_fi_addr, tx_args[tx_io].tag, + NULL); + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_tx_cq, NULL, 0); + fi_cq_read(cxit_rx_cq, NULL, 0); + } + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d", + tx_io, ret); + } + + /* Start processing Send events */ + ret = pthread_create(&tx_thread, &attr, tagged_evt_worker, + (void *)&tx_evt_args); + cr_assert_eq(ret, 0, "Send thread create failed %d", ret); + + sleep(1); + + /* Force onloading of UX entries if operating in SW EP mode */ + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + + /* Optional peek to see if all send tags are found on ux list */ + if (param->peek) { + for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) { + ret = try_peek(FI_ADDR_UNSPEC, rx_io, 0, buf_len, + NULL, false); + cr_assert_eq(ret, FI_SUCCESS, + "peek of UX message failed"); + } + } + + /* Issue the Receives */ + for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) { + rx_args[rx_io].len = buf_len; + rx_args[rx_io].tag = rx_io; + rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(rx_args[rx_io].buf); + memset(rx_args[rx_io].buf, 0, buf_len); + + do { + ret = fi_trecv(cxit_ep, rx_args[rx_io].buf, + rx_args[rx_io].len, NULL, + FI_ADDR_UNSPEC, rx_args[rx_io].tag, + 0, NULL); + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, NULL, 0); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d", + rx_io, ret); + } + + /* Start processing Receive events */ + ret = pthread_create(&rx_thread, &attr, tagged_evt_worker, + (void *)&rx_evt_args); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + + /* Wait for the RX/TX event threads to complete */ + ret = pthread_join(tx_thread, NULL); + cr_assert_eq(ret, 0, "Send thread join failed %d", ret); + + ret = pthread_join(rx_thread, NULL); + cr_assert_eq(ret, 0, "Recv thread join failed %d", ret); + + /* Validate results */ + for (size_t io = 0; io < param->num_ios; io++) { + /* Validate sent data */ + cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len); + + validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL); + + validate_rx_event(&rx_cqe[io], NULL, buf_len, + FI_TAGGED | FI_RECV, NULL, + 0, tx_args[rx_cqe[io].tag].tag); + + free(tx_args[io].buf); + free(rx_args[io].buf); + } + + pthread_attr_destroy(&attr); + free(rx_cqe); + free(tx_cqe); + free(tx_args); + free(rx_args); +} + +ParameterizedTestParameters(tagged_tclass, multi_tc) +{ + size_t param_sz; + + static struct multi_tc_params params[] = { + {.length = 64, /* Eager IDC */ + .num_ios = 10, + .tclass = FI_TC_LOW_LATENCY, + .peek = true, + .alias_mask = 0x1}, + {.length = 64, /* Eager IDC */ + .num_ios = 10, + .tclass = FI_TC_LOW_LATENCY, + .peek = true, + .alias_mask = 0x3}, + {.length = 2 * 1024, /* Eager */ + .num_ios = 15, + .tclass = FI_TC_LOW_LATENCY, + .peek = true, + .alias_mask = 0x1}, + {.length = 4 * 1024, /* Rendezvous */ + .num_ios = 12, + .tclass = FI_TC_LOW_LATENCY, + .peek = true, + .alias_mask = 0x1}, + {.length = 128 * 1024, /* Rendezvous */ + .num_ios = 25, + .tclass = FI_TC_LOW_LATENCY, + .peek = true, + .alias_mask = 0x1}, + }; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct multi_tc_params, params, + param_sz); +} + +ParameterizedTest(struct multi_tc_params *param, tagged_tclass, multi_tc, + .timeout = 60) +{ + do_multi_tc(param); +} + +TestSuite(tagged_src_err, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(tagged_src_err, cap_not_requested) +{ + struct fi_info *info; + int ret; + + /* No hints, both FI_SOURCE and FI_SOURCE_ERR should be removed + * since they are secondary capabilities that impact performance. + */ + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, NULL, + &info); + cr_assert(ret == FI_SUCCESS); + cr_assert_eq(info->caps & FI_SOURCE, 0, "FI_SOURCE"); + cr_assert_eq(info->caps & FI_SOURCE_ERR, 0, "FI_SOURCE_ERR"); + fi_freeinfo(info); + + cxit_setup_getinfo(); + cxit_fi_hints->caps = 0; + + /* No caps, both FI_SOURCE and FI_SOURCE_ERR should not be set since + * they are secondary capabilities and they impact performance. + */ + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + + cr_assert_eq(info->caps & FI_SOURCE, 0, "FI_SOURCE"); + cr_assert_eq(info->caps & FI_SOURCE_ERR, 0, "FI_SOURCE_ERR"); + + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(tagged_src_err, hints_check) +{ + struct fi_info *info; + int ret; + + /* If only FI_SOURCE then FI_SOURCE_ERR should not be set */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_MSG | FI_SOURCE; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + + cr_assert_eq(info->caps & FI_SOURCE, FI_SOURCE, "FI_SOURCE"); + cr_assert_eq(info->caps & FI_SOURCE_ERR, 0, "FI_SOURCE_ERR"); + + fi_freeinfo(info); + cxit_teardown_getinfo(); + + /* Validate FI_SOURCE are set if FI_SOURCE_ERR specified in hints */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_MSG | FI_SOURCE | FI_SOURCE_ERR; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + + cr_assert_eq(info->caps & FI_SOURCE, FI_SOURCE, "FI_SOURCE"); + cr_assert_eq(info->caps & FI_SOURCE_ERR, FI_SOURCE_ERR, + "FI_SOURCE_ERR"); + fi_freeinfo(info); + cxit_teardown_getinfo(); + + /* Verify that if hints are specified, but do not include FI_SOURCE + * FI_SOURCE_ERR in capabilities they are not returned. + */ + cxit_setup_getinfo(); + cxit_fi_hints->caps = FI_MSG; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == FI_SUCCESS); + + cr_assert_eq(info->caps & FI_SOURCE, 0, "FI_SOURCE"); + cr_assert_eq(info->caps & FI_SOURCE_ERR, 0, "FI_SOURCE_ERR"); + fi_freeinfo(info); + cxit_teardown_getinfo(); +} + +Test(tagged_src_err, invalid_use) +{ + struct fi_info *info; + int ret; + + cxit_setup_getinfo(); + + /* If no FI_SOURCE then FI_SOURCE_ERR is not allowed */ + cxit_fi_hints->caps = FI_MSG | FI_SOURCE_ERR; + ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + cxit_node, cxit_service, cxit_flags, cxit_fi_hints, + &info); + cr_assert(ret == -FI_ENODATA); + + cxit_teardown_getinfo(); +} + +Test(tagged_src_err, addr) +{ + struct fid_ep *fid_ep; + struct fid_eq *fid_eq; + struct fi_eq_attr eq_attr = { + .size = 32, + .flags = FI_WRITE, + .wait_obj = FI_WAIT_NONE + }; + struct fid_cq *fid_tx_cq; + struct fid_cq *fid_rx_cq; + struct fid_av *fid_av; + struct cxip_addr ep_addr; + fi_addr_t fi_dest_ep_addr; + fi_addr_t fi_src_err_ep_addr; + size_t addr_len = sizeof(ep_addr); + int ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + struct fi_cq_err_entry err_entry; + int i; + + /* Create first EP - adds itself to the AV */ + cxit_setup_enabled_ep(); + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, NULL, 0, NULL); + cr_assert_eq(ret, 1, "First EP AV insert of self %d\n", ret); + + /* Create second EP and resources */ + cr_assert_eq(cxit_fi->caps & + (FI_TAGGED | FI_SOURCE | FI_SOURCE_ERR | FI_DIRECTED_RECV), + (FI_TAGGED | FI_SOURCE | FI_SOURCE_ERR | FI_DIRECTED_RECV), + "info->caps"); + ret = fi_endpoint(cxit_domain, cxit_fi, &fid_ep, NULL); + cr_assert_eq(ret, FI_SUCCESS, "Second EP %d", ret); + ret = fi_eq_open(cxit_fabric, &eq_attr, &fid_eq, NULL); + cr_assert_eq(ret, FI_SUCCESS, "Second EP EQ %d", ret); + ret = fi_ep_bind(fid_ep, &fid_eq->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "Second PE EQ bind %d", ret); + ret = fi_cq_open(cxit_domain, &cxit_tx_cq_attr, &fid_tx_cq, NULL); + cr_assert_eq(ret, FI_SUCCESS, "Second EP TXCQ %d", ret); + ret = fi_cq_open(cxit_domain, &cxit_rx_cq_attr, &fid_rx_cq, NULL); + cr_assert_eq(ret, FI_SUCCESS, "Second EP RXCQ %d", ret); + ret = fi_ep_bind(fid_ep, &fid_tx_cq->fid, FI_TRANSMIT); + cr_assert_eq(ret, FI_SUCCESS, "Second EP bind TXCQ %d", ret); + ret = fi_ep_bind(fid_ep, &fid_rx_cq->fid, FI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "Second EP bind RXCQ %d", ret); + + /* Needs it's own AV */ + ret = fi_av_open(cxit_domain, &cxit_av_attr, &fid_av, NULL); + cr_assert_eq(ret, FI_SUCCESS, "Second AV %d\n", ret); + ret = fi_ep_bind(fid_ep, &fid_av->fid, 0); + cr_assert_eq(ret, FI_SUCCESS, "Secnd AV bind %d\n", ret); + + ret = fi_enable(fid_ep); + cr_assert_eq(ret, FI_SUCCESS, "Second EP enable %d\n", ret); + ret = fi_getname(&fid_ep->fid, &ep_addr, &addr_len); + cr_assert_eq(ret, FI_SUCCESS, "Second EP getname %d\n", ret); + + /* Insert second EP address into to both AV, but do not insert + * the first EP address into the the second EP AV. + */ + ret = fi_av_insert(fid_av, (void *)&ep_addr, 1, 0, + 0, NULL); + cr_assert_eq(ret, 1, "Second EP AV insert local %d\n", ret); + + ret = fi_av_insert(cxit_av, (void *)&ep_addr, 1, &fi_dest_ep_addr, + 0, NULL); + cr_assert_eq(ret, 1, "Fisrt EP AV insert second EP %d\n", ret); + + /* Setup buffers */ + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Test address not found EP1->EP2 */ + ret = fi_trecv(fid_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + sleep(1); + + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, fi_dest_ep_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Receive should get an -FI_EAVAIL with source error info */ + ret = cxit_await_completion(fid_rx_cq, &rx_cqe); + cr_assert_eq(ret, -FI_EAVAIL); + err_entry.err_data_size = sizeof(uint32_t); + err_entry.err_data = malloc(sizeof(uint32_t)); + cr_assert(err_entry.err_data); + + ret = fi_cq_readerr(fid_rx_cq, &err_entry, 0); + cr_assert_eq(ret, 1, "Readerr CQ %d\n", ret); + + /* Insert address from FI_SOURCE_ERR into AV */ + ret = fi_av_insert(fid_av, (void *)err_entry.err_data, 1, + &fi_src_err_ep_addr, 0, NULL); + + cr_assert_eq(ret, 1, "Second EP AV add src address %d\n", ret); + + /* Wait for TX */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "Send completion %d\n", ret); + + /* First EP address should now be found EP1->EP2 */ + ret = fi_trecv(fid_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + sleep(1); + + ret = fi_tsend(cxit_ep, send_buf, send_len, NULL, fi_dest_ep_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Receive should complete successfully */ + ret = cxit_await_completion(fid_rx_cq, &rx_cqe); + cr_assert_eq(ret, 1); + + /* Wait for TX */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "Send completion %d\n", ret); + + /* Validate that the inserted address may be used in send, + * i.e. EP2 can now send to EP1. + */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + sleep(1); + + ret = fi_tsend(fid_ep, send_buf, send_len, NULL, fi_src_err_ep_addr, 0, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend failed %d", ret); + + /* Receive should complete successfully */ + ret = cxit_await_completion(cxit_rx_cq, &rx_cqe); + cr_assert_eq(ret, 1); + + /* Wait for TX */ + ret = cxit_await_completion(fid_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "Send completion %d\n", ret); + + /* Cleanup Second EP */ + fi_close(&fid_ep->fid); + fi_close(&fid_av->fid); + fi_close(&fid_tx_cq->fid); + fi_close(&fid_rx_cq->fid); + + /* Cleanup First EP */ + cxit_teardown_tagged(); + cxit_teardown_getinfo(); + + free(err_entry.err_data); +} + +TestSuite(tagged_cq_wait, .init = cxit_setup_rma_fd, + .fini = cxit_teardown_rma_fd, + .timeout = CXIT_DEFAULT_TIMEOUT); + +struct fd_params { + size_t length; + size_t num_ios; + int timeout; + bool poll; + bool ux_msg; +}; + +struct tagged_cq_wait_event_args { + struct fid_cq *cq; + struct fi_cq_tagged_entry *cqe; + size_t io_num; + int timeout; + bool poll; +}; + +static void *tagged_cq_wait_evt_worker(void *data) +{ + int ret; + struct tagged_cq_wait_event_args *args; + struct fid *fids[1]; + int cq_fd; + size_t completions = 0; + + args = (struct tagged_cq_wait_event_args *)data; + + if (args->poll) { + ret = fi_control(&args->cq->fid, FI_GETWAIT, &cq_fd); + cr_assert_eq(ret, FI_SUCCESS, "Get CQ wait FD %d", ret); + fids[0] = &args->cq->fid; + } + + while (completions < args->io_num) { + if (args->poll) { + ret = fi_trywait(cxit_fabric, fids, 1); + if (ret == FI_SUCCESS) { + struct pollfd fds; + + fds.fd = cq_fd; + fds.events = POLLIN; + + ret = poll(&fds, 1, args->timeout); + cr_assert_neq(ret, 0, "Poll timed out"); + cr_assert_eq(ret, 1, "Poll error"); + } + ret = fi_cq_read(args->cq, + &args->cqe[completions], 1); + if (ret == 1) + completions++; + } else { + ret = fi_cq_sread(args->cq, &args->cqe[completions], + 1, NULL, args->timeout); + cr_assert_eq(ret, 1, "Completion not received\n"); + completions++; + } + } + + pthread_exit(NULL); +} + +static void cq_wait_post_sends(struct tagged_thread_args *tx_args, + struct fd_params *param) +{ + int ret; + size_t buf_len = param->length; + + /* Issue the Sends */ + for (size_t tx_io = 0; tx_io < param->num_ios; tx_io++) { + tx_args[tx_io].len = buf_len; + tx_args[tx_io].tag = tx_io; + tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(tx_args[tx_io].buf); + for (size_t i = 0; i < buf_len; i++) + tx_args[tx_io].buf[i] = i + 0xa0 + tx_io; + + do { + ret = fi_tsend(cxit_ep, tx_args[tx_io].buf, + tx_args[tx_io].len, NULL, + cxit_ep_fi_addr, tx_args[tx_io].tag, + NULL); + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_tx_cq, NULL, 0); + fi_cq_read(cxit_rx_cq, NULL, 0); + } + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d", + tx_io, ret); + } +} + +void do_cq_wait(struct fd_params *param) +{ + int ret; + size_t buf_len = param->length; + struct fi_cq_tagged_entry *rx_cqe; + struct fi_cq_tagged_entry *tx_cqe; + struct tagged_thread_args *tx_args; + struct tagged_thread_args *rx_args; + pthread_t tx_thread; + pthread_t rx_thread; + pthread_attr_t attr; + struct tagged_cq_wait_event_args tx_evt_args = { + .cq = cxit_tx_cq, + .io_num = param->num_ios, + .timeout = param->timeout, + .poll = param->poll, + }; + struct tagged_cq_wait_event_args rx_evt_args = { + .cq = cxit_rx_cq, + .io_num = param->num_ios, + .timeout = param->timeout, + .poll = param->poll, + }; + + tx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(tx_cqe); + + rx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(rx_cqe); + + tx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args)); + cr_assert_not_null(tx_args); + + rx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args)); + cr_assert_not_null(rx_args); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + tx_evt_args.cqe = tx_cqe; + rx_evt_args.cqe = rx_cqe; + + /* Sends first if testing unexpected message operation */ + if (param->ux_msg) { + cq_wait_post_sends(tx_args, param); + + /* Start processing Send events */ + ret = pthread_create(&tx_thread, &attr, + tagged_cq_wait_evt_worker, + (void *)&tx_evt_args); + cr_assert_eq(ret, 0, "Send thread create failed %d", ret); + + /* Force onloading of UX entries if operating in SW EP mode */ + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + } + + /* Issue the Receives */ + for (size_t rx_io = 0; rx_io < param->num_ios; rx_io++) { + rx_args[rx_io].len = buf_len; + rx_args[rx_io].tag = rx_io; + rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(rx_args[rx_io].buf); + memset(rx_args[rx_io].buf, 0, buf_len); + + do { + ret = fi_trecv(cxit_ep, rx_args[rx_io].buf, + rx_args[rx_io].len, NULL, + FI_ADDR_UNSPEC, rx_args[rx_io].tag, + 0, NULL); + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, NULL, 0); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d", + rx_io, ret); + } + + /* Start processing Receive events */ + ret = pthread_create(&rx_thread, &attr, tagged_cq_wait_evt_worker, + (void *)&rx_evt_args); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + + /* Sends last for expected messaging */ + if (!param->ux_msg) { + /* Make sure receive has blocked */ + sleep(1); + cq_wait_post_sends(tx_args, param); + + /* Start processing Send events */ + ret = pthread_create(&tx_thread, &attr, + tagged_cq_wait_evt_worker, + (void *)&tx_evt_args); + } + + /* Wait for the RX/TX event threads to complete */ + ret = pthread_join(tx_thread, NULL); + cr_assert_eq(ret, 0, "Send thread join failed %d", ret); + + ret = pthread_join(rx_thread, NULL); + cr_assert_eq(ret, 0, "Recv thread join failed %d", ret); + + /* Validate results */ + for (size_t io = 0; io < param->num_ios; io++) { + /* Validate sent data */ + cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len); + + validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL); + + validate_rx_event(&rx_cqe[io], NULL, buf_len, + FI_TAGGED | FI_RECV, NULL, + 0, tx_args[rx_cqe[io].tag].tag); + + free(tx_args[io].buf); + free(rx_args[io].buf); + } + + pthread_attr_destroy(&attr); + free(rx_cqe); + free(tx_cqe); + free(tx_args); + free(rx_args); +} + +ParameterizedTestParameters(tagged_cq_wait, wait_fd) +{ + size_t param_sz; + + static struct fd_params params[] = { + {.length = 1024, + .num_ios = 4, + .timeout = 5000, + .poll = true}, + {.length = 8192, + .num_ios = 4, + .timeout = 5000, + .poll = true}, + {.length = 1024, + .num_ios = 4, + .timeout = 5000, + .poll = false}, + {.length = 8192, + .num_ios = 4, + .timeout = 5000, + .poll = false}, + }; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct fd_params, params, + param_sz); +} + +ParameterizedTest(struct fd_params *param, tagged_cq_wait, wait_fd, + .timeout = 60) +{ + do_cq_wait(param); +} + +TestSuite(tagged_tx_size, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(tagged_tx_size, force_progress) +{ + struct fi_cq_tagged_entry rx_cqe; + struct fi_cq_tagged_entry tx_cqe; + fi_addr_t from; + char *send_buf; + char *recv_buf; + size_t buf_len; + int ret; + int tx_posted; + int rx_posted; + int i; + + /* Limit the TX queue size to 32 */ + cxit_setup_getinfo(); + cxit_fi_hints->tx_attr->size = 32; + cxit_setup_rma(); + + cr_assert_eq(cxit_fi_hints->tx_attr->size, + cxit_fi->tx_attr->size, "tx_attr->size"); + + /* Send unexpected rendezvous messages so that completions + * will not occur and verify we get resource management + * at tx_attr->size. + */ + buf_len = 32 * 1024; + send_buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(send_buf); + recv_buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(recv_buf); + + ret = 0; + for (tx_posted = 0; tx_posted < cxit_fi->tx_attr->size + 1; + tx_posted++) { + ret = fi_tsend(cxit_ep, send_buf, buf_len, NULL, + cxit_ep_fi_addr, 0, NULL); + if (ret == -FI_EAGAIN) + break; + } + cr_assert_eq(ret, -FI_EAGAIN, "-FI_EAGAIN expected"); + cr_assert(tx_posted <= cxit_fi->tx_attr->size, + "Too many I/O initiated\n"); + + /* Post the receives and get RX completions */ + ret = 0; + for (rx_posted = 0; rx_posted < tx_posted; rx_posted++) { + do { + ret = fi_trecv(cxit_ep, recv_buf, buf_len, NULL, + FI_ADDR_UNSPEC, 0, 0, NULL); + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, NULL, 0); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, + "fi_trecv %d: unexpected ret %d", rx_posted, ret); + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + } + + /* Get TX completions */ + ret = 0; + for (i = 0; i < tx_posted; i++) { + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + } while (ret == -FI_EAGAIN); + } + cr_assert_eq(ret, 1, "bad completion status"); + cr_assert_eq(i, tx_posted, "bad TX completion count"); + + cxit_teardown_rma(); + + free(send_buf); + free(recv_buf); +} diff --git a/prov/cxi/test/tagged_stress.c b/prov/cxi/test/tagged_stress.c new file mode 100644 index 00000000000..4addafaf52a --- /dev/null +++ b/prov/cxi/test/tagged_stress.c @@ -0,0 +1,224 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * + * Copyright (c) 2018 Hewlett Packard Enterprise Development LP + */ + +#include +#include + +#include +#include +#include + +#include "cxip.h" +#include "cxip_test_common.h" + +#define RECV_INIT 0x77 +#define SEND_INIT ~RECV_INIT + +TestSuite(tagged_stress, .init = cxit_setup_tagged, + .fini = cxit_teardown_tagged, + .timeout = CXIT_DEFAULT_TIMEOUT); + +static void do_msg(uint8_t *send_buf, size_t send_len, uint64_t send_tag, + uint8_t *recv_buf, size_t recv_len, uint64_t recv_tag, + uint64_t recv_ignore, bool send_first, size_t buf_size, + bool tagged, size_t ntrans) +{ + int i, j, ret; + int err = 0; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + fi_addr_t from; + int sent = false; + int recved = false; + struct fi_cq_err_entry err_cqe = {}; + + memset(recv_buf, RECV_INIT, send_len * ntrans); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + if (send_first) { + for (j = 0; j < ntrans; j++) { + /* Send 64 bytes to self */ + if (tagged) { + ret = fi_tsend(cxit_ep, send_buf, send_len, + NULL, cxit_ep_fi_addr, send_tag, + NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_tsend failed %d", + ret); + } else { + ret = fi_send(cxit_ep, send_buf, send_len, + NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_send failed %d", + ret); + } + + /* Progress send to ensure it arrives unexpected */ + i = 0; + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) { + sent++; + break; + } + cr_assert_eq(ret, -FI_EAGAIN, + "send failed %d", ret); + } while (i++ < 10000); + } + } + + /* Post RX buffer */ + + for (j = 0; j < ntrans; j++) { + if (tagged) { + ret = fi_trecv(cxit_ep, recv_buf + j * send_len, + recv_len, NULL, + FI_ADDR_UNSPEC, recv_tag, recv_ignore, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", + ret); + } else { + ret = fi_recv(cxit_ep, recv_buf + j * send_len, + recv_len, NULL, + FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", + ret); + } + } + + if (!send_first) { + for (j = 0; j < ntrans; j++) { + if (tagged) { + ret = fi_tsend(cxit_ep, send_buf, send_len, + NULL, cxit_ep_fi_addr, send_tag, + NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_tsend failed %d", + ret); + } else { + ret = fi_send(cxit_ep, send_buf, send_len, + NULL, + cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, + "fi_send failed %d", + ret); + } + } + } + + /* Gather both events, ensure progress on both sides. */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + if (ret == 1) { + cr_assert_lt(recved, ntrans); + recved++; + } else if (ret == -FI_EAVAIL) { + cr_assert_lt(recved, ntrans); + + ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0); + cr_assert_eq(ret, 1); + recved++; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", ret); + } + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + if (ret == 1) { + cr_assert_lt(sent, ntrans); + sent++; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", ret); + } + } while (sent < ntrans || recved < ntrans); + + for (i = 0; i < ntrans; i++) { + for (j = 0; j < send_len; j++) { + uint8_t *r = recv_buf + i * send_len; + + cr_expect_eq(r[j], send_buf[j], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + j, send_buf[j], r[j], err++); + } + cr_assert_eq(err, 0, "trans[%d] Data errors seen\n", i); + } +} + +#define BUF_SIZE (128*1024) +#define SEND_MIN 64 +#define TAG 0x333333333333 + +struct tagged_rx_params { + size_t buf_size; + size_t send_min; + uint64_t send_tag; + int recv_len_off; + uint64_t recv_tag; + uint64_t ignore; + bool ux; + bool tagged; + size_t ntrans; +}; + +static struct tagged_rx_params params[] = { + {.buf_size = BUF_SIZE, /* equal length */ + .send_min = SEND_MIN, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = 0, + .ignore = 0, + .ux = false, + .tagged = true, + .ntrans = 200}, + {.buf_size = BUF_SIZE, /* equal length UX */ + .send_min = SEND_MIN, + .send_tag = 0, + .recv_len_off = 0, + .recv_tag = 0, + .ignore = 0, + .ux = true, + .tagged = true, + .ntrans = 200}, +}; + +ParameterizedTestParameters(tagged_stress, rx) +{ + size_t param_sz; + + param_sz = ARRAY_SIZE(params); + return cr_make_param_array(struct tagged_rx_params, params, + param_sz); +} + +ParameterizedTest(struct tagged_rx_params *param, tagged_stress, rx, + .timeout = 60*10, .disabled = true) +{ + uint8_t *recv_buf, + *send_buf; + size_t send_len; + + recv_buf = aligned_alloc(s_page_size, param->buf_size * param->ntrans); + cr_assert(recv_buf); + + send_buf = aligned_alloc(s_page_size, param->buf_size * param->ntrans); + cr_assert(send_buf); + + for (send_len = param->send_min; + send_len <= param->buf_size; + send_len <<= 1) { + do_msg(send_buf, send_len, param->send_tag, + recv_buf, send_len + param->recv_len_off, + param->recv_tag, param->ignore, param->ux, + param->buf_size, param->tagged, param->ntrans); + printf("send_len: %ld completed\n", send_len); + } + + free(send_buf); + free(recv_buf); +} diff --git a/prov/cxi/test/test.sh b/prov/cxi/test/test.sh new file mode 100644 index 00000000000..ea6a913703f --- /dev/null +++ b/prov/cxi/test/test.sh @@ -0,0 +1,450 @@ +#!/bin/bash +# +# set -x +# +# Run CXI unit tests. +# +# ################################################################ +# +# Tests are declared as an array with up to 3 strings: +# 1) the test body +# 2) an optional prolog +# 3) an optional epilog +# +# The strings are executed with the shell 'eval' function. +# They may contain more than one statement separated by semi-colons. +# You probably want to escape your '\$' and '\"' in the strings.... +# Prologs and epilogs may be "". Or absent if both at "". +# Output from test body is captured in $TEST_OUTPUT automatically. +# Output from prologs and epilogs is not captured by default. +# +# Tests are grouped into suites. Since Bash does not +# really have 2 dimensional arrays, the suites are arrays +# of test names, which match the variable names of the tests. +# +# The long suite is selected by default. +# The short and dummy suites can be selected with -s or -d. +# +# A no-execute mode is selected with -n. This prints the +# test name, prolog, body and epilog for every test in the +# selected suite. +# +# See default_env for environment variables common to all tests. +# Overriding for a particular test is supported in the test body. +# +# To disable a test, comment out the name in the suite. +# ################################################################ +# +# The examples: + +dummy_test1=( + "echo \"dummy test\"" + "echo \"dummy prolog\"; echo \$(hostname)" + "echo \"dummy epilog\"" +) + +dummy_test2=( + "echo \"dummy test with epilog but no prolog\"" + "" + "echo \"dummy epilog\"") + +dummy_test3=( + "echo \"simple dummy test\"") + +dummy_test_suite=( + "dummy_test1" + "dummy_test2" + "dummy_test3" +) + +# ################################################################ +# The short tests and short test suite + +short_test1=( + "./cxitest --verbose --filter=\"@(msg*|tagged*|rma*|atomic*)/*\" -j 1 -f --tap=cxitest-short.tap") + +short_test_suite=( + "short_test1" +) + +# ################################################################ +# the long tests and long test suite + +basic_test=("./cxitest --verbose --tap=cxitest.tap -j 1") + +swget_test=( + "FI_CXI_RGET_TC=BULK_DATA ./cxitest --verbose --filter=\"@(tagged|msg)/*\" --tap=cxitest-swget.tap -j1" + "csrutil store csr C_LPE_CFG_GET_CTRL get_en=0 > /dev/null" + "csrutil store csr C_LPE_CFG_GET_CTRL get_en=1 > /dev/null") + +swget_unaligned_test=( + "FI_CXI_RDZV_THRESHOLD=2036 ./cxitest --verbose --filter=\"@(tagged|msg)/*\" --tap=cxitest-swget-unaligned.tap -j1" + "csrutil store csr C_LPE_CFG_GET_CTRL get_en=0 > /dev/null" + "csrutil store csr C_LPE_CFG_GET_CTRL get_en=1 > /dev/null") + +constrained_le_test=( + "FI_CXI_DEFAULT_CQ_SIZE=16384 ./cxitest --verbose --filter=\"@(tagged|msg)/fc*\" --tap=cxitest-constrained-le.tap -j1" + "MAX_ALLOC=\$(csrutil dump csr le_pools[63] | grep max_alloc | awk '{print \$3}'); echo \"Saving MAX_ALLOC=\$MAX_ALLOC\"; csrutil store csr le_pools[] max_alloc=10 > /dev/null" + "echo \"Restoring MAX_ALLOC=\$MAX_ALLOC\"; csrutil store csr le_pools[] max_alloc=\$MAX_ALLOC > /dev/null") + +hw_matching_rendezvous_test=( + "FI_CXI_DEVICE_NAME=\"cxi1,cxi0\" FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --verbose -j 1 --filter=\"tagged_directed/*\" --tap=cxitest-hw-rdzv-tag-matching.tap") + +sw_matching_rendezvous_test=( + "FI_CXI_RX_MATCH_MODE=\"software\" FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --verbose -j 1 --filter=\"@(tagged|msg)/*\" --tap=cxitest-sw-ep-mode.tap") + +fc_eq_space_test=( + "FI_CXI_DEFAULT_CQ_SIZE=64 FI_CXI_DISABLE_EQ_HUGETLB=1 FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --filter=\"msg/fc_no_eq_space_expected_multi_recv\" --verbose -j 1 --tap=cxitest-fc-eq-space.tap") + +fc_eq_20_percent_test=( + "FI_CXI_CQ_FILL_PERCENT=20 FI_CXI_DEFAULT_CQ_SIZE=64 FI_CXI_DISABLE_EQ_HUGETLB=1 FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --filter=\"msg/fc_no_eq_space_expected_multi_recv\" --verbose -j 1 --tap=cxitest-fc-20%-eq-space.tap") + +fi_info_test=( + "./fi_info_test.sh --tap=fi_info.tap") + +unoptimized_mr_test=( + "FI_CXI_OPTIMIZED_MRS=0 ./cxitest --filter=\"amo_hybrid_mr_desc/*\" -j 1 -f --verbose --tap=cxitest-hybrid_mr_desc_unopt_mrs.tap") + +provider_keys_mr_test=( + "CXIP_TEST_PROV_KEY=1 ./cxitest -j 1 -f --verbose --tap=cxitest-prov_key_mrs.tap") + +unoptimized_provider_keys_mr_test=( + "CXIP_TEST_PROV_KEY=1 FI_CXI_OPTIMIZED_MRS=0 ./cxitest --filter=\"@(rma|mr)/*\" -j 1 -f --verbose --tap=cxitest-prov_key_no_opt_mrs.tap") + +provider_keys_std_fallback_test=( + "CXIP_TEST_PROV_KEY=1 FI_MR_CACHE_MONITOR=\"disabled\" ./cxitest --filter=\"mr_resources/opt_fallback\" -j 1 -f --verbose --tap=cxitest-prov_key_opt_to_std.tap") + +zero_eager_size_test=( + "FI_CXI_RDZV_EAGER_SIZE=0 ./cxitest --filter=\"@(tagged|msg)/*\" -j 1 -f --verbose --tap=cxitest-zero-rdzv-eager-size.tap") + +alt_read_rendezvous_test=( + "FI_CXI_RDZV_PROTO=\"alt_read\" ./cxitest --filter=\"tagged/*rdzv\" -j 1 -f --verbose --tap=cxitest-alt-read-rdzv.tap" + "csrutil store csr C_LPE_CFG_GET_CTRL get_en=0 > /dev/null" + "csrutil store csr C_LPE_CFG_GET_CTRL get_en=1 > /dev/null") + +mr_mode_no_compat_test=( + "FI_CXI_COMPAT=0 ./cxitest -j 1 --filter=\"getinfo_infos/*\" -f --verbose --tap=cxitest-mr-mode-no-compat.tap") + +mr_mode_with_odp_test=( + "FI_CXI_ODP=1 ./cxitest -j 1 --filter=\"getinfo_infos/*\" -f --verbose --tap=cxitest-mr-mode-with-odp.tap") + +mr_mode_with_prov_keys_odp_test=( + "FI_CXI_ODP=1 CXIP_TEST_PROV_KEY=1 ./cxitest -j 1 --filter=\"getinfo_infos/*\" -f --verbose --tap=cxitest-mr-mode-with-prov-key-odp.tap") + +cxi_fork_safe_test=( + "CXI_FORK_SAFE=1 CXI_FORK_SAFE_HP=1 ./cxitest --verbose --tap=cxitest-fork-safe.tap --filter=\"@(rma*|tagged*|msg*|atomic*)/*\" -j 1") + +fork_safe_monitor_disabled_test=( + "FI_MR_CACHE_MONITOR=\"disabled\" ./cxitest --verbose --tap=cxitest-fork_tests-mr_cache_disabled.tap --filter=\"fork/*\" -j 1") + +fork_safe_uffd_test=( + "FI_MR_CACHE_MONITOR=\"uffd\" ./cxitest --verbose --tap=cxitest-fork_tests-mr_cache_uffd.tap --filter=\"fork/*\" -j 1") + +fork_safe_memhooks_test=( + "FI_MR_CACHE_MONITOR=\"memhooks\" ./cxitest --verbose --tap=cxitest-fork_tests-mr_cache_memhooks.tap --filter=\"fork/*\" -j 1") + +fork_safe_kdreg2_test=( + "FI_MR_CACHE_MONITOR=\"kdreg2\" ./cxitest --verbose --tap=cxitest-fork_tests-mr_cache_kdreg2.tap --filter=\"fork/*\" -j 1") + +unlimited_triggered_ops_test=( + "FI_CXI_ENABLE_TRIG_OP_LIMIT=0 ./cxitest -j 1 --verbose --filter=\"deferred_work_trig_op_limit/*\" --tap=cxitest-disable-trig-op-limit.tap") + +long_test_suite=( + "basic_test" + "swget_test" + "swget_unaligned_test" + "constrained_le_test" + "hw_matching_rendezvous_test" + "sw_matching_rendezvous_test" + "fc_eq_space_test" + "fc_eq_20_percent_test" + "fi_info_test" + "unoptimized_mr_test" + "provider_keys_mr_test" + "unoptimized_provider_keys_mr_test" + "provider_keys_std_fallback_test" + "zero_eager_size_test" + "alt_read_rendezvous_test" + "mr_mode_no_compat_test" + "mr_mode_with_odp_test" + "mr_mode_with_prov_keys_odp_test" + "cxi_fork_safe_test" + "fork_safe_monitor_disabled_test" + "fork_safe_uffd_test" + "fork_safe_memhooks_test" + "fork_safe_kdreg2_test" + "unlimited_triggered_ops_test" +) + +# ################################################################ + +known_suites=( + "short_test_suite" + "long_test_suite" + "dummy_test_suite" +) + +default_test_suite="long_test_suite" + +# ################################################################ + +default_env=( + "DMA_FAULT_RATE=0.1" + "MALLOC_FAULT_RATE=0.1" + "FI_LOG_LEVEL=warn" + "FI_CXI_FC_RECOVERY=1" + "FI_CXI_ENABLE_TRIG_OP_LIMIT=1" + "FI_MR_CACHE_MONITOR=uffd" +) + +# ################################################################ + +dashes="----------------------------------------------------------------" + +# ################################################################ + +print_suites() { + + for suite in "${known_suites[@]}"; do + echo "Suite: $suite" + local -n tests="$suite" + for test in "${tests[@]}"; do + echo " $test" + done; + done; + + return 0 +} + +# ################################################################ +# Function to run one test +# It expects the following argument: +# test name + +run_one_test() { + if [ $# -eq 0 ]; then + echo "$0 called with no arguments (?)" + exit 1 + fi + local name="$1" + + local -n elements="$name" + local -i num_elements=${#elements[@]} + + if [ $no_execute -ne 0 ]; then + echo $dashes + fi + + if [ $num_elements -lt 1 ]; then + echo "Test $name not found" + return 1 + elif [ $num_elements -gt 3 ]; then + echo "test $1 malformed: maximum 3 elements in array: test prolog epilog" + exit 1 + fi + + local test_body="${elements[0]}" + if [ $num_elements -ge 2 ]; then + local prolog="${elements[1]}" + else + local prolog="" + fi + if [ $num_elements -ge 3 ]; then + local epilog="${elements[2]}" + else + local epilog="" + fi + + local full_test_string="$test_body >> $TEST_OUTPUT 2>&1" + + if [ $no_execute -ne 0 ]; then + echo "Test name: $name" + echo "Prolog: $prolog" + echo "Test body: $full_test_string" + echo "Epilog: $epilog" + return 0 + fi + + if [ -n "$prolog" ]; then + echo "Running $name prolog: $prolog" + eval $prolog + fi + + echo "Running $name: $full_test_string" | tee -a $TEST_OUTPUT + eval $full_test_string + local -i test_result=$? + + if [ $test_result -ne 0 ]; then + echo "Test $name returns non-zero exit code. Possible failures in test teardown." + fi + + if [ -n "$epilog" ]; then + echo "Running $name epilog: $epilog" + eval $epilog + fi + + return $test_result +} + +# ################################################################ +# Function to run a list of tests + +run_tests() { + local ret=0 + for test in $@; do + run_one_test "$test" + local r=$? + if [ $r -ne 0 ]; then + ret=$r + if [ $fail_fast -ne 0 ]; then + break + fi + fi + done + return $ret +} + +# ################################################################ +# Function to run all the tests in a suite +# It expects the following argument: +# suite name + +run_test_suite() { + if [ $# -ne 1 ]; then + echo "$0 called with no arguments (?)" + exit 1 + fi + local suite=$1 + + echo "Running Suite: $suite" + + local -n tests=$suite + + run_tests "${tests[@]}" + local ret=$? + + if [ $no_execute -ne 0 ]; then + echo $dashes + fi + + return $ret +} + +# ################################################################ + +print_help() { +cat < $TEST_OUTPUT 2>&1" +#FI_CXI_RX_MATCH_MODE=hybrid ./cxitest --verbose --tap=cxitest.tap -j2 > $TEST_OUTPUT 2>&1 +#if [[ $? -ne 0 ]]; then +# echo "cxitest return non-zero exit code. Possible failures in test teardown" +# exit 1 +#fi + +# Run tests with constrained LE count - Using Flow Control recovery +MAX_ALLOC=`csrutil dump csr le_pools[63] |grep max_alloc |awk '{print $3}'` +csrutil store csr le_pools[] max_alloc=10 > /dev/null +echo "running;FI_CXI_RX_MATCH_MODE=hardware ./cxitest --verbose --filter=\"tagged/fc*\" --tap=cxitest-fc.tap -j1 > $TEST_OUTPUT 2>&1" +FI_CXI_RX_MATCH_MODE=hardware ./cxitest --verbose --filter="tagged/fc*" --tap=cxitest-fc.tap -j1 > $TEST_OUTPUT 2>&1 +cxitest_exit_status=$? +csrutil store csr le_pools[] max_alloc=$MAX_ALLOC > /dev/null +if [[ $cxitest_exit_status -ne 0 ]]; then + echo "cxitest return non-zero exit code. Possible failures in test teardown" + exit 1 +fi + +# Run tests with constrained LE count - Using hybrid operation instead +# of flow control recovery +MAX_ALLOC=`csrutil dump csr le_pools[63] |grep max_alloc |awk '{print $3}'` +csrutil store csr le_pools[] max_alloc=10 > /dev/null +echo "running;FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 ./cxitest --verbose --filter=\"tagged/fc*\" --tap=cxitest-sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1" +FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 ./cxitest --verbose --filter="tagged/fc*" --tap=cxitest-sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1 +cxitest_exit_status=$? +csrutil store csr le_pools[] max_alloc=$MAX_ALLOC > /dev/null +if [[ $cxitest_exit_status -ne 0 ]]; then + echo "cxitest return non-zero exit code. Possible failures in test teardown" + exit 1 +fi + +# Run HW to SW hybrid test with constrained LE count and forcing both +# eager and rendezvous processing +MAX_ALLOC=`csrutil dump csr le_pools[63] |grep max_alloc |awk '{print $3}'` +csrutil store csr le_pools[] max_alloc=60 > /dev/null +echo "running;FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --verbose --filter=\"tagged/hw2sw_*\" --tap=cxitest-hw2sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1" +FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=2048 ./cxitest --verbose --filter="tagged/hw2sw_*" --tap=cxitest-hw2sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1 +csrutil store csr le_pools[] max_alloc=$MAX_ALLOC > /dev/null +if [[ $cxitest_exit_status -ne 0 ]]; then + echo "cxitest return non-zero exit code. Possible failures in test teardown" + exit 1 +fi + +# Run HW to SW hybrid test with constrained LE count and forcing only eager processing +MAX_ALLOC=`csrutil dump csr le_pools[63] |grep max_alloc |awk '{print $3}'` +echo "running;FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=16384 ./cxitest --verbose --filter=\"tagged/hw2sw_*\" --tap=cxitest-hw2sw-eager-transition.tap -j1 >> $TEST_OUTPUT 2>&1" +FI_CXI_RX_MATCH_MODE=hybrid FI_CXI_RDZV_GET_MIN=0 FI_CXI_RDZV_THRESHOLD=16384 ./cxitest --verbose --filter="tagged/hw2sw_*" --tap=cxitest-hw2sw-transition.tap -j1 >> $TEST_OUTPUT 2>&1 +cxitest_exit_status=$? +csrutil store csr le_pools[] max_alloc=$MAX_ALLOC > /dev/null +if [[ $cxitest_exit_status -ne 0 ]]; then + echo "cxitest return non-zero exit code. Possible failures in test teardown" + exit 1 +fi + +# Test scaling of request buffers +echo "running; FI_CXI_RX_MATCH_MODE=software FI_CXI_REQ_BUF_MIN_POSTED=2 FI_CXI_REQ_BUF_MAX_COUNT=10 ./cxitest --verbose --filter=\"tagged/*fc_mt\" --tap=cxitest-sw-reqbuf.tap -j1 >> $TEST_OUTPUT 2>&1" +FI_CXI_RX_MATCH_MODE=software FI_CXI_REQ_BUF_MIN_POSTED=2 FI_CXI_REQ_BUF_MAX_COUNT=10 ./cxitest --verbose --filter="tagged/*fc_mt" --tap=cxitest-sw-req_buf.tap -j1 >> $TEST_OUTPUT 2>&1 +cxitest_exit_status=$? +if [[ $cxitest_exit_status -ne 0 ]]; then + echo "cxitest return non-zero exit code. Possible failures in test teardown" + exit 1 +fi + +grep "Tested" $TEST_OUTPUT diff --git a/prov/cxi/test/ze.c b/prov/cxi/test/ze.c new file mode 100644 index 00000000000..4d08d103966 --- /dev/null +++ b/prov/cxi/test/ze.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "libcxi/libcxi.h" +#include "cxip.h" +#include "cxip_test_common.h" + +static uint32_t ze_driver_count = 1; +static ze_driver_handle_t ze_driver; +static ze_context_handle_t ze_context; +static uint32_t ze_device_count = 1; +static ze_device_handle_t ze_device; +static ze_command_queue_handle_t ze_cq; +static const ze_device_mem_alloc_desc_t device_desc = { + .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + .pNext = NULL, + .flags = 0, + .ordinal = 0, +}; +static const ze_host_mem_alloc_desc_t host_desc = { + .stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, + .pNext = NULL, + .flags = 0, +}; +static const ze_command_queue_desc_t cq_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .pNext = NULL, + .ordinal = 0, + .index = 0, + .flags = 0, + .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, +}; +static const ze_command_list_desc_t cl_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .pNext = NULL, + .commandQueueGroupOrdinal = 0, + .flags = 0, +}; + +static void ze_init(void) +{ + ze_result_t ze_ret; + ze_context_desc_t ze_context_desc = {}; + + ze_ret = zeInit(ZE_INIT_FLAG_GPU_ONLY); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeInit failed: %d", ze_ret); + + ze_ret = zeDriverGet(&ze_driver_count, &ze_driver); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeDriverGet failed: %d", + ze_ret); + + ze_ret = zeContextCreate(ze_driver, &ze_context_desc, &ze_context); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeContextCreate failed: %d", + ze_ret); + + /* Only support a single device. */ + ze_ret = zeDeviceGet(ze_driver, &ze_device_count, &ze_device); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeDeviceGet failed: %d", + ze_ret); + + ze_ret = zeCommandQueueCreate(ze_context, ze_device, &cq_desc, &ze_cq); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, + "zeCommandQueueCreate failed: %d", ze_ret); +} + +static void ze_fini(void) +{ + ze_result_t ze_ret; + + ze_ret = zeCommandQueueDestroy(ze_cq); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, + "zeCommandQueueDestroy failed: %d", ze_ret); + + ze_ret = zeContextDestroy(ze_context); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, + "zeContextDestroy failed: %d", ze_ret); +} + +static void ze_copy(void *dst, const void *src, size_t size) +{ + ze_command_list_handle_t cmd_list; + ze_result_t ze_ret; + + ze_ret = zeCommandListCreate(ze_context, ze_device, &cl_desc, + &cmd_list); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, + "zeCommandListCreate failed: %d", ze_ret); + + ze_ret = zeCommandListAppendMemoryCopy(cmd_list, dst, src, size, NULL, + 0, NULL); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, + "zeCommandListAppendMemoryCopy failed: %d", ze_ret); + + ze_ret = zeCommandListClose(cmd_list); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, + "zeCommandListClose failed: %d", ze_ret); + + ze_ret = zeCommandQueueExecuteCommandLists(ze_cq, 1, &cmd_list, NULL); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, + "zeCommandQueueExecuteCommandLists failed: %d", ze_ret); +} + +TestSuite(ze, .timeout = CXIT_DEFAULT_TIMEOUT); + +static void ze_message_runner(void *ze_send_buf, void *ze_recv_buf, + size_t buf_size) +{ + int ret; + char *send_buf; + char *recv_buf; + struct fi_cq_tagged_entry cqe; + int i; + + cxit_setup_msg(); + + /* Send and recv buffer as used as bounce buffers for their ze + * counterparts. This is not true for zeMemAllocHost. + */ + send_buf = malloc(buf_size); + cr_assert_neq(send_buf, NULL, "Failed to allocate memory"); + + ret = open("/dev/urandom", O_RDONLY); + cr_assert_neq(ret, -1, "open failed: %d", -errno); + read(ret, send_buf, buf_size); + close(ret); + + recv_buf = calloc(1, buf_size); + cr_assert_neq(send_buf, NULL, "Failed to allocate memory"); + + ze_copy(ze_send_buf, send_buf, buf_size); + + ret = fi_recv(cxit_ep, ze_recv_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed: %d", ret); + + ret = fi_send(cxit_ep, ze_send_buf, buf_size, NULL, cxit_ep_fi_addr, + NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed: %d", ret); + + do { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + do { + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + ze_copy(recv_buf, ze_recv_buf, buf_size); + + for (i = 0; i < buf_size; i++) + cr_assert_eq(send_buf[i], recv_buf[i], + "Data corruption at byte %d", i); + + free(recv_buf); + free(send_buf); + + cxit_teardown_msg(); +} + +Test(ze, messaging_devMemory) +{ + ze_result_t ze_ret; + void *ze_send_buf; + void *ze_recv_buf; + size_t buf_size = 1048576; + + ze_init(); + + /* Ze buffers will be used for RDMA. */ + ze_ret = zeMemAllocDevice(ze_context, &device_desc, buf_size, 0, + ze_device, &ze_send_buf); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemAllocDevice failed: %d", + ze_ret); + + ze_ret = zeMemAllocDevice(ze_context, &device_desc, buf_size, 0, + ze_device, &ze_recv_buf); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemAllocDevice failed: %d", + ze_ret); + + ze_message_runner(ze_send_buf, ze_recv_buf, buf_size); + + ze_ret = zeMemFree(ze_context, ze_recv_buf); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemFree failed: %d", + ze_ret); + + ze_ret = zeMemFree(ze_context, ze_send_buf); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemFree failed: %d", + ze_ret); + + ze_fini(); +} + +Test(ze, messaging_hostMemory) +{ + ze_result_t ze_ret; + void *ze_send_buf; + void *ze_recv_buf; + size_t buf_size = 1048576; + + ze_init(); + + /* Ze buffers will be used for RDMA. */ + ze_ret = zeMemAllocHost(ze_context, &host_desc, buf_size, 0, + &ze_send_buf); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemAllocDevice failed: %d", + ze_ret); + + ze_ret = zeMemAllocHost(ze_context, &host_desc, buf_size, 0, + &ze_recv_buf); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemAllocDevice failed: %d", + ze_ret); + + ze_message_runner(ze_send_buf, ze_recv_buf, buf_size); + + ze_ret = zeMemFree(ze_context, ze_recv_buf); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemFree failed: %d", + ze_ret); + + ze_ret = zeMemFree(ze_context, ze_send_buf); + cr_assert_eq(ze_ret, ZE_RESULT_SUCCESS, "zeMemFree failed: %d", + ze_ret); + + ze_fini(); +} diff --git a/src/fabric.c b/src/fabric.c index b33c6068907..849909fd0d8 100644 --- a/src/fabric.c +++ b/src/fabric.c @@ -445,7 +445,7 @@ static struct fi_provider *ofi_get_hook(const char *name) static void ofi_ordered_provs_init(void) { char *ordered_prov_names[] = { - "efa", "psm2", "opx", "verbs", + "efa", "psm2", "opx", "verbs", "cxi", "netdir", "psm3", "ucx", "ofi_rxm", "ofi_rxd", "shm", /* Initialize the socket based providers last of the @@ -545,6 +545,7 @@ static void ofi_register_provider(struct fi_provider *provider, void *dlhandle) !strcasecmp(provider->name, "efa") || !strcasecmp(provider->name, "psm3") || !strcasecmp(provider->name, "ucx") || + !strcasecmp(provider->name, "cxi") || ofi_is_util_prov(provider)) ofi_prov_ctx(provider)->disable_layering = true; @@ -898,6 +899,7 @@ void fi_ini(void) ofi_register_provider(PSM3_INIT, NULL); ofi_register_provider(PSM2_INIT, NULL); + ofi_register_provider(CXI_INIT, NULL); ofi_register_provider(SHM_INIT, NULL); ofi_register_provider(SM2_INIT, NULL);