diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 88ff147dce..647e4b3008 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -17,4 +17,4 @@ on: jobs: call-workflow-passing-data: name: Documentation - uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@develop + uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@update_spelling_rst diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml index 41785ea504..20621f0606 100644 --- a/.markdownlint-cli2.yaml +++ b/.markdownlint-cli2.yaml @@ -12,5 +12,5 @@ config: MD041: false MD051: false ignores: - - CHANGELOG.md - - "{,docs/}{RELEASE,release}.md" + - RELEASE.md + - docs/doxygen/mainpage.md diff --git a/.wordlist.txt b/.wordlist.txt index 31db060a15..45af247c0d 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -1,688 +1,92 @@ -AAC -ABI -ACE -ACEs -AccVGPR -AccVGPRs ALU -AMD -AMDGPU -AMDGPUs -AMDMIGraphX -AMI -AOCC -AOMP -APIC -APIs +ALUs +AmgX APU -ASIC -ASICs -ASan -ASAN -ASm -ATI -AddressSanitizer -AlexNet -Arb -BLAS -BMC -BitCode -Blit -Bluefield -CCD -CDNA -CIFAR -CLI -CLion -CMake -CMakeLists -CMakePackage -CP -CPC -CPF -CPP -CPU -CPUs -CSC -CSE -CSV -CSn -CTests -CU -CUDA -CUs -CXX -Cavium -CentOS -ChatGPT -CoRR -Codespaces -Commitizen -CommonMark -Concretized -Conda -ConnectX -DGEMM -DKMS -DL -DMA -DNN -DNNL -DPM -DRI -DW -DWORD -Dask -DataFrame -DataLoader -DataParallel -DeepSpeed -Dependabot -DevCap -Dockerfile -Doxygen -ELMo -ENDPGM -EPYC -ESXi -FFT -FFTs -FFmpeg -FHS -FMA -FP -Filesystem -Flang -Fortran -Fuyu -GALB -GCD -GCDs -GCN -GDB -GDDR -GDR -GDS -GEMM -GEMMs -GFortran -GiB -GIM -GL -GLXT -GMI -GPG -GPR -GPT -GPU -GPU's -GPUs -GRBM -GenAI -GenZ -GitHub -Gitpod -HBM -HCA -HIPCC -HIPExtension -HIPIFY -HPC -HPCG -HPE -HPL -HSA -HWE -Haswell -Higgs -Hyperparameters -ICV -IDE -IDEs -IMDb -IOMMU -IOP -IOPM -IOV -IRQ -ISA -ISV -ISVs -ImageNet -InfiniBand -Inlines -IntelliSense -Intersphinx -Intra -Ioffe -JSON -Jupyter -KFD -KiB -KVM -Keras -Khronos -LAPACK -LCLK -LDS -LLM -LLMs -LLVM -LM -LSAN -LSan -LTS -LoRA -MEM -MERCHANTABILITY -MFMA -MiB -MIGraphX -MIOpen -MIOpenGEMM -MIVisionX -MLM -MMA -MMIO -MMIOH -MNIST -MPI -MSVC -MVAPICH -MVFFR -Makefile -Makefiles -Matplotlib -Megatron -Mellanox -Mellanox's -Meta's -MirroredStrategy -Multicore -Multithreaded -MyEnvironment -MyST -NBIO -NBIOs -NIC -NICs -NLI -NLP -NPS -NSP -NUMA -NVCC -NVIDIA -NVPTX -NaN -Nano -Navi -Noncoherently -NousResearch's -NumPy -OAM -OAMs -OCP -OEM -OFED -OMP -OMPI -OMPT -OMPX -ONNX -OSS -OSU -OpenCL -OpenCV -OpenFabrics -OpenGL -OpenMP -OpenSSL -OpenVX -PCI -PCIe -PEFT -PIL -PILImage -PRNG -PRs -PaLM -Pageable -PeerDirect -Perfetto -PipelineParallel -PnP -PowerShell -PyPi -PyTorch -Qcycles -RAII -RCCL -RDC -RDMA -RDNA -RHEL -ROC -ROCProfiler -ROCTracer -ROCclr -ROCdbgapi -ROCgdb -ROCk -ROCm -ROCmCC -ROCmSoftwarePlatform -ROCmValidationSuite -ROCr -RST -RW -Radeon -RelWithDebInfo -Req -Rickle -RoCE -Ryzen -SALU -SBIOS -SCA -SDK -SDMA -SDRAM -SENDMSG -SGPR -SGPRs -SHA -SIGQUIT -SIMD -SIMDs -SKU -SKUs -SLES -SMEM -SMI -SMT -SPI -SQs -SRAM -SRAMECC -SVD -SWE -SerDes -Shlens -Skylake -Softmax -Spack -Supermicro -Szegedy -TCA -TCC -TCI -TCIU -TCP -TCR -TF -TFLOPS -TPU -TPUs -TensorBoard -TensorFlow -TensorParallel -ToC -TorchAudio -TorchMIGraphX -TorchScript -TorchServe -TorchVision -TransferBench -TrapStatus -UAC -UC -UCC -UCX -UIF -USM -UTCL -UTIL -Uncached -Unhandled -VALU -VBIOS -VGPR -VGPRs -VM -VMEM -VMWare -VRAM -VSIX -VSkipped -Vanhoucke -Vulkan -WGP -WGPs -WX -WikiText -Wojna -Workgroups -Writebacks -XCD -XCDs -XGBoost -XGBoost's -XGMI -XT -XTX -Xeon -Xilinx -Xnack -Xteam -YAML -YML -YModel -ZeRO -ZenDNN -accuracies -activations -addr -alloc -allocator -allocators -amdgpu -api -atmi -atomics -autogenerated -avx -awk -backend -backends -benchmarking -bfloat -bilinear -bitsandbytes -blit -boson -bosons -buildable -bursty -bzip -cacheable -cd -centos -centric -changelog -chiplet -cmake -cmd -coalescable -codename -collater -comgr -completers -composable -concretization -config -conformant -convolutional -convolves -cpp -csn -cuBLAS -cuFFT -cuLIB -cuRAND -cuSOLVER -cuSPARSE -dataset -datasets -dataspace -datatype -datatypes -dbgapi -de -deallocation -denoise -denoised -denoises -denormalize -deserializers -detections -dev -devicelibs -devsel -dimensionality -disambiguates -distro -el -embeddings -enablement -endpgm -encodings -env -epilog -etcetera -ethernet -exascale -executables -ffmpeg -filesystem -fortran -galb -gcc -gdb -gfortran -gfx -githooks -github -gnupg -grayscale -gzip -heterogenous -hipBLAS -hipBLASLt -hipCUB -hipFFT -hipLIB -hipRAND -hipSOLVER -hipSPARSE -hipSPARSELt -hipTensor -hipamd -hipblas -hipcub -hipfft -hipfort -hipify -hipsolver -hipsparse -hpp -hsa -hsakmt -hyperparameter -ib_core -inband -incrementing -inferencing -inflight -init -initializer -inlining -installable -interprocedural -intra -invariants -invocating -ipo -kdb +AQL +Asynchrony +backtrace +Bitcode +bitcode +bitcodes +builtins +Builtins +CAS +clr +cuBLASLt +cuCtx +cuDNN +deallocate +denormal +dll +DirectX +EIGEN +EIGEN's +enqueue +enqueues +enum +embeded +extern +fatbinary +frontends +gedit +GPGPU +hardcoded +HC +HIP's +hipcc +hipexamine +hipified +hipother +HIPRTC +hcBLAS +icc +inplace +Interoperation +interoperate +Intrinsics +intrinsics +IPC +isa +Lapack latencies -libfabric -libjpeg -libs -linearized -linter -linux -llvm -localscratch -logits -lossy -macOS -matchers -microarchitecture -migraphx -miopen -miopengemm -mivisionx -mkdir -mlirmiopen -mtypes -mvffr -namespace -namespaces -numref -ocl -opencl -opencv -openmp -openssl -optimizers -os -pageable -parallelization -parameterization -passthrough -perfcounter -performant -perl -pragma -pre -prebuilt -precompiled -prefetch -prefetchable -preprocess -preprocessed -preprocessing -prequantized -prerequisites -profiler -protobuf -pseudorandom -py -quasirandom -queueing -rccl -rdc -reStructuredText -reformats -repos -representativeness -req -resampling -rescaling -reusability -roadmap -roc -rocAL -rocALUTION -rocBLAS -rocFFT -rocLIB -rocMLIR -rocPRIM -rocRAND -rocSOLVER -rocSPARSE -rocThrust -rocWMMA -rocalution -rocblas -rocclr -rocfft -rocm -rocminfo -rocprim -rocprof -rocprofiler -rocr -rocrand -rocsolver -rocsparse -rocthrust -roctracer -runtime -runtimes -sL -scalability -scalable -sendmsg -serializers -shader -sharding -sigmoid -sm -smi -softmax -spack -src -stochastically -strided -subdirectory -subexpression -subfolder -subfolders -supercomputing -tensorfloat -th -tokenization -tokenize -tokenized -tokenizer -tokenizes -toolchain -toolchains -toolset -toolsets -torchvision -tqdm -tracebacks -txt -uarch -uncached -uncorrectable -uninstallation -unsqueeze -unstacking -unswitching -untrusted -untuned -upvote -USM -UTCL -UTIL -utils -vL -variational -vdi -vectorizable -vectorization -vectorize -vectorized -vectorizer -vectorizes -vjxb -walkthrough -walkthroughs -wavefront -wavefronts -whitespaces -workgroup -workgroups -writeback -writebacks -wrreq -wzo -xargs -xz -yaml -ysvmadyb -zypper \ No newline at end of file +libc +libstdc +lifecycle +linearizing +LOC +LUID +ltrace +makefile +Malloc +malloc +multicore +multigrid +multithreading +NCCL +NDRange +nonnegative +Numa +Nsight +oversubscription +preconditioners +prefetched +preprocessor +PTX +PyHIP +queryable +representable +RMW +ROCm's +rocTX +RTC +RTTI +scalarizing +sceneries +SIMT +SPMV +structs +SYCL +syntaxes +typedefs +WinGDB +zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz \ No newline at end of file diff --git a/docs/how-to/debugging.rst b/docs/how-to/debugging.rst index 340c4c8df6..c90f7ec7d8 100644 --- a/docs/how-to/debugging.rst +++ b/docs/how-to/debugging.rst @@ -1,13 +1,13 @@ .. meta:: :description: How to debug using HIP. - :keywords: AMD, ROCm, HIP, debugging, ltrace, ROCdgb, Windgb + :keywords: AMD, ROCm, HIP, debugging, ltrace, ROCgdb, WinGDB ************************************************************************* Debugging with HIP ************************************************************************* -AMD debugging tools include *ltrace* and *ROCdgb*. External tools are available and can be found -online. For example, if you're using Windows, you can use *Microsoft Visual Studio* and *Windgb*. +AMD debugging tools include *ltrace* and *ROCgdb*. External tools are available and can be found +online. For example, if you're using Windows, you can use *Microsoft Visual Studio* and *WinGDB*. You can trace and debug your code using the following tools and techniques. @@ -23,7 +23,7 @@ can use ltrace to visualize the runtime behavior of the entire ROCm software sta Here's a simple command-line example that uses ltrace to trace HIP APIs and output: -.. code:: console +.. code-block:: console $ ltrace -C -e "hip*" ./hipGetChanDesc hipGetChanDesc->hipCreateChannelDesc(0x7ffdc4b66860, 32, 0, 0) = 0x7ffdc4b66860 @@ -36,7 +36,7 @@ Here's a simple command-line example that uses ltrace to trace HIP APIs and outp Here's another example that uses ltrace to trace hsa APIs and output: -.. code:: console +.. code-block:: console $ ltrace -C -e "hsa*" ./hipGetChanDesc libamdhip64.so.4->hsa_init(0, 0x7fff325a69d0, 0x9c80e0, 0 @@ -94,12 +94,12 @@ Debugging You can use ROCgdb for debugging and profiling. ROCgdb is the ROCm source-level debugger for Linux and is based on GNU Project debugger (GDB). -the GNU source-level debugger, equivalent of cuda-gdb, can be used with debugger frontends, such as eclipse, vscode, or gdb-dashboard. +the GNU source-level debugger, equivalent of CUDA-GDB, can be used with debugger frontends, such as Eclipse, Visual Studio Code, or GDB dashboard. For details, see (https://github.com/ROCm/ROCgdb). -Below is a sample how to use ROCgdb run and debug HIP application, rocgdb is installed with ROCM package in the folder /opt/rocm/bin. +Below is a sample how to use ROCgdb run and debug HIP application, ROCgdb is installed with ROCM package in the folder /opt/rocm/bin. -.. code:: console +.. code-block:: console $ export PATH=$PATH:/opt/rocm/bin $ rocgdb ./hipTexObjPitch @@ -132,7 +132,7 @@ Debugging HIP applications The following Linux example shows how to get useful information from the debugger while running a simple memory copy test, which caused a segmentation fault issue. -.. code:: console +.. code-block:: console test: simpleTest2 numElements=4194304 sizeElements=4194304 bytes Segmentation fault (core dumped) @@ -231,13 +231,13 @@ For systems with multiple devices, you can choose to make only certain device(s) ``HIP_VISIBLE_DEVICES`` (or ``CUDA_VISIBLE_DEVICES`` on an NVIDIA platform). Once enabled, HIP can only view devices that have indices present in the sequence. For example: -.. code:: console +.. code-block:: console $ HIP_VISIBLE_DEVICES=0,1 Or in the application: -.. code:: cpp +.. code-block:: cpp if (totalDeviceNum > 2) { setenv("HIP_VISIBLE_DEVICES", "0,1,2", 1); @@ -272,11 +272,15 @@ HIP environment variable summary Here are some of the more commonly used environment variables: +.. + .. # COMMENT: The following lines define a break for use in the table below. -.. |br| raw:: html +.. |break| raw:: html
+.. + .. list-table:: * - **Environment variable** @@ -284,80 +288,80 @@ Here are some of the more commonly used environment variables: - **Usage** * - AMD_LOG_LEVEL - |br| Enable HIP log on different Level + |break| Enable HIP log on different Level - 0 - 0: Disable log. - |br| 1: Enable log on error level - |br| 2: Enable log on warning and below levels - |br| 0x3: Enable log on information and below levels - |br| 0x4: Decode and display AQL packets + |break| 1: Enable log on error level + |break| 2: Enable log on warning and below levels + |break| 0x3: Enable log on information and below levels + |break| 0x4: Decode and display AQL packets * - AMD_LOG_MASK - |br| Enable HIP log on different Level + |break| Enable HIP log on different Level - 0x7FFFFFFF - 0x1: Log API calls - |br| 0x02: Kernel and Copy Commands and Barriers - |br| 0x4: Synchronization and waiting for commands to finish - |br| 0x8: Enable log on information and below levels - |br| 0x20: Queue commands and queue contents - |br| 0x40: Signal creation, allocation, pool - |br| 0x80: Locks and thread-safety code - |br| 0x100: Copy debug - |br| 0x200: Detailed copy debug - |br| 0x400: Resource allocation, performance-impacting events - |br| 0x800: Initialization and shutdown - |br| 0x1000: Misc debug, not yet classified - |br| 0x2000: Show raw bytes of AQL packet - |br| 0x4000: Show code creation debug - |br| 0x8000: More detailed command info, including barrier commands - |br| 0x10000: Log message location - |br| 0xFFFFFFFF: Log always even mask flag is zero + |break| 0x02: Kernel and Copy Commands and Barriers + |break| 0x4: Synchronization and waiting for commands to finish + |break| 0x8: Enable log on information and below levels + |break| 0x20: Queue commands and queue contents + |break| 0x40: Signal creation, allocation, pool + |break| 0x80: Locks and thread-safety code + |break| 0x100: Copy debug + |break| 0x200: Detailed copy debug + |break| 0x400: Resource allocation, performance-impacting events + |break| 0x800: Initialization and shutdown + |break| 0x1000: Misc debug, not yet classified + |break| 0x2000: Show raw bytes of AQL packet + |break| 0x4000: Show code creation debug + |break| 0x8000: More detailed command info, including barrier commands + |break| 0x10000: Log message location + |break| 0xFFFFFFFF: Log always even mask flag is zero * - HIP_LAUNCH_BLOCKING - |br| Used for serialization on kernel execution. + |break| Used for serialization on kernel execution. - 0 - 0: Disable. Kernel executes normally. - |br| 1: Enable. Serializes kernel enqueue, behaves the same as AMD_SERIALIZE_KERNEL. + |break| 1: Enable. Serializes kernel enqueue, behaves the same as AMD_SERIALIZE_KERNEL. * - HIP_VISIBLE_DEVICES (or CUDA_VISIBLE_DEVICES) - |br| Only devices whose index is present in the sequence are visible to HIP + |break| Only devices whose index is present in the sequence are visible to HIP - - 0,1,2: Depending on the number of devices on the system * - GPU_DUMP_CODE_OBJECT - |br| Dump code object + |break| Dump code object - 0 - 0: Disable - |br| 1: Enable + |break| 1: Enable * - AMD_SERIALIZE_KERNEL - |br| Serialize kernel enqueue + |break| Serialize kernel enqueue - 0 - 1: Wait for completion before enqueue - |br| 2: Wait for completion after enqueue - |br| 3: Both + |break| 2: Wait for completion after enqueue + |break| 3: Both * - AMD_SERIALIZE_COPY - |br| Serialize copies + |break| Serialize copies - 0 - 1: Wait for completion before enqueue - |br| 2: Wait for completion after enqueue - |br| 3: Both + |break| 2: Wait for completion after enqueue + |break| 3: Both * - HIP_HOST_COHERENT - |br| Coherent memory in hipHostMalloc + |break| Coherent memory in hipHostMalloc - 0 - 0: memory is not coherent between host and GPU - |br| 1: memory is coherent with host + |break| 1: memory is coherent with host * - AMD_DIRECT_DISPATCH - |br| Enable direct kernel dispatch (Currently for Linux; under development for Windows) + |break| Enable direct kernel dispatch (Currently for Linux; under development for Windows) - 1 - 0: Disable - |br| 1: Enable + |break| 1: Enable * - GPU_MAX_HW_QUEUES - |br| The maximum number of hardware queues allocated per device + |break| The maximum number of hardware queues allocated per device - 4 - The variable controls how many independent hardware queues HIP runtime can create per process, per device. If an application allocates more HIP streams than this number, then HIP runtime reuses @@ -371,7 +375,7 @@ General debugging tips * ``gdb --args`` can be used to pass the executable and arguments to ``gdb``. * You can set environment variables (``set env``) from within GDB on Linux: - .. code:: bash + .. code-block:: bash (gdb) set env AMD_SERIALIZE_KERNEL 3 @@ -379,7 +383,7 @@ General debugging tips This ``gdb`` command does not use an equal (=) sign. * The GDB backtrace shows a path in the runtime. This is because a fault is caught by the runtime, but it is generated by an asynchronous command running on the GPU. -* To determine the true location of a fault, you can force the kernels to run synchronously by setting the environment variables ``AMD_SERIALIZE_KERNEL=3`` and ``AMD_SERIALIZE_COPY=3``. This forces HIP runtime to wait for the kernel to finish running before retuning. If the fault occurs when a kernel is running, you can see the code that launched the kernel inside the backtrace. The thread that's causing the issue is typically the one inside ``libhsa-runtime64.so``. +* To determine the true location of a fault, you can force the kernels to run synchronously by setting the environment variables ``AMD_SERIALIZE_KERNEL=3`` and ``AMD_SERIALIZE_COPY=3``. This forces HIP runtime to wait for the kernel to finish running before returning. If the fault occurs when a kernel is running, you can see the code that launched the kernel inside the backtrace. The thread that's causing the issue is typically the one inside ``libhsa-runtime64.so``. * VM faults inside kernels can be caused by: * Incorrect code (e.g., a for loop that extends past array boundaries) diff --git a/docs/how-to/faq.md b/docs/how-to/faq.md index 6bfed815a4..348fd3e732 100644 --- a/docs/how-to/faq.md +++ b/docs/how-to/faq.md @@ -4,16 +4,16 @@ HIP provides the following: -* Devices (hipSetDevice(), hipGetDeviceProperties(), etc.) -* Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.) -* Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(), etc.) -* Events (hipEventRecord(), hipEventElapsedTime(), etc.) -* Kernel launching (hipLaunchKernel/hipLaunchKernelGGL is the preferred way of launching kernels. hipLaunchKernelGGL is a standard C/C++ macro that can serve as an alternative way to launch kernels, replacing the CUDA triple-chevron (<<< >>>) syntax). -* HIP Module API to control when adn how code is loaded. -* CUDA-style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim) -* Cross-lane instructions including shfl, ballot, any, all +* Devices (`hipSetDevice()`, `hipGetDeviceProperties()`, etc.) +* Memory management (`hipMalloc()`, `hipMemcpy()`, `hipFree()`, etc.) +* Streams (`hipStreamCreate()`, `hipStreamSynchronize()`, `hipStreamWaitEvent()`, etc.) +* Events (`hipEventRecord()`, `hipEventElapsedTime()`, etc.) +* Kernel launching (`hipLaunchKernel`/`hipLaunchKernelGGL` is the preferred way of launching kernels. `hipLaunchKernelGGL` is a standard C/C++ macro that can serve as an alternative way to launch kernels, replacing the CUDA triple-chevron (`<<< >>>`) syntax). +* HIP Module API to control when and how code is loaded. +* CUDA-style kernel coordinate functions (`threadIdx`, `blockIdx`, `blockDim`, `gridDim`) +* Cross-lane instructions including `shfl`, `ballot`, `any`, `all` * Most device-side math built-ins -* Error reporting (hipGetLastError(), hipGetErrorString()) +* Error reporting (`hipGetLastError()`, `hipGetErrorString()`) The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API. @@ -27,7 +27,7 @@ At a high-level, the following features are not supported: * Dynamic parallelism (CUDA 5.0) * Graphics interoperability with OpenGL or Direct3D * CUDA IPC Functions (Under Development) -* CUDA array, mipmappedArray and pitched memory +* CUDA array, `mipmappedArray` and pitched memory * Queue priority controls See the [API Support Table](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/tables/CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information. @@ -37,8 +37,8 @@ See the [API Support Table](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs * C++-style device-side dynamic memory allocations (free, new, delete) (CUDA 4.0) * Virtual functions, indirect functions and try/catch (CUDA 4.0) * `__prof_trigger` -* PTX assembly (CUDA 4.0). HIP-Clang supports inline GCN assembly. -* Several kernel features are under development. See the {doc}`/reference/kernel_language` for more information. +* PTX assembly (CUDA 4.0). HIP-Clang supports inline GCN assembly. +* Several kernel features are under development. See the {doc}`/reference/kernel_language` for more information. ## Is HIP a drop-in replacement for CUDA? @@ -56,44 +56,44 @@ However, we can provide a rough summary of the features included in each CUDA SD * HIP supports CUDA 4.0 except for the limitations described above. * CUDA 5.0 : * Dynamic Parallelism (not supported) - * cuIpc functions (under development). + * `cuIpc` functions (under development). * CUDA 6.0 : * Managed memory (under development) * CUDA 6.5 : - * __shfl intrinsic (supported) + * `__shfl` intrinsic (supported) * CUDA 7.0 : * Per-thread default streams (supported) * C++11 (Hip-Clang supports all of C++11, all of C++14 and some C++17 features) * CUDA 7.5 : * float16 (supported) * CUDA 8.0 : - * Page Migration including cudaMemAdvise, cudaMemPrefetch, other cudaMem* APIs(not supported) + * Page Migration including `cudaMemAdvise`, `cudaMemPrefetch`, other `cudaMem*` APIs(not supported) * CUDA 9.0 : * Cooperative Launch, Surface Object Management, Version Management ## What libraries does HIP support? -HIP includes growing support for the four key math libraries using hipBlas, hipFFt, hipRAND and hipSPARSE, as well as MIOpen for machine intelligence applications. +HIP includes growing support for the four key math libraries using hipBLAS, hipFFT, hipRAND and hipSPARSE, as well as MIOpen for machine intelligence applications. These offer pointer-based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HIP applications. The hip interfaces support both ROCm and CUDA paths, with familiar library interfaces. -* [hipBlas](https://github.com/ROCmSoftwarePlatform/hipBLAS), which utilizes [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS). -* [hipFFt](https://github.com/ROCmSoftwarePlatform/hipfft) +* [hipBLAS](https://github.com/ROCmSoftwarePlatform/hipBLAS), which utilizes [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS). +* [hipFFT](https://github.com/ROCmSoftwarePlatform/hipfft) * [hipsSPARSE](https://github.com/ROCmSoftwarePlatform/hipsparse) * [hipRAND](https://github.com/ROCmSoftwarePlatform/hipRAND) * [MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen) -Additionally, some of the cublas routines are automatically converted to hipblas equivalents by the HIPIFY tools. These APIs use cublas or hcblas depending on the platform and replace the need to use conditional compilation. +Additionally, some of the cuBLAS routines are automatically converted to hipblas equivalents by the HIPIFY tools. These APIs use cuBLAS or hcBLAS depending on the platform and replace the need to use conditional compilation. ## How does HIP compare with OpenCL? -Both AMD and Nvidia support OpenCL 1.2 on their devices so that developers can write portable code. +Both AMD and NVIDIA support OpenCL 1.2 on their devices so that developers can write portable code. HIP offers several benefits over OpenCL: * Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, lambdas, classes and so on. * The HIP API is less verbose than OpenCL and is familiar to CUDA developers. * Because both CUDA and HIP are C++ languages, porting from CUDA to HIP is significantly easier than porting from CUDA to OpenCL. -* HIP uses the best available development tools on each platform: on Nvidia GPUs, HIP code compiles using NVCC and can employ the nSight profiler and debugger (unlike OpenCL on Nvidia GPUs). +* HIP uses the best available development tools on each platform: on NVIDIA GPUs, HIP code compiles using NVCC and can employ the Nsight profiler and debugger (unlike OpenCL on NVIDIA GPUs). * HIP provides pointers and host-side pointer arithmetic. * HIP provides device-level control over memory allocation and placement. * HIP offers an offline compilation model. @@ -103,17 +103,17 @@ HIP offers several benefits over OpenCL: Both HIP and CUDA are dialects of C++, and thus porting between them is relatively straightforward. Both dialects support templates, classes, lambdas, and other C++ constructs. As one example, the hipify-perl tool was originally a Perl script that used simple text conversions from CUDA to HIP. -HIP and CUDA provide similar math library calls as well. In summary, the HIP philosophy was to make the HIP language close enough to CUDA that the porting effort is relatively simple. -This reduces the potential for error, and also makes it easy to automate the translation. HIP's goal is to quickly get the ported program running on both platforms with little manual intervention, so that the programmer can focus on performance optimizations. +HIP and CUDA provide similar math library calls as well. In summary, the HIP philosophy was to make the HIP language close enough to CUDA that the porting effort is relatively simple. +This reduces the potential for error, and also makes it easy to automate the translation. HIP goal is to quickly get the ported program running on both platforms with little manual intervention, so that the programmer can focus on performance optimizations. -There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL. OpenCL is a C99-based kernel language (rather than C++) and also does not support single-source compilation. +There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL. OpenCL is a C99-based kernel language (rather than C++) and also does not support single-source compilation. As a result, the OpenCL syntax is different from CUDA, and the porting tools have to perform some heroic transformations to bridge this gap. The tools also struggle with more complex CUDA applications, in particular, those that use templates, classes, or other C++ features inside the kernel. ## What hardware does HIP support? * For AMD platforms, see the [ROCm documentation](https://github.com/RadeonOpenCompute/ROCm#supported-gpus) for the list of supported platforms. -* For Nvidia platforms, HIP requires unified memory and should run on any device supporting CUDA SDK 6.0 or newer. We have tested the Nvidia Titan and Tesla K40. +* For NVIDIA platforms, HIP requires unified memory and should run on any device supporting CUDA SDK 6.0 or newer. We have tested the NVIDIA Titan and Tesla K40. ## Do HIPIFY tools automatically convert all source code? @@ -125,7 +125,7 @@ In general, developers should always expect to perform some platform-specific tu ## What is NVCC? -NVCC is Nvidia's compiler driver for compiling "CUDA C++" code into PTX or device code for Nvidia GPUs. It's a closed-source binary compiler that is provided by the CUDA SDK. +NVCC is NVIDIA's compiler driver for compiling "CUDA C++" code into PTX or device code for NVIDIA GPUs. It's a closed-source binary compiler that is provided by the CUDA SDK. ## What is HIP-Clang? @@ -133,13 +133,13 @@ HIP-Clang is a Clang/LLVM based compiler to compile HIP programs which can run o ## Why use HIP rather than supporting CUDA directly? -While HIP is a strong subset of the CUDA, it is a subset. The HIP layer allows that subset to be clearly defined and documented. -Developers who code to the HIP API can be assured their code will remain portable across Nvidia and AMD platforms. -In addition, HIP defines portable mechanisms to query architectural features and supports a larger 64-bit wavesize which expands the return type for cross-lane functions like ballot and shuffle from 32-bit ints to 64-bit ints. +While HIP is a strong subset of the CUDA, it is a subset. The HIP layer allows that subset to be clearly defined and documented. +Developers who code to the HIP API can be assured their code will remain portable across NVIDIA and AMD platforms. +In addition, HIP defines portable mechanisms to query architectural features and supports a larger 64-bit `WaveSize` which expands the return type for cross-lane functions like ballot and shuffle from 32-bit integers to 64-bit integers. -## Can I develop HIP code on an Nvidia CUDA platform? +## Can I develop HIP code on an NVIDIA CUDA platform? -Yes. HIP's CUDA path only exposes the APIs and functionality that work on both NVCC and AMDGPU back-ends. +Yes. HIP's CUDA path only exposes the APIs and functionality that work on both NVCC and AMDGPU back-ends. "Extra" APIs, parameters, and features which exist in CUDA but not in HIP-Clang will typically result in compile-time or run-time errors. Developers need to use the HIP API for most accelerator code and bracket any CUDA-specific code with preprocessor conditionals. Developers concerned about portability should, of course, run on both platforms, and should expect to tune for performance. @@ -177,14 +177,14 @@ hipother supports the HIP back-end implementation on some non-AMD platforms, lik No, there is no HIP repository open publicly on Windows. -## Can a HIP binary run on both AMD and Nvidia platforms? +## Can a HIP binary run on both AMD and NVIDIA platforms? HIP is a source-portable language that can be compiled to run on either AMD or NVIDIA platform. HIP tools don't create a "fat binary" that can run on either platform, however. ## On HIP-Clang, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang? -Yes. HIP generates the object code which conforms to the GCC ABI, and also links with libstdc++. This means you can compile host code with the compiler of your choice and link the generated object code -with GPU code compiled with HIP. Larger projects often contain a mixture of accelerator code (initially written in CUDA with nvcc) and host code (compiled with gcc, icc, or clang). These projects +Yes. HIP generates the object code which conforms to the GCC ABI, and also links with libstdc++. This means you can compile host code with the compiler of your choice and link the generated object code +with GPU code compiled with HIP. Larger projects often contain a mixture of accelerator code (initially written in CUDA with NVCC) and host code (compiled with gcc, icc, or clang). These projects can convert the accelerator code to HIP, compile that code with hipcc, and link with object code from their preferred compiler. ## Can HIP API support C style application? What is the difference between C and C++? @@ -216,7 +216,7 @@ dim3 grid1; x=1, y=1, z=1 dim3 grid2 = {1,1,1}; x=1, y=1, z=1 ``` -In which "dim3 grid1;" will yield a dim3 grid with all dimensional members x,y,z initalized to 1, as the default constructor behaves that way. +In which "dim3 grid1;" will yield a dim3 grid with all dimensional members x,y,z initialized to 1, as the default constructor behaves that way. Further, if written: ```cpp @@ -242,9 +242,9 @@ dim3 grid = {1,1,1}; // initialized as in C++ ## Can I install both CUDA SDK and HIP-Clang on the same machine? -Yes. You can use HIP_PLATFORM to choose which path hipcc targets. This configuration can be useful when using HIP to develop an application which is portable to both AMD and NVIDIA. +Yes. You can use HIP_PLATFORM to choose which path hipcc targets. This configuration can be useful when using HIP to develop an application which is portable to both AMD and NVIDIA. -## HIP detected my platform (HIP-Clang vs nvcc) incorrectly * what should I do? +## HIP detected my platform (HIP-Clang vs NVCC) incorrectly * what should I do? HIP will set the platform to AMD and use HIP-Clang as compiler if it sees that the AMD graphics driver is installed and has detected an AMD GPU. Sometimes this isn't what you want * you can force HIP to recognize the platform by setting the following, @@ -264,21 +264,24 @@ export HIP_PLATFORM=nvidia ``` In this case, HIP will set and use the following, + +```shell HIP_COMPILER=cuda HIP_RUNTIME=nvcc +``` -One symptom of this problem is the message "error: 'unknown error'(11) at square.hipref.cpp:56". This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as nvcc. HIP may be able to compile the application using the nvcc tool-chain but will generate this error at runtime since the platform does not have a CUDA device. +One symptom of this problem is the message "error: 'unknown error'(11) at `square.hipref.cpp:56`. This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as NVCC. HIP may be able to compile the application using the NVCC tool-chain but will generate this error at runtime since the platform does not have a CUDA device. ## On CUDA, can I mix CUDA code with HIP code? -Yes. Most HIP data structures (hipStream_t, hipEvent_t) are typedefs to CUDA equivalents and can be intermixed. Both CUDA and HIP use integer device ids. -One notable exception is that hipError_t is a new type, and cannot be used where a cudaError_t is expected. In these cases, refactor the code to remove the expectation. Alternatively, hip_runtime_api.h defines functions which convert between the error code spaces: +Yes. Most HIP data structures (`hipStream_t`, `hipEvent_t`) are typedefs to CUDA equivalents and can be intermixed. Both CUDA and HIP use integer device ids. +One notable exception is that `hipError_t` is a new type, and cannot be used where a `cudaError_t` is expected. In these cases, refactor the code to remove the expectation. Alternatively, hip_runtime_api.h defines functions which convert between the error code spaces: -hipErrorToCudaError -hipCUDAErrorTohipError -hipCUResultTohipError +`hipErrorToCudaError` +`hipCUDAErrorTohipError` +`hipCUResultTohipError` -If platform portability is important, use #ifdef __HIP_PLATFORM_NVIDIA__ to guard the CUDA-specific code. +If platform portability is important, use `#ifdef __HIP_PLATFORM_NVIDIA__` to guard the CUDA-specific code. ## How do I trace HIP application flow? @@ -289,17 +292,17 @@ See {doc}`/how-to/logging` for more information. Product of block.x, block.y, and block.z should be less than 1024. Please note, HIP does not support kernel launch with total work items defined in dimension with size `gridDim x blockDim >= 2^32`, so `gridDim.x * blockDim.x, gridDim.y * blockDim.y and gridDim.z * blockDim.z` are always less than 2^32. -## Are __shfl_*_sync functions supported on HIP platform? +## Are ``__shfl_*_sync`` functions supported on HIP platform? -__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version. +``__shfl_*_sync`` is not supported on HIP but for NVCC path CUDA 9.0 and above all shuffle calls get redirected to it's sync version. ## How to create a guard for code that is specific to the host or the GPU? -The compiler defines the `__HIP_DEVICE_COMPILE__` macro only when compiling the code for the GPU. It could be used to guard code that is specific to the host or the GPU. +The compiler defines the `__HIP_DEVICE_COMPILE__` macro only when compiling the code for the GPU. It could be used to guard code that is specific to the host or the GPU. -## Why _OpenMP is undefined when compiling with -fopenmp? +## Why _OpenMP is undefined when compiling with `-fopenmp`? -When compiling an OpenMP source file with `hipcc -fopenmp`, the compiler may generate error if there is a reference to the `_OPENMP` macro. This is due to a limitation in hipcc that treats any source file type (for example `.cpp`) as an HIP translation unit leading to some conflicts with the OpenMP language switch. If the OpenMP source file doesn't contain any HIP language constructs you could work around this issue by adding the `-x c++` switch to force the compiler to treat the file as regular C++. Another approach would be to guard the OpenMP code with `#ifdef _OPENMP` so that the code block is disabled when compiling for the GPU. The `__HIP_DEVICE_COMPILE__` macro defined by the HIP compiler when compiling GPU code could also be used for guarding code paths specific to the host or the GPU. +When compiling an OpenMP source file with `hipcc -fopenmp`, the compiler may generate error if there is a reference to the `_OPENMP` macro. This is due to a limitation in hipcc that treats any source file type (for example `.cpp`) as an HIP translation unit leading to some conflicts with the OpenMP language switch. If the OpenMP source file doesn't contain any HIP language constructs you could work around this issue by adding the `-x c++` switch to force the compiler to treat the file as regular C++. Another approach would be to guard the OpenMP code with `#ifdef _OPENMP` so that the code block is disabled when compiling for the GPU. The `__HIP_DEVICE_COMPILE__` macro defined by the HIP compiler when compiling GPU code could also be used for guarding code paths specific to the host or the GPU. ## Does the HIP-Clang compiler support extern shared declarations? @@ -308,19 +311,19 @@ Previously, it was essential to declare dynamic shared memory using the HIP_DYNA Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required. You may use the standard extern definition: extern __shared__ type var[]; -## I have multiple HIP enabled devices and I am getting an error code hipErrorSharedObjectInitFailed with the message "Error: shared object initialization failed"? +## I have multiple HIP enabled devices and I am getting an error code `hipErrorSharedObjectInitFailed` with the message "Error: shared object initialization failed"? This error message is seen due to the fact that you do not have valid code object for all of your devices. If you have compiled the application yourself, make sure you have given the correct device name(s) and its features via: `--offload-arch`. If you are not mentioning the `--offload-arch`, make sure that `hipcc` is using the correct offload arch by verifying the hipcc output generated by setting the environment variable `HIPCC_VERBOSE=1`. -If you have a precompiled application/library (like rocblas, tensorflow etc) which gives you such error, there are one of two possibilities. +If you have a precompiled application/library (like rocblas, TensorFlow etc) which gives you such error, there are one of two possibilities. * The application/library does not ship code object bundles for __all__ of your device(s): in this case you need to recompile the application/library yourself with correct `--offload-arch`. -* The application/library does not ship code object bundles for __some__ of your device(s), for example you have a system with an APU + GPU and the library does not ship code objects for your APU. For this you can set the environment variable `HIP_VISIBLE_DEVICES` or `CUDA_VISIBLE_DEVICES` on NVdia platform, to only enable GPUs for which code object is available. This will limit the GPUs visible to your application and allow it to run. +* The application/library does not ship code object bundles for __some__ of your device(s), for example you have a system with an APU + GPU and the library does not ship code objects for your APU. For this you can set the environment variable `HIP_VISIBLE_DEVICES` or `CUDA_VISIBLE_DEVICES` on NVIDIA platform, to only enable GPUs for which code object is available. This will limit the GPUs visible to your application and allow it to run. -Note: In previous releases, the error code is hipErrorNoBinaryForGpu with message "Unable to find code object for all current devices". -The error code handling behavior is changed. HIP runtime shows the error code hipErrorSharedObjectInitFailed with message "Error: shared object initialization failed" on unsupported GPU. +Note: In previous releases, the error code is `hipErrorNoBinaryForGpu` with message "Unable to find code object for all current devices". +The error code handling behavior is changed. HIP runtime shows the error code `hipErrorSharedObjectInitFailed` with message "Error: shared object initialization failed" on unsupported GPU. ## How to use per-thread default stream in HIP? @@ -329,21 +332,24 @@ The per-thread default stream is an implicit stream local to both the thread and The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program. In ROCm, a compilation option should be added in order to compile the translation unit with per-thread default stream enabled. -"-fgpu-default-stream=per-thread". +`-fgpu-default-stream=per-thread`. Once source is compiled with per-thread default stream enabled, all APIs will be executed on per thread default stream, hence there will not be any implicit synchronization with other streams. Besides, per-thread default stream be enabled per translation unit, users can compile some files with feature enabled and some with feature disabled. Feature enabled translation unit will have default stream as per thread and there will not be any implicit synchronization done but other modules will have legacy default stream which will do implicit synchronization. -## How to use complex muliplication and division operations? +## How to use complex multiplication and division operations? + +In HIP, `hipFloatComplex` and `hipDoubleComplex` are defined as complex data types, -In HIP, hipFloatComplex and hipDoubleComplex are defined as complex data types, +```c++ typedef float2 hipFloatComplex; typedef double2 hipDoubleComplex; +``` Any application uses complex multiplication and division operations, need to replace '*' and '/' operators with the following, -* hipCmulf() and hipCdivf() for hipFloatComplex -* hipCmul() and hipCdiv() for hipDoubleComplex +* `hipCmulf()` and `hipCdivf()` for `hipFloatComplex` +* `hipCmul()` and `hipCdiv()` for `hipDoubleComplex` Note: These complex operations are equivalent to corresponding types/functions on the NVIDIA platform. @@ -355,7 +361,7 @@ Due to different working mechanisms on operating systems like Windows vs Linux, ## Does HIP support LUID? Starting ROCm 6.0, HIP runtime supports Locally Unique Identifier (LUID). -This feature enables the local physical device(s) to interoperate with other devices. For example, DX12. +This feature enables the local physical device(s) to interoperate with other devices. For example, DirectX 12. HIP runtime sets device LUID properties so the driver can query LUID to identify each device for interoperability. @@ -370,7 +376,10 @@ HIP_VERSION=HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERS ``` HIP version can be queried from HIP API call, + +```cpp hipRuntimeGetVersion(&runtimeVersion); +``` The version returned will always be greater than the versions in previous ROCm releases. diff --git a/docs/how-to/hip_porting_driver_api.md b/docs/how-to/hip_porting_driver_api.md index 99847dbd11..d42353b631 100644 --- a/docs/how-to/hip_porting_driver_api.md +++ b/docs/how-to/hip_porting_driver_api.md @@ -9,9 +9,9 @@ CUDA provides a separate CUDA Driver and Runtime APIs. The two APIs have signifi * Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`. For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality. * The Driver API defines a different but largely overlapping error code space than the Runtime API, and uses a different coding convention. For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue` -The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs. +The Driver API offers two additional pieces of functionality not provided by the Runtime API: `cuModule` and `cuCtx` APIs. -### cuModule API +### `cuModule` API The Module section of the Driver API provides additional control over how and when accelerator code objects are loaded. For example, the driver API allows code objects to be loaded from files or memory pointers. @@ -30,7 +30,7 @@ Other environments have many kernels and do not want them to be all loaded autom The Module functions can be used to load the generated code objects and launch kernels. As we will see below, HIP defines a Module API which provides similar explicit control over code object management. -### cuCtx API +### `cuCtx` API The Driver API defines "Context" and "Devices" as separate entities. Contexts contain a single device, and a device can theoretically have multiple contexts. @@ -41,33 +41,33 @@ HIP as well as a recent versions of CUDA Runtime provide other mechanisms to acc The CUDA Runtime API unifies the Context API with the Device API. This simplifies the APIs and has little loss of functionality since each Context can contain a single device, and the benefits of multiple contexts has been replaced with other interfaces. HIP provides a context API to facilitate easy porting from existing Driver codes. -In HIP, the Ctx functions largely provide an alternate syntax for changing the active device. +In HIP, the `Ctx` functions largely provide an alternate syntax for changing the active device. -Most new applications will prefer to use `hipSetDevice` or the stream APIs , therefore HIP has marked hipCtx APIs as **deprecated**. Support for these APIs may not be available in future releases. For more details on deprecated APIs please refer [HIP deprecated APIs](https://github.com/ROCm/HIP/blob/develop/docs/reference/deprecated_api_list.md). +Most new applications will prefer to use `hipSetDevice` or the stream APIs , therefore HIP has marked `hipCtx` APIs as **deprecated**. Support for these APIs may not be available in future releases. For more details on deprecated APIs please refer [HIP deprecated APIs](https://github.com/ROCm/HIP/blob/develop/docs/reference/deprecated_api_list.md). -## HIP Module and Ctx APIs +## HIP Module and `Ctx` APIs -Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules and Ctx control. +Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules and `Ctx` control. -### hipModule API +### `hipModule` API Like the CUDA Driver API, the Module API provides additional control over how code is loaded, including options to load code from files or from in-memory pointers. NVCC and HIP-Clang target different architectures and use different code object formats: NVCC is `cubin` or `ptx` files, while the HIP-Clang path is the `hsaco` format. The external compilers which generate these code objects are responsible for generating and loading the correct code object for each platform. Notably, there is not a fat binary format that can contain code for both NVCC and HIP-Clang platforms. The following table summarizes the formats used on each platform: -| Format | APIs | NVCC | HIP-CLANG | -| --- | --- | --- | --- | -| Code Object | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco | -| Fat Binary | hipModuleLoadFatBin | .fatbin | .hip_fatbin | +| Format | APIs | NVCC | HIP-CLANG | +| --- | --- | --- | --- | +| Code Object | `hipModuleLoad`, `hipModuleLoadData` | `.cubin` or PTX text | `.hsaco` | +| Fat Binary | `hipModuleLoadFatBin` | `.fatbin` | `.hip_fatbin` | `hipcc` uses HIP-Clang or NVCC to compile host codes. Both of these may embed code objects into the final executable, and these code objects will be automatically loaded when the application starts. -The hipModule API can be used to load additional code objects, and in this way provides an extended capability to the automatically loaded code objects. +The `hipModule` API can be used to load additional code objects, and in this way provides an extended capability to the automatically loaded code objects. HIP-Clang allows both of these capabilities to be used together, if desired. Of course it is possible to create a program with no kernels and thus no automatic loading. -### hipCtx API +### `hipCtx` API -HIP provides a `Ctx` API as a thin layer over the existing Device functions. This Ctx API can be used to set the current context, or to query properties of the device associated with the context. +HIP provides a `Ctx` API as a thin layer over the existing Device functions. This `Ctx` API can be used to set the current context, or to query properties of the device associated with the context. The current context is implicitly used by other APIs such as `hipStreamCreate`. ### hipify translation of CUDA Driver API @@ -75,7 +75,7 @@ The current context is implicitly used by other APIs such as `hipStreamCreate`. The HIPIFY tools convert CUDA Driver APIs for streams, events, modules, devices, memory management, context, profiler to the equivalent HIP driver calls. For example, `cuEventCreate` will be translated to `hipEventCreate`. HIPIFY tools also convert error codes from the Driver namespace and coding convention to the equivalent HIP error code. Thus, HIP unifies the APIs for these common functions. -The memory copy API requires additional explanation. The CUDA driver includes the memory direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides a single memory copy API with a parameter that specifies the direction and additionally supports a "default" direction where the runtime determines the direction automatically. +The memory copy API requires additional explanation. The CUDA driver includes the memory direction in the name of the API (`cuMemcpyH2D`) while the CUDA driver API provides a single memory copy API with a parameter that specifies the direction and additionally supports a "default" direction where the runtime determines the direction automatically. HIP provides APIs with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`. The first flavor may be faster in some cases since they avoid host overhead to detect the different memory directions. @@ -86,7 +86,7 @@ HIP defines a single error space, and uses camel-case for all errors (i.e. `hipE HIP-Clang defines a process-wide address space where the CPU and all devices allocate addresses from a single unified pool. Thus addresses may be shared between contexts, and unlike the original CUDA definition a new context does not create a new address space for the device. -#### Using hipModuleLaunchKernel +#### Using `hipModuleLaunchKernel` `hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`. @@ -95,26 +95,26 @@ Thus addresses may be shared between contexts, and unlike the original CUDA defi * HIP-Clang creates a primary context when the HIP API is called. So in a pure driver API code, HIP-Clang will create a primary context while HIP/NVCC will have empty context stack. HIP-Clang will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs. -### hip-clang Implementation Notes +### `hip-clang` Implementation Notes -#### .hip_fatbin +#### `.hip_fatbin` -hip-clang links device code from different translation units together. For each device target, a code object is generated. Code objects for different device targets are bundled by clang-offload-bundler as one fatbinary, which is embeded as a global symbol `__hip_fatbin` in the .hip_fatbin section of the ELF file of the executable or shared object. +hip-clang links device code from different translation units together. For each device target, a code object is generated. Code objects for different device targets are bundled by `clang-offload-bundler` as one fatbinary, which is embeded as a global symbol `__hip_fatbin` in the `.hip_fatbin` section of the ELF file of the executable or shared object. #### Initialization and Termination Functions -hip-clang generates initializatiion and termination functions for each translation unit for host code compilation. The initialization functions call `__hipRegisterFatBinary` to register the fatbinary embeded in the ELF file. They also call `__hipRegisterFunction` and `__hipRegisterVar` to register kernel functions and device side global variables. The termination functions call `__hipUnregisterFatBinary`. +hip-clang generates initialization and termination functions for each translation unit for host code compilation. The initialization functions call `__hipRegisterFatBinary` to register the fatbinary embeded in the ELF file. They also call `__hipRegisterFunction` and `__hipRegisterVar` to register kernel functions and device side global variables. The termination functions call `__hipUnregisterFatBinary`. hip-clang emits a global variable `__hip_gpubin_handle` of void** type with linkonce linkage and inital value 0 for each host translation unit. Each initialization function checks `__hip_gpubin_handle` and register the fatbinary only if `__hip_gpubin_handle` is 0 and saves the return value of `__hip_gpubin_handle` to `__hip_gpubin_handle`. This is to guarantee that the fatbinary is only registered once. Similar check is done in the termination functions. #### Kernel Launching hip-clang supports kernel launching by CUDA `<<<>>>` syntax, hipLaunchKernelGGL. The latter one is macro which expand to CUDA `<<<>>>` syntax. -When the executable or shared library is loaded by the dynamic linker, the initilization functions are called. In the initialization functions, when `__hipRegisterFatBinary` is called, the code objects containing all kernels are loaded; when `__hipRegisterFunction` is called, the stub functions are associated with the corresponding kernels in code objects. +When the executable or shared library is loaded by the dynamic linker, the initialization functions are called. In the initialization functions, when `__hipRegisterFatBinary` is called, the code objects containing all kernels are loaded; when `__hipRegisterFunction` is called, the stub functions are associated with the corresponding kernels in code objects. hip-clang implements two sets of kernel launching APIs. -By default, in the host code, for the `<<<>>>` statement, hip-clang first emits call of hipConfigureCall to set up the threads and grids, then emits call of the stub function with the given arguments. In the stub function, hipSetupArgument is called for each kernel argument, then hipLaunchByPtr is called with a function pointer to the stub function. In hipLaunchByPtr, the real kernel associated with the stub function is launched. +By default, in the host code, for the `<<<>>>` statement, hip-clang first emits call of `hipConfigureCall` to set up the threads and grids, then emits call of the stub function with the given arguments. In the stub function, `hipSetupArgument` is called for each kernel argument, then `hipLaunchByPtr` is called with a function pointer to the stub function. In `hipLaunchByPtr`, the real kernel associated with the stub function is launched. ### NVCC Implementation Notes @@ -122,15 +122,15 @@ By default, in the host code, for the `<<<>>>` statement, hip-clang first emits CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction. -|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| -| ---- | ---- | ---- | -| hipModule_t | CUmodule | | -| hipFunction_t | CUfunction | | -| hipCtx_t | CUcontext | | -| hipDevice_t | CUdevice | | -| hipStream_t | CUstream | cudaStream_t | -| hipEvent_t | CUevent | cudaEvent_t | -| hipArray | CUarray | cudaArray | +|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| +| ---- | ---- | ---- | +| `hipModule_t` | `CUmodule` | | +| `hipFunction_t` | `CUfunction` | | +| `hipCtx_t` | `CUcontext` | | +| `hipDevice_t` | `CUdevice` | | +| `hipStream_t` | `CUstream` | `cudaStream_t` | +| `hipEvent_t` | `CUevent` | `cudaEvent_t` | +| `hipArray` | `CUarray` | `cudaArray` | #### Compilation Options diff --git a/docs/how-to/hip_porting_guide.md b/docs/how-to/hip_porting_guide.md index 99e7c6511d..1a51339b66 100644 --- a/docs/how-to/hip_porting_guide.md +++ b/docs/how-to/hip_porting_guide.md @@ -8,9 +8,9 @@ and provides practical suggestions on how to port CUDA code and work through com ### General Tips -* Starting the port on a CUDA machine is often the easiest approach, since you can incrementally port pieces of the code to HIP while leaving the rest in CUDA. (Recall that on CUDA machines HIP is just a thin layer over CUDA, so the two code types can interoperate on nvcc platforms.) Also, the HIP port can be compared with the original CUDA code for function and performance. +* Starting the port on a CUDA machine is often the easiest approach, since you can incrementally port pieces of the code to HIP while leaving the rest in CUDA. (Recall that on CUDA machines HIP is just a thin layer over CUDA, so the two code types can interoperate on NVCC platforms.) Also, the HIP port can be compared with the original CUDA code for function and performance. * Once the CUDA code is ported to HIP and is running on the CUDA machine, compile the HIP code using the HIP compiler on an AMD machine. -* HIP ports can replace CUDA versions: HIP can deliver the same performance as a native CUDA implementation, with the benefit of portability to both Nvidia and AMD architectures as well as a path to future C++ standard support. You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure. +* HIP ports can replace CUDA versions: HIP can deliver the same performance as a native CUDA implementation, with the benefit of portability to both NVIDIA and AMD architectures as well as a path to future C++ standard support. You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure. * Use **[hipconvertinplace-perl.sh](https://github.com/ROCm/HIPIFY/blob/amd-staging/bin/hipconvertinplace-perl.sh)** to hipify all code files in the CUDA source directory. ### Scanning existing CUDA code to scope the porting effort @@ -38,17 +38,17 @@ info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 s hipexamine-perl scans each code file (cpp, c, h, hpp, etc.) found in the specified directory: -* Files with no CUDA code (ie kmeans.h) print one line summary just listing the source file name. -* Files with CUDA code print a summary of what was found - for example the kmeans_cuda_kernel.cu file: +* Files with no CUDA code (`kmeans.h`) print one line summary just listing the source file name. +* Files with CUDA code print a summary of what was found - for example the `kmeans_cuda_kernel.cu` file: ```shell info: hipify ./kmeans_cuda_kernel.cu =====> info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 ``` -* Interesting information in kmeans_cuda_kernel.cu : +* Interesting information in `kmeans_cuda_kernel.cu` : * How many CUDA calls were converted to HIP (40) - * Breakdown of the CUDA functionality used (dev:0 mem:0 etc). This file uses many CUDA builtins (37) and texture functions (3). + * Breakdown of the CUDA functionality used (`dev:0 mem:0` etc). This file uses many CUDA builtins (37) and texture functions (3). * Warning for code that looks like CUDA API but was not converted (0 in this file). * Count Lines-of-Code (LOC) - 185 for this file. @@ -67,8 +67,8 @@ info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 s For each input file FILE, this script will: -* If "FILE.prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file. -* If "FILE.prehip" file exists, hipify FILE.prehip and save to FILE. +* If `FILE.prehip` file does not exist, copy the original code to a new file with extension `.prehip`. Then hipify the code file. +* If `FILE.prehip` file exists, hipify `FILE.prehip` and save to FILE. This is useful for testing improvements to the hipify toolset. @@ -96,8 +96,8 @@ Most CUDA libraries have a corresponding ROCm library with similar functionality | cuBLASLt | hipBLASLt | N/A | Basic Linear Algebra Subroutines, lightweight and new flexible API | cuFFT | hipFFT | rocFFT | Fast Fourier Transfer Library | cuSPARSE | hipSPARSE | rocSPARSE | Sparse BLAS + SPMV -| cuSolver | hipSOLVER | rocSOLVER | Lapack library -| AMG-X | N/A | rocALUTION | Sparse iterative solvers and preconditioners with Geometric and Algebraic MultiGrid +| cuSOLVER | hipSOLVER | rocSOLVER | Lapack library +| AmgX | N/A | rocALUTION | Sparse iterative solvers and preconditioners with algebraic multigrid | Thrust | N/A | rocThrust | C++ parallel algorithms library | CUB | hipCUB | rocPRIM | Low Level Optimized Parallel Primitives | cuDNN | N/A | MIOpen | Deep learning Solver Library @@ -116,9 +116,9 @@ Note, `HIP_PLATFORM_HCC` was previously defined if the HIP platform targeted AMD * `HIP_PLATFORM_NVDIA` is defined if the HIP platform targets NVIDIA. Note, `HIP_PLATFORM_NVCC` was previously defined if the HIP platform targeted NVIDIA, it is deprecated. -### Identifying the Compiler: hip-clang or nvcc +### Identifying the Compiler: hip-clang or NVCC -Often, it's useful to know whether the underlying compiler is HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning. +Often, it's useful to know whether the underlying compiler is HIP-Clang or NVCC. This knowledge can guard platform-specific code or aid in platform-specific performance tuning. ```cpp #ifdef __HIP_PLATFORM_AMD__ @@ -139,13 +139,13 @@ Often, it's useful to know whether the underlying compiler is HIP-Clang or nvcc. // Compiled with nvcc (CUDA language extensions enabled) ``` -Compiler directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, they have no equivalent of the \__CUDA_ACC define. +Compiler directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, they have no equivalent of the `__CUDACC__` define. ### Identifying Current Compilation Pass: Host or Device -nvcc makes two passes over the code: one for host code and one for device code. +NVCC makes two passes over the code: one for host code and one for device code. HIP-Clang will have multiple passes over the code: one for the host code, and one for each architecture on the device code. -`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define. +`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (HIP-Clang or NVCC) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace `#ifdef` checks on the `__CUDA_ARCH__` define. ```cpp // #ifdef __CUDA_ARCH__ @@ -164,8 +164,8 @@ Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, an |`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host | 1 if compiling for device; undefined if compiling for host | Undefined |`__HIPCC__` | Defined | Defined | Undefined |`__HIP_ARCH_*` | 0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0 -|nvcc-related defines:| -|`__CUDACC__` | Defined if source code is compiled by nvcc; undefined otherwise | Undefined +|NVCC-related defines:| +|`__CUDACC__` | Defined if source code is compiled by NVCC; undefined otherwise | Undefined |`__NVCC__` Undefined | Defined | Undefined |`__CUDA_ARCH__` | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code | Undefined |hip-clang-related defines:| @@ -199,7 +199,7 @@ For host code, the `__HIP_ARCH__*` defines are set to 0. You should only use the ### Device-Architecture Properties -Host code should query the architecture feature flags in the device properties that hipGetDeviceProperties returns, rather than testing the "major" and "minor" fields directly: +Host code should query the architecture feature flags in the device properties that `hipGetDeviceProperties` returns, rather than testing the "major" and "minor" fields directly: ```cpp hipGetDeviceProperties(&deviceProp, device); @@ -213,31 +213,31 @@ if (deviceProp.arch.hasSharedInt32Atomics) { // portable HIP feature The table below shows the full set of architectural properties that HIP supports. -|Define (use only in device code) | Device Property (run-time query) | Comment | -|------- | --------- | ----- | -|32-bit atomics:|| -|`__HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__` | hasGlobalInt32Atomics |32-bit integer atomics for global memory -|`__HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__` | hasGlobalFloatAtomicExch |32-bit float atomic exchange for global memory -|`__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__` | hasSharedInt32Atomics |32-bit integer atomics for shared memory -|`__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__` | hasSharedFloatAtomicExch |32-bit float atomic exchange for shared memory -|`__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__` | hasFloatAtomicAdd |32-bit float atomic add in global and shared memory -|64-bit atomics: | | -|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__` | hasGlobalInt64Atomics |64-bit integer atomics for global memory -|`__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__` | hasSharedInt64Atomics |64-bit integer atomics for shared memory -|Doubles: | | -|`__HIP_ARCH_HAS_DOUBLES__` | hasDoubles |Double-precision floating point -|Warp cross-lane operations: | | -|`__HIP_ARCH_HAS_WARP_VOTE__` | hasWarpVote |Warp vote instructions (any, all) -|`__HIP_ARCH_HAS_WARP_BALLOT__` | hasWarpBallot |Warp ballot instructions -|`__HIP_ARCH_HAS_WARP_SHUFFLE__` | hasWarpShuffle |Warp shuffle operations (shfl\_\*) -|`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__` | hasFunnelShift |Funnel shift two input words into one -|Sync: | | -|`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__` | hasThreadFenceSystem |threadfence\_system -|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__` | hasSyncThreadsExt |syncthreads\_count, syncthreads\_and, syncthreads\_or -|Miscellaneous: | | -|`__HIP_ARCH_HAS_SURFACE_FUNCS__` | hasSurfaceFuncs | -|`__HIP_ARCH_HAS_3DGRID__` | has3dGrid | Grids and groups are 3D -|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__` | hasDynamicParallelism | +|Define (use only in device code) | Device Property (run-time query) | Comment | +|------- | --------- | ----- | +|32-bit atomics: | | +|`__HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__` | `hasGlobalInt32Atomics` |32-bit integer atomics for global memory +|`__HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__` | `hasGlobalFloatAtomicExch` |32-bit float atomic exchange for global memory +|`__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__` | `hasSharedInt32Atomics` |32-bit integer atomics for shared memory +|`__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__` | `hasSharedFloatAtomicExch` |32-bit float atomic exchange for shared memory +|`__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__` | `hasFloatAtomicAdd` |32-bit float atomic add in global and shared memory +|64-bit atomics: | | +|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__` | `hasGlobalInt64Atomics` |64-bit integer atomics for global memory +|`__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__` | `hasSharedInt64Atomics` |64-bit integer atomics for shared memory +|Doubles: | | +|`__HIP_ARCH_HAS_DOUBLES__` | `hasDoubles` |Double-precision floating point +|Warp cross-lane operations: | | +|`__HIP_ARCH_HAS_WARP_VOTE__` | `hasWarpVote` |Warp vote instructions (`any`, `all`) +|`__HIP_ARCH_HAS_WARP_BALLOT__` | `hasWarpBallot` |Warp ballot instructions +|`__HIP_ARCH_HAS_WARP_SHUFFLE__` | `hasWarpShuffle` |Warp shuffle operations (`shfl_*`) +|`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__` | `hasFunnelShift` |Funnel shift two input words into one +|Sync: | | +|`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__` | `hasThreadFenceSystem` |`threadfence_system` +|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__` | `hasSyncThreadsExt` |`syncthreads_count`, `syncthreads_and`, `syncthreads_or` +|Miscellaneous: | | +|`__HIP_ARCH_HAS_SURFACE_FUNCS__` | `hasSurfaceFuncs` | +|`__HIP_ARCH_HAS_3DGRID__` | `has3dGrid` | Grids and groups are 3D +|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__` | `hasDynamicParallelism` | ## Finding HIP @@ -249,57 +249,57 @@ HIP_PATH ?= $(shell hipconfig --path) ## Identifying HIP Runtime -HIP can depend on rocclr, or cuda as runtime +HIP can depend on rocclr, or CUDA as runtime * AMD platform On AMD platform, HIP uses Radeon Open Compute Common Language Runtime, called ROCclr. ROCclr is a virtual device interface that HIP runtimes interact with different backends which allows runtimes to work on Linux , as well as Windows without much efforts. * NVIDIA platform -On Nvidia platform, HIP is just a thin layer on top of CUDA. -On non-AMD platform, HIP runtime determines if cuda is available and can be used. If available, HIP_PLATFORM is set to nvidia and underneath CUDA path is used. +On NVIDIA platform, HIP is just a thin layer on top of CUDA. +On non-AMD platform, HIP runtime determines if CUDA is available and can be used. If available, HIP_PLATFORM is set to `nvidia` and underneath CUDA path is used. -## hipLaunchKernelGGL +## `hipLaunchKernelGGL` -hipLaunchKernelGGL is a macro that can serve as an alternative way to launch kernel, which accepts parameters of launch configurations (grid dims, group dims, stream, dynamic shared size) followed by a variable number of kernel arguments. +`hipLaunchKernelGGL` is a macro that can serve as an alternative way to launch kernel, which accepts parameters of launch configurations (grid dims, group dims, stream, dynamic shared size) followed by a variable number of kernel arguments. It can replace <<< >>>, if the user so desires. ## Compiler Options -hipcc is a portable compiler driver that will call nvcc or HIP-Clang (depending on the target system) and attach all required include and library options. It passes options through to the target compiler. Tools that call hipcc must ensure the compiler options are appropriate for the target compiler. +hipcc is a portable compiler driver that will call NVCC or HIP-Clang (depending on the target system) and attach all required include and library options. It passes options through to the target compiler. Tools that call hipcc must ensure the compiler options are appropriate for the target compiler. The `hipconfig` script may helpful in identifying the target platform, compiler and runtime. It can also help set options appropriately. ### Compiler options supported on AMD platforms Here are the main compiler options supported on AMD platforms by HIP-Clang. -| Option | Description | -| ------ | ----------- | -| --amdgpu-target= | [DEPRECATED] This option is being replaced by `--offload-arch=`. Generate code for the given GPU target. Supported targets are gfx701, gfx801, gfx802, gfx803, gfx900, gfx906, gfx908, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031. This option could appear multiple times on the same command line to generate a fat binary for multiple targets. | -| --fgpu-rdc | Generate relocatable device code, which allows kernels or device functions calling device functions in different translation units. | -| -ggdb | Equivalent to `-g` plus tuning for GDB. This is recommended when using ROCm's GDB to debug GPU code. | -| --gpu-max-threads-per-block= | Generate code to support up to the specified number of threads per block. | -| -O | Specify the optimization level. | -| -offload-arch= | Specify the AMD GPU [target ID](https://clang.llvm.org/docs/ClangOffloadBundler.html#target-id). | -| -save-temps | Save the compiler generated intermediate files. | -| -v | Show the compilation steps. | +| Option | Description | +| ------ | ----------- | +| `--amdgpu-target=` | [DEPRECATED] This option is being replaced by `--offload-arch=`. Generate code for the given GPU target. Supported targets are gfx701, gfx801, gfx802, gfx803, gfx900, gfx906, gfx908, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031. This option could appear multiple times on the same command line to generate a fat binary for multiple targets. | +| `--fgpu-rdc` | Generate relocatable device code, which allows kernels or device functions calling device functions in different translation units. | +| `-ggdb` | Equivalent to `-g` plus tuning for GDB. This is recommended when using ROCm's GDB to debug GPU code. | +| `--gpu-max-threads-per-block=` | Generate code to support up to the specified number of threads per block. | +| `-O` | Specify the optimization level. | +| `-offload-arch=` | Specify the AMD GPU [target ID](https://clang.llvm.org/docs/ClangOffloadBundler.html#target-id). | +| `-save-temps` | Save the compiler generated intermediate files. | +| `-v` | Show the compilation steps. | ## Linking Issues ### Linking With hipcc -hipcc adds the necessary libraries for HIP as well as for the accelerator compiler (nvcc or AMD compiler). We recommend linking with hipcc since it automatically links the binary to the necessary HIP runtime libraries. It also has knowledge on how to link and to manage the GPU objects. +hipcc adds the necessary libraries for HIP as well as for the accelerator compiler (NVCC or AMD compiler). We recommend linking with hipcc since it automatically links the binary to the necessary HIP runtime libraries. It also has knowledge on how to link and to manage the GPU objects. -### -lm Option +### `-lm` Option -hipcc adds -lm by default to the link command. +hipcc adds `-lm` by default to the link command. ## Linking Code With Other Compilers -CUDA code often uses nvcc for accelerator code (defining and launching kernels, typically defined in .cu or .cuh files). -It also uses a standard compiler (g++) for the rest of the application. nvcc is a preprocessor that employs a standard host compiler (gcc) to generate the host code. -Code compiled using this tool can employ only the intersection of language features supported by both nvcc and the host compiler. -In some cases, you must take care to ensure the data types and alignment of the host compiler are identical to those of the device compiler. Only some host compilers are supported---for example, recent nvcc versions lack Clang host-compiler capability. +CUDA code often uses NVCC for accelerator code (defining and launching kernels, typically defined in `.cu` or `.cuh` files). +It also uses a standard compiler (g++) for the rest of the application. NVCC is a preprocessor that employs a standard host compiler (gcc) to generate the host code. +Code compiled using this tool can employ only the intersection of language features supported by both NVCC and the host compiler. +In some cases, you must take care to ensure the data types and alignment of the host compiler are identical to those of the device compiler. Only some host compilers are supported---for example, recent NVCC versions lack Clang host-compiler capability. HIP-Clang generates both device and host code using the same Clang-based compiler. The code uses the same API as gcc, which allows code generated by different gcc-compatible compilers to be linked together. For example, code compiled using HIP-Clang can link with code compiled using "standard" compilers (such as gcc, ICC and Clang). Take care to ensure all compilers use the same standard C++ header and library formats. @@ -307,64 +307,64 @@ HIP-Clang generates both device and host code using the same Clang-based compile hipcc links to libstdc++ by default. This provides better compatibility between g++ and HIP. -If you pass "--stdlib=libc++" to hipcc, hipcc will use the libc++ library. Generally, libc++ provides a broader set of C++ features while libstdc++ is the standard for more compilers (notably including g++). +If you pass `--stdlib=libc++` to hipcc, hipcc will use the libc++ library. Generally, libc++ provides a broader set of C++ features while libstdc++ is the standard for more compilers (notably including g++). When cross-linking C++ code, any C++ functions that use types from the C++ standard library (including std::string, std::vector and other containers) must use the same standard-library implementation. They include the following: * Functions or kernels defined in HIP-Clang that are called from a standard compiler -* Functions defined in a standard compiler that are called from HIP-Clanng. +* Functions defined in a standard compiler that are called from HIP-Clang. Applications with these interfaces should use the default libstdc++ linking. -Applications which are compiled entirely with hipcc, and which benefit from advanced C++ features not supported in libstdc++, and which do not require portability to nvcc, may choose to use libc++. +Applications which are compiled entirely with hipcc, and which benefit from advanced C++ features not supported in libstdc++, and which do not require portability to NVCC, may choose to use libc++. -### HIP Headers (hip_runtime.h, hip_runtime_api.h) +### HIP Headers (`hip_runtime.h`, `hip_runtime_api.h`) -The hip_runtime.h and hip_runtime_api.h files define the types, functions and enumerations needed to compile a HIP program: +The `hip_runtime.h` and `hip_runtime_api.h` files define the types, functions and enumerations needed to compile a HIP program: -* hip_runtime_api.h: defines all the HIP runtime APIs (e.g., hipMalloc) and the types required to call them. A source file that is only calling HIP APIs but neither defines nor launches any kernels can include hip_runtime_api.h. hip_runtime_api.h uses no custom hc language features and can be compiled using a standard C++ compiler. -* hip_runtime.h: included in hip_runtime_api.h. It additionally provides the types and defines required to create and launch kernels. hip_runtime.h can be compiled using a standard C++ compiler but will expose a subset of the available functions. +* `hip_runtime_api.h`: defines all the HIP runtime APIs (e.g., `hipMalloc`) and the types required to call them. A source file that is only calling HIP APIs but neither defines nor launches any kernels can include `hip_runtime_api.h`. `hip_runtime_api.h` uses no custom Heterogeneous Compute (HC) language features and can be compiled using a standard C++ compiler. +* `hip_runtime.h`: included in `hip_runtime_api.h`. It additionally provides the types and defines required to create and launch kernels. hip_runtime.h can be compiled using a standard C++ compiler but will expose a subset of the available functions. -CUDA has slightly different contents for these two files. In some cases you may need to convert hipified code to include the richer hip_runtime.h instead of hip_runtime_api.h. +CUDA has slightly different contents for these two files. In some cases you may need to convert hipified code to include the richer `hip_runtime.h` instead of `hip_runtime_api.h`. ### Using a Standard C++ Compiler -You can compile hip\_runtime\_api.h using a standard C or C++ compiler (e.g., gcc or ICC). The HIP include paths and defines (`__HIP_PLATFORM_AMD__` or `__HIP_PLATFORM_NVIDIA__`) must pass to the standard compiler; hipconfig then returns the necessary options: +You can compile `hip_runtime_api.h` using a standard C or C++ compiler (e.g., gcc or ICC). The HIP include paths and defines (`__HIP_PLATFORM_AMD__` or `__HIP_PLATFORM_NVIDIA__`) must pass to the standard compiler; `hipconfig` then returns the necessary options: ```bash > hipconfig --cxx_config -D__HIP_PLATFORM_AMD__ -I/home/user1/hip/include ``` -You can capture the hipconfig output and passed it to the standard compiler; below is a sample makefile syntax: +You can capture the `hipconfig` output and passed it to the standard compiler; below is a sample makefile syntax: ```bash CPPFLAGS += $(shell $(HIP_PATH)/bin/hipconfig --cpp_config) ``` -nvcc includes some headers by default. However, HIP does not include default headers, and instead all required files must be explicitly included. +NVCC includes some headers by default. However, HIP does not include default headers, and instead all required files must be explicitly included. Specifically, files that call HIP run-time APIs or define HIP kernels must explicitly include the appropriate HIP headers. -If the compilation process reports that it cannot find necessary APIs (for example, "error: identifier hipSetDevice is undefined"), +If the compilation process reports that it cannot find necessary APIs (for example, `error: identifier hipSetDevice is undefined`), ensure that the file includes hip_runtime.h (or hip_runtime_api.h, if appropriate). -The hipify-perl script automatically converts "cuda_runtime.h" to "hip_runtime.h," and it converts "cuda_runtime_api.h" to "hip_runtime_api.h", but it may miss nested headers or macros. +The hipify-perl script automatically converts `cuda_runtime.h` to `hip_runtime.h`, and it converts `cuda_runtime_api.h` to `hip_runtime_api.h`, but it may miss nested headers or macros. -#### cuda.h +#### `cuda.h` -The HIP-Clang path provides an empty cuda.h file. Some existing CUDA programs include this file but don't require any of the functions. +The HIP-Clang path provides an empty `cuda.h` file. Some existing CUDA programs include this file but don't require any of the functions. ### Choosing HIP File Extensions -Many existing CUDA projects use the ".cu" and ".cuh" file extensions to indicate code that should be run through the nvcc compiler. +Many existing CUDA projects use the `.cu` and `.cuh` file extensions to indicate code that should be run through the NVCC compiler. For quick HIP ports, leaving these file extensions unchanged is often easier, as it minimizes the work required to change file names in the directory and #include statements in the files. -For new projects or ports which can be re-factored, we recommend the use of the extension ".hip.cpp" for source files, and -".hip.h" or ".hip.hpp" for header files. +For new projects or ports which can be re-factored, we recommend the use of the extension `.hip.cpp` for source files, and +`.hip.h` or `.hip.hpp` for header files. This indicates that the code is standard C++ code, but also provides a unique indication for make tools to run hipcc when appropriate. ## Workarounds -### warpSize +### ``warpSize`` Code should not assume a warp size of 32 or 64. See [Warp Cross-Lane Functions](https://rocm.docs.amd.com/projects/HIP/en/latest/reference/kernel_language.html#warp-cross-lane-functions) for information on how to write portable wave-aware code. @@ -378,13 +378,13 @@ For example: __global__ void dot(double *a,double *b,const int n) __attribute__((amdgpu_flat_work_group_size(1, 512))) ``` -## memcpyToSymbol +## `memcpyToSymbol` -HIP support for hipMemcpyToSymbol is complete. This feature allows a kernel +HIP support for `hipMemcpyToSymbol` is complete. This feature allows a kernel to define a device-side data symbol which can be accessed on the host side. The symbol can be in __constant or device space. -Note that the symbol name needs to be encased in the HIP_SYMBOL macro, as shown in the code example below. This also applies to hipMemcpyFromSymbol, hipGetSymbolAddress, and hipGetSymbolSize. +Note that the symbol name needs to be encased in the HIP_SYMBOL macro, as shown in the code example below. This also applies to `hipMemcpyFromSymbol`, `hipGetSymbolAddress`, and `hipGetSymbolSize`. For example: @@ -436,7 +436,7 @@ int main() ## CU_POINTER_ATTRIBUTE_MEMORY_TYPE -To get pointer's memory type in HIP/HIP-Clang, developers should use hipPointerGetAttributes API. First parameter of the API is hipPointerAttribute_t which has 'type' as member variable. 'type' indicates input pointer is allocated on device or host. +To get pointer's memory type in HIP/HIP-Clang, developers should use `hipPointerGetAttributes` API. First parameter of the API is `hipPointerAttribute_t` which has 'type' as member variable. 'type' indicates input pointer is allocated on device or host. For example: @@ -452,9 +452,9 @@ hipPointerAttribute_t attr; hipPointerGetAttributes(&attr, ptrHost); /*attr.type will have value as hipMemoryTypeHost*/ ``` -Please note, hipMemoryType enum values are different from cudaMemoryType enum values. +Please note, `hipMemoryType` enum values are different from `cudaMemoryType` enum values. -For example, on AMD platform, hipMemoryType is defined in hip_runtime_api.h, +For example, on AMD platform, `hipMemoryType` is defined in `hip_runtime_api.h`, ```cpp typedef enum hipMemoryType { @@ -466,7 +466,7 @@ typedef enum hipMemoryType { } hipMemoryType; ``` -Looking into CUDA toolkit, it defines cudaMemoryType as following, +Looking into CUDA toolkit, it defines `cudaMemoryType` as following, ```cpp enum cudaMemoryType @@ -478,31 +478,31 @@ enum cudaMemoryType } ``` -In this case, memory type translation for hipPointerGetAttributes needs to be handled properly on nvidia platform to get the correct memory type in CUDA, which is done in the file nvidia_hip_runtime_api.h. +In this case, memory type translation for `hipPointerGetAttributes` needs to be handled properly on NVIDIA platform to get the correct memory type in CUDA, which is done in the file `nvidia_hip_runtime_api.h`. -So in any HIP applications which use HIP APIs involving memory types, developers should use #ifdef in order to assign the correct enum values depending on Nvidia or AMD platform. +So in any HIP applications which use HIP APIs involving memory types, developers should use `#ifdef` in order to assign the correct enum values depending on NVIDIA or AMD platform. As an example, please see the code from the [link](https://github.com/ROCm/hip-tests/tree/develop/catch/unit/memory/hipMemcpyParam2D.cc). -With the #ifdef condition, HIP APIs work as expected on both AMD and NVIDIA platforms. +With the `#ifdef` condition, HIP APIs work as expected on both AMD and NVIDIA platforms. -Note, cudaMemoryTypeUnregstered is currently not supported in hipMemoryType enum, due to HIP functionality backward compatibility. +Note, `cudaMemoryTypeUnregstered` is currently not supported in `hipMemoryType` enum, due to HIP functionality backward compatibility. -## threadfence_system +## `threadfence_system` -Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices. +`threadfence_system` makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices. Some implementations can provide this behavior by flushing the GPU L2 cache. HIP/HIP-Clang does not provide this functionality. As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have a performance impact. ### Textures and Cache Control -Compute programs sometimes use textures either to access dedicated texture caches or to use the texture-sampling hardware for interpolation and clamping. The former approach uses simple point samplers with linear interpolation, essentially only reading a single point. The latter approach uses the sampler hardware to interpolate and combine multiple samples. AMD hardware, as well as recent competing hardware, has a unified texture/L1 cache, so it no longer has a dedicated texture cache. But the nvcc path often caches global loads in the L2 cache, and some programs may benefit from explicit control of the L1 cache contents. We recommend the __ldg instruction for this purpose. +Compute programs sometimes use textures either to access dedicated texture caches or to use the texture-sampling hardware for interpolation and clamping. The former approach uses simple point samplers with linear interpolation, essentially only reading a single point. The latter approach uses the sampler hardware to interpolate and combine multiple samples. AMD hardware, as well as recent competing hardware, has a unified texture/L1 cache, so it no longer has a dedicated texture cache. But the NVCC path often caches global loads in the L2 cache, and some programs may benefit from explicit control of the L1 cache contents. We recommend the `__ldg` instruction for this purpose. -AMD compilers currently load all data into both the L1 and L2 caches, so __ldg is treated as a no-op. +AMD compilers currently load all data into both the L1 and L2 caches, so `__ldg` is treated as a no-op. We recommend the following for functional portability: -* For programs that use textures only to benefit from improved caching, use the __ldg instruction +* For programs that use textures only to benefit from improved caching, use the `__ldg` instruction * Programs that use texture object and reference APIs, work well on HIP ## More Tips @@ -553,7 +553,7 @@ enum LogMask { ### Debugging hipcc -To see the detailed commands that hipcc issues, set the environment variable HIPCC_VERBOSE to 1. Doing so will print to stderr the HIP-clang (or nvcc) commands that hipcc generates. +To see the detailed commands that hipcc issues, set the environment variable HIPCC_VERBOSE to 1. Doing so will print to ``stderr`` the HIP-clang (or NVCC) commands that hipcc generates. ```bash export HIPCC_VERBOSE=1 diff --git a/docs/how-to/hip_rtc.md b/docs/how-to/hip_rtc.md index 5e86e83ce0..b2b76d1ac0 100644 --- a/docs/how-to/hip_rtc.md +++ b/docs/how-to/hip_rtc.md @@ -6,7 +6,7 @@ Kernels can be stored as a text string and can be passed to HIPRTC APIs alongsid NOTE: * This library can be used on systems without HIP installed nor AMD GPU driver installed at all (offline compilation). Therefore, it does not depend on any HIP runtime library. -* But it does depend on COMGr. You may try to statically link COMGr into HIPRTC to avoid any ambiguity. +* But it does depend on comgr. You may try to statically link comgr into HIPRTC to avoid any ambiguity. * Developers can decide to bundle this library with their application. ## Example @@ -29,7 +29,7 @@ R"( )"}; ``` -Now to compile this kernel, it needs to be associated with hiprtcProgram type, which is done by declaring `hiprtcProgram prog;` and associating the string of kernel with this program: +Now to compile this kernel, it needs to be associated with `hiprtcProgram` type, which is done by declaring `hiprtcProgram prog;` and associating the string of kernel with this program: ```cpp hiprtcCreateProgram(&prog, // HIPRTC program @@ -40,10 +40,10 @@ hiprtcCreateProgram(&prog, // HIPRTC program &header_names[0]); // Name of header files ``` -hiprtcCreateProgram API also allows you to add headers which can be included in your rtc program. -For online compilation, the compiler pre-defines HIP device API functions, HIP specific types and macros for device compilation, but does not include standard C/C++ headers by default. Users can only include header files provided to hiprtcCreateProgram. +`hiprtcCreateProgram` API also allows you to add headers which can be included in your RTC program. +For online compilation, the compiler pre-defines HIP device API functions, HIP specific types and macros for device compilation, but does not include standard C/C++ headers by default. Users can only include header files provided to `hiprtcCreateProgram`. -After associating the kernel string with hiprtcProgram, you can now compile this program using: +After associating the kernel string with `hiprtcProgram`, you can now compile this program using: ```cpp hiprtcCompileProgram(prog, // hiprtcProgram @@ -51,7 +51,7 @@ hiprtcCompileProgram(prog, // hiprtcProgram options); // Clang Options [Supported Clang Options](clang_options.md) ``` -hiprtcCompileProgram returns a status value which can be converted to string via `hiprtcGetErrorString`. If compilation is successful, hiprtcCompileProgram will return `HIPRTC_SUCCESS`. +`hiprtcCompileProgram` returns a status value which can be converted to string via `hiprtcGetErrorString`. If compilation is successful, `hiprtcCompileProgram` will return `HIPRTC_SUCCESS`. If the compilation fails, you can look up the logs via: @@ -76,7 +76,7 @@ vector kernel_binary(codeSize); hiprtcGetCode(prog, kernel_binary.data()); ``` -After loading the binary, hiprtcProgram can be destroyed. +After loading the binary, `hiprtcProgram` can be destroyed. `hiprtcDestroyProgram(&prog);` The binary present in `kernel_binary` can now be loaded via `hipModuleLoadData` API. @@ -89,7 +89,7 @@ hipModuleLoadData(&module, kernel_binary.data()); hipModuleGetFunction(&kernel, module, "vector_add"); ``` -And now this kernel can be launched via hipModule APIs. +And now this kernel can be launched via `hipModule` APIs. The full example is below: @@ -231,11 +231,11 @@ HIPRTC provides a few HIPRTC specific flags * `--gpu-architecture` : This flag can guide the code object generation for a specific gpu arch. Example: `--gpu-architecture=gfx906:sramecc+:xnack-`, its equivalent to `--offload-arch`. * This option is compulsory if compilation is done on a system without AMD GPUs supported by HIP runtime. * Otherwise, HIPRTC will load the hip runtime and gather the current device and its architecture info and use it as option. -* `-fgpu-rdc` : This flag when provided during the hiprtcCompileProgram generates the bitcode (HIPRTC doesn't convert this bitcode into ISA and binary). This bitcode can later be fetched using hiprtcGetBitcode and hiprtcGetBitcodeSize APIs. +* `-fgpu-rdc` : This flag when provided during the `hiprtcCompileProgram` generates the bitcode (HIPRTC doesn't convert this bitcode into ISA and binary). This bitcode can later be fetched using `hiprtcGetBitcode` and `hiprtcGetBitcodeSize` APIs. ### Bitcode -In the usual scenario, the kernel associated with hiprtcProgram is compiled into the binary which can be loaded and run. However, if -fpu-rdc option is provided in the compile options, HIPRTC calls comgr and generates only the LLVM bitcode. It doesn't convert this bitcode to ISA and generate the final binary. +In the usual scenario, the kernel associated with `hiprtcProgram` is compiled into the binary which can be loaded and run. However, if `-fpu-rdc` option is provided in the compile options, HIPRTC calls comgr and generates only the LLVM bitcode. It doesn't convert this bitcode to ISA and generate the final binary. ```cpp std::string sarg = std::string("-fgpu-rdc"); @@ -270,11 +270,11 @@ HIPRTC assumes **WGP mode by default** for gfx10+. This can be overridden by pas ## Linker APIs -The bitcode generated using the HIPRTC Bitcode APIs can be loaded using hipModule APIs and also can be linked with other generated bitcodes with appropriate linker flags using the HIPRTC linker APIs. This also provides more flexibility and optimizations to the applications who want to generate the binary dynamically according to their needs. The input bitcodes can be generated only for a specific architecture or it can be a bundled bitcode which is generated for multiple architectures. +The bitcode generated using the HIPRTC Bitcode APIs can be loaded using `hipModule` APIs and also can be linked with other generated bitcodes with appropriate linker flags using the HIPRTC linker APIs. This also provides more flexibility and optimizations to the applications who want to generate the binary dynamically according to their needs. The input bitcodes can be generated only for a specific architecture or it can be a bundled bitcode which is generated for multiple architectures. ### Example -Firstly, HIPRTC link instance or a pending linker invocation must be created using hiprtcLinkCreate, with the appropriate linker options provided. +Firstly, HIPRTC link instance or a pending linker invocation must be created using `hiprtcLinkCreate`, with the appropriate linker options provided. ```cpp hiprtcLinkCreate( num_options, // number of options @@ -283,7 +283,7 @@ hiprtcLinkCreate( num_options, // number of options &rtc_link_state ); // HIPRTC link state created upon success ``` -Following which, the bitcode data can be added to this link instance via hiprtcLinkAddData (if the data is present as a string) or hiprtcLinkAddFile (if the data is present as a file) with the appropriate input type according to the data or the bitcode used. +Following which, the bitcode data can be added to this link instance via `hiprtcLinkAddData` (if the data is present as a string) or `hiprtcLinkAddFile` (if the data is present as a file) with the appropriate input type according to the data or the bitcode used. ```cpp hiprtcLinkAddData(rtc_link_state, // HIPRTC link state @@ -305,7 +305,7 @@ hiprtcLinkAddFile(rtc_link_state, // HIPRTC link state 0); // Array of option values cast to void* ``` -Once the bitcodes for multiple architectures are added to the link instance, the linking of the device code must be completed using hiprtcLinkComplete which generates the final binary. +Once the bitcodes for multiple architectures are added to the link instance, the linking of the device code must be completed using `hiprtcLinkComplete` which generates the final binary. ```cpp hiprtcLinkComplete(rtc_link_state, // HIPRTC link state @@ -313,7 +313,7 @@ hiprtcLinkComplete(rtc_link_state, // HIPRTC link state &binarySize); // size of the binary is stored (optional) ``` -If the hiprtcLinkComplete returns successfully, the generated binary can be loaded and run using the hipModule* APIs. +If the `hiprtcLinkComplete` returns successfully, the generated binary can be loaded and run using the `hipModule*` APIs. ```cpp hipModuleLoadData(&module, binary); @@ -321,19 +321,19 @@ hipModuleLoadData(&module, binary); #### Note -* The compiled binary must be loaded before HIPRTC link instance is destroyed using the hiprtcLinkDestroy API. +* The compiled binary must be loaded before HIPRTC link instance is destroyed using the `hiprtcLinkDestroy` API. ```cpp hiprtcLinkDestroy(rtc_link_state); ``` -* The correct sequence of calls is : hiprtcLinkCreate, hiprtcLinkAddData or hiprtcLinkAddFile, hiprtcLinkComplete, hiprtcModuleLoadData, hiprtcLinkDestroy. +* The correct sequence of calls is : `hiprtcLinkCreate`, `hiprtcLinkAddData` or `hiprtcLinkAddFile`, `hiprtcLinkComplete`, `hiprtcModuleLoadData`, `hiprtcLinkDestroy`. ### Input Types -HIPRTC provides hiprtcJITInputType enumeration type which defines the input types accepted by the Linker APIs. Here are the enum values of hiprtcJITInputType. However only the input types HIPRTC_JIT_INPUT_LLVM_BITCODE, HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE and HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE are supported currently. +HIPRTC provides `hiprtcJITInputType` enumeration type which defines the input types accepted by the Linker APIs. Here are the `enum` values of `hiprtcJITInputType`. However only the input types `HIPRTC_JIT_INPUT_LLVM_BITCODE`, `HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE` and `HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE` are supported currently. -HIPRTC_JIT_INPUT_LLVM_BITCODE can be used to load both LLVM bitcode or LLVM IR assembly code. However, HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE and HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE are only for bundled bitcode and archive of bundled bitcode. +`HIPRTC_JIT_INPUT_LLVM_BITCODE` can be used to load both LLVM bitcode or LLVM IR assembly code. However, `HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE` and `HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE` are only for bundled bitcode and archive of bundled bitcode. ```cpp HIPRTC_JIT_INPUT_CUBIN = 0, @@ -351,11 +351,11 @@ HIPRTC_JIT_NUM_INPUT_TYPES = (HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES + 3) ### Backward Compatibility of LLVM Bitcode/IR -For HIP applications utilizing HIPRTC to compile LLVM bitcode/IR, compatibility is assured only when the ROCm or HIP SDK version used for generating the LLVM bitcode/IR matches the version used during the runtime compilation. When an application requires the ingestion of bitcode/IR not derived from the currently installed AMD compiler, it must run with HIPRTC and COMgr dynamic libraries that are compatible with the version of the bitcode/IR. +For HIP applications utilizing HIPRTC to compile LLVM bitcode/IR, compatibility is assured only when the ROCm or HIP SDK version used for generating the LLVM bitcode/IR matches the version used during the runtime compilation. When an application requires the ingestion of bitcode/IR not derived from the currently installed AMD compiler, it must run with HIPRTC and comgr dynamic libraries that are compatible with the version of the bitcode/IR. -COMgr, a shared library, incorporates the LLVM/Clang compiler that HIPRTC relies on. To identify the bitcode/IR version that COMgr is compatible with, one can execute "clang -v" using the clang binary from the same ROCm or HIP SDK package. For instance, if compiling bitcode/IR version 14, the HIPRTC and COMgr libraries released by AMD around mid 2022 would be the best choice, assuming the LLVM/Clang version included in the package is also version 14. +comgr, a shared library, incorporates the LLVM/Clang compiler that HIPRTC relies on. To identify the bitcode/IR version that comgr is compatible with, one can execute "clang -v" using the clang binary from the same ROCm or HIP SDK package. For instance, if compiling bitcode/IR version 14, the HIPRTC and comgr libraries released by AMD around mid 2022 would be the best choice, assuming the LLVM/Clang version included in the package is also version 14. -To ensure smooth operation and compatibility, an application may choose to ship the specific versions of HIPRTC and COMgr dynamic libraries, or it may opt to clearly specify the version requirements and dependencies. This approach guarantees that the application can correctly compile the specified version of bitcode/IR. +To ensure smooth operation and compatibility, an application may choose to ship the specific versions of HIPRTC and comgr dynamic libraries, or it may opt to clearly specify the version requirements and dependencies. This approach guarantees that the application can correctly compile the specified version of bitcode/IR. ### Link Options @@ -376,9 +376,9 @@ hiprtcLinkCreate(2, jit_options.data(), (void**)lopts, &linkstate); ## Error Handling -HIPRTC defines the hiprtcResult enumeration type and a function hiprtcGetErrorString for API call error handling. hiprtcResult enum defines the API result codes. HIPRTC APIs return hiprtcResult to indicate the call result. hiprtcGetErrorString function returns a string describing the given hiprtcResult code, e.g., HIPRTC_SUCCESS to "HIPRTC_SUCCESS". For unrecognized enumeration values, it returns "Invalid HIPRTC error code". +HIPRTC defines the `hiprtcResult` enumeration type and a function `hiprtcGetErrorString` for API call error handling. `hiprtcResult` `enum` defines the API result codes. HIPRTC APIs return `hiprtcResult` to indicate the call result. `hiprtcGetErrorString` function returns a string describing the given `hiprtcResult` code, e.g., HIPRTC_SUCCESS to "HIPRTC_SUCCESS". For unrecognized enumeration values, it returns "Invalid HIPRTC error code". -hiprtcResult enum supported values and the hiprtcGetErrorString usage are mentioned below. +`hiprtcResult` `enum` supported values and the `hiprtcGetErrorString` usage are mentioned below. ```cpp HIPRTC_SUCCESS = 0, @@ -416,13 +416,13 @@ Currently, it returns hardcoded value. This should be implemented to return HIP HIPRTC mangles the `__global__` function names and names of `__device__` and `__constant__` variables. If the generated binary is being loaded using the HIP Runtime API, the kernel function or `__device__/__constant__` variable must be looked up by name, but this is very hard when the name has been mangled. To overcome this, HIPRTC provides API functions that map `__global__` function or `__device__/__constant__` variable names in the source to the mangled names present in the generated binary. -The two APIs hiprtcAddNameExpression and hiprtcGetLoweredName provide this functionality. First, a 'name expression' string denoting the address for the `__global__` function or `__device__/__constant__` variable is provided to hiprtcAddNameExpression. Then, the program is compiled with hiprtcCompileProgram. During compilation, HIPRTC will parse the name expression string as a C++ constant expression at the end of the user program. Finally, the function hiprtcGetLoweredName is called with the original name expression and it returns a pointer to the lowered name. The lowered name can be used to refer to the kernel or variable in the HIP Runtime API. +The two APIs `hiprtcAddNameExpression` and `hiprtcGetLoweredName` provide this functionality. First, a 'name expression' string denoting the address for the `__global__` function or `__device__/__constant__` variable is provided to `hiprtcAddNameExpression`. Then, the program is compiled with `hiprtcCompileProgram`. During compilation, HIPRTC will parse the name expression string as a C++ constant expression at the end of the user program. Finally, the function `hiprtcGetLoweredName` is called with the original name expression and it returns a pointer to the lowered name. The lowered name can be used to refer to the kernel or variable in the HIP Runtime API. ### Note -* The identical name expression string must be provided on a subsequent call to hiprtcGetLoweredName to extract the lowered name. -* The correct sequence of calls is : hiprtcAddNameExpression, hiprtcCompileProgram, hiprtcGetLoweredName, hiprtcDestroyProgram. -* The lowered names must be fetched using hiprtcGetLoweredName only after the HIPRTC program has been compiled, and before it has been destroyed. +* The identical name expression string must be provided on a subsequent call to `hiprtcGetLoweredName` to extract the lowered name. +* The correct sequence of calls is : `hiprtcAddNameExpression`, `hiprtcCompileProgram`, `hiprtcGetLoweredName`, `hiprtcDestroyProgram`. +* The lowered names must be fetched using `hiprtcGetLoweredName` only after the HIPRTC program has been compiled, and before it has been destroyed. ### Example @@ -444,7 +444,7 @@ __global__ void f3(int *result) { *result = sizeof(T); } )"}; ``` -hiprtcAddNameExpression is called with various name expressions referring to the address of `__global__` functions and `__device__/__constant__` variables. +`hiprtcAddNameExpression` is called with various name expressions referring to the address of `__global__` functions and `__device__/__constant__` variables. ```cpp kernel_name_vec.push_back("&f1"); @@ -456,7 +456,7 @@ variable_name_vec.push_back("&N1::N2::V2"); for (auto&& x : variable_name_vec) hiprtcAddNameExpression(prog, x.c_str()); ``` -After which, the program is compiled using hiprtcCompileProgram and the generated binary is loaded using hipModuleLoadData. And the mangled names can be fetched using hirtcGetLoweredName. +After which, the program is compiled using `hiprtcCompileProgram` and the generated binary is loaded using `hipModuleLoadData`. And the mangled names can be fetched using `hirtcGetLoweredName`. ```cpp for (decltype(variable_name_vec.size()) i = 0; i != variable_name_vec.size(); ++i) { @@ -481,7 +481,7 @@ hipModuleGetGlobal(&variable_addr, &bytes, module, name); hipMemcpyHtoD(variable_addr, &initial_value, sizeof(initial_value)); ``` -Finally, the mangled name of the kernel is used to launch it using the hipModule APIs. +Finally, the mangled name of the kernel is used to launch it using the `hipModule` APIs. ```cpp hipFunction_t kernel; @@ -489,7 +489,7 @@ hipModuleGetFunction(&kernel, module, name); hipModuleLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, nullptr, nullptr, config); ``` -Please have a look at hiprtcGetLoweredName.cpp for the detailed example. +Please have a look at `hiprtcGetLoweredName.cpp` for the detailed example. ## Versioning @@ -497,9 +497,9 @@ HIPRTC follows the below versioning. * Linux * HIPRTC follows the same versioning as HIP runtime library. - * The `so` name field for the shared library is set to MAJOR version. For example, for HIP 5.3 the `so` name is set to 5 (hiprtc.so.5). + * The `so` name field for the shared library is set to MAJOR version. For example, for HIP 5.3 the `so` name is set to 5 (`hiprtc.so.5`). * Windows - * HIPRTC dll is named as hiprtcXXYY.dll where XX is MAJOR version and YY is MINOR version. For example, for HIP 5.3 the name is hiprtc0503.dll. + * HIPRTC dll is named as `hiprtcXXYY.dll` where XX is MAJOR version and YY is MINOR version. For example, for HIP 5.3 the name is `hiprtc0503.dll`. ## HIP header support @@ -507,5 +507,5 @@ HIPRTC follows the below versioning. ## Deprecation notice -* Currently HIPRTC APIs are separated from HIP APIs and HIPRTC is available as a separate library libhiprtc.so/libhiprtc.dll. But on Linux, HIPRTC symbols are also present in libhipamd64.so in order to support the existing applications. Gradually, these symbols will be removed from HIP library and applications using HIPRTC will be required to explicitly link to HIPRTC library. However, on Windows hiprtc.dll must be used as the hipamd64.dll doesn't contain the HIPRTC symbols. -* Data types such as uint32_t, uint64_t, int32_t, int64_t defined in std namespace in HIPRTC are deprecated earlier and are being removed from ROCm release 6.1 since these can conflict with the standard C++ data types. These data types are now prefixed with `__hip__`, e.g. `__hip_uint32_t`. Applications previously using std::uint32_t or similar types can use `__hip_` prefixed types to avoid conflicts with standard std namespace or application can have their own definitions for these types. Also, type_traits templates previously defined in std namespace are moved to `__hip_internal` namespace as implementation details. +* Currently HIPRTC APIs are separated from HIP APIs and HIPRTC is available as a separate library `libhiprtc.so`/`libhiprtc.dll`. But on Linux, HIPRTC symbols are also present in `libhipamd64.so` in order to support the existing applications. Gradually, these symbols will be removed from HIP library and applications using HIPRTC will be required to explicitly link to HIPRTC library. However, on Windows `hiprtc.dll` must be used as the `hipamd64.dll` doesn't contain the HIPRTC symbols. +* Data types such as `uint32_t`, `uint64_t`, `int32_t`, `int64_t` defined in std namespace in HIPRTC are deprecated earlier and are being removed from ROCm release 6.1 since these can conflict with the standard C++ data types. These data types are now prefixed with `__hip__`, e.g. `__hip_uint32_t`. Applications previously using `std::uint32_t` or similar types can use `__hip_` prefixed types to avoid conflicts with standard std namespace or application can have their own definitions for these types. Also, type_traits templates previously defined in std namespace are moved to `__hip_internal` namespace as implementation details. diff --git a/docs/how-to/logging.rst b/docs/how-to/logging.rst index d996243000..4a97332f1e 100644 --- a/docs/how-to/logging.rst +++ b/docs/how-to/logging.rst @@ -20,7 +20,7 @@ Refer to the following sections for examples. Logging works for the release and debug versions of HIP. If you want to save logging output in a file, define the file when running the application via command line. For example: - .. code-block:: bash + .. code-block:: bash user@user-test:~/hip/bin$ ./hipinfo > ~/hip_log.txt @@ -30,7 +30,7 @@ Logging level HIP logging is disabled by default. You can enable it via the ``AMD_LOG_LEVEL`` environment variable. The value of this variable controls your logging level. Levels are defined as follows: -.. code-block:: cpp +.. code-block:: cpp enum LogLevel { LOG_NONE = 0, @@ -52,7 +52,7 @@ The logging mask is designed to print functionality types when you're running a Once you set ``AMD_LOG_LEVEL``, the logging mask is set as the default value (``0x7FFFFFFF``). You can change this to any of the valid values: -.. code-block:: cpp +.. code-block:: cpp enum LogMask { LOG_API = 0x00000001, //!< API call @@ -84,7 +84,7 @@ Logging command You can use the following code to print HIP logging information: -.. code-block:: cpp +.. code-block:: cpp #define ClPrint(level, mask, format, ...) \ do { \ @@ -102,7 +102,7 @@ You can use the following code to print HIP logging information: Using HIP code, call the ``ClPrint()`` function with the desired input variables. For example: -.. code-block:: cpp +.. code-block:: cpp ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack."); @@ -112,7 +112,7 @@ Logging examples On **Linux**, you can enable HIP logging and retrieve logging information when you run ``hipinfo``. -.. code-block:: console +.. code-block:: console user@user-test:~/hip/bin$ export AMD_LOG_LEVEL=4 user@user-test:~/hip/bin$ ./hipinfo @@ -192,7 +192,7 @@ On **Windows**, you can set ``AMD_LOG_LEVEL`` via environment variable from the settings or the command prompt (when run as administrator). The following example shows debug log information when calling the backend runtime. -.. code-block:: bash +.. code-block:: bash C:\hip\bin>set AMD_LOG_LEVEL=4 C:\hip\bin>hipinfo diff --git a/docs/how-to/programming_manual.md b/docs/how-to/programming_manual.md index 0c4dfe3a5b..df6a80261c 100644 --- a/docs/how-to/programming_manual.md +++ b/docs/how-to/programming_manual.md @@ -4,22 +4,22 @@ ### Introduction -hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device, and can be read or written with much higher bandwidth than pageable memory obtained with functions such as malloc(). +`hipHostMalloc` allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device, and can be read or written with much higher bandwidth than pageable memory obtained with functions such as `malloc()`. There are two use cases for this host memory: -* Faster HostToDevice and DeviceToHost Data Transfers: -The runtime tracks the hipHostMalloc allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with --unpinned and --pinned switches for the hipBusBandwidth tool. +* Faster `HostToDevice` and `DeviceToHost` Data Transfers: +The runtime tracks the `hipHostMalloc` allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with `--unpinned` and `--pinned` switches for the `hipBusBandwidth` tool. * Zero-Copy GPU Access: GPU can directly access the host memory over the CPU/GPU interconnect, without need to copy the data. This avoids the need for the copy, but during the kernel access each memory access must traverse the interconnect, which can be tens of times slower than accessing the GPU's local device memory. Zero-copy memory can be a good choice when the memory accesses are infrequent (perhaps only once). Zero-copy memory is typically "Coherent" and thus not cached by the GPU but this can be overridden if desired. ### Memory allocation flags There are flags parameter which can specify options how to allocate the memory, for example, -hipHostMallocPortable, the memory is considered allocated by all contexts, not just the one on which the allocation is made. -hipHostMallocMapped, will map the allocation into the address space for the current device, and the device pointer can be obtained with the API hipHostGetDevicePointer(). -hipHostMallocNumaUser is the flag to allow host memory allocation to follow Numa policy by user. Please note this flag is currently only applicable on Linux, under development on Windows. +`hipHostMallocPortable`, the memory is considered allocated by all contexts, not just the one on which the allocation is made. +`hipHostMallocMapped`, will map the allocation into the address space for the current device, and the device pointer can be obtained with the API `hipHostGetDevicePointer()`. +`hipHostMallocNumaUser` is the flag to allow host memory allocation to follow Numa policy by user. Please note this flag is currently only applicable on Linux, under development on Windows. -All allocation flags are independent, and can be used in any combination without restriction, for instance, hipHostMalloc can be called with both hipHostMallocPortable and hipHostMallocMapped flags set. Both usage models described above use the same allocation flags, and the difference is in how the surrounding code uses the host memory. +All allocation flags are independent, and can be used in any combination without restriction, for instance, `hipHostMalloc` can be called with both `hipHostMallocPortable` and `hipHostMallocMapped` flags set. Both usage models described above use the same allocation flags, and the difference is in how the surrounding code uses the host memory. ### Numa-aware host memory allocation @@ -27,56 +27,56 @@ Numa policy determines how memory is allocated. Target of Numa policy is to select a CPU that is closest to each GPU. Numa distance is the measurement of how far between GPU and CPU devices. -By default, each GPU selects a Numa CPU node that has the least Numa distance between them, that is, host memory will be automatically allocated closest on the memory pool of Numa node of the current GPU device. Using hipSetDevice API to a different GPU will still be able to access the host allocation, but can have longer Numa distance. +By default, each GPU selects a Numa CPU node that has the least Numa distance between them, that is, host memory will be automatically allocated closest on the memory pool of Numa node of the current GPU device. Using `hipSetDevice` API to a different GPU will still be able to access the host allocation, but can have longer Numa distance. Note, Numa policy is so far implemented on Linux, and under development on Windows. ### Coherency Controls ROCm defines two coherency options for host memory: -* Coherent memory : Supports fine-grain synchronization while the kernel is running. For example, a kernel can perform atomic operations that are visible to the host CPU or to other (peer) GPUs. Synchronization instructions include threadfence_system and C++11-style atomic operations. +* Coherent memory : Supports fine-grain synchronization while the kernel is running. For example, a kernel can perform atomic operations that are visible to the host CPU or to other (peer) GPUs. Synchronization instructions include `threadfence_system` and C++11-style atomic operations. In order to achieve this fine-grained coherence, many AMD GPUs use a limited cache policy, such as leaving these allocations uncached by the GPU, or making them read-only. * Non-coherent memory : Can be cached by GPU, but cannot support synchronization while the kernel is running. Non-coherent memory can be optionally synchronized only at command (end-of-kernel or copy command) boundaries. This memory is appropriate for high-performance access when fine-grain synchronization is not required. -HIP provides the developer with controls to select which type of memory is used via allocation flags passed to hipHostMalloc and the HIP_HOST_COHERENT environment variable. By default, the environment variable HIP_HOST_COHERENT is set to 0 in HIP. +HIP provides the developer with controls to select which type of memory is used via allocation flags passed to `hipHostMalloc` and the `HIP_HOST_COHERENT` environment variable. By default, the environment variable HIP_HOST_COHERENT is set to 0 in HIP. The control logic in the current version of HIP is as follows: * No flags are passed in: the host memory allocation is coherent, the HIP_HOST_COHERENT environment variable is ignored. -* hipHostMallocCoherent=1: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored. -* hipHostMallocMapped=1: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored. -* hipHostMallocNonCoherent=1, hipHostMallocCoherent=0, and hipHostMallocMapped=0: The host memory will be non-coherent, the HIP_HOST_COHERENT environment variable is ignored. -* hipHostMallocCoherent=0, hipHostMallocNonCoherent=0, hipHostMallocMapped=0, but one of the other HostMalloc flags is set: - * If HIP_HOST_COHERENT is defined as 1, the host memory allocation is coherent. - * If HIP_HOST_COHERENT is not defined, or defined as 0, the host memory allocation is non-coherent. -* hipHostMallocCoherent=1, hipHostMallocNonCoherent=1: Illegal. +* `hipHostMallocCoherent=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored. +* `hipHostMallocMapped=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored. +* `hipHostMallocNonCoherent=1`, `hipHostMallocCoherent=0`, and `hipHostMallocMapped=0`: The host memory will be non-coherent, the HIP_HOST_COHERENT environment variable is ignored. +* `hipHostMallocCoherent=0`, `hipHostMallocNonCoherent=0`, `hipHostMallocMapped=0`, but one of the other `HostMalloc` flags is set: + * If `HIP_HOST_COHERENT` is defined as 1, the host memory allocation is coherent. + * If `HIP_HOST_COHERENT` is not defined, or defined as 0, the host memory allocation is non-coherent. +* `hipHostMallocCoherent=1`, `hipHostMallocNonCoherent=1`: Illegal. ### Visibility of Zero-Copy Host Memory Coherent host memory is automatically visible at synchronization points. Non-coherent -| HIP API | Synchronization Effect | Fence | Coherent Host Memory Visibiity | Non-Coherent Host Memory Visibility| +| HIP API | Synchronization Effect | Fence | Coherent Host Memory Visibility | Non-Coherent Host Memory Visibility| | --- | --- | --- | --- | --- | -| hipStreamSynchronize | host waits for all commands in the specified stream to complete | system-scope release | yes | yes | -| hipDeviceSynchronize | host waits for all commands in all streams on the specified device to complete | system-scope release | yes | yes | -| hipEventSynchronize | host waits for the specified event to complete | device-scope release | yes | depends - see below| -| hipStreamWaitEvent | stream waits for the specified event to complete | none | yes | no | +| `hipStreamSynchronize` | host waits for all commands in the specified stream to complete | system-scope release | yes | yes | +| `hipDeviceSynchronize` | host waits for all commands in all streams on the specified device to complete | system-scope release | yes | yes | +| `hipEventSynchronize` | host waits for the specified event to complete | device-scope release | yes | depends - see below| +| `hipStreamWaitEvent` | stream waits for the specified event to complete | none | yes | no | -### hipEventSynchronize +### `hipEventSynchronize` -Developers can control the release scope for hipEvents: +Developers can control the release scope for `hipEvents`: * By default, the GPU performs a device-scope acquire and release operation with each recorded event. This will make host and device memory visible to other commands executing on the same device. -A stronger system-level fence can be specified when the event is created with hipEventCreateWithFlags: +A stronger system-level fence can be specified when the event is created with `hipEventCreateWithFlags`: -* hipEventReleaseToSystem : Perform a system-scope release operation when the event is recorded. This will make both Coherent and Non-Coherent host memory visible to other agents in the system, but may involve heavyweight operations such as cache flushing. Coherent memory will typically use lighter-weight in-kernel synchronization mechanisms such as an atomic operation and thus does not need to use hipEventReleaseToSystem. -* hipEventDisableTiming: Events created with this flag will not record profiling data and provide the best performance if used for synchronization. +* `hipEventReleaseToSystem`: Perform a system-scope release operation when the event is recorded. This will make both Coherent and Non-Coherent host memory visible to other agents in the system, but may involve heavyweight operations such as cache flushing. Coherent memory will typically use lighter-weight in-kernel synchronization mechanisms such as an atomic operation and thus does not need to use `hipEventReleaseToSystem`. +* `hipEventDisableTiming`: Events created with this flag will not record profiling data and provide the best performance if used for synchronization. ### Summary and Recommendations -* Coherent host memory is the default and is the easiest to use since the memory is visible to the CPU at typical synchronization points. This memory allows in-kernel synchronization commands such as threadfence_system to work transparently. +* Coherent host memory is the default and is the easiest to use since the memory is visible to the CPU at typical synchronization points. This memory allows in-kernel synchronization commands such as `threadfence_system` to work transparently. * HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization. ### Managed memory allocation @@ -84,7 +84,7 @@ A stronger system-level fence can be specified when the event is created with hi Managed memory, including the `__managed__` keyword, is supported in HIP combined host/device compilation, on Linux, not on Windows (under development). Managed memory, via unified memory allocation, allows data be shared and accessible to both the CPU and GPU using a single pointer. -The allocation will be managed by AMD GPU driver using the Linux HMM (Heterogeneous Memory Management) mechanism, the user can call managed memory API hipMallocManaged to allocate a large chunk of HMM memory, execute kernels on device and fetch data between the host and device as needed. +The allocation will be managed by AMD GPU driver using the Linux HMM (Heterogeneous Memory Management) mechanism, the user can call managed memory API `hipMallocManaged` to allocate a large chunk of HMM memory, execute kernels on device and fetch data between the host and device as needed. In HIP application, it is recommended to do the capability check before calling the managed memory APIs. For example: @@ -110,15 +110,15 @@ Note, managed memory management is implemented on Linux, not supported on Window ### HIP Stream Memory Operations HIP supports Stream Memory Operations to enable direct synchronization between Network Nodes and GPU. Following new APIs are added, - hipStreamWaitValue32 - hipStreamWaitValue64 - hipStreamWriteValue32 - hipStreamWriteValue64 + `hipStreamWaitValue32` + `hipStreamWaitValue64` + `hipStreamWriteValue32` + `hipStreamWriteValue64` Note, CPU access to the semaphore's memory requires volatile keyword to disable CPU compiler's optimizations on memory access. -For more details, please check the documentation HIP-API.pdf. +For more details, please check the documentation `HIP-API.pdf`. -Please note, HIP stream does not guarantee concurrency on AMD hardware for the case of multiple (at least 6) long-running streams executing concurrently, using hipStreamSynchronize(nullptr) for synchronization. +Please note, HIP stream does not guarantee concurrency on AMD hardware for the case of multiple (at least 6) long-running streams executing concurrently, using `hipStreamSynchronize(nullptr)` for synchronization. ## Direct Dispatch @@ -136,13 +136,13 @@ Note, Direct Dispatch is implemented on Linux. It is currently not supported on ## HIP Runtime Compilation -HIP now supports runtime compilation (HIPRTC), the usage of which will provide the possibility of optimizations and performance improvement compared with other APIs via regular offline static compilation. +HIP now supports runtime compilation (HIP RTC), the usage of which will provide the possibility of optimizations and performance improvement compared with other APIs via regular offline static compilation. -HIPRTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes. +HIP RTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes. -For more details on HIPRTC APIs, refer to [HIP Runtime API Reference](https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/index.html). +For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/index.html). -For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIPRTC programming guide](./hip_rtc) is also available. +For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIP RTC programming guide](./hip_rtc) is also available. ## HIP Graph @@ -158,9 +158,9 @@ This implementation does not require the use of `hipDeviceSetLimit(hipLimitMallo The per-thread default stream is supported in HIP. It is an implicit stream local to both the thread and the current device. This means that the command issued to the per-thread default stream by the thread does not implicitly synchronize with other streams (like explicitly created streams), or default per-thread stream on other threads. The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program. The per-thread default stream can be enabled via adding a compilation option, -"-fgpu-default-stream=per-thread". +`-fgpu-default-stream=per-thread`. -And users can explicitly use "hipStreamPerThread" as per-thread default stream handle as input in API commands. There are test codes as examples in the [link](https://github.com/ROCm/hip-tests/tree/develop/catch/unit/streamperthread). +And users can explicitly use `hipStreamPerThread` as per-thread default stream handle as input in API commands. There are test codes as examples in the [link](https://github.com/ROCm/hip-tests/tree/develop/catch/unit/streamperthread). ## Use of Long Double Type @@ -172,36 +172,36 @@ If a host function is to be used between clang (or hipcc) and gcc for x86_64, i. ## FMA and contractions -By default HIP-Clang assumes -ffp-contract=fast-honor-pragmas. -Users can use '#pragma clang fp contract(on|off|fast)' to control fp contraction of a block of code. +By default HIP-Clang assumes `-ffp-contract=fast-honor-pragmas`. +Users can use `#pragma clang fp contract(on|off|fast)` to control `fp` contraction of a block of code. For x86_64, FMA is off by default since the generic x86_64 target does not -support FMA by default. To turn on FMA on x86_64, either use -mfma or -march=native +support FMA by default. To turn on FMA on x86_64, either use `-mfma` or `-march=native` on CPU's supporting FMA. When contractions are enabled and the CPU has not enabled FMA instructions, the GPU can produce different numerical results than the CPU for expressions that -can be contracted. Tolerance should be used for floating point comparsions. +can be contracted. Tolerance should be used for floating point comparisons. ## Math functions with special rounding modes -Note: Currently, HIP only supports basic math functions with rounding modern (round to nearest). HIP does not support basic math functions with rounding modes ru (round up), rd (round down), and rz (round towards zero). +Note: Currently, HIP only supports basic math functions with rounding modern (round to nearest). HIP does not support basic math functions with rounding modes `ru` (round up), `rd` (round down), and `rz` (round towards zero). ## Creating Static Libraries HIP-Clang supports generating two types of static libraries. The first type of static library does not export device functions, and only exports and launches host functions within the same library. The advantage of this type is the ability to link with a non-hipcc compiler such as gcc. The second type exports device functions to be linked by other code objects. However, this requires using hipcc as the linker. -In addition, the first type of library contains host objects with device code embedded as fat binaries. It is generated using the flag --emit-static-lib. The second type of library contains relocatable device objects and is generated using ar. +In addition, the first type of library contains host objects with device code embedded as fat binaries. It is generated using the flag --emit-static-lib. The second type of library contains relocatable device objects and is generated using `ar`. Here is an example to create and use static libraries: -* Type 1 using --emit-static-lib: +* Type 1 using `--emit-static-lib`: ```cpp hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out ``` -* Type 2 using system ar: +* Type 2 using system `ar`: ```cpp hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o diff --git a/docs/install/build.rst b/docs/install/build.rst index 7d85f7ad2b..a4785e9230 100644 --- a/docs/install/build.rst +++ b/docs/install/build.rst @@ -11,13 +11,13 @@ Before building and running HIP, make sure drivers and prebuilt packages are ins You also need to install Python 3, which includes the ``CppHeaderParser`` package. Install Python 3 using the following command: -.. code:: shell +.. code-block:: shell apt-get install python3 Check and install ``CppHeaderParser`` package using the command: -.. code:: shell +.. code-block:: shell pip3 install CppHeaderParser @@ -29,7 +29,7 @@ Building the HIP runtime Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for ROCm 6.1, use: -.. code:: shell +.. code-block:: shell export ROCM_BRANCH=rocm-6.1.x @@ -49,7 +49,7 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for ``hipother`` provides files required to support the HIP back-end implementation on some non-AMD platforms, like NVIDIA. - .. code:: shell + .. code-block:: shell git clone -b "$ROCM_BRANCH" https://github.com/ROCm/clr.git git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hip.git @@ -66,7 +66,7 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for #. Set the environment variables. - .. code:: shell + .. code-block:: shell export CLR_DIR="$(readlink -f clr)" export HIP_DIR="$(readlink -f hip)" @@ -74,7 +74,7 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for #. Build HIP. - .. code:: shell + .. code-block:: shell cd "$CLR_DIR" mkdir -p build; cd build @@ -116,7 +116,7 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for Usage: - .. code:: shell + .. code-block:: shell `hip_prof_gen.py [-v] []` @@ -131,7 +131,7 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for Example usage: - .. code:: shell + .. code-block:: shell hip_prof_gen.py -v -p -t --priv /include/hip/hip_runtime_api.h \ /src /include/hip/amd_detail/hip_prof_str.h \ @@ -142,7 +142,7 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for #. Get the HIP source code. - .. code:: shell + .. code-block:: shell git clone -b "$ROCM_BRANCH" https://github.com/ROCm/clr.git git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hip.git @@ -150,7 +150,7 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for #. Set the environment variables. - .. code:: shell + .. code-block:: shell export CLR_DIR="$(readlink -f clr)" export HIP_DIR="$(readlink -f hip)" @@ -158,7 +158,7 @@ Set the repository branch using the variable: ``ROCM_BRANCH``. For example, for #. Build HIP. - .. code:: shell + .. code-block:: shell cd "$CLR_DIR" mkdir -p build; cd build @@ -180,13 +180,13 @@ Build HIP tests * Get HIP tests source code. - .. code:: shell + .. code-block:: shell git clone -b "$ROCM_BRANCH" https://github.com/ROCm/hip-tests.git * Build HIP tests from source. - .. code:: shell + .. code-block:: shell export HIPTESTS_DIR="$(readlink -f hip-tests)" cd "$HIPTESTS_DIR" @@ -199,14 +199,14 @@ Build HIP tests To run any single catch test, use this example: - .. code:: shell + .. code-block:: shell cd $HIPTESTS_DIR/build/catch_tests/unit/texture ./TextureTest * Build a HIP Catch2 standalone test. - .. code:: shell + .. code-block:: shell cd "$HIPTESTS_DIR" hipcc $HIPTESTS_DIR/catch/unit/memory/hipPointerGetAttributes.cc \ diff --git a/docs/install/install.rst b/docs/install/install.rst index b6168bfc5d..d88ba6596c 100644 --- a/docs/install/install.rst +++ b/docs/install/install.rst @@ -48,7 +48,7 @@ Installation #. Install the NVIDIA driver. - .. code:: shell + .. code-block:: shell sudo apt-get install ubuntu-drivers-common && sudo ubuntu-drivers autoinstall sudo reboot @@ -59,7 +59,7 @@ Installation #. Install the ``hip-runtime-nvidia`` and ``hip-dev`` packages. This installs the CUDA SDK and HIP porting layer. - .. code:: shell + .. code-block:: shell apt-get install hip-runtime-nvidia hip-dev @@ -74,6 +74,6 @@ Verify your installation Run ``hipconfig`` in your installation path. -.. code:: shell +.. code-block:: shell /opt/rocm/bin/hipconfig --full diff --git a/docs/reference/kernel_language.rst b/docs/reference/kernel_language.rst index ae87e9e11e..0a73b147e8 100644 --- a/docs/reference/kernel_language.rst +++ b/docs/reference/kernel_language.rst @@ -36,7 +36,7 @@ Function-type qualifiers ``__device__`` ----------------------------------------------------------------------- -Supported ``__device__`` functions are: +Supported ``__device__`` functions are: * Run on the device * Called from the device only @@ -91,18 +91,18 @@ configuration to the kernel. However, you can also use the CUDA ``<<< >>>`` synt When using ``hipLaunchKernelGGL``, your first five parameters must be: - * **symbol kernelName**: The name of the kernel you want to launch. To support template kernels + * ``symbol kernelName``: The name of the kernel you want to launch. To support template kernels that contain ``","``, use the ``HIP_KERNEL_NAME`` macro (HIPIFY tools insert this automatically). - * **dim3 gridDim**: 3D-grid dimensions that specify the number of blocks to launch. - * **dim3 blockDim**: 3D-block dimensions that specify the number of threads in each block. - * **size_t dynamicShared**: The amount of additional shared memory that you want to allocate + * ``dim3 gridDim``: 3D-grid dimensions that specify the number of blocks to launch. + * ``dim3 blockDim``: 3D-block dimensions that specify the number of threads in each block. + * ``size_t dynamicShared``: The amount of additional shared memory that you want to allocate when launching the kernel (see :ref:`shared-variable-type`). - * **hipStream_t**: The stream where you want to run the kernel. A value of ``0`` corresponds to the + * ``hipStream_t``: The stream where you want to run the kernel. A value of ``0`` corresponds to the NULL stream (see :ref:`synchronization functions`). You can include your kernel arguments after these parameters. -.. code:: cpp +.. code-block:: cpp // Example hipLaunchKernelGGL pseudocode: __global__ MyKernel(hipLaunchParm lp, float *A, float *B, float *C, size_t N) @@ -128,7 +128,7 @@ parameters. Kernel launch example ========================================================== -.. code:: cpp +.. code-block:: cpp // Example showing device function, __device__ __host__ // <- compile for both device and host @@ -221,7 +221,7 @@ Coordinate variable definitions for built-ins are the same for HIP and CUDA. For Coordinate built-ins are implemented as structures for improved performance. When used with ``printf``, they must be explicitly cast to integer types. -warpSize +``warpSize`` ----------------------------------------------------------------------------- The ``warpSize`` variable type is ``int``. It contains the warp size (in threads) for the target device. ``warpSize`` should only be used in device functions that develop portable wave-aware code. @@ -279,7 +279,7 @@ dimensions. The dim3 constructor accepts between zero and three arguments. By default, it initializes unspecified dimensions to 1. -.. code:: cpp +.. code-block:: cpp typedef struct dim3 { uint32_t x; @@ -697,7 +697,7 @@ Following is the list of supported single precision mathematical functions. - ✓ - ✓ - * - | ``float scalbnf(float x, int n)`` + * - | ``float scalbnf(float x, int n)`` | Scale :math:`x` by :math:`2^n`. - ✓ - ✓ @@ -780,7 +780,7 @@ Following is the list of supported single precision mathematical functions. Double precision mathematical functions -------------------------------------------------------------------------------------------- -Following is the list of supported double precision mathematical functions. +Following is the list of supported double precision mathematical functions. .. list-table:: Double precision mathematical functions @@ -1153,7 +1153,7 @@ Following is the list of supported double precision mathematical functions. - ✓ - ✓ - * - | ``double scalbn(double x, int n)`` + * - | ``double scalbn(double x, int n)`` | Scale :math:`x` by :math:`2^n`. - ✓ - ✓ @@ -1248,7 +1248,7 @@ Following is the list of supported integer intrinsics. Note that intrinsics are * - | ``unsigned long long int __brevll(unsigned long long int x)`` | Reverse the bit order of a 64 bit unsigned integer. - * - | ``unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int z)`` + * - | ``unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int z)`` | Return selected bytes from two 32-bit unsigned integers. * - | ``unsigned int __clz(int x)`` @@ -1258,10 +1258,10 @@ Following is the list of supported integer intrinsics. Note that intrinsics are | Return the number of consecutive high-order zero bits in 64 bit integer. * - | ``unsigned int __ffs(int x)`` - | Find the position of least signigicant bit set to 1 in a 32 bit integer. + | Find the position of least significant bit set to 1 in a 32 bit integer. * - | ``unsigned int __ffsll(long long int x)`` - | Find the position of least signigicant bit set to 1 in a 64 bit signed integer. + | Find the position of least significant bit set to 1 in a 64 bit signed integer. * - | ``unsigned int __fns32(unsigned long long mask, unsigned int base, int offset)`` | Find the position of the n-th set to 1 bit in a 32-bit integer. @@ -1290,7 +1290,7 @@ Following is the list of supported integer intrinsics. Note that intrinsics are * - | ``unsigned int __uhadd(int x, int y)`` | Compute average of unsigned input arguments, avoiding overflow in the intermediate sum. - * - | ``unsigned int __urhadd (unsigned int x, unsigned int y)`` + * - | ``unsigned int __urhadd (unsigned int x, unsigned int y)`` | Compute rounded average of unsigned input arguments, avoiding overflow in the intermediate sum. * - | ``int __sad(int x, int y, int z)`` @@ -1323,10 +1323,10 @@ Following is the list of supported integer intrinsics. Note that intrinsics are * - | ``unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y)`` | Returns the most significant 64 bits of the product of the two 64 unsigned bit integers. -The HIP-Clang implementation of ``__ffs()`` and ``__ffsll()`` contains code to add a constant +1 to produce the ffs result format. +The HIP-Clang implementation of ``__ffs()`` and ``__ffsll()`` contains code to add a constant +1 to produce the ``ffs`` result format. For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform, HIP-Clang provides ``__lastbit_u32_u32(unsigned int input)`` and ``__lastbit_u32_u64(unsigned long long int input)``. -The index returned by ``__lastbit_`` instructions starts at -1, while for ffs the index starts at 0. +The index returned by ``__lastbit_`` instructions starts at -1, while for ``ffs`` the index starts at 0. Floating-point Intrinsics -------------------------------------------------------------------------------------------- @@ -1422,7 +1422,7 @@ Following is the list of supported floating-point intrinsics. Note that intrinsi * - | ``double __dsub_rn(double x, double y)`` | Subtract two floating-point values in round-to-nearest-even mode. - * - | ``double __fma_rn(double x, double y, double z)`` + * - | ``double __fma_rn(double x, double y, double z)`` | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode. @@ -1450,7 +1450,7 @@ To read a high-resolution timer from the device, HIP provides the following buil * Returning the incremental counter value for every clock cycle on a device: - .. code:: cpp + .. code-block:: cpp clock_t clock() long long int clock64() @@ -1459,14 +1459,14 @@ To read a high-resolution timer from the device, HIP provides the following buil * Returning the wall clock count at a constant frequency on the device: - .. code:: cpp + .. code-block:: cpp long long int wall_clock64() This can be queried using the HIP API with the ``hipDeviceAttributeWallClockRate`` attribute of the device in HIP application code. For example: - .. code:: cpp + .. code-block:: cpp int wallClkRate = 0; //in kilohertz HIPCHECK(hipDeviceGetAttribute(&wallClkRate, hipDeviceAttributeWallClockRate, deviceId)); @@ -1794,7 +1794,7 @@ RMW functions produce unsafe atomic RMW instructions: Warp cross-lane functions ======================================================== -Threads in a warp are referred to as `lanes` and are numbered from 0 to warpSize - 1. +Threads in a warp are referred to as ``lanes`` and are numbered from ``0`` to ``warpSize - 1``. Warp cross-lane functions operate across all lanes in a warp. The hardware guarantees that all warp lanes will execute in lockstep, so additional synchronization is unnecessary, and the instructions use no shared memory. @@ -1809,7 +1809,7 @@ portable code to query the warp size. To get the default warp size of a GPU device, use ``hipGetDeviceProperties`` in you host functions. -.. code:: cpp +.. code-block:: cpp cudaDeviceProp props; cudaGetDeviceProperties(&props, deviceID); @@ -1835,7 +1835,7 @@ the correct type for the mask. Warp vote and ballot functions ------------------------------------------------------------------------------------------------------------- -.. code:: cpp +.. code-block:: cpp int __all(int predicate) int __any(int predicate) @@ -1849,7 +1849,7 @@ Warp vote and ballot functions You can use ``__any`` and ``__all`` to get a summary view of the predicates evaluated by the participating lanes. -* ``__any()``: Returns 1 if the predicate is non-zero for any participating lane, otherwise it returns 0. +* ``__any()``: Returns 1 if the predicate is non-zero for any participating lane, otherwise it returns 0. * ``__all()``: Returns 1 if the predicate is non-zero for all participating lanes, otherwise it returns 0. @@ -1883,7 +1883,7 @@ undefined. Warp match functions ------------------------------------------------------------------------------------------------------------- -.. code:: cpp +.. code-block:: cpp unsigned long long __match_any(T value) unsigned long long __match_all(T value, int *pred) @@ -1915,7 +1915,7 @@ Warp shuffle functions The default width is ``warpSize`` (see :ref:`warp-cross-lane`). Half-float shuffles are not supported. -.. code:: cpp +.. code-block:: cpp int __shfl (T var, int srcLane, int width=warpSize); int __shfl_up (T var, unsigned int delta, int width=warpSize); @@ -2096,14 +2096,15 @@ HIP does not support this type of scheduling. Profiler Counter Function ============================================================ -The CUDA `__prof_trigger()` instruction is not supported. +The CUDA ``__prof_trigger()`` instruction is not supported. Assert ============================================================ The assert function is supported in HIP. Assert function is used for debugging purpose, when the input expression equals to zero, the execution will be stopped. -.. code:: cpp + +.. code-block:: cpp void assert(int input) @@ -2112,7 +2113,7 @@ There are two kinds of implementations for assert functions depending on the use - Another is the device version of assert, which is implemented in ``hip/hip_runtime.h``. Users need to include ``assert.h`` to use ``assert``. For assert to work in both device and host functions, users need to include ``"hip/hip_runtime.h"``. -HIP provides the function ``abort()`` which can be used to terminate the application when terminal failures are detected. It is implemented using the ``__builtin_trap()`` function. +HIP provides the function ``abort()`` which can be used to terminate the application when terminal failures are detected. It is implemented using the ``__builtin_trap()`` function. This function produces a similar effect of using ``asm("trap")`` in the CUDA code. @@ -2121,13 +2122,13 @@ This function produces a similar effect of using ``asm("trap")`` in the CUDA cod In HIP, the function terminates the entire application, while in CUDA, ``asm("trap")`` only terminates the dispatch and the application continues to run. -Printf +``printf`` ============================================================ -Printf function is supported in HIP. +``printf`` function is supported in HIP. The following is a simple example to print information in the kernel. -.. code:: cpp +.. code-block:: cpp #include @@ -2141,29 +2142,29 @@ The following is a simple example to print information in the kernel. Device-Side Dynamic Global Memory Allocation ============================================================ -Device-side dynamic global memory allocation is under development. HIP now includes a preliminary +Device-side dynamic global memory allocation is under development. HIP now includes a preliminary implementation of malloc and free that can be called from device functions. -`__launch_bounds__` +``__launch_bounds__`` ============================================================ -GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) which are shared by the actively running warps. Using more resources can increase IPC of the kernel but reduces the resources available for other warps and limits the number of warps that can be simulaneously running. Thus GPUs have a complex relationship between resource usage and performance. +GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) which are shared by the actively running warps. Using more resources can increase IPC of the kernel but reduces the resources available for other warps and limits the number of warps that can be simultaneously running. Thus GPUs have a complex relationship between resource usage and performance. -__launch_bounds__ allows the application to provide usage hints that influence the resources (primarily registers) used by the generated code. It is a function attribute that must be attached to a __global__ function: +``__launch_bounds__`` allows the application to provide usage hints that influence the resources (primarily registers) used by the generated code. It is a function attribute that must be attached to a __global__ function: -.. code:: cpp +.. code-block:: cpp __global__ void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EXECUTION_UNIT) MyKernel(hipGridLaunch lp, ...) ... -__launch_bounds__ supports two parameters: -- MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified, MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying MAX_THREADS_PER_BLOCK less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation that supports all possible block sizes at launch time. -The threads-per-block is the product of (blockDim.x * blockDim.y * blockDim.z). +``__launch_bounds__`` supports two parameters: +- MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than MAX_THREADS_PER_BLOCK. (On NVCC this maps to the ``.maxntid`` PTX directive). If no launch_bounds is specified, MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying MAX_THREADS_PER_BLOCK less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation that supports all possible block sizes at launch time. +The threads-per-block is the product of (``blockDim.x * blockDim.y * blockDim.z``). - MIN_WARPS_PER_EXECUTION_UNIT - directs the compiler to minimize resource usage so that the requested number of warps can be simultaneously active on a multi-processor. Since active warps compete for the same fixed pool of resources, the compiler must reduce resources required by each warp(primarily registers). MIN_WARPS_PER_EXECUTION_UNIT is optional and defaults to 1 if not specified. Specifying a MIN_WARPS_PER_EXECUTION_UNIT greater than the default 1 effectively constrains the compiler's resource usage. -When launch kernel with HIP APIs, for example, hipModuleLaunchKernel(), HIP will do validation to make sure input kernel dimension size is not larger than specified launch_bounds. -In case exceeded, HIP would return launch failure, if AMD_LOG_LEVEL is set with proper value (for details, please refer to docs/markdown/hip_logging.md), detail information will be shown in the error log message, including +When launch kernel with HIP APIs, for example, ``hipModuleLaunchKernel()``, HIP will do validation to make sure input kernel dimension size is not larger than specified launch_bounds. +In case exceeded, HIP would return launch failure, if AMD_LOG_LEVEL is set with proper value (for details, please refer to ``docs/markdown/hip_logging.md``), detail information will be shown in the error log message, including launch parameters of kernel dim size, launch bounds, and the name of the faulting kernel. It's helpful to figure out which is the faulting kernel, besides, the kernel dim size and launch bounds values will also assist in debugging such failures. Compiler Impact @@ -2173,44 +2174,44 @@ The compiler uses these parameters as follows: - The compiler uses the hints only to manage register usage, and does not automatically reduce shared memory or other resources. - Compilation fails if compiler cannot generate a kernel which meets the requirements of the specified launch bounds. - From MAX_THREADS_PER_BLOCK, the compiler derives the maximum number of warps/block that can be used at launch time. -Values of MAX_THREADS_PER_BLOCK less than the default allows the compiler to use a larger pool of registers : each warp uses registers, and this hint constains the launch to a warps/block size which is less than maximum. +Values of MAX_THREADS_PER_BLOCK less than the default allows the compiler to use a larger pool of registers : each warp uses registers, and this hint constrains the launch to a warps/block size which is less than maximum. - From MIN_WARPS_PER_EXECUTION_UNIT, the compiler derives a maximum number of registers that can be used by the kernel (to meet the required #simultaneous active blocks). If MIN_WARPS_PER_EXECUTION_UNIT is 1, then the kernel can use all registers supported by the multiprocessor. - The compiler ensures that the registers used in the kernel is less than both allowed maximums, typically by spilling registers (to shared or global memory), or by using more instructions. -- The compiler may use hueristics to increase register usage, or may simply be able to avoid spilling. The MAX_THREADS_PER_BLOCK is particularly useful in this cases, since it allows the compiler to use more registers and avoid situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size that is never used at launch time. +- The compiler may use heuristics to increase register usage, or may simply be able to avoid spilling. The MAX_THREADS_PER_BLOCK is particularly useful in this cases, since it allows the compiler to use more registers and avoid situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size that is never used at launch time. CU and EU Definitions -------------------------------------------------------------------------------------------- A compute unit (CU) is responsible for executing the waves of a work-group. It is composed of one or more execution units (EU) which are responsible for executing waves. An EU can have enough resources to maintain the state of more than one executing wave. This allows an EU to hide latency by switching between waves in a similar way to symmetric multithreading on a CPU. In order to allow the state for multiple waves to fit on an EU, the resources used by a single wave have to be limited. Limiting such resources can allow greater latency hiding, but can result in having to spill some register state to memory. This attribute allows an advanced developer to tune the number of waves that are capable of fitting within the resources of an EU. It can be used to ensure at least a certain number will fit to help hide latency, and can also be used to ensure no more than a certain number will fit to limit cache thrashing. -Porting from CUDA `__launch_bounds` +Porting from CUDA ``__launch_bounds`` -------------------------------------------------------------------------------------------- -CUDA defines a __launch_bounds which is also designed to control occupancy: +CUDA defines a ``__launch_bounds`` which is also designed to control occupancy: -.. code:: cpp +.. code-block:: cpp __launch_bounds(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR) -- The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors (this conversion is performed automatically by HIPIFY tools). +- The second parameter ``__launch_bounds`` parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors (this conversion is performed automatically by HIPIFY tools). -.. code:: cpp +.. code-block:: cpp MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK) / 32 The key differences in the interface are: - Warps (rather than blocks): -The developer is trying to tell the compiler to control resource utilization to guarantee some amount of active Warps/EU for latency hiding. Specifying active warps in terms of blocks appears to hide the micro-architectural details of the warp size, but makes the interface more confusing since the developer ultimately needs to compute the number of warps to obtain the desired level of control. -- Execution Units (rather than multiProcessor): -The use of execution units rather than multiprocessors provides support for architectures with multiple execution units/multi-processor. For example, the AMD GCN architecture has 4 execution units per multiProcessor. The hipDeviceProps has a field executionUnitsPerMultiprocessor. -Platform-specific coding techniques such as #ifdef can be used to specify different launch_bounds for NVCC and HIP-Clang platforms, if desired. +The developer is trying to tell the compiler to control resource utilization to guarantee some amount of active Warps/EU for latency hiding. Specifying active warps in terms of blocks appears to hide the micro-architectural details of the warp size, but makes the interface more confusing since the developer ultimately needs to compute the number of warps to obtain the desired level of control. +- Execution Units (rather than multiprocessor): +The use of execution units rather than multiprocessors provides support for architectures with multiple execution units/multi-processor. For example, the AMD GCN architecture has 4 execution units per multiprocessor. The ``hipDeviceProps`` has a field ``executionUnitsPerMultiprocessor``. +Platform-specific coding techniques such as ``#ifdef`` can be used to specify different launch_bounds for NVCC and HIP-Clang platforms, if desired. -maxregcount +``maxregcount`` -------------------------------------------------------------------------------------------- -Unlike nvcc, HIP-Clang does not support the "--maxregcount" option. Instead, users are encouraged to use the hip_launch_bounds directive since the parameters are more intuitive and portable than -micro-architecture details like registers, and also the directive allows per-kernel control rather than an entire file. hip_launch_bounds works on both HIP-Clang and nvcc targets. +Unlike NVCC, HIP-Clang does not support the ``--maxregcount`` option. Instead, users are encouraged to use the hip_launch_bounds directive since the parameters are more intuitive and portable than +micro-architecture details like registers, and also the directive allows per-kernel control rather than an entire file. hip_launch_bounds works on both HIP-Clang and NVCC targets. Asynchronous Functions ============================================================ @@ -2245,24 +2246,24 @@ External Resource Interoperability Register Keyword ============================================================ -The register keyword is deprecated in C++, and is silently ignored by both nvcc and HIP-Clang. You can pass the option `-Wdeprecated-register` the compiler warning message. +The register keyword is deprecated in C++, and is silently ignored by both NVCC and HIP-Clang. You can pass the option ``-Wdeprecated-register`` the compiler warning message. Pragma Unroll ============================================================ -Unroll with a bounds that is known at compile-time is supported. For example: +Unroll with a bounds that is known at compile-time is supported. For example: -.. code:: cpp +.. code-block:: cpp #pragma unroll 16 /* hint to compiler to unroll next loop by 16 */ for (int i=0; i<16; i++) ... -.. code:: cpp +.. code-block:: cpp #pragma unroll 1 /* tell compiler to never unroll the loop */ for (int i=0; i<16; i++) ... -.. code:: cpp +.. code-block:: cpp #pragma unroll /* hint to compiler to completely unroll next loop. */ for (int i=0; i<16; i++) ... @@ -2272,16 +2273,16 @@ In-Line Assembly GCN ISA In-line assembly, is supported. For example: -.. code:: cpp +.. code-block:: cpp asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i])); -We insert the GCN isa into the kernel using `asm()` Assembler statement. -`volatile` keyword is used so that the optimizers must not change the number of volatile operations or change their order of execution relative to other volatile operations. -`v_mac_f32_e32` is the GCN instruction, for more information please refer - [AMD GCN3 ISA architecture manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/) +We insert the GCN isa into the kernel using ``asm()`` Assembler statement. +``volatile`` keyword is used so that the optimizers must not change the number of volatile operations or change their order of execution relative to other volatile operations. +``v_mac_f32_e32`` is the GCN instruction, for more information please refer - [AMD GCN3 ISA architecture manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/) Index for the respective operand in the ordered fashion is provided by `%` followed by position in the list of operands `"v"` is the constraint code (for target-specific AMDGPU) for 32-bit VGPR register, for more info please refer - [Supported Constraint Code List for AMDGPU](https://llvm.org/docs/LangRef.html#supported-constraint-code-list) -Output Constraints are specified by an `"="` prefix as shown above ("=v"). This indicate that assemby will write to this operand, and the operand will then be made available as a return value of the asm expression. Input constraints do not have a prefix - just the constraint code. The constraint string of `"0"` says to use the assigned register for output as an input as well (it being the 0'th constraint). +Output Constraints are specified by an `"="` prefix as shown above ("=v"). This indicate that assembly will write to this operand, and the operand will then be made available as a return value of the ``asm`` expression. Input constraints do not have a prefix - just the constraint code. The constraint string of `"0"` says to use the assigned register for output as an input as well (it being the 0'th constraint). ## C++ Support The following C++ features are not supported: @@ -2292,10 +2293,11 @@ Virtual functions are not supported if objects containing virtual function table Kernel Compilation ============================================================ + hipcc now supports compiling C++/HIP kernels to binary code objects. -The file format for binary is `.co` which means Code Object. The following command builds the code object using `hipcc`. +The file format for binary is ``.co`` which means Code Object. The following command builds the code object using ``hipcc``. -.. code:: bash +.. code-block:: bash hipcc --genco --offload-arch=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE] @@ -2309,4 +2311,5 @@ The file format for binary is `.co` which means Code Object. The following comma gfx-arch-specific-kernel ============================================================ -Clang defined '__gfx*__' macros can be used to execute gfx arch specific codes inside the kernel. Refer to the sample in `HIP 14_gpu_arch sample `_. + +Clang defined '__gfx*__' macros can be used to execute gfx arch specific codes inside the kernel. Refer to the sample in `HIP 14_gpu_arch sample `_. diff --git a/docs/reference/terms.md b/docs/reference/terms.md index ce6d51f3ec..4d4be12296 100644 --- a/docs/reference/terms.md +++ b/docs/reference/terms.md @@ -12,10 +12,10 @@ | |thread|thread|work-item| | |warp|warp|sub-group| ||||| -|Thread-
index | threadIdx.x | threadIdx.x | get_local_id(0) | -|Block-
index | blockIdx.x | blockIdx.x | get_group_id(0) | -|Block-
dim | blockDim.x | blockDim.x | get_local_size(0) | -|Grid-dim | gridDim.x | gridDim.x | get_num_groups(0) | +|Thread-
index | `threadIdx.x` | `threadIdx.x` | `get_local_id(0)` | +|Block-
index | `blockIdx.x` | `blockIdx.x` | `get_group_id(0)` | +|Block-
dim | `blockDim.x` | `blockDim.x` | `get_local_size(0)` | +|Grid-dim | `gridDim.x` | `gridDim.x` | `get_num_groups(0)` | ||||| |Device Kernel|`__global__`|`__global__`|`__kernel`| |Device Function|`__device__`|`__device__`|Implied in device compilation| @@ -35,4 +35,4 @@ ## Notes -The indexing functions (starting with `thread-index`) show the terminology for a 1D grid. Some APIs use reverse order of xyz / 012 indexing for 3D grids. +The indexing functions (starting with `thread-index`) show the terminology for a 1D grid. Some APIs use reverse order of `xyz` / 012 indexing for 3D grids. diff --git a/docs/understand/glossary.md b/docs/understand/glossary.md index cb211efbb7..272acd4beb 100644 --- a/docs/understand/glossary.md +++ b/docs/understand/glossary.md @@ -1,10 +1,10 @@ # Glossary of terms -* **host**, **host cpu** : Executes the HIP runtime API and is capable of initiating kernel launches to one or more devices. +* **host**, **host CPU** : Executes the HIP runtime API and is capable of initiating kernel launches to one or more devices. * **default device** : Each host thread maintains a default device. Most HIP runtime APIs (including memory allocation, copy commands, kernel launches) do not accept an explicit device argument but instead implicitly use the default device. -The default device can be set with ```hipSetDevice```. +The default device can be set with `hipSetDevice`. * **active host thread** - the thread which is running the HIP APIs. @@ -13,12 +13,12 @@ The default device can be set with ```hipSetDevice```. * **clr** - a repository for AMD Common Language Runtime, contains source codes for AMD's compute languages runtimes: HIP and OpenCL. clr (https://github.com/ROCm/clr) contains the following three parts, - * ```hipamd```: contains implementation of HIP language on AMD platform. - * ```rocclr```: contains common runtime used in HIP and OpenCL, which provides virtual device interfaces that compute runtimes interact with different backends such as ROCr on Linux or PAL on Windows. - * ```opencl```: contains implementation of OpenCL on AMD platform. + * `hipamd`: contains implementation of HIP language on AMD platform. + * `rocclr`: contains common runtime used in HIP and OpenCL, which provides virtual device interfaces that compute runtimes interact with different backends such as ROCr on Linux or PAL on Windows. + * `opencl`: contains implementation of OpenCL on AMD platform. * **hipify tools** - tools to convert CUDA code to portable C++ code (https://github.com/ROCm/HIPIFY). -* **hipconfig** - tool to report various configuration properties of the target platform. +* **`hipconfig`** - tool to report various configuration properties of the target platform. -* **nvcc** - NVIDIA CUDA ```nvcc``` compiler, do not capitalize. +* **`nvcc`** - NVIDIA CUDA `nvcc` compiler, do not capitalize. diff --git a/docs/understand/programming_model.rst b/docs/understand/programming_model.rst index d7f223830f..d3c09b9676 100644 --- a/docs/understand/programming_model.rst +++ b/docs/understand/programming_model.rst @@ -84,7 +84,7 @@ identical instructions over the available SIMD engines. Consider the following kernel: -.. code:: cu +.. code-block:: cpp __global__ void k(float4* a, const float4* b) { diff --git a/docs/understand/programming_model_reference.rst b/docs/understand/programming_model_reference.rst index e8e0216bb1..582086c2c4 100644 --- a/docs/understand/programming_model_reference.rst +++ b/docs/understand/programming_model_reference.rst @@ -10,7 +10,7 @@ Programming model reference HIP defines a model for mapping single instruction, multiple threads (SIMT) programs onto various architectures, primarily GPUs. While the model may be expressed -in most imperative languages, (eg. Python via PyHIP) this document will focus on +in most imperative languages, (for example Python via PyHIP) this document will focus on the original C/C++ API of HIP. Threading Model