Merge pull request #18 from dipy/csaodf_debug_2

Generalizes probabilistic tracking to any ODF
dipy · Nov 14, 2024 · 785c1f9 · 785c1f9
2 parents c839c96 + c482d86
commit 785c1f9
Show file tree

Hide file tree

Showing 19 changed files with 3,914 additions and 1,298 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,45 +1,39 @@
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.24)
 
-#project definition
-cmake_policy(SET CMP0104 NEW)
 project(cuslines LANGUAGES CUDA CXX VERSION 1.0)
 
-#include the external project stuff
-include(ExternalProject)
-
-#global settings
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+# Build settings
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -Werror=reorder")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 
-#determine type of build
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release")
 endif()
 
-#now, add the right flags
 if (CMAKE_BUILD_TYPE STREQUAL "Debug" )
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
 else()
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
 endif()
 
-#decide whether to use CUDA or not
-#find_package(CUDAToolkit REQUIRED)
-if(NOT CUDA_COMPUTE_CAPABILITY)
-  set(CUDA_COMPUTE_CAPABILITY 70 80)
+# CUDA
+find_package(CUDAToolkit REQUIRED)
+
+# Set default CUDA compute capabilities if unset
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 70 80 90)
 endif()
 
-#Find OpenMP
+# OpenMP
 find_package(OpenMP)
 if(OPENMP_FOUND)
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
 
-    #check which compiler we have to determine openmp runtime
+    # Set OMP runtime based on compiler
     if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
       set(OMP_RUNTIME "INTEL")
     elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@@ -50,43 +44,18 @@ if(OPENMP_FOUND)
     message(STATUS "OpenMP runtime used: ${OMP_RUNTIME}")
 endif()
 
-# add object lib
-add_library(cuslines_kernels-object OBJECT
-  "${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu"
-)
-target_include_directories( cuslines_kernels-object PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" )
-target_link_libraries(cuslines_kernels-object PUBLIC cudart)
-set_target_properties(cuslines_kernels-object PROPERTIES POSITION_INDEPENDENT_CODE TRUE
-                                                         CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
-
-##############################################
-# PyBind11/Python build
-##############################################
+# Build library and pybind11 module
 add_subdirectory(external/pybind11)
-
-FIND_PACKAGE( PythonLibs )
-INCLUDE_DIRECTORIES( ${PYTHON_INCLUDE_PATH} )
-
-add_library(cuslines_kernels-shared STATIC $<TARGET_OBJECTS:cuslines_kernels-object>)
-set_target_properties(cuslines_kernels-shared PROPERTIES OUTPUT_NAME cuslines_kernels 
-                                                         POSITION_INDEPENDENT_CODE TRUE
-                                                         CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
+add_library(cuslines_kernels)
+target_sources(cuslines_kernels
+	       PRIVATE
+	       ${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu)
+set_target_properties(cuslines_kernels PROPERTIES OUTPUT_NAME cuslines_kernels
+                                                  POSITION_INDEPENDENT_CODE TRUE)
 
 pybind11_add_module(cuslines ${CMAKE_SOURCE_DIR}/cuslines/cuslines.cpp)
-target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
-target_link_libraries(cuslines PRIVATE cuslines_kernels-shared cudart)
-set_target_properties(cuslines PROPERTIES CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
-
-# custom target for install
-#add_custom_target(mvcmd_py ALL
-#  COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CMAKE_SOURCE_DIR}/cuslines/__init__.py" "${CMAKE_INSTALL_PREFIX}/cuslines/__init__.py"
-#  DEPENDS "${CMAKE_SOURCE_DIR}/cuslines/__init__.py"
-#)
+target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CUDAToolkit_INCLUDE_DIRS}")
+target_link_libraries(cuslines PRIVATE cuslines_kernels CUDA::cudart_static)
 
-# install
-install(TARGETS cuslines cuslines_kernels-shared LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
-install(FILES "${CMAKE_SOURCE_DIR}/cuslines/__init__.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_gpu.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_cpu.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_gpu_hardi.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_cpu_hardi.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
+# Install
+install(TARGETS cuslines cuslines_kernels LIBRARY DESTINATION .)
diff --git a/README.md b/README.md
@@ -3,14 +3,16 @@
 ## Installation
 When cloning the repo, please use "git clone --recursive" to pull all the requirements.
 
-To install, simply run `./build_cuslines.sh`. This will place the cuslines library and files within a new subdirectory `./install` as well as copy the example script.
+To install, simply run `pip install .` in the top-level repository directory.
 
 ## Running the examples
-First, navigate into the `./builds` subdirectory. There are two example scripts using the HARDI dataset, `run_dipy_cpu_hardi.py` and `run_dipy_gpu_hardi.py`, which run on CPU and GPU respectively.
+This repository contains several example usage scripts.
+
+The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If not data is passed, it will donaload and use the HARDI dataset.
 
 To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
 ```
-$ python run_dipy_cpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 1000
+$ python run_gpu_streamlines.py --device=cpu --output-prefix small --nseeds 1000
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -28,7 +30,7 @@ Streamline generation total time: 6.9404990673065186 sec
 
 To run the same case on a single GPU, this is the command and example output:
 ```
-$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 1000 --ngpus 1
+$ python run_gpu_streamlines.py --output-prefix small --nseeds 1000 --ngpus 1
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -46,11 +48,13 @@ Streamline generation total time: 0.3834989070892334 sec
 Destroy GPUTracker...
 ```
 
+Note that if you experience memory errors, you can adjust the `--chunk-size` flag.
+
 To run on more seeds, we suggest enabling the `--use-fast-write` flag in the GPU script to not get bottlenecked by writing files. Here is a comparison running on 500K seeds on 1 GPU with and without this flag:
 
 Without `--use-fast-write`:
 ```
-$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 500000 --ngpus 1
+$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -78,7 +82,7 @@ Destroy GPUTracker...
 
 With `--use-fast-write`:
 ```
-$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
+$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -118,11 +122,10 @@ $ docker pull docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest
 4. Run the code, mounting the current directory into the container for easy result retrieval:
 ```
 $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
- python run_dipy_gpu_hardi.py --chunk-size 100000 --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
+ python run_gpu_streamlines.py --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
 ```
 5. The code produces a number of independent track files (one per processed "chunk"), but we have provided a merge script to combine them into a single trk file. To merge files, run:
 ```
 $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
  ./merge_trk.sh -o output/hardi_tracks.trk output/hardi_gpu_full*
-```
-
+```
diff --git a/cuslines/Makefile b/cuslines/Makefile
@@ -27,28 +27,29 @@
 
 
 CUDA_HOME=/usr/local/cuda
-CUDACC=$(CUDA_HOME)/bin/nvcc
+CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on
 CXX=g++
 LD=g++
 
 CXXFLAGS= -c -O3 -std=c++11 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include
 
-SMS ?= 60 70
+SMS ?= 70
 CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
 LASTSM := $(lastword $(sort $(SMS)))
 CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM)
 CUDACFLAGS=-c -O3 -lineinfo -Xptxas=-v -std=c++11 -Xcompiler -fPIC -Xcompiler=-fopenmp $(CUDA_ARCH)
+LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
 
 all: cuslines
 
 cuslines: generate_streamlines_cuda.o cuslines.o
-	$(LD) -shared cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` -fopenmp -L$(CUDA_HOME)/lib64 -lcudart
+	$(LD) cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS)
 
-%.o: %.cu
-	$(CUDACC) $(CUDACFLAGS) $<
+%.o : %.cu
+	$(CUDACC) $(CUDACFLAGS) $< -o $@
 
 %.o: %.cpp
 	$(CXX) $(CXXFLAGS) $< -o $@
 
 clean:
-	rm *.o cuslines`python3-config --extension-suffix`
+	rm *.o cuslines`python3-config --extension-suffix` __pycache__/*.pyc
diff --git a/cuslines/__init__.py b/cuslines/__init__.py
diff --git a/cuslines/cudamacro.h b/cuslines/cudamacro.h
@@ -45,4 +45,30 @@
         exit(EXIT_FAILURE);                                                  \
     }}
 
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+
+const uint32_t colors4[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff};
+const int num_colors4 = sizeof(colors4)/sizeof(colors4[0]);
+
+#define START_RANGE(name,cid) { \
+        int color_id = cid; \
+        color_id = color_id%num_colors4;\
+        nvtxEventAttributes_t eventAttrib = {0}; \
+        eventAttrib.version = NVTX_VERSION; \
+        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+        eventAttrib.colorType = NVTX_COLOR_ARGB; \
+        eventAttrib.color = colors4[color_id]; \
+        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+        eventAttrib.message.ascii = name; \
+        nvtxRangePushEx(&eventAttrib); \
+}
+#define END_RANGE { \
+        nvtxRangePop(); \
+}
+#else
+#define START_RANGE(name,cid)
+#define END_RANGE
+#endif
+
 #endif
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
@@ -37,17 +37,17 @@ namespace py = pybind11;
 
 #include <cuda_runtime.h>
 
+// #define USE_NVTX
+
 #include "globals.h"
 #include "cudamacro.h"
 #include "generate_streamlines_cuda.h"
 
 using np_array = py::array_t<REAL>;
-using np_array_bool = py::array_t<bool>;
 using np_array_int = py::array_t<int>;
-using np_array_short = py::array_t<short>;
 
 using np_array_cast = py::array_t<REAL, py::array::c_style | py::array::forcecast>;
-using np_array_short_cast = py::array_t<short, py::array::c_style | py::array::forcecast>;
+using np_array_int_cast = py::array_t<int, py::array::c_style | py::array::forcecast>;
 
 // Handle to cleanup returned host allocations when associated Python object is destroyed
 template <typename T>
@@ -60,20 +60,23 @@ py::capsule cleanup(T* ptr) {
 
 class GPUTracker {
   public:
-    GPUTracker(double max_angle,
+    GPUTracker(ModelType model_type,
+               double max_angle,
                double min_signal,
                double tc_threshold,
                double step_size,
+               double relative_peak_thresh,
+               double min_separation_angle,
                np_array_cast dataf,
-               np_array H,
-               np_array R,
-               np_array delta_b,
-               np_array delta_q,
-               np_array_int b0s_mask,
-               np_array metric_map,
-               np_array sampling_matrix,
-               np_array sphere_vertices,
-               np_array_int sphere_edges,
+               np_array_cast H,
+               np_array_cast R,
+               np_array_cast delta_b,
+               np_array_cast delta_q,
+               np_array_int_cast b0s_mask,
+               np_array_cast metric_map,
+               np_array_cast sampling_matrix,
+               np_array_cast sphere_vertices,
+               np_array_int_cast sphere_edges,
                int ngpus = 1,
                int rng_seed = 0,
                int rng_offset = 0) {
@@ -121,10 +124,13 @@ class GPUTracker {
       std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl;
       ngpus_ = ngpus;
 
+      model_type_ = model_type;
       max_angle_ = max_angle;
       min_signal_ = min_signal;
       tc_threshold_ = tc_threshold;
       step_size_ = step_size;
+      relative_peak_thresh_ = relative_peak_thresh,
+      min_separation_angle_ = min_separation_angle,
 
       // Allocate/copy constant problem data on GPUs
       dataf_d.resize(ngpus_, nullptr);
@@ -220,7 +226,8 @@ class GPUTracker {
       std::vector<int> nSlines(ngpus_);
 
       // Call GPU routine
-      generate_streamlines_cuda_mgpu(max_angle_, min_signal_, tc_threshold_, step_size_,
+      generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_,
+                                     relative_peak_thresh_, min_separation_angle_,
                                      nseeds, seeds_d,
                                      dimx_, dimy_, dimz_, dimt_,
                                      dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
@@ -263,6 +270,9 @@ class GPUTracker {
       auto roi_shape_info = roi_shape.request();
       auto voxel_size_info = voxel_size.request();
       auto vox_to_ras_info = vox_to_ras.request();
+
+      START_RANGE("filewrite", 0);
+
       //#pragma omp parallel for
       for (int n = 0; n < ngpus_; ++n) {
         std::stringstream ss;
@@ -271,6 +281,8 @@ class GPUTracker {
                   voxel_order.c_str(), reinterpret_cast<REAL *>(vox_to_ras_info.ptr), nSlines_old_[n], slinesLen_[n],
                   reinterpret_cast<REAL3 *>(slines_[n]));
       }
+
+      END_RANGE;
     }
 
   private:
@@ -281,10 +293,13 @@ class GPUTracker {
     int nedges_;
     int delta_nr_, samplm_nr_;
 
+    ModelType model_type_;
     double max_angle_;
     double tc_threshold_;
     double min_signal_;
     double step_size_;
+    double relative_peak_thresh_;
+    double min_separation_angle_;
 
     std::vector<int> nSlines_old_;
     std::vector<REAL*> slines_;
@@ -307,15 +322,26 @@ class GPUTracker {
 
 
 PYBIND11_MODULE(cuslines, m) {
+  m.attr("MAX_SLINE_LEN") = py::int_(MAX_SLINE_LEN);
+  m.attr("REAL_SIZE") = py::int_(REAL_SIZE);
+
+  py::enum_<ModelType>(m, "ModelType")
+    .value("OPDT", OPDT)
+    .value("CSA", CSA)
+    .value("PROB", PROB)
+    .value("PTT", PTT);
+
   py::class_<GPUTracker>(m, "GPUTracker")
-    .def(py::init<double, double, double, double,
-		  np_array_cast, np_array,
-                  np_array, np_array,
-                  np_array, np_array_int,
-                  np_array, np_array,
-                  np_array, np_array_int,
+    .def(py::init<ModelType, double, double, double, double,
+                  double, double,
+		              np_array_cast, np_array_cast,
+                  np_array_cast, np_array_cast,
+                  np_array_cast, np_array_int_cast,
+                  np_array_cast, np_array_cast,
+                  np_array_cast, np_array_int_cast,
                   int, int, int>(),
-                  py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
+                  py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
+                  py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),