Skip to content

Commit

Permalink
Merge pull request #18 from dipy/csaodf_debug_2
Browse files Browse the repository at this point in the history
Generalizes probabilistic tracking to any ODF
  • Loading branch information
arokem authored Nov 14, 2024
2 parents c839c96 + c482d86 commit 785c1f9
Show file tree
Hide file tree
Showing 19 changed files with 3,914 additions and 1,298 deletions.
73 changes: 21 additions & 52 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,45 +1,39 @@
cmake_minimum_required(VERSION 3.10)
cmake_minimum_required(VERSION 3.24)

#project definition
cmake_policy(SET CMP0104 NEW)
project(cuslines LANGUAGES CUDA CXX VERSION 1.0)

#include the external project stuff
include(ExternalProject)

#global settings
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
# Build settings
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -Werror=reorder")
set(CMAKE_CXX_FLAGS_RELEASE "-O3")

#determine type of build
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release")
endif()

#now, add the right flags
if (CMAKE_BUILD_TYPE STREQUAL "Debug" )
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
else()
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
endif()

#decide whether to use CUDA or not
#find_package(CUDAToolkit REQUIRED)
if(NOT CUDA_COMPUTE_CAPABILITY)
set(CUDA_COMPUTE_CAPABILITY 70 80)
# CUDA
find_package(CUDAToolkit REQUIRED)

# Set default CUDA compute capabilities if unset
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 70 80 90)
endif()

#Find OpenMP
# OpenMP
find_package(OpenMP)
if(OPENMP_FOUND)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")

#check which compiler we have to determine openmp runtime
# Set OMP runtime based on compiler
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
set(OMP_RUNTIME "INTEL")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
Expand All @@ -50,43 +44,18 @@ if(OPENMP_FOUND)
message(STATUS "OpenMP runtime used: ${OMP_RUNTIME}")
endif()

# add object lib
add_library(cuslines_kernels-object OBJECT
"${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu"
)
target_include_directories( cuslines_kernels-object PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" )
target_link_libraries(cuslines_kernels-object PUBLIC cudart)
set_target_properties(cuslines_kernels-object PROPERTIES POSITION_INDEPENDENT_CODE TRUE
CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")

##############################################
# PyBind11/Python build
##############################################
# Build library and pybind11 module
add_subdirectory(external/pybind11)

FIND_PACKAGE( PythonLibs )
INCLUDE_DIRECTORIES( ${PYTHON_INCLUDE_PATH} )

add_library(cuslines_kernels-shared STATIC $<TARGET_OBJECTS:cuslines_kernels-object>)
set_target_properties(cuslines_kernels-shared PROPERTIES OUTPUT_NAME cuslines_kernels
POSITION_INDEPENDENT_CODE TRUE
CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
add_library(cuslines_kernels)
target_sources(cuslines_kernels
PRIVATE
${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu)
set_target_properties(cuslines_kernels PROPERTIES OUTPUT_NAME cuslines_kernels
POSITION_INDEPENDENT_CODE TRUE)

pybind11_add_module(cuslines ${CMAKE_SOURCE_DIR}/cuslines/cuslines.cpp)
target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
target_link_libraries(cuslines PRIVATE cuslines_kernels-shared cudart)
set_target_properties(cuslines PROPERTIES CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")

# custom target for install
#add_custom_target(mvcmd_py ALL
# COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CMAKE_SOURCE_DIR}/cuslines/__init__.py" "${CMAKE_INSTALL_PREFIX}/cuslines/__init__.py"
# DEPENDS "${CMAKE_SOURCE_DIR}/cuslines/__init__.py"
#)
target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CUDAToolkit_INCLUDE_DIRS}")
target_link_libraries(cuslines PRIVATE cuslines_kernels CUDA::cudart_static)

# install
install(TARGETS cuslines cuslines_kernels-shared LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
install(FILES "${CMAKE_SOURCE_DIR}/cuslines/__init__.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_gpu.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_cpu.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_gpu_hardi.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_cpu_hardi.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
# Install
install(TARGETS cuslines cuslines_kernels LIBRARY DESTINATION .)
21 changes: 12 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
## Installation
When cloning the repo, please use "git clone --recursive" to pull all the requirements.

To install, simply run `./build_cuslines.sh`. This will place the cuslines library and files within a new subdirectory `./install` as well as copy the example script.
To install, simply run `pip install .` in the top-level repository directory.

## Running the examples
First, navigate into the `./builds` subdirectory. There are two example scripts using the HARDI dataset, `run_dipy_cpu_hardi.py` and `run_dipy_gpu_hardi.py`, which run on CPU and GPU respectively.
This repository contains several example usage scripts.

The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If not data is passed, it will donaload and use the HARDI dataset.

To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
```
$ python run_dipy_cpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 1000
$ python run_gpu_streamlines.py --device=cpu --output-prefix small --nseeds 1000
parsing arguments
Fitting Tensor
Computing anisotropy measures (FA,MD,RGB)
Expand All @@ -28,7 +30,7 @@ Streamline generation total time: 6.9404990673065186 sec

To run the same case on a single GPU, this is the command and example output:
```
$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 1000 --ngpus 1
$ python run_gpu_streamlines.py --output-prefix small --nseeds 1000 --ngpus 1
parsing arguments
Fitting Tensor
Computing anisotropy measures (FA,MD,RGB)
Expand All @@ -46,11 +48,13 @@ Streamline generation total time: 0.3834989070892334 sec
Destroy GPUTracker...
```

Note that if you experience memory errors, you can adjust the `--chunk-size` flag.

To run on more seeds, we suggest enabling the `--use-fast-write` flag in the GPU script to not get bottlenecked by writing files. Here is a comparison running on 500K seeds on 1 GPU with and without this flag:

Without `--use-fast-write`:
```
$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 500000 --ngpus 1
$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1
parsing arguments
Fitting Tensor
Computing anisotropy measures (FA,MD,RGB)
Expand Down Expand Up @@ -78,7 +82,7 @@ Destroy GPUTracker...

With `--use-fast-write`:
```
$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
parsing arguments
Fitting Tensor
Computing anisotropy measures (FA,MD,RGB)
Expand Down Expand Up @@ -118,11 +122,10 @@ $ docker pull docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest
4. Run the code, mounting the current directory into the container for easy result retrieval:
```
$ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
python run_dipy_gpu_hardi.py --chunk-size 100000 --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
python run_gpu_streamlines.py --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
```
5. The code produces a number of independent track files (one per processed "chunk"), but we have provided a merge script to combine them into a single trk file. To merge files, run:
```
$ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
./merge_trk.sh -o output/hardi_tracks.trk output/hardi_gpu_full*
```

```
13 changes: 7 additions & 6 deletions cuslines/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,29 @@


CUDA_HOME=/usr/local/cuda
CUDACC=$(CUDA_HOME)/bin/nvcc
CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on
CXX=g++
LD=g++

CXXFLAGS= -c -O3 -std=c++11 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include

SMS ?= 60 70
SMS ?= 70
CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
LASTSM := $(lastword $(sort $(SMS)))
CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM)
CUDACFLAGS=-c -O3 -lineinfo -Xptxas=-v -std=c++11 -Xcompiler -fPIC -Xcompiler=-fopenmp $(CUDA_ARCH)
LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt

all: cuslines

cuslines: generate_streamlines_cuda.o cuslines.o
$(LD) -shared cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` -fopenmp -L$(CUDA_HOME)/lib64 -lcudart
$(LD) cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS)

%.o: %.cu
$(CUDACC) $(CUDACFLAGS) $<
%.o : %.cu
$(CUDACC) $(CUDACFLAGS) $< -o $@

%.o: %.cpp
$(CXX) $(CXXFLAGS) $< -o $@

clean:
rm *.o cuslines`python3-config --extension-suffix`
rm *.o cuslines`python3-config --extension-suffix` __pycache__/*.pyc
1 change: 0 additions & 1 deletion cuslines/__init__.py

This file was deleted.

26 changes: 26 additions & 0 deletions cuslines/cudamacro.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,30 @@
exit(EXIT_FAILURE); \
}}

#ifdef USE_NVTX
#include "nvToolsExt.h"

const uint32_t colors4[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff};
const int num_colors4 = sizeof(colors4)/sizeof(colors4[0]);

#define START_RANGE(name,cid) { \
int color_id = cid; \
color_id = color_id%num_colors4;\
nvtxEventAttributes_t eventAttrib = {0}; \
eventAttrib.version = NVTX_VERSION; \
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
eventAttrib.colorType = NVTX_COLOR_ARGB; \
eventAttrib.color = colors4[color_id]; \
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
eventAttrib.message.ascii = name; \
nvtxRangePushEx(&eventAttrib); \
}
#define END_RANGE { \
nvtxRangePop(); \
}
#else
#define START_RANGE(name,cid)
#define END_RANGE
#endif

#endif
68 changes: 47 additions & 21 deletions cuslines/cuslines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ namespace py = pybind11;

#include <cuda_runtime.h>

// #define USE_NVTX

#include "globals.h"
#include "cudamacro.h"
#include "generate_streamlines_cuda.h"

using np_array = py::array_t<REAL>;
using np_array_bool = py::array_t<bool>;
using np_array_int = py::array_t<int>;
using np_array_short = py::array_t<short>;

using np_array_cast = py::array_t<REAL, py::array::c_style | py::array::forcecast>;
using np_array_short_cast = py::array_t<short, py::array::c_style | py::array::forcecast>;
using np_array_int_cast = py::array_t<int, py::array::c_style | py::array::forcecast>;

// Handle to cleanup returned host allocations when associated Python object is destroyed
template <typename T>
Expand All @@ -60,20 +60,23 @@ py::capsule cleanup(T* ptr) {

class GPUTracker {
public:
GPUTracker(double max_angle,
GPUTracker(ModelType model_type,
double max_angle,
double min_signal,
double tc_threshold,
double step_size,
double relative_peak_thresh,
double min_separation_angle,
np_array_cast dataf,
np_array H,
np_array R,
np_array delta_b,
np_array delta_q,
np_array_int b0s_mask,
np_array metric_map,
np_array sampling_matrix,
np_array sphere_vertices,
np_array_int sphere_edges,
np_array_cast H,
np_array_cast R,
np_array_cast delta_b,
np_array_cast delta_q,
np_array_int_cast b0s_mask,
np_array_cast metric_map,
np_array_cast sampling_matrix,
np_array_cast sphere_vertices,
np_array_int_cast sphere_edges,
int ngpus = 1,
int rng_seed = 0,
int rng_offset = 0) {
Expand Down Expand Up @@ -121,10 +124,13 @@ class GPUTracker {
std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl;
ngpus_ = ngpus;

model_type_ = model_type;
max_angle_ = max_angle;
min_signal_ = min_signal;
tc_threshold_ = tc_threshold;
step_size_ = step_size;
relative_peak_thresh_ = relative_peak_thresh,
min_separation_angle_ = min_separation_angle,

// Allocate/copy constant problem data on GPUs
dataf_d.resize(ngpus_, nullptr);
Expand Down Expand Up @@ -220,7 +226,8 @@ class GPUTracker {
std::vector<int> nSlines(ngpus_);

// Call GPU routine
generate_streamlines_cuda_mgpu(max_angle_, min_signal_, tc_threshold_, step_size_,
generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_,
relative_peak_thresh_, min_separation_angle_,
nseeds, seeds_d,
dimx_, dimy_, dimz_, dimt_,
dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
Expand Down Expand Up @@ -263,6 +270,9 @@ class GPUTracker {
auto roi_shape_info = roi_shape.request();
auto voxel_size_info = voxel_size.request();
auto vox_to_ras_info = vox_to_ras.request();

START_RANGE("filewrite", 0);

//#pragma omp parallel for
for (int n = 0; n < ngpus_; ++n) {
std::stringstream ss;
Expand All @@ -271,6 +281,8 @@ class GPUTracker {
voxel_order.c_str(), reinterpret_cast<REAL *>(vox_to_ras_info.ptr), nSlines_old_[n], slinesLen_[n],
reinterpret_cast<REAL3 *>(slines_[n]));
}

END_RANGE;
}

private:
Expand All @@ -281,10 +293,13 @@ class GPUTracker {
int nedges_;
int delta_nr_, samplm_nr_;

ModelType model_type_;
double max_angle_;
double tc_threshold_;
double min_signal_;
double step_size_;
double relative_peak_thresh_;
double min_separation_angle_;

std::vector<int> nSlines_old_;
std::vector<REAL*> slines_;
Expand All @@ -307,15 +322,26 @@ class GPUTracker {


PYBIND11_MODULE(cuslines, m) {
m.attr("MAX_SLINE_LEN") = py::int_(MAX_SLINE_LEN);
m.attr("REAL_SIZE") = py::int_(REAL_SIZE);

py::enum_<ModelType>(m, "ModelType")
.value("OPDT", OPDT)
.value("CSA", CSA)
.value("PROB", PROB)
.value("PTT", PTT);

py::class_<GPUTracker>(m, "GPUTracker")
.def(py::init<double, double, double, double,
np_array_cast, np_array,
np_array, np_array,
np_array, np_array_int,
np_array, np_array,
np_array, np_array_int,
.def(py::init<ModelType, double, double, double, double,
double, double,
np_array_cast, np_array_cast,
np_array_cast, np_array_cast,
np_array_cast, np_array_int_cast,
np_array_cast, np_array_cast,
np_array_cast, np_array_int_cast,
int, int, int>(),
py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
py::arg().noconvert(), py::arg().noconvert(),
py::arg().noconvert(), py::arg().noconvert(),
py::arg().noconvert(), py::arg().noconvert(),
py::arg().noconvert(), py::arg().noconvert(),
Expand Down
Loading

0 comments on commit 785c1f9

Please sign in to comment.