diff --git a/CMakeLists.txt b/CMakeLists.txt
index 229ea23..2a694b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,45 +1,39 @@
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.24)
 
-#project definition
-cmake_policy(SET CMP0104 NEW)
 project(cuslines LANGUAGES CUDA CXX VERSION 1.0)
 
-#include the external project stuff
-include(ExternalProject)
-
-#global settings
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+# Build settings
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -Werror=reorder")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 
-#determine type of build
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release")
 endif()
 
-#now, add the right flags
 if (CMAKE_BUILD_TYPE STREQUAL "Debug" )
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
 else()
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
 endif()
 
-#decide whether to use CUDA or not
-#find_package(CUDAToolkit REQUIRED)
-if(NOT CUDA_COMPUTE_CAPABILITY)
-  set(CUDA_COMPUTE_CAPABILITY 70 80)
+# CUDA
+find_package(CUDAToolkit REQUIRED)
+
+# Set default CUDA compute capabilities if unset
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 70 80 90)
 endif()
 
-#Find OpenMP
+# OpenMP
 find_package(OpenMP)
 if(OPENMP_FOUND)
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
     
-    #check which compiler we have to determine openmp runtime
+    # Set OMP runtime based on compiler
     if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
       set(OMP_RUNTIME "INTEL")
     elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@@ -50,43 +44,18 @@ if(OPENMP_FOUND)
     message(STATUS "OpenMP runtime used: ${OMP_RUNTIME}")
 endif()
 
-# add object lib
-add_library(cuslines_kernels-object OBJECT
-  "${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu"
-)
-target_include_directories( cuslines_kernels-object PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" )
-target_link_libraries(cuslines_kernels-object PUBLIC cudart)
-set_target_properties(cuslines_kernels-object PROPERTIES POSITION_INDEPENDENT_CODE TRUE
-                                                         CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
-
-##############################################
-# PyBind11/Python build
-##############################################
+# Build library and pybind11 module
 add_subdirectory(external/pybind11)
-
-FIND_PACKAGE( PythonLibs )
-INCLUDE_DIRECTORIES( ${PYTHON_INCLUDE_PATH} )
-
-add_library(cuslines_kernels-shared STATIC $<TARGET_OBJECTS:cuslines_kernels-object>)
-set_target_properties(cuslines_kernels-shared PROPERTIES OUTPUT_NAME cuslines_kernels 
-                                                         POSITION_INDEPENDENT_CODE TRUE
-                                                         CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
+add_library(cuslines_kernels)
+target_sources(cuslines_kernels
+	       PRIVATE
+	       ${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu)
+set_target_properties(cuslines_kernels PROPERTIES OUTPUT_NAME cuslines_kernels
+                                                  POSITION_INDEPENDENT_CODE TRUE)
 
 pybind11_add_module(cuslines ${CMAKE_SOURCE_DIR}/cuslines/cuslines.cpp)
-target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
-target_link_libraries(cuslines PRIVATE cuslines_kernels-shared cudart)
-set_target_properties(cuslines PROPERTIES CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
-
-# custom target for install
-#add_custom_target(mvcmd_py ALL
-#  COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CMAKE_SOURCE_DIR}/cuslines/__init__.py" "${CMAKE_INSTALL_PREFIX}/cuslines/__init__.py"
-#  DEPENDS "${CMAKE_SOURCE_DIR}/cuslines/__init__.py"
-#)
+target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CUDAToolkit_INCLUDE_DIRS}")
+target_link_libraries(cuslines PRIVATE cuslines_kernels CUDA::cudart_static)
 
-# install
-install(TARGETS cuslines cuslines_kernels-shared LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
-install(FILES "${CMAKE_SOURCE_DIR}/cuslines/__init__.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_gpu.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_cpu.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_gpu_hardi.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_cpu_hardi.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
+# Install
+install(TARGETS cuslines cuslines_kernels LIBRARY DESTINATION .)
diff --git a/README.md b/README.md
index 4768c84..37661cc 100644
--- a/README.md
+++ b/README.md
@@ -3,14 +3,16 @@
 ## Installation
 When cloning the repo, please use "git clone --recursive" to pull all the requirements.
 
-To install, simply run `./build_cuslines.sh`. This will place the cuslines library and files within a new subdirectory `./install` as well as copy the example script.
+To install, simply run `pip install .` in the top-level repository directory.
 
 ## Running the examples
-First, navigate into the `./builds` subdirectory. There are two example scripts using the HARDI dataset, `run_dipy_cpu_hardi.py` and `run_dipy_gpu_hardi.py`, which run on CPU and GPU respectively.
+This repository contains several example usage scripts.
+
+The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If not data is passed, it will donaload and use the HARDI dataset.
 
 To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
 ```
-$ python run_dipy_cpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 1000
+$ python run_gpu_streamlines.py --device=cpu --output-prefix small --nseeds 1000
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -28,7 +30,7 @@ Streamline generation total time: 6.9404990673065186 sec
 
 To run the same case on a single GPU, this is the command and example output:
 ```
-$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 1000 --ngpus 1
+$ python run_gpu_streamlines.py --output-prefix small --nseeds 1000 --ngpus 1
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -46,11 +48,13 @@ Streamline generation total time: 0.3834989070892334 sec
 Destroy GPUTracker...
 ```
 
+Note that if you experience memory errors, you can adjust the `--chunk-size` flag.
+
 To run on more seeds, we suggest enabling the `--use-fast-write` flag in the GPU script to not get bottlenecked by writing files. Here is a comparison running on 500K seeds on 1 GPU with and without this flag:
 
 Without `--use-fast-write`:
 ```
-$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 500000 --ngpus 1
+$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -78,7 +82,7 @@ Destroy GPUTracker...
 
 With `--use-fast-write`:
 ```
-$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
+$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -118,11 +122,10 @@ $ docker pull docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest
 4. Run the code, mounting the current directory into the container for easy result retrieval:
 ```
 $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
- python run_dipy_gpu_hardi.py --chunk-size 100000 --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
+ python run_gpu_streamlines.py --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
 ```
 5. The code produces a number of independent track files (one per processed "chunk"), but we have provided a merge script to combine them into a single trk file. To merge files, run:
 ```
 $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
  ./merge_trk.sh -o output/hardi_tracks.trk output/hardi_gpu_full*
-```
-
+```
\ No newline at end of file
diff --git a/cuslines/Makefile b/cuslines/Makefile
index d167bc1..1061a16 100644
--- a/cuslines/Makefile
+++ b/cuslines/Makefile
@@ -27,28 +27,29 @@
 
 
 CUDA_HOME=/usr/local/cuda
-CUDACC=$(CUDA_HOME)/bin/nvcc
+CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on
 CXX=g++
 LD=g++
 
 CXXFLAGS= -c -O3 -std=c++11 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include
 
-SMS ?= 60 70
+SMS ?= 70
 CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
 LASTSM := $(lastword $(sort $(SMS)))
 CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM)
 CUDACFLAGS=-c -O3 -lineinfo -Xptxas=-v -std=c++11 -Xcompiler -fPIC -Xcompiler=-fopenmp $(CUDA_ARCH)
+LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
 
 all: cuslines
 
 cuslines: generate_streamlines_cuda.o cuslines.o
-	$(LD) -shared cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` -fopenmp -L$(CUDA_HOME)/lib64 -lcudart
+	$(LD) cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS)
 
-%.o: %.cu
-	$(CUDACC) $(CUDACFLAGS) $<
+%.o : %.cu
+	$(CUDACC) $(CUDACFLAGS) $< -o $@
 
 %.o: %.cpp
 	$(CXX) $(CXXFLAGS) $< -o $@
 
 clean:
-	rm *.o cuslines`python3-config --extension-suffix`
+	rm *.o cuslines`python3-config --extension-suffix` __pycache__/*.pyc
diff --git a/cuslines/__init__.py b/cuslines/__init__.py
deleted file mode 100644
index f3d06a8..0000000
--- a/cuslines/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-import cuslines
diff --git a/cuslines/cudamacro.h b/cuslines/cudamacro.h
index c447529..49ac24c 100644
--- a/cuslines/cudamacro.h
+++ b/cuslines/cudamacro.h
@@ -45,4 +45,30 @@
         exit(EXIT_FAILURE);                                                  \
     }}
 
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+
+const uint32_t colors4[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff};
+const int num_colors4 = sizeof(colors4)/sizeof(colors4[0]);
+
+#define START_RANGE(name,cid) { \
+        int color_id = cid; \
+        color_id = color_id%num_colors4;\
+        nvtxEventAttributes_t eventAttrib = {0}; \
+        eventAttrib.version = NVTX_VERSION; \
+        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+        eventAttrib.colorType = NVTX_COLOR_ARGB; \
+        eventAttrib.color = colors4[color_id]; \
+        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+        eventAttrib.message.ascii = name; \
+        nvtxRangePushEx(&eventAttrib); \
+}
+#define END_RANGE { \
+        nvtxRangePop(); \
+}
+#else
+#define START_RANGE(name,cid)
+#define END_RANGE
+#endif
+
 #endif
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index c190019..4e8bc30 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -37,17 +37,17 @@ namespace py = pybind11;
 
 #include <cuda_runtime.h>
 
+// #define USE_NVTX
+
 #include "globals.h"
 #include "cudamacro.h"
 #include "generate_streamlines_cuda.h"
 
 using np_array = py::array_t<REAL>;
-using np_array_bool = py::array_t<bool>;
 using np_array_int = py::array_t<int>;
-using np_array_short = py::array_t<short>;
 
 using np_array_cast = py::array_t<REAL, py::array::c_style | py::array::forcecast>;
-using np_array_short_cast = py::array_t<short, py::array::c_style | py::array::forcecast>;
+using np_array_int_cast = py::array_t<int, py::array::c_style | py::array::forcecast>;
 
 // Handle to cleanup returned host allocations when associated Python object is destroyed
 template <typename T>
@@ -60,20 +60,23 @@ py::capsule cleanup(T* ptr) {
 
 class GPUTracker {
   public:
-    GPUTracker(double max_angle,
+    GPUTracker(ModelType model_type,
+               double max_angle,
                double min_signal,
                double tc_threshold,
                double step_size,
+               double relative_peak_thresh,
+               double min_separation_angle,
                np_array_cast dataf,
-               np_array H,
-               np_array R,
-               np_array delta_b,
-               np_array delta_q,
-               np_array_int b0s_mask,
-               np_array metric_map,
-               np_array sampling_matrix,
-               np_array sphere_vertices,
-               np_array_int sphere_edges,
+               np_array_cast H,
+               np_array_cast R,
+               np_array_cast delta_b,
+               np_array_cast delta_q,
+               np_array_int_cast b0s_mask,
+               np_array_cast metric_map,
+               np_array_cast sampling_matrix,
+               np_array_cast sphere_vertices,
+               np_array_int_cast sphere_edges,
                int ngpus = 1,
                int rng_seed = 0,
                int rng_offset = 0) {
@@ -121,10 +124,13 @@ class GPUTracker {
       std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl;
       ngpus_ = ngpus;
 
+      model_type_ = model_type;
       max_angle_ = max_angle;
       min_signal_ = min_signal;
       tc_threshold_ = tc_threshold;
       step_size_ = step_size;
+      relative_peak_thresh_ = relative_peak_thresh,
+      min_separation_angle_ = min_separation_angle,
 
       // Allocate/copy constant problem data on GPUs
       dataf_d.resize(ngpus_, nullptr);
@@ -220,7 +226,8 @@ class GPUTracker {
       std::vector<int> nSlines(ngpus_);
 
       // Call GPU routine
-      generate_streamlines_cuda_mgpu(max_angle_, min_signal_, tc_threshold_, step_size_,
+      generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_,
+                                     relative_peak_thresh_, min_separation_angle_,
                                      nseeds, seeds_d,
                                      dimx_, dimy_, dimz_, dimt_,
                                      dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
@@ -263,6 +270,9 @@ class GPUTracker {
       auto roi_shape_info = roi_shape.request();
       auto voxel_size_info = voxel_size.request();
       auto vox_to_ras_info = vox_to_ras.request();
+
+      START_RANGE("filewrite", 0);
+
       //#pragma omp parallel for
       for (int n = 0; n < ngpus_; ++n) {
         std::stringstream ss;
@@ -271,6 +281,8 @@ class GPUTracker {
                   voxel_order.c_str(), reinterpret_cast<REAL *>(vox_to_ras_info.ptr), nSlines_old_[n], slinesLen_[n],
                   reinterpret_cast<REAL3 *>(slines_[n]));
       }
+
+      END_RANGE;
     }
 
   private:
@@ -281,10 +293,13 @@ class GPUTracker {
     int nedges_;
     int delta_nr_, samplm_nr_;
 
+    ModelType model_type_;
     double max_angle_;
     double tc_threshold_;
     double min_signal_;
     double step_size_;
+    double relative_peak_thresh_;
+    double min_separation_angle_;
 
     std::vector<int> nSlines_old_;
     std::vector<REAL*> slines_;
@@ -307,15 +322,26 @@ class GPUTracker {
 
 
 PYBIND11_MODULE(cuslines, m) {
+  m.attr("MAX_SLINE_LEN") = py::int_(MAX_SLINE_LEN);
+  m.attr("REAL_SIZE") = py::int_(REAL_SIZE);
+
+  py::enum_<ModelType>(m, "ModelType")
+    .value("OPDT", OPDT)
+    .value("CSA", CSA)
+    .value("PROB", PROB)
+    .value("PTT", PTT);
+
   py::class_<GPUTracker>(m, "GPUTracker")
-    .def(py::init<double, double, double, double,
-		  np_array_cast, np_array,
-                  np_array, np_array,
-                  np_array, np_array_int,
-                  np_array, np_array,
-                  np_array, np_array_int,
+    .def(py::init<ModelType, double, double, double, double,
+                  double, double,
+		              np_array_cast, np_array_cast,
+                  np_array_cast, np_array_cast,
+                  np_array_cast, np_array_int_cast,
+                  np_array_cast, np_array_cast,
+                  np_array_cast, np_array_int_cast,
                   int, int, int>(),
-                  py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
+                  py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
+                  py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
diff --git a/cuslines/disc.h b/cuslines/disc.h
new file mode 100644
index 0000000..0ae36f1
--- /dev/null
+++ b/cuslines/disc.h
@@ -0,0 +1,1886 @@
+
+/*
+This code from: https://github.com/nibrary/nibrary/blob/main/src/math/disc.h
+
+BSD 3-Clause License
+
+Copyright (c) 2024, Dogu Baran Aydogan All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define DISC_2_VERT_CNT 24
+#define DISC_2_FACE_CNT 31
+
+#define DISC_2_VERT {\
+	-0.99680788,-0.07983759,\
+	-0.94276539,0.33345677,\
+	-0.87928469,-0.47629658,\
+	-0.72856617,0.68497542,\
+	-0.60006556,-0.79995082,\
+	-0.54129995,-0.02761342,\
+	-0.39271207,0.37117272,\
+	-0.39217391,0.91989110,\
+	-0.36362884,-0.40757367,\
+	-0.22391316,-0.97460910,\
+	-0.00130022,0.53966106,\
+	0.00000000,0.00000000,\
+	0.00973999,0.99995257,\
+	0.01606516,-0.54289908,\
+	0.21342395,-0.97695968,\
+	0.38192071,-0.38666136,\
+	0.38897094,0.37442837,\
+	0.40696681,0.91344295,\
+	0.54387161,-0.01477123,\
+	0.59119367,-0.80652963,\
+	0.73955688,0.67309406,\
+	0.87601150,-0.48229022,\
+	0.94617928,0.32364298,\
+	0.99585368,-0.09096944}
+
+
+
+#define DISC_2_FACE {\
+	9,8,4,\
+	11,16,10,\
+	5,8,11,\
+	5,1,0,\
+	18,16,11,\
+	11,15,18,\
+	13,8,9,\
+	11,8,13,\
+	13,15,11,\
+	22,18,23,\
+	22,20,16,\
+	16,18,22,\
+	16,20,17,\
+	12,10,17,\
+	17,10,16,\
+	15,19,21,\
+	23,18,21,\
+	21,18,15,\
+	2,4,8,\
+	2,5,0,\
+	8,5,2,\
+	7,10,12,\
+	6,7,3,\
+	10,7,6,\
+	3,1,6,\
+	1,5,6,\
+	11,10,6,\
+	6,5,11,\
+	14,19,15,\
+	15,13,14,\
+	14,13,9}
+
+
+
+#define DISC_3_VERT_CNT 36
+#define DISC_3_FACE_CNT 52
+
+#define DISC_3_VERT {\
+	-0.98798409,-0.15455565,\
+	-0.98026530,0.19768646,\
+	-0.87061458,-0.49196570,\
+	-0.85315536,0.52165691,\
+	-0.67948751,0.12870519,\
+	-0.65830249,-0.75275350,\
+	-0.60977645,0.79257345,\
+	-0.60599745,-0.25746218,\
+	-0.49175185,0.45085081,\
+	-0.39584766,-0.56449807,\
+	-0.37151031,0.02599657,\
+	-0.34538749,-0.93846017,\
+	-0.31409968,0.94939001,\
+	-0.19774331,-0.30335822,\
+	-0.18708240,0.31479263,\
+	-0.14013436,0.65112487,\
+	-0.02230445,-0.65649640,\
+	-0.01247874,-0.99992214,\
+	0,0,\
+	0.03699045,0.99931562,\
+	0.15587647,0.33306130,\
+	0.17739302,-0.31129535,\
+	0.23950456,0.64808985,\
+	0.31593561,-0.94878063,\
+	0.34839477,-0.59393230,\
+	0.35674244,0.02011329,\
+	0.38082583,0.92464679,\
+	0.52353496,0.39304489,\
+	0.57766607,-0.30046041,\
+	0.63711661,-0.77076743,\
+	0.66791137,0.74424082,\
+	0.68671421,0.06646131,\
+	0.85301727,-0.52188269,\
+	0.88617706,0.46334676,\
+	0.97866100,-0.20548151,\
+	0.98951863,0.14440528}
+
+
+
+#define DISC_3_FACE {\
+	27,30,22,\
+	9,2,5,\
+	28,32,34,\
+	22,30,26,\
+	26,19,22,\
+	30,27,33,\
+	31,34,35,\
+	28,34,31,\
+	35,33,31,\
+	31,33,27,\
+	25,31,27,\
+	28,31,25,\
+	10,14,8,\
+	10,13,18,\
+	18,14,10,\
+	15,19,12,\
+	22,19,15,\
+	8,14,15,\
+	11,9,5,\
+	13,9,16,\
+	16,11,17,\
+	9,11,16,\
+	17,23,16,\
+	23,24,16,\
+	29,24,23,\
+	29,32,28,\
+	28,24,29,\
+	6,3,8,\
+	6,15,12,\
+	8,15,6,\
+	20,27,22,\
+	20,25,27,\
+	18,25,20,\
+	20,14,18,\
+	22,15,20,\
+	20,15,14,\
+	21,24,28,\
+	28,25,21,\
+	21,25,18,\
+	18,13,21,\
+	13,16,21,\
+	21,16,24,\
+	4,10,8,\
+	8,3,4,\
+	4,3,1,\
+	4,1,0,\
+	7,9,13,\
+	13,10,7,\
+	2,9,7,\
+	10,4,7,\
+	0,2,7,\
+	7,4,0}
+
+
+
+#define DISC_4_VERT_CNT 62
+#define DISC_4_FACE_CNT 97
+
+#define DISC_4_VERT {\
+	-0.99632399,0.08566510,\
+	-0.98618071,-0.16567317,\
+	-0.94150749,0.33699206,\
+	-0.91375624,-0.40626289,\
+	-0.82498245,0.56515834,\
+	-0.78016046,-0.62557946,\
+	-0.76768368,-0.12856657,\
+	-0.73146437,0.13012819,\
+	-0.65993870,0.75131945,\
+	-0.64923474,-0.36077026,\
+	-0.64404827,0.37054571,\
+	-0.59590501,-0.80305493,\
+	-0.52723292,-0.08735736,\
+	-0.50689203,0.59055453,\
+	-0.48847116,-0.55956115,\
+	-0.45296756,0.16857820,\
+	-0.44983881,0.89310976,\
+	-0.37980292,-0.92506742,\
+	-0.37431635,-0.30508770,\
+	-0.34347782,0.41024244,\
+	-0.28695359,-0.72298195,\
+	-0.26607611,-0.04337891,\
+	-0.26300954,0.69460622,\
+	-0.20716495,0.97830603,\
+	-0.19234589,-0.49854986,\
+	-0.17059736,0.21089158,\
+	-0.13571649,-0.99074771,\
+	-0.09401357,-0.25347966,\
+	-0.08264934,0.47807857,\
+	-0.02277615,-0.74173498,\
+	-0.00823427,0.74390934,\
+	0,0,\
+	0.04408226,0.99902790,\
+	0.07601333,-0.47847223,\
+	0.09881278,0.25627739,\
+	0.12027544,-0.99274056,\
+	0.17542943,-0.20988186,\
+	0.18324588,0.50353410,\
+	0.23456374,-0.70587816,\
+	0.25179308,0.73431645,\
+	0.27247058,0.04550104,\
+	0.28231740,0.95932106,\
+	0.33582739,-0.41640371,\
+	0.36311579,-0.93174402,\
+	0.37710908,0.31092054,\
+	0.45674883,-0.17177025,\
+	0.47051745,0.57347807,\
+	0.47748339,-0.61101450,\
+	0.51751229,0.85567577,\
+	0.53030761,0.08858984,\
+	0.57509454,-0.81808696,\
+	0.63141296,-0.38867064,\
+	0.64811006,0.37237321,\
+	0.71524914,0.69886956,\
+	0.73323663,-0.14081300,\
+	0.76190940,0.12763299,\
+	0.76512049,-0.64388713,\
+	0.85941151,0.51128451,\
+	0.90165136,-0.43246368,\
+	0.95650965,0.29170069,\
+	0.97794249,-0.20887431,\
+	0.99951822,0.03103749}
+
+
+
+#define DISC_4_FACE {\
+	39,32,30,\
+	30,22,28,\
+	28,22,19,\
+	52,59,57,\
+	14,11,20,\
+	47,56,51,\
+	50,56,47,\
+	47,43,50,\
+	41,39,48,\
+	32,39,41,\
+	10,4,2,\
+	37,39,30,\
+	30,28,37,\
+	25,28,19,\
+	25,21,31,\
+	53,52,57,\
+	44,52,46,\
+	46,37,44,\
+	39,37,46,\
+	52,53,46,\
+	48,39,46,\
+	46,53,48,\
+	61,59,55,\
+	59,52,55,\
+	42,47,51,\
+	35,29,26,\
+	26,29,20,\
+	12,21,15,\
+	19,10,15,\
+	15,25,19,\
+	21,25,15,\
+	18,21,12,\
+	18,9,14,\
+	12,9,18,\
+	6,9,12,\
+	0,1,6,\
+	5,11,14,\
+	14,9,5,\
+	51,56,58,\
+	16,22,23,\
+	30,32,23,\
+	23,22,30,\
+	34,40,44,\
+	44,37,34,\
+	31,40,34,\
+	34,25,31,\
+	34,37,28,\
+	28,25,34,\
+	51,58,54,\
+	54,58,60,\
+	54,60,61,\
+	61,55,54,\
+	49,52,44,\
+	49,55,52,\
+	44,40,49,\
+	49,54,55,\
+	36,40,31,\
+	33,42,36,\
+	43,47,38,\
+	35,43,38,\
+	38,29,35,\
+	33,29,38,\
+	38,42,33,\
+	47,42,38,\
+	17,20,11,\
+	17,26,20,\
+	21,18,27,\
+	33,36,27,\
+	31,21,27,\
+	27,36,31,\
+	14,20,24,\
+	24,18,14,\
+	24,27,18,\
+	33,27,24,\
+	24,29,33,\
+	20,29,24,\
+	7,6,12,\
+	12,15,7,\
+	7,15,10,\
+	0,6,7,\
+	7,2,0,\
+	7,10,2,\
+	3,5,9,\
+	3,6,1,\
+	9,6,3,\
+	4,10,13,\
+	13,8,4,\
+	13,10,19,\
+	19,22,13,\
+	13,22,16,\
+	16,8,13,\
+	45,49,40,\
+	40,36,45,\
+	45,36,42,\
+	45,42,51,\
+	51,54,45,\
+	54,49,45}
+
+
+
+
+#define DISC_5_VERT_CNT 88
+#define DISC_5_FACE_CNT 143
+
+#define DISC_5_VERT {\
+	-0.99971936,0.02368974,\
+	-0.98497387,-0.17270345,\
+	-0.97603282,0.21762338,\
+	-0.92922869,-0.36950514,\
+	-0.90708773,0.42094161,\
+	-0.83415725,-0.55152668,\
+	-0.79951365,0.60064792,\
+	-0.79931959,-0.15614114,\
+	-0.78301036,0.22417418,\
+	-0.73599072,0.03246227,\
+	-0.70693097,-0.70728255,\
+	-0.70138379,-0.35670071,\
+	-0.67496938,0.41799949,\
+	-0.65872201,0.75238641,\
+	-0.58146330,-0.53578945,\
+	-0.58133729,-0.15872598,\
+	-0.56876870,0.21601921,\
+	-0.55866507,-0.82939335,\
+	-0.55072833,0.60394640,\
+	-0.49682087,0.86785311,\
+	-0.48121950,0.02702752,\
+	-0.46260160,-0.34991875,\
+	-0.44852900,0.41519979,\
+	-0.43296059,-0.69074037,\
+	-0.37854350,-0.92558350,\
+	-0.35947581,0.64734787,\
+	-0.34709225,-0.16018397,\
+	-0.34353985,0.21908385,\
+	-0.32747735,-0.52839636,\
+	-0.31642072,0.94861896,\
+	-0.23858273,0.43081147,\
+	-0.23727114,0.03012835,\
+	-0.22120862,-0.34920779,\
+	-0.22093401,0.78635645,\
+	-0.21959193,-0.75538583,\
+	-0.18652372,-0.98245046,\
+	-0.11983054,0.22928120,\
+	-0.11608911,0.60000621,\
+	-0.11215562,-0.15690164,\
+	-0.10768432,-0.56628902,\
+	-0.10628279,0.99433594,\
+	-0.00000000,-1.00000000,\
+	-0.00000000,-0.78706952,\
+	0.00000000,0.41097842,\
+	0.00000000,-0.36210641,\
+	0.00000000,0.04030552,\
+	0.00000000,0.79503467,\
+	0.10628279,0.99433594,\
+	0.10768432,-0.56628902,\
+	0.11215563,-0.15690164,\
+	0.11608911,0.60000621,\
+	0.11983054,0.22928120,\
+	0.18652372,-0.98245046,\
+	0.21959193,-0.75538583,\
+	0.22093402,0.78635645,\
+	0.22120862,-0.34920779,\
+	0.23727114,0.03012835,\
+	0.23858273,0.43081146,\
+	0.31642072,0.94861896,\
+	0.32747735,-0.52839636,\
+	0.34353985,0.21908385,\
+	0.34709225,-0.16018397,\
+	0.35947581,0.64734786,\
+	0.37854350,-0.92558350,\
+	0.43296060,-0.69074037,\
+	0.44852900,0.41519979,\
+	0.46260160,-0.34991875,\
+	0.48121950,0.02702752,\
+	0.49682086,0.86785311,\
+	0.55072833,0.60394640,\
+	0.55866507,-0.82939336,\
+	0.56876870,0.21601921,\
+	0.58133729,-0.15872598,\
+	0.58146330,-0.53578945,\
+	0.65872201,0.75238641,\
+	0.67496938,0.41799949,\
+	0.70138379,-0.35670071,\
+	0.70693097,-0.70728255,\
+	0.73599072,0.03246227,\
+	0.78301036,0.22417418,\
+	0.79931959,-0.15614114,\
+	0.79951365,0.60064792,\
+	0.83415725,-0.55152668,\
+	0.90708773,0.42094162,\
+	0.92922869,-0.36950514,\
+	0.97603282,0.21762338,\
+	0.98497387,-0.17270345,\
+	0.99971936,0.02368974}
+
+
+
+#define DISC_5_FACE {\
+	12,6,4,\
+	69,74,68,\
+	1,7,0,\
+	81,74,69,\
+	75,81,69,\
+	83,81,75,\
+	41,52,42,\
+	42,35,41,\
+	42,52,53,\
+	53,48,42,\
+	73,64,77,\
+	77,64,70,\
+	45,31,38,\
+	56,61,67,\
+	42,48,39,\
+	59,73,66,\
+	64,73,59,\
+	59,53,64,\
+	48,53,59,\
+	40,46,47,\
+	36,31,45,\
+	30,36,43,\
+	54,47,46,\
+	18,6,12,\
+	9,0,7,\
+	27,36,30,\
+	31,36,27,\
+	72,61,66,\
+	67,61,72,\
+	5,11,3,\
+	3,7,1,\
+	3,11,7,\
+	17,23,10,\
+	24,23,17,\
+	10,23,14,\
+	14,5,10,\
+	14,11,5,\
+	24,35,34,\
+	34,23,24,\
+	34,35,42,\
+	42,39,34,\
+	83,75,79,\
+	79,85,83,\
+	75,71,79,\
+	80,72,76,\
+	66,73,76,\
+	76,72,66,\
+	63,53,52,\
+	64,53,63,\
+	63,70,64,\
+	44,38,32,\
+	32,39,44,\
+	44,39,48,\
+	66,61,55,\
+	55,59,66,\
+	48,59,55,\
+	55,44,48,\
+	30,43,37,\
+	51,36,45,\
+	51,43,36,\
+	45,56,51,\
+	57,43,51,\
+	58,54,68,\
+	47,54,58,\
+	69,68,62,\
+	68,54,62,\
+	50,43,57,\
+	57,62,50,\
+	50,62,54,\
+	50,54,46,\
+	46,37,50,\
+	50,37,43,\
+	6,18,13,\
+	13,18,19,\
+	12,4,8,\
+	4,2,8,\
+	8,2,0,\
+	0,9,8,\
+	7,11,15,\
+	15,9,7,\
+	12,8,16,\
+	16,8,9,\
+	28,14,23,\
+	28,39,32,\
+	23,34,28,\
+	28,34,39,\
+	85,79,87,\
+	87,80,86,\
+	86,80,84,\
+	80,76,84,\
+	38,44,49,\
+	44,55,49,\
+	45,38,49,\
+	49,56,45,\
+	61,56,49,\
+	49,55,61,\
+	19,18,25,\
+	30,37,25,\
+	60,51,56,\
+	60,56,67,\
+	67,71,60,\
+	57,51,60,\
+	32,38,26,\
+	26,38,31,\
+	22,27,30,\
+	22,16,27,\
+	30,25,22,\
+	22,25,18,\
+	22,18,12,\
+	12,16,22,\
+	9,15,20,\
+	20,16,9,\
+	15,26,20,\
+	20,26,31,\
+	31,27,20,\
+	27,16,20,\
+	14,28,21,\
+	11,14,21,\
+	21,15,11,\
+	21,26,15,\
+	21,28,32,\
+	32,26,21,\
+	80,87,78,\
+	78,87,79,\
+	67,72,78,\
+	78,72,80,\
+	78,71,67,\
+	78,79,71,\
+	82,84,76,\
+	82,73,77,\
+	82,76,73,\
+	33,29,19,\
+	19,25,33,\
+	33,25,37,\
+	33,37,46,\
+	33,46,40,\
+	40,29,33,\
+	57,60,65,\
+	65,60,71,\
+	69,62,65,\
+	65,62,57,\
+	65,75,69,\
+	65,71,75}
+
+
+
+
+#define DISC_6_VERT_CNT 93
+#define DISC_6_FACE_CNT 152
+
+#define DISC_6_VERT {\
+	-0.99999594,0.00284872,\
+	-0.98015885,-0.19821361,\
+	-0.97910452,0.20335765,\
+	-0.91824742,-0.39600716,\
+	-0.91642084,0.40021599,\
+	-0.82654691,0.00183534,\
+	-0.81562261,-0.57858427,\
+	-0.81341702,0.58168096,\
+	-0.77345426,-0.20215086,\
+	-0.77255314,0.20518626,\
+	-0.68941469,-0.39022484,\
+	-0.68791137,0.39257991,\
+	-0.68013707,-0.73308497,\
+	-0.67797282,0.73508697,\
+	-0.63913549,0.00098160,\
+	-0.57136133,-0.55867323,\
+	-0.56955891,0.56029565,\
+	-0.55841446,-0.19717430,\
+	-0.55776663,0.19858763,\
+	-0.52205785,-0.85291008,\
+	-0.52029081,0.85398915,\
+	-0.45456737,-0.37917968,\
+	-0.45349521,0.38007165,\
+	-0.43551427,0.00036391,\
+	-0.42468657,-0.70786237,\
+	-0.42282245,0.70855298,\
+	-0.34062665,-0.94019864,\
+	-0.33950518,0.94060419,\
+	-0.33587158,-0.19180791,\
+	-0.33542840,0.19211031,\
+	-0.32531753,-0.54899060,\
+	-0.32390201,0.54924250,\
+	-0.22054068,-0.00005872,\
+	-0.22037192,-0.37480908,\
+	-0.21959501,-0.76658281,\
+	-0.21957535,0.37471145,\
+	-0.21830443,0.76666288,\
+	-0.14314293,0.98970203,\
+	-0.14308842,-0.98970991,\
+	-0.11182136,-0.58019243,\
+	-0.11144867,-0.19011261,\
+	-0.11115009,0.18972064,\
+	-0.11083876,0.58001267,\
+	-0.01202610,-0.79058356,\
+	-0.01146133,0.79044198,\
+	-0.00296704,-0.38655576,\
+	-0.00251985,0.38610767,\
+	0,0,\
+	0.05602383,0.99842943,\
+	0.05753595,-0.99834343,\
+	0.09929965,-0.58445955,\
+	0.09978787,0.58401640,\
+	0.10720128,-0.19034642,\
+	0.10732262,0.18978004,\
+	0.19741560,0.77640808,\
+	0.19782431,-0.77666373,\
+	0.21311365,-0.37705130,\
+	0.21324783,0.37657351,\
+	0.21679316,-0.00027758,\
+	0.24086510,0.97055861,\
+	0.24384828,-0.96981339,\
+	0.30997719,-0.55399954,\
+	0.31009251,0.55373347,\
+	0.33113518,0.19349214,\
+	0.33116892,-0.19395431,\
+	0.39954032,0.71482630,\
+	0.40007135,-0.71505270,\
+	0.41738752,0.90872859,\
+	0.42074496,-0.90717897,\
+	0.43070234,-0.00023714,\
+	0.44655149,0.38465307,\
+	0.44677948,-0.38475532,\
+	0.55227292,0.20180373,\
+	0.55259284,-0.20199491,\
+	0.55434766,-0.56898461,\
+	0.55436109,0.56926283,\
+	0.59389421,0.80454314,\
+	0.59763878,-0.80176548,\
+	0.63149306,0.00014085,\
+	0.68111571,0.39992460,\
+	0.68195669,-0.39979424,\
+	0.73619953,0.67676455,\
+	0.74034292,-0.67222940,\
+	0.76516557,0.20592197,\
+	0.76545629,-0.20472387,\
+	0.81831151,0.00107473,\
+	0.84515629,0.53451927,\
+	0.84868724,-0.52889504,\
+	0.92889024,0.37035513,\
+	0.93106261,-0.36485945,\
+	0.98208177,0.18845531,\
+	0.98320733,-0.18249206,\
+	0.99999396,0.00347432}
+
+
+
+#define DISC_6_FACE {\
+	47,41,32,\
+	29,35,22,\
+	23,32,29,\
+	29,41,35,\
+	29,32,41,\
+	8,1,3,\
+	6,12,15,\
+	15,12,24,\
+	71,80,73,\
+	73,80,84,\
+	19,24,12,\
+	87,80,82,\
+	48,37,44,\
+	22,11,18,\
+	18,29,22,\
+	23,29,18,\
+	44,37,36,\
+	22,35,31,\
+	31,36,25,\
+	16,11,22,\
+	22,31,16,\
+	16,31,25,\
+	16,25,13,\
+	13,7,16,\
+	16,7,11,\
+	11,7,4,\
+	13,25,20,\
+	69,73,78,\
+	78,73,84,\
+	49,60,55,\
+	60,68,55,\
+	47,32,40,\
+	45,33,39,\
+	45,40,33,\
+	0,1,5,\
+	1,8,5,\
+	39,33,30,\
+	33,21,30,\
+	15,24,30,\
+	30,21,15,\
+	39,30,34,\
+	34,30,24,\
+	24,19,26,\
+	26,34,24,\
+	74,80,71,\
+	74,82,80,\
+	77,82,74,\
+	0,5,2,\
+	27,36,37,\
+	25,36,27,\
+	27,20,25,\
+	89,91,84,\
+	89,80,87,\
+	84,80,89,\
+	85,83,78,\
+	85,78,84,\
+	92,90,85,\
+	85,90,83,\
+	85,91,92,\
+	84,91,85,\
+	69,58,64,\
+	64,73,69,\
+	64,56,71,\
+	71,73,64,\
+	81,76,75,\
+	63,58,69,\
+	78,83,72,\
+	70,63,72,\
+	69,78,72,\
+	72,63,69,\
+	35,41,46,\
+	28,21,33,\
+	28,32,23,\
+	28,40,32,\
+	33,40,28,\
+	15,21,10,\
+	10,6,15,\
+	10,8,3,\
+	3,6,10,\
+	47,40,52,\
+	52,58,47,\
+	52,45,56,\
+	40,45,52,\
+	56,64,52,\
+	52,64,58,\
+	56,45,50,\
+	50,45,39,\
+	9,18,11,\
+	9,2,5,\
+	11,4,9,\
+	4,2,9,\
+	23,18,14,\
+	14,5,8,\
+	18,9,14,\
+	14,9,5,\
+	34,26,38,\
+	83,90,88,\
+	79,86,81,\
+	81,75,79,\
+	79,88,86,\
+	83,88,79,\
+	79,75,70,\
+	79,72,83,\
+	70,72,79,\
+	65,76,67,\
+	65,75,76,\
+	44,36,42,\
+	36,31,42,\
+	42,31,35,\
+	35,46,42,\
+	53,46,41,\
+	58,63,53,\
+	53,41,47,\
+	47,58,53,\
+	66,55,68,\
+	66,68,77,\
+	77,74,66,\
+	55,50,43,\
+	49,55,43,\
+	39,34,43,\
+	43,50,39,\
+	43,38,49,\
+	34,38,43,\
+	21,28,17,\
+	17,14,8,\
+	17,28,23,\
+	23,14,17,\
+	8,10,17,\
+	17,10,21,\
+	67,59,54,\
+	54,65,67,\
+	54,48,44,\
+	54,59,48,\
+	56,50,61,\
+	71,56,61,\
+	61,50,55,\
+	55,66,61,\
+	61,74,71,\
+	61,66,74,\
+	65,54,62,\
+	70,75,62,\
+	75,65,62,\
+	57,63,70,\
+	70,62,57,\
+	57,53,63,\
+	46,53,57,\
+	51,62,54,\
+	44,42,51,\
+	51,54,44,\
+	51,42,46,\
+	46,57,51,\
+	51,57,62}
+
+
+
+#define DISC_7_VERT_CNT 362
+#define DISC_7_FACE_CNT 661
+
+#define DISC_7_VERT {\
+	-0.99985012,-0.01731283,\
+	-0.99556874,0.09403658,\
+	-0.99269568,-0.12064526,\
+	-0.98039504,0.19704206,\
+	-0.97660422,-0.21504466,\
+	-0.95669950,0.29107744,\
+	-0.94719722,-0.32065158,\
+	-0.91977079,0.03476785,\
+	-0.91823426,0.39603769,\
+	-0.90908498,-0.41661072,\
+	-0.90313047,-0.06293630,\
+	-0.89624049,0.13116999,\
+	-0.88570063,-0.24444376,\
+	-0.87285522,-0.15209581,\
+	-0.87078228,0.49166881,\
+	-0.86621517,0.31178476,\
+	-0.86248349,-0.50608521,\
+	-0.86017585,0.21863311,\
+	-0.84170474,-0.33621087,\
+	-0.83205774,0.03180995,\
+	-0.81606482,0.40088958,\
+	-0.81347977,0.58159321,\
+	-0.80880317,-0.58807944,\
+	-0.80047290,-0.06419645,\
+	-0.79753133,-0.42481233,\
+	-0.79434394,0.12621664,\
+	-0.79156344,-0.23872410,\
+	-0.77379916,0.30089105,\
+	-0.76620500,0.48702609,\
+	-0.75497372,-0.15055590,\
+	-0.74996331,-0.66147943,\
+	-0.74615778,0.66576915,\
+	-0.74611496,-0.50995249,\
+	-0.74439196,0.21110739,\
+	-0.73952765,-0.33215782,\
+	-0.73635629,0.02967200,\
+	-0.71613187,0.39251295,\
+	-0.70989537,0.56969253,\
+	-0.69311231,-0.06262365,\
+	-0.68888730,-0.42215438,\
+	-0.68873812,-0.23996840,\
+	-0.68789855,0.12079206,\
+	-0.68775964,-0.59169999,\
+	-0.68761735,-0.72607326,\
+	-0.67300273,0.29907892,\
+	-0.66617715,0.74579354,\
+	-0.66034349,0.48247936,\
+	-0.64930335,0.65326738,\
+	-0.64251884,-0.15112779,\
+	-0.63423915,-0.33167241,\
+	-0.63367295,0.20893327,\
+	-0.63361786,0.02834503,\
+	-0.63317442,-0.50922682,\
+	-0.62397583,-0.67129702,\
+	-0.61668934,-0.78720662,\
+	-0.61382861,0.39179584,\
+	-0.60234812,0.57529738,\
+	-0.58817577,0.80873313,\
+	-0.58517097,-0.06261579,\
+	-0.58292803,-0.24113073,\
+	-0.58101739,0.11895848,\
+	-0.58024100,-0.42164527,\
+	-0.57342995,-0.59754731,\
+	-0.57023518,0.30017942,\
+	-0.56388698,0.69225657,\
+	-0.55806447,0.48787902,\
+	-0.54436708,-0.83884712,\
+	-0.53471317,-0.71163670,\
+	-0.53277416,-0.15208460,\
+	-0.52845305,0.02779153,\
+	-0.52768053,-0.33226318,\
+	-0.52626233,0.20945175,\
+	-0.52436621,-0.51257293,\
+	-0.51322968,0.39629252,\
+	-0.51105321,0.59594241,\
+	-0.50070339,0.77148078,\
+	-0.49284021,0.87011983,\
+	-0.47766585,-0.06283164,\
+	-0.47709422,-0.61587203,\
+	-0.47617996,-0.24250847,\
+	-0.47492414,0.11881178,\
+	-0.47332691,-0.42421072,\
+	-0.46891581,-0.78503521,\
+	-0.46830639,0.30307673,\
+	-0.46266258,0.49810707,\
+	-0.45708829,-0.88942133,\
+	-0.45576672,0.68708769,\
+	-0.42465108,-0.15303707,\
+	-0.42294844,-0.52165201,\
+	-0.42277498,0.02774444,\
+	-0.42158212,-0.33447625,\
+	-0.42154903,-0.70155621,\
+	-0.42118179,0.21073835,\
+	-0.41518847,0.40003793,\
+	-0.41014166,0.59223339,\
+	-0.40595577,0.79572144,\
+	-0.40144096,0.91588490,\
+	-0.37410706,-0.80603587,\
+	-0.37388864,-0.92747360,\
+	-0.37084162,-0.06308696,\
+	-0.36985500,-0.24430894,\
+	-0.36964213,0.11912678,\
+	-0.36961888,-0.42913077,\
+	-0.36950835,-0.61160154,\
+	-0.36692104,0.30426278,\
+	-0.36331898,0.49302705,\
+	-0.36031066,0.69223951,\
+	-0.32677443,0.86283108,\
+	-0.32032475,-0.70802805,\
+	-0.31779837,-0.15404597,\
+	-0.31719767,0.02776004,\
+	-0.31689202,0.21090281,\
+	-0.31681891,-0.33738863,\
+	-0.31666303,-0.52066428,\
+	-0.31403760,0.39611874,\
+	-0.31388083,0.58661750,\
+	-0.30063918,-0.86564249,\
+	-0.29840707,0.77598627,\
+	-0.29118453,0.95666691,\
+	-0.27578713,-0.96121874,\
+	-0.26535904,-0.61453199,\
+	-0.26467581,-0.06343296,\
+	-0.26450873,0.11893627,\
+	-0.26436076,-0.24601298,\
+	-0.26386624,-0.42936315,\
+	-0.26351074,0.30183375,\
+	-0.26320615,0.48755038,\
+	-0.26150637,0.67168140,\
+	-0.25770896,-0.78444210,\
+	-0.22690147,0.87620642,\
+	-0.21182740,-0.15487168,\
+	-0.21176386,0.20951421,\
+	-0.21163724,0.02750973,\
+	-0.21161899,-0.52224104,\
+	-0.21148579,0.57469681,\
+	-0.21131707,0.39241729,\
+	-0.21124026,-0.33783321,\
+	-0.20788702,-0.69983910,\
+	-0.20780059,0.75906391,\
+	-0.20086952,-0.88206044,\
+	-0.18395374,0.98293490,\
+	-0.18078187,-0.98352322,\
+	-0.15939234,0.66160521,\
+	-0.15922292,0.48080959,\
+	-0.15893189,0.29974615,\
+	-0.15886894,0.11819996,\
+	-0.15886163,-0.06382544,\
+	-0.15865438,-0.24633350,\
+	-0.15863082,-0.43008689,\
+	-0.15661700,-0.61168462,\
+	-0.15237093,-0.79492683,\
+	-0.14769055,0.83062127,\
+	-0.11501977,0.91942011,\
+	-0.10707064,0.56845652,\
+	-0.10625023,0.38888883,\
+	-0.10607360,0.20858766,\
+	-0.10591811,-0.15506199,\
+	-0.10588439,0.02705488,\
+	-0.10583335,-0.33807967,\
+	-0.10494986,-0.52098388,\
+	-0.10231280,-0.70603494,\
+	-0.10202215,-0.89648205,\
+	-0.10135860,0.74182236,\
+	-0.08783976,-0.99613462,\
+	-0.06094913,0.99814087,\
+	-0.05373065,0.47736208,\
+	-0.05303096,0.29846240,\
+	-0.05301161,0.11775088,\
+	-0.05297384,-0.24644770,\
+	-0.05294683,-0.06402902,\
+	-0.05272878,-0.42928088,\
+	-0.05216873,0.65274562,\
+	-0.05171351,-0.61420930,\
+	-0.05042272,-0.80478224,\
+	-0.04981932,0.83515353,\
+	-0.00000007,0.92251132,\
+	-0.00000000,-1.00000000,\
+	-0.00000000,-0.52141188,\
+	-0.00000000,-0.33767237,\
+	0.00000000,-0.70961850,\
+	0.00000000,-0.15521184,\
+	0,0,\
+	0.00000000,-0.90399448,\
+	0.00000001,0.74344973,\
+	0.00000001,0.20812464,\
+	0.00000002,0.38816625,\
+	0.00000004,0.56389175,\
+	0.04981924,0.83515359,\
+	0.05042272,-0.80478224,\
+	0.05171351,-0.61420930,\
+	0.05216879,0.65274567,\
+	0.05272878,-0.42928088,\
+	0.05294684,-0.06402902,\
+	0.05297384,-0.24644770,\
+	0.05301163,0.11775088,\
+	0.05303099,0.29846241,\
+	0.05373071,0.47736209,\
+	0.06094900,0.99814088,\
+	0.08783975,-0.99613462,\
+	0.10135861,0.74182246,\
+	0.10202215,-0.89648204,\
+	0.10231280,-0.70603494,\
+	0.10494986,-0.52098388,\
+	0.10583335,-0.33807967,\
+	0.10588440,0.02705488,\
+	0.10591811,-0.15506198,\
+	0.10607362,0.20858767,\
+	0.10625027,0.38888884,\
+	0.10707069,0.56845657,\
+	0.11501967,0.91942019,\
+	0.14769048,0.83062141,\
+	0.15237093,-0.79492683,\
+	0.15661700,-0.61168462,\
+	0.15863082,-0.43008689,\
+	0.15865438,-0.24633350,\
+	0.15886163,-0.06382544,\
+	0.15886894,0.11819997,\
+	0.15893191,0.29974616,\
+	0.15922296,0.48080963,\
+	0.15939237,0.66160531,\
+	0.18078187,-0.98352322,\
+	0.18395366,0.98293492,\
+	0.20086952,-0.88206044,\
+	0.20780058,0.75906403,\
+	0.20788703,-0.69983910,\
+	0.21124026,-0.33783320,\
+	0.21131709,0.39241733,\
+	0.21148582,0.57469689,\
+	0.21161899,-0.52224104,\
+	0.21163724,0.02750974,\
+	0.21176387,0.20951422,\
+	0.21182740,-0.15487168,\
+	0.22690143,0.87620648,\
+	0.25770896,-0.78444210,\
+	0.26150638,0.67168149,\
+	0.26320616,0.48755045,\
+	0.26351075,0.30183378,\
+	0.26386624,-0.42936315,\
+	0.26436076,-0.24601298,\
+	0.26450874,0.11893627,\
+	0.26467581,-0.06343296,\
+	0.26535904,-0.61453199,\
+	0.27578713,-0.96121874,\
+	0.29118450,0.95666692,\
+	0.29840708,0.77598632,\
+	0.30063918,-0.86564249,\
+	0.31388084,0.58661757,\
+	0.31403760,0.39611878,\
+	0.31666303,-0.52066428,\
+	0.31681891,-0.33738863,\
+	0.31689202,0.21090283,\
+	0.31719767,0.02776005,\
+	0.31779837,-0.15404597,\
+	0.32032475,-0.70802805,\
+	0.32677441,0.86283111,\
+	0.36031068,0.69223956,\
+	0.36331898,0.49302709,\
+	0.36692104,0.30426281,\
+	0.36950835,-0.61160154,\
+	0.36961888,-0.42913077,\
+	0.36964213,0.11912679,\
+	0.36985500,-0.24430894,\
+	0.37084162,-0.06308696,\
+	0.37388864,-0.92747360,\
+	0.37410706,-0.80603587,\
+	0.40144095,0.91588491,\
+	0.40595578,0.79572146,\
+	0.41014167,0.59223341,\
+	0.41518846,0.40003796,\
+	0.42118179,0.21073837,\
+	0.42154903,-0.70155621,\
+	0.42158212,-0.33447625,\
+	0.42277498,0.02774444,\
+	0.42294844,-0.52165201,\
+	0.42465107,-0.15303707,\
+	0.45576674,0.68708770,\
+	0.45708829,-0.88942133,\
+	0.46266258,0.49810708,\
+	0.46830638,0.30307674,\
+	0.46891581,-0.78503521,\
+	0.47332691,-0.42421072,\
+	0.47492414,0.11881178,\
+	0.47617996,-0.24250847,\
+	0.47709422,-0.61587202,\
+	0.47766585,-0.06283163,\
+	0.49284023,0.87011983,\
+	0.50070341,0.77148078,\
+	0.51105323,0.59594241,\
+	0.51322967,0.39629252,\
+	0.52436621,-0.51257293,\
+	0.52626232,0.20945175,\
+	0.52768053,-0.33226318,\
+	0.52845304,0.02779153,\
+	0.53277416,-0.15208460,\
+	0.53471317,-0.71163670,\
+	0.54436708,-0.83884712,\
+	0.55806447,0.48787901,\
+	0.56388700,0.69225656,\
+	0.57023517,0.30017942,\
+	0.57342995,-0.59754731,\
+	0.58024100,-0.42164527,\
+	0.58101739,0.11895848,\
+	0.58292803,-0.24113073,\
+	0.58517097,-0.06261579,\
+	0.58817579,0.80873311,\
+	0.60234813,0.57529736,\
+	0.61382860,0.39179583,\
+	0.61668934,-0.78720662,\
+	0.62397583,-0.67129702,\
+	0.63317442,-0.50922682,\
+	0.63361786,0.02834503,\
+	0.63367294,0.20893326,\
+	0.63423914,-0.33167241,\
+	0.64251884,-0.15112779,\
+	0.64930338,0.65326736,\
+	0.66034350,0.48247934,\
+	0.66617718,0.74579352,\
+	0.67300273,0.29907891,\
+	0.68761735,-0.72607326,\
+	0.68775963,-0.59169999,\
+	0.68789855,0.12079206,\
+	0.68873811,-0.23996840,\
+	0.68888730,-0.42215438,\
+	0.69311230,-0.06262365,\
+	0.70989539,0.56969251,\
+	0.71613187,0.39251294,\
+	0.73635629,0.02967200,\
+	0.73952765,-0.33215782,\
+	0.74439196,0.21110738,\
+	0.74611496,-0.50995249,\
+	0.74615780,0.66576913,\
+	0.74996331,-0.66147943,\
+	0.75497372,-0.15055590,\
+	0.76620501,0.48702606,\
+	0.77379915,0.30089103,\
+	0.79156344,-0.23872410,\
+	0.79434393,0.12621664,\
+	0.79753133,-0.42481233,\
+	0.80047290,-0.06419645,\
+	0.80880317,-0.58807944,\
+	0.81347979,0.58159318,\
+	0.81606483,0.40088956,\
+	0.83205774,0.03180994,\
+	0.84170474,-0.33621087,\
+	0.86017584,0.21863310,\
+	0.86248349,-0.50608521,\
+	0.86621517,0.31178474,\
+	0.87078230,0.49166878,\
+	0.87285522,-0.15209581,\
+	0.88570063,-0.24444376,\
+	0.89624049,0.13116998,\
+	0.90313047,-0.06293631,\
+	0.90908498,-0.41661073,\
+	0.91823427,0.39603766,\
+	0.91977079,0.03476784,\
+	0.94719721,-0.32065159,\
+	0.95669950,0.29107742,\
+	0.97660422,-0.21504466,\
+	0.98039504,0.19704204,\
+	0.99269568,-0.12064527,\
+	0.99556874,0.09403657,\
+	0.99985012,-0.01731283}
+
+
+
+#define DISC_7_FACE {\
+	130,146,121,\
+	17,5,3,\
+	197,209,221,\
+	243,221,232,\
+	232,221,209,\
+	226,218,207,\
+	67,66,82,\
+	132,121,146,\
+	122,110,132,\
+	132,110,121,\
+	3,1,11,\
+	11,17,3,\
+	54,67,53,\
+	66,67,54,\
+	294,279,295,\
+	295,279,276,\
+	279,294,270,\
+	272,251,262,\
+	357,349,355,\
+	236,230,250,\
+	50,44,33,\
+	63,44,50,\
+	277,296,287,\
+	226,236,247,\
+	166,184,195,\
+	155,184,166,\
+	93,104,114,\
+	135,114,125,\
+	125,114,104,\
+	76,95,96,\
+	106,115,127,\
+	115,106,94,\
+	117,106,127,\
+	95,106,117,\
+	196,207,218,\
+	126,114,135,\
+	127,115,134,\
+	115,126,134,\
+	162,174,151,\
+	53,67,62,\
+	72,52,62,\
+	237,259,249,\
+	136,148,158,\
+	122,132,145,\
+	130,121,109,\
+	109,123,130,\
+	100,123,109,\
+	0,2,10,\
+	37,47,31,\
+	31,47,45,\
+	45,47,64,\
+	64,57,45,\
+	75,57,64,\
+	75,95,76,\
+	76,57,75,\
+	28,14,20,\
+	361,360,354,\
+	321,327,335,\
+	357,359,348,\
+	348,349,357,\
+	348,335,349,\
+	315,296,306,\
+	315,306,325,\
+	41,50,33,\
+	329,339,345,\
+	242,222,220,\
+	176,198,182,\
+	182,163,176,\
+	161,182,173,\
+	163,182,161,\
+	53,30,43,\
+	43,54,53,\
+	24,9,16,\
+	42,22,30,\
+	42,30,53,\
+	53,62,42,\
+	42,62,52,\
+	307,294,295,\
+	299,294,308,\
+	308,307,318,\
+	294,307,308,\
+	283,299,289,\
+	283,294,299,\
+	283,270,294,\
+	225,237,249,\
+	217,207,195,\
+	230,236,217,\
+	226,207,217,\
+	217,236,226,\
+	250,230,239,\
+	239,229,251,\
+	251,272,260,\
+	260,272,281,\
+	281,269,260,\
+	260,239,251,\
+	260,269,250,\
+	250,239,260,\
+	268,247,257,\
+	257,247,236,\
+	250,269,257,\
+	257,236,250,\
+	314,330,316,\
+	244,232,223,\
+	84,73,93,\
+	84,94,74,\
+	83,73,63,\
+	83,92,104,\
+	83,104,93,\
+	93,73,83,\
+	55,44,63,\
+	63,73,55,\
+	246,227,235,\
+	235,227,218,\
+	235,218,226,\
+	226,247,235,\
+	181,204,194,\
+	155,166,144,\
+	135,125,144,\
+	80,92,71,\
+	63,50,71,\
+	71,83,63,\
+	92,83,71,\
+	101,110,122,\
+	101,92,80,\
+	131,144,125,\
+	122,145,131,\
+	131,145,155,\
+	155,144,131,\
+	111,125,104,\
+	104,92,111,\
+	111,131,125,\
+	122,131,111,\
+	111,101,122,\
+	92,101,111,\
+	164,140,152,\
+	151,174,152,\
+	138,117,127,\
+	138,162,151,\
+	129,140,118,\
+	129,138,151,\
+	117,138,129,\
+	151,152,129,\
+	129,152,140,\
+	107,96,95,\
+	95,117,107,\
+	118,96,107,\
+	107,129,118,\
+	117,129,107,\
+	197,164,175,\
+	175,209,197,\
+	164,152,175,\
+	175,152,174,\
+	218,227,208,\
+	208,196,218,\
+	195,207,185,\
+	207,196,185,\
+	196,165,185,\
+	185,166,195,\
+	143,126,135,\
+	143,165,153,\
+	153,134,143,\
+	143,134,126,\
+	93,114,105,\
+	114,126,105,\
+	105,84,93,\
+	105,126,115,\
+	115,94,105,\
+	94,84,105,\
+	78,62,67,\
+	78,88,72,\
+	72,62,78,\
+	13,2,4,\
+	13,10,2,\
+	18,9,24,\
+	18,6,9,\
+	61,52,72,\
+	61,39,52,\
+	49,39,61,\
+	150,161,173,\
+	81,61,72,\
+	72,88,81,\
+	88,102,81,\
+	124,148,136,\
+	238,252,231,\
+	238,225,249,\
+	249,259,271,\
+	136,158,147,\
+	147,123,136,\
+	130,123,147,\
+	68,77,58,\
+	58,48,68,\
+	155,145,167,\
+	167,184,155,\
+	181,194,167,\
+	167,194,184,\
+	112,123,100,\
+	136,123,112,\
+	112,124,136,\
+	102,124,112,\
+	89,101,80,\
+	110,101,89,\
+	87,68,79,\
+	77,68,87,\
+	79,100,87,\
+	100,109,87,\
+	21,14,28,\
+	28,37,21,\
+	21,37,31,\
+	86,64,74,\
+	86,75,64,\
+	74,94,86,\
+	86,94,106,\
+	86,106,95,\
+	95,75,86,\
+	8,20,14,\
+	33,44,27,\
+	27,17,33,\
+	351,359,361,\
+	351,348,359,\
+	361,354,351,\
+	351,354,342,\
+	321,335,332,\
+	335,348,332,\
+	310,301,292,\
+	281,272,292,\
+	292,301,281,\
+	353,346,356,\
+	344,346,334,\
+	358,356,344,\
+	344,356,346,\
+	288,306,296,\
+	288,277,268,\
+	296,277,288,\
+	298,306,288,\
+	334,346,341,\
+	341,325,334,\
+	341,346,353,\
+	341,353,347,\
+	50,41,60,\
+	80,71,60,\
+	60,71,50,\
+	309,329,322,\
+	322,300,309,\
+	289,299,309,\
+	309,300,289,\
+	173,182,188,\
+	188,179,173,\
+	211,201,188,\
+	188,201,179,\
+	200,182,198,\
+	198,220,200,\
+	200,220,222,\
+	200,188,182,\
+	200,222,211,\
+	211,188,200,\
+	163,161,141,\
+	82,66,85,\
+	116,98,119,\
+	22,42,32,\
+	24,16,32,\
+	32,16,22,\
+	32,39,24,\
+	32,42,52,\
+	52,39,32,\
+	331,308,318,\
+	212,201,224,\
+	224,201,211,\
+	258,253,270,\
+	270,283,258,\
+	245,242,263,\
+	245,222,242,\
+	237,225,213,\
+	231,252,240,\
+	251,229,240,\
+	262,251,240,\
+	240,252,262,\
+	349,335,343,\
+	343,335,327,\
+	343,355,349,\
+	343,352,355,\
+	333,341,347,\
+	315,325,333,\
+	325,341,333,\
+	340,333,347,\
+	330,314,324,\
+	315,333,324,\
+	324,340,330,\
+	333,340,324,\
+	287,296,305,\
+	305,296,315,\
+	315,324,305,\
+	305,324,314,\
+	234,244,223,\
+	234,227,246,\
+	243,232,254,\
+	232,244,254,\
+	254,265,243,\
+	46,37,28,\
+	44,55,36,\
+	36,27,44,\
+	20,27,36,\
+	55,46,36,\
+	28,20,36,\
+	36,46,28,\
+	246,235,256,\
+	268,277,256,\
+	256,247,268,\
+	256,235,247,\
+	206,217,195,\
+	230,217,206,\
+	195,184,206,\
+	184,194,206,\
+	142,138,127,\
+	162,138,142,\
+	127,134,142,\
+	142,134,153,\
+	210,199,223,\
+	223,232,210,\
+	210,232,209,\
+	174,162,183,\
+	219,208,227,\
+	227,234,219,\
+	223,199,219,\
+	219,234,223,\
+	196,208,186,\
+	153,165,186,\
+	186,165,196,\
+	135,144,154,\
+	154,143,135,\
+	165,143,154,\
+	154,144,166,\
+	166,185,154,\
+	154,185,165,\
+	82,85,97,\
+	97,85,98,\
+	98,116,97,\
+	34,39,49,\
+	34,18,24,\
+	24,39,34,\
+	6,18,12,\
+	4,6,12,\
+	12,13,4,\
+	202,213,191,\
+	137,150,160,\
+	179,172,160,\
+	173,179,160,\
+	160,150,173,\
+	61,81,70,\
+	49,61,70,\
+	113,102,88,\
+	113,124,102,\
+	291,271,280,\
+	289,300,280,\
+	280,300,291,\
+	280,271,259,\
+	261,238,249,\
+	249,271,261,\
+	252,238,261,\
+	322,327,312,\
+	312,300,322,\
+	312,327,321,\
+	321,302,312,\
+	291,300,312,\
+	312,302,291,\
+	156,146,130,\
+	130,147,156,\
+	59,68,48,\
+	49,70,59,\
+	79,68,59,\
+	59,70,79,\
+	157,167,145,\
+	157,132,146,\
+	157,145,132,\
+	181,167,157,\
+	90,112,100,\
+	90,70,81,\
+	90,81,102,\
+	102,112,90,\
+	90,100,79,\
+	79,70,90,\
+	99,89,77,\
+	77,87,99,\
+	99,87,109,\
+	110,89,99,\
+	121,110,99,\
+	99,109,121,\
+	0,10,7,\
+	10,19,7,\
+	7,1,0,\
+	7,11,1,\
+	7,19,11,\
+	25,41,33,\
+	11,19,25,\
+	33,17,25,\
+	17,11,25,\
+	23,19,10,\
+	23,13,29,\
+	10,13,23,\
+	20,8,15,\
+	15,27,20,\
+	15,8,5,\
+	5,17,15,\
+	17,27,15,\
+	321,332,313,\
+	313,302,321,\
+	293,302,313,\
+	338,351,342,\
+	348,351,338,\
+	338,332,348,\
+	281,301,290,\
+	290,269,281,\
+	298,290,311,\
+	311,290,301,\
+	310,292,303,\
+	293,313,303,\
+	284,272,262,\
+	284,292,272,\
+	293,303,284,\
+	284,303,292,\
+	342,354,350,\
+	350,354,360,\
+	350,360,358,\
+	358,344,350,\
+	268,257,278,\
+	278,288,268,\
+	298,288,278,\
+	278,257,269,\
+	278,290,298,\
+	269,290,278,\
+	51,60,41,\
+	139,141,161,\
+	161,150,139,\
+	119,141,139,\
+	139,116,119,\
+	308,331,319,\
+	299,308,319,\
+	319,309,299,\
+	329,309,319,\
+	339,329,319,\
+	319,331,339,\
+	241,224,253,\
+	253,258,241,\
+	212,224,241,\
+	259,237,248,\
+	248,241,258,\
+	253,224,233,\
+	233,224,211,\
+	211,222,233,\
+	222,245,233,\
+	279,270,264,\
+	270,253,264,\
+	264,276,279,\
+	253,233,264,\
+	264,233,245,\
+	263,276,264,\
+	264,245,263,\
+	214,238,231,\
+	225,238,214,\
+	191,213,203,\
+	203,213,225,\
+	203,178,191,\
+	193,178,203,\
+	225,214,203,\
+	203,214,193,\
+	215,229,204,\
+	231,240,215,\
+	215,240,229,\
+	345,352,337,\
+	352,343,337,\
+	337,329,345,\
+	322,329,337,\
+	337,327,322,\
+	337,343,327,\
+	287,275,267,\
+	267,277,287,\
+	246,256,267,\
+	267,256,277,\
+	244,234,255,\
+	255,267,275,\
+	255,234,246,\
+	246,267,255,\
+	297,275,287,\
+	287,305,297,\
+	297,305,314,\
+	297,316,304,\
+	297,314,316,\
+	265,254,266,\
+	266,255,275,\
+	266,254,244,\
+	244,255,266,\
+	65,84,74,\
+	73,84,65,\
+	65,55,73,\
+	65,46,55,\
+	47,37,56,\
+	37,46,56,\
+	46,65,56,\
+	56,65,74,\
+	74,64,56,\
+	64,47,56,\
+	216,239,230,\
+	230,206,216,\
+	229,239,216,\
+	204,229,216,\
+	216,194,204,\
+	216,206,194,\
+	199,210,187,\
+	187,183,199,\
+	174,183,187,\
+	187,175,174,\
+	209,175,187,\
+	187,210,209,\
+	171,183,162,\
+	153,186,171,\
+	171,142,153,\
+	162,142,171,\
+	91,67,82,\
+	91,78,67,\
+	82,97,91,\
+	97,108,91,\
+	128,97,116,\
+	128,108,97,\
+	137,108,128,\
+	128,150,137,\
+	128,139,150,\
+	116,139,128,\
+	40,34,49,\
+	49,59,40,\
+	40,59,48,\
+	40,48,29,\
+	18,34,26,\
+	26,12,18,\
+	34,40,26,\
+	26,40,29,\
+	29,13,26,\
+	13,12,26,\
+	191,178,170,\
+	170,178,158,\
+	170,158,148,\
+	148,159,170,\
+	172,159,149,\
+	137,160,149,\
+	149,160,172,\
+	212,202,189,\
+	189,201,212,\
+	179,201,189,\
+	189,172,179,\
+	120,108,137,\
+	137,149,120,\
+	293,284,274,\
+	274,284,262,\
+	262,252,274,\
+	252,261,274,\
+	282,271,291,\
+	282,261,271,\
+	282,274,261,\
+	293,274,282,\
+	282,302,293,\
+	291,302,282,\
+	192,204,181,\
+	192,215,204,\
+	146,156,169,\
+	169,157,146,\
+	181,157,169,\
+	169,192,181,\
+	323,313,332,\
+	332,338,323,\
+	310,303,323,\
+	323,303,313,\
+	298,311,317,\
+	317,306,298,\
+	317,325,306,\
+	334,325,317,\
+	328,344,334,\
+	334,317,328,\
+	328,317,311,\
+	310,323,326,\
+	326,338,342,\
+	326,323,338,\
+	60,51,69,\
+	80,60,69,\
+	58,77,69,\
+	69,51,58,\
+	69,89,80,\
+	77,89,69,\
+	19,23,35,\
+	35,51,41,\
+	41,25,35,\
+	35,25,19,\
+	273,248,258,\
+	273,283,289,\
+	273,258,283,\
+	259,248,273,\
+	289,280,273,\
+	273,280,259,\
+	237,213,228,\
+	228,248,237,\
+	228,202,212,\
+	213,202,228,\
+	212,241,228,\
+	241,248,228,\
+	168,178,193,\
+	168,156,147,\
+	168,147,158,\
+	158,178,168,\
+	286,297,304,\
+	275,297,286,\
+	286,266,275,\
+	199,183,190,\
+	183,171,190,\
+	190,219,199,\
+	208,219,190,\
+	190,186,208,\
+	190,171,186,\
+	177,189,202,\
+	177,170,159,\
+	177,159,172,\
+	172,189,177,\
+	177,202,191,\
+	191,170,177,\
+	113,120,133,\
+	133,120,149,\
+	148,124,133,\
+	124,113,133,\
+	133,159,148,\
+	133,149,159,\
+	103,91,108,\
+	108,120,103,\
+	88,78,103,\
+	78,91,103,\
+	103,113,88,\
+	103,120,113,\
+	215,192,205,\
+	205,214,231,\
+	231,215,205,\
+	193,214,205,\
+	192,169,180,\
+	193,205,180,\
+	180,205,192,\
+	180,168,193,\
+	180,169,156,\
+	156,168,180,\
+	336,350,344,\
+	344,328,336,\
+	342,350,336,\
+	336,326,342,\
+	320,328,311,\
+	320,311,301,\
+	320,336,328,\
+	326,336,320,\
+	320,301,310,\
+	310,326,320,\
+	38,35,23,\
+	29,48,38,\
+	38,23,29,\
+	38,48,58,\
+	58,51,38,\
+	51,35,38,\
+	265,266,285,\
+	266,286,285,\
+	285,286,304}
+
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 700860a..374c7c1 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -36,21 +36,23 @@
 #include <cfloat>
 #include <omp.h>
 #include <vector>
+#include <cmath>
+
 #include "cudamacro.h" /* for time() */
 #include "globals.h"
-//#include "utils.h"
 
 #include <iostream>
 
 #include "cuwsort.cuh"
+#include "ptt.cuh"
+
+#include "utils.cu"
+#include "ptt.cu"
 
 #define MAX_NUM_DIR (128)
 
 #define NTHR_GEN (128)
 
-#define THR_X_BL (64)
-#define THR_X_SL (32)
-
 #define MAX_DIMS        (8)
 #define MAX_STR_LEN     (256)
 
@@ -58,169 +60,22 @@ using namespace cuwsort;
 
 //#define USE_FIXED_PERMUTATION
 #ifdef USE_FIXED_PERMUTATION
-__device__ const int fixedPerm[] = {44, 47, 53,  0,  3,  3, 39,  9, 19, 21, 50, 36, 23,
-                                     6, 24, 24, 12,  1, 38, 39, 23, 46, 24, 17, 37, 25, 
-                                    13,  8,  9, 20, 51, 16, 51,  5, 15, 47,  0, 18, 35, 
-                                    24, 49, 51, 29, 19, 19, 14, 39, 32,  1,  9, 32, 31,
-                                    10, 52, 23};
-#endif
-
-
-template<int BDIM_X,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int trilinear_interp_d(const int dimx,
-                                  const int dimy,
-                                  const int dimz,
-                                  const int dimt,
-                                  const REAL_T *__restrict__ dataf,
-                                  const REAL3_T point,
-                                        REAL_T *__restrict__ __vox_data) {
-
-        const int tidx = threadIdx.x;
-        //const int tidy = threadIdx.y;
-#if 0        
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-#endif
-        const REAL_T HALF = static_cast<REAL_T>(0.5);
-
-        // all thr compute the same here
-        if (point.x < -HALF || point.x+HALF >= dimx ||
-            point.y < -HALF || point.y+HALF >= dimy ||
-               point.z < -HALF || point.z+HALF >= dimz) {
-                return -1;
-        }
-
-        long long  coo[3][2];
-        REAL wgh[3][2]; // could use just one...
-
-        const REAL_T ONE  = static_cast<REAL_T>(1.0);
-
-        const REAL3_T fl = MAKE_REAL3(FLOOR(point.x),
-                                      FLOOR(point.y),
-                                      FLOOR(point.z));
-
-        wgh[0][1] = point.x - fl.x; 
-        wgh[0][0] = ONE-wgh[0][1]; 
-        coo[0][0] = MAX(0, fl.x);
-        coo[0][1] = MIN(dimx-1, coo[0][0]+1);
-
-        wgh[1][1] = point.y - fl.y; 
-        wgh[1][0] = ONE-wgh[1][1]; 
-        coo[1][0] = MAX(0, fl.y);
-        coo[1][1] = MIN(dimy-1, coo[1][0]+1);
-
-        wgh[2][1] = point.z - fl.z; 
-        wgh[2][0] = ONE-wgh[2][1]; 
-        coo[2][0] = MAX(0, fl.z);
-        coo[2][1] = MIN(dimz-1, coo[2][0]+1);
-
-        //#pragma unroll
-        for(int t = tidx; t < dimt; t += BDIM_X) {
-
-                REAL_T __tmp = 0;
-
-                #pragma unroll
-                for(int i = 0; i < 2; i++) {
-                        #pragma unroll
-                        for(int j = 0; j < 2; j++) {
-                                #pragma unroll
-                                for(int k = 0; k < 2; k++) {
-                                        __tmp += wgh[0][i]*wgh[1][j]*wgh[2][k]*
-                                                 dataf[coo[0][i]*dimy*dimz*dimt +
-                                                       coo[1][j]*dimz*dimt +
-                                                       coo[2][k]*dimt +
-                                                       t];
-                                        /*
-                                        if (tidx == 0 && threadIdx.y == 0 && t==0) {
-                                                printf("wgh[0][%d]: %f, wgh[1][%d]: %f, wgh[2][%d]: %f\n",
-                                                        i, wgh[0][i], j, wgh[1][j], k, wgh[2][k]);
-                                                printf("dataf[%d][%d][%d][%d]: %f\n", coo[0][i], coo[1][j], coo[2][k], t+tidx,
-                                                                dataf[coo[0][i]*dimy*dimz*dimt +
-                                                                coo[1][j]*dimz*dimt +
-                                                                coo[2][k]*dimt +
-                                                                t+tidx]);
-                                        }
-                                        */
-                                }
-                        }
-                }
-                __vox_data[t] = __tmp;
-        }
-#if 0
-        __syncwarp(WMASK);
-        if (tidx == 0 && threadIdx.y == 0) {
-                printf("point: %f, %f, %f\n", point.x, point.y, point.z);
-                for(int i = 0; i < dimt; i++) {
-                        printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
-                }
-        }
-#endif
-        return 0;
-}
-
-template<int BDIM_X,
-         int N,
-         typename VAL_T>
-__device__ void copy_d(      VAL_T *__restrict__ dst,
-                       const VAL_T *__restrict__ src) {
-
-        const int tidx = threadIdx.x;
-
-        #pragma unroll
-        for(int j = 0; j < N; j+= BDIM_X) {
-                if (j+tidx < N) {
-                        dst[j+tidx] = src[j+tidx];
-                }
-        }
-        return;
-}
-
-template<int BDIM_X,
-         int N,
-         int M,
-         typename VAL_T>
-__device__ void ndotp_d(const VAL_T *__restrict__ srcV,
-                        const VAL_T *__restrict__ srcM,
-                              VAL_T *__restrict__ dstV) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        //#pragma unroll
-        for(int i = 0; i < N; i++) {
-
-                VAL_T __tmp = 0;
-
-                #pragma unroll
-                for(int j = 0; j < M; j += BDIM_X) {
-                        if (j+tidx < M) {
-                                __tmp += srcV[j+tidx]*srcM[i*M + j+tidx];
-                        }
-                }
-#if 0
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-                }
-#else
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-                }
+//__device__ const int fixedPerm[] = {44, 47, 53,  0,  3,  3, 39,  9, 19, 21, 50, 36, 23,
+//                                     6, 24, 24, 12,  1, 38, 39, 23, 46, 24, 17, 37, 25, 
+//                                    13,  8,  9, 20, 51, 16, 51,  5, 15, 47,  0, 18, 35, 
+//                                    24, 49, 51, 29, 19, 19, 14, 39, 32,  1,  9, 32, 31,
+//                                    10, 52, 23};
+__device__ const int fixedPerm[] = {
+  47, 117,  67, 103,   9,  21,  36,  87,  70,  88, 140,  58,  39,  87,  88,  81,  25,  77,
+  72,   9, 148, 115,  79,  82,  99,  29, 147, 147, 142,  32,   9, 127,  32,  31, 114,  28,
+  34, 128, 128,  53, 133,  38,  17,  79, 132, 105,  42,  31, 120,   1,  65,  57,  35, 102,
+ 119,  11,  82,  91, 128, 142,  99,  53, 140, 121,  84,  68,   6,  47, 127, 131, 100,  78,
+ 143, 148,  23, 141, 117,  85,  48,  49,  69,  95,  94,   0, 113,  36,  48,  93, 131,  98,
+  42, 112, 149, 127,   0, 138, 114,  43, 127,  23, 130, 121,  98,  62, 123,  82, 148,  50,
+  14,  41,  58,  36,  10,  86,  43, 104,  11,   2,  51,  80,  32, 128,  38,  19,  42, 115,
+  77,  30,  24, 125,   2,   3,  94, 107,  13, 112,  40,  72,  19,  95,  72,  67,  61,  14,
+  96,   4, 139,  86, 121, 109};
 #endif
-                // values could be held by BDIM_X threads and written
-                // together every BDIM_X iterations...
-
-                if (tidx == 0) {
-                        dstV[i] = __tmp;
-                }
-        }
-        return;
-}
 
 template<int BDIM_X,
          typename VAL_T>
@@ -264,11 +119,12 @@ __device__ void ndotp_d(const int N,
         return;
 }
 
+
 template<int BDIM_X,
-         int N,
-         int M,
          typename VAL_T>
-__device__ void ndotp_log_d(const VAL_T *__restrict__ srcV,
+__device__ void ndotp_log_opdt_d(const int N,
+			    const int M,
+			    const VAL_T *__restrict__ srcV,
                             const VAL_T *__restrict__ srcM,
                                   VAL_T *__restrict__ dstV) {
 
@@ -284,25 +140,21 @@ __device__ void ndotp_log_d(const VAL_T *__restrict__ srcV,
 
                 VAL_T __tmp = 0;
 
-                #pragma unroll
+                //#pragma unroll
                 for(int j = 0; j < M; j += BDIM_X) {
                         if (j+tidx < M) {
-
                                 const VAL_T v = srcV[j+tidx];
                                 __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx];
                         }
                 }
-#if 0
                 #pragma unroll
                 for(int j = BDIM_X/2; j; j /= 2) {
+#if 0
                         __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-                }
 #else
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
                         __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-                }
 #endif
+                }
                 // values could be held by BDIM_X threads and written
                 // together every BDIM_X iterations...
 
@@ -314,48 +166,113 @@ __device__ void ndotp_log_d(const VAL_T *__restrict__ srcV,
 }
 
 template<int BDIM_X,
-         typename VAL_T>
-__device__ void ndotp_log_d(const int N,
-			    const int M,
-			    const VAL_T *__restrict__ srcV,
-                            const VAL_T *__restrict__ srcM,
-                                  VAL_T *__restrict__ dstV) {
-
-        const int tidx = threadIdx.x;
+	 typename VAL_T>
+__device__ void ndotp_log_csa_d(const int N,
+				const int M,
+				const VAL_T *__restrict__ srcV,
+				const VAL_T *__restrict__ srcM,
+				VAL_T *__restrict__ dstV) {
 
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+	const int tidx = threadIdx.x;
 
-        const VAL_T ONEP5 = static_cast<VAL_T>(1.5);
+	const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+	const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+	// Clamp values
+	constexpr VAL_T min = .001;
+	constexpr VAL_T max = .999;
 
-        //#pragma unroll
-        for(int i = 0; i < N; i++) {
+	//#pragma unroll
+	for(int i = 0; i < N; i++) {
 
-                VAL_T __tmp = 0;
+		VAL_T __tmp = 0;
 
-                //#pragma unroll
-                for(int j = 0; j < M; j += BDIM_X) {
-                        if (j+tidx < M) {
-                                const VAL_T v = srcV[j+tidx];
-                                __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx];
-                        }
-                }
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
+		//#pragma unroll
+		for(int j = 0; j < M; j += BDIM_X) {
+			if (j+tidx < M) {
+				const VAL_T v = MIN(MAX(srcV[j+tidx], min), max);
+				__tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx];
+			}
+		}
+		#pragma unroll
+		for(int j = BDIM_X/2; j; j /= 2) {
 #if 0
-                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
+			__tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
 #else
-                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
+			__tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
 #endif
-                }
-                // values could be held by BDIM_X threads and written
-                // together every BDIM_X iterations...
+		}
+		// values could be held by BDIM_X threads and written
+		// together every BDIM_X iterations...
 
-                if (tidx == 0) {
-                        dstV[i] = __tmp;
-                }
+		if (tidx == 0) {
+			dstV[i] = __tmp;
+		}
+	}
+	return;
+}
+
+
+template<int BDIM_X,
+         typename REAL_T>
+__device__ void fit_opdt(const int delta_nr,
+                         const int hr_side,
+                         const REAL_T *__restrict__ delta_q,
+                         const REAL_T *__restrict__ delta_b,
+                         const REAL_T *__restrict__ __msk_data_sh,
+                         REAL_T *__restrict__ __h_sh,
+                         REAL_T *__restrict__ __r_sh) {
+        const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        ndotp_log_opdt_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
+        ndotp_d         <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
+        __syncwarp(WMASK);
+        #pragma unroll
+        for(int j = tidx; j < delta_nr; j += BDIM_X) {
+                __r_sh[j] -= __h_sh[j];
+        }
+        __syncwarp(WMASK);
+}
+
+template<int BDIM_X, typename REAL_T>
+__device__ void fit_csa(const int delta_nr,
+                        const int hr_side,
+                        const REAL_T *__restrict__ fit_matrix,
+                        const REAL_T *__restrict__ __msk_data_sh,
+                        REAL_T *__restrict__ __r_sh) {
+        const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi)
+        ndotp_log_csa_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh);
+        __syncwarp(WMASK);
+        if (tidx == 0) {
+                __r_sh[0] = _n0_const;
+        }
+        __syncwarp(WMASK);
+}
+
+template<int BDIM_X, ModelType MODEL_T, typename REAL_T>
+__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions
+                               const int hr_side, // hr_side is number of data directions
+                               const REAL_T *__restrict__ delta_q,
+                               const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model
+                               const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block
+                               REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned
+                               REAL_T *__restrict__ __r_sh) {
+        switch(MODEL_T) {
+                case OPDT:
+                        fit_opdt<BDIM_X>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
+                        break;
+                case CSA:
+                        fit_csa<BDIM_X>(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh);
+                        break;
+                default:
+                        printf("FATAL: Invalid Model Type.\n");
+                        break;
         }
-        return;
 }
 
 template<int BDIM_X,
@@ -480,8 +397,8 @@ __device__ int peak_directions_d(const REAL_T  *__restrict__ odf,
                                  const int num_edges,
 				 int samplm_nr,
 				 int *__restrict__ __shInd,
-                                 const REAL_T relative_peak_thres=static_cast<REAL_T>(0.5),
-                                 const REAL_T min_separation_angle=static_cast<REAL_T>(0.4363323129985824)) { // 20 degrees in rads
+				 const REAL_T relative_peak_thres,
+				 const REAL_T min_separation_angle) {
 
         const int tidx = threadIdx.x;
 
@@ -787,52 +704,287 @@ __device__ void maskPut(const LEN_T n,
 	return;
 }
 
-template<typename REAL_T>
-__device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
-	if (!threadIdx.x && !threadIdx.y && !blockIdx.x) {
-		printf("%s:\n", name);
+template<int BDIM_X,
+         int BDIM_Y,
+         bool IS_START,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
+                                    const REAL_T *__restrict__ pmf,
+                                    const REAL_T max_angle,
+                                    const REAL_T relative_peak_thres,
+                                    const REAL_T min_separation_angle,
+                                    REAL3_T dir,
+                                    const int dimx,
+                                    const int dimy,
+                                    const int dimz,
+                                    const int dimt,
+                                    const REAL3_T point,
+                                    const REAL3_T *__restrict__ sphere_vertices,
+                                    const int2 *__restrict__ sphere_edges,
+                                    const int num_edges,
+                                    REAL3_T *__restrict__ dirs) {
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+	
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	const int n32dimt = ((dimt+31)/32)*32;
+
+	extern __shared__ REAL_T __sh[];
+        REAL_T *__pmf_data_sh = __sh + tidy*n32dimt;
 
-		for(int j = 0; j < n; j++) {
-			if ((j%ncol)==0) printf("\n");
-			printf("%10.8f ", arr[j]);
-		}
-		printf("\n");
-	}
+        // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
+        __syncwarp(WMASK);
+        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, pmf, point, __pmf_data_sh);
+        __syncwarp(WMASK);
+        if (rv != 0) {
+                return 0;
+        }
+
+        // for i in range(_len):
+        //     if pmf[i] > max_pmf:
+        //         max_pmf = pmf[i]
+        // absolute_pmf_threshold = pmf_threshold * max_pmf
+        const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, __pmf_data_sh, REAL_MIN);
+        __syncwarp(WMASK);
+
+        // for i in range(_len):
+        //     if pmf[i] < absolute_pmf_threshold:
+        //         pmf[i] = 0.0
+        #pragma unroll
+        for(int i = tidx; i < dimt; i += BDIM_X) {
+                if (__pmf_data_sh[i] < absolpmf_thresh) {
+                        __pmf_data_sh[i] = 0.0;
+                }
+        }
+        __syncwarp(WMASK);
+
+        if (IS_START) {
+                int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*n32dimt) + tidy*n32dimt;
+                return peak_directions_d<BDIM_X,
+                                         BDIM_Y>(__pmf_data_sh,
+                                                 dirs,
+                                                 sphere_vertices,
+                                                 sphere_edges,
+                                                 num_edges,
+                                                 dimt,
+                                                 __shInd,
+                                                 relative_peak_thres,
+                                                 min_separation_angle);
+        } else {
+                REAL_T __tmp;
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printArray("__pmf_data_sh initial", 8, dimt, __pmf_data_sh);
+                                printf("absolpmf_thresh %10.8f\n", absolpmf_thresh);
+                                printf("--->            dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
+                                printf("--->            point %10.8f, %10.8f, %10.8f\n", point.x, point.y, point.z);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 15) {
+                                printf("absolpmf_thresh %10.8f l15\n", absolpmf_thresh);
+                                printf("--->            dir %10.8f, %10.8f, %10.8f l15\n", dir.x, dir.y, dir.z);
+                                printf("--->            point %10.8f, %10.8f, %10.8f l15\n", point.x, point.y, point.z);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 31) {
+                                printArray("__pmf_data_sh initial l31", 8, dimt, __pmf_data_sh);
+                                printf("absolpmf_thresh %10.8f l31\n", absolpmf_thresh);
+                                printf("--->            dir %10.8f, %10.8f, %10.8f l31\n", dir.x, dir.y, dir.z);
+                                printf("--->            point %10.8f, %10.8f, %10.8f l31\n", point.x, point.y, point.z);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+
+                // // These should not be relevant
+                // if norm(&direction[0]) == 0:
+                //     return 1
+                // normalize(&direction[0])
+
+                // for i in range(_len):
+                //         cos_sim = self.vertices[i][0] * direction[0] \
+                //                 + self.vertices[i][1] * direction[1] \
+                //                 + self.vertices[i][2] * direction[2]
+                //         if cos_sim < 0:
+                //                 cos_sim = cos_sim * -1
+                //         if cos_sim < self.cos_similarity:
+                //                 pmf[i] = 0
+                const REAL_T cos_similarity = COS(max_angle);
+
+                #pragma unroll
+                for(int i = tidx; i < dimt; i += BDIM_X) {
+                        const REAL_T dot = dir.x*sphere_vertices[i].x+
+                                           dir.y*sphere_vertices[i].y+
+                                           dir.z*sphere_vertices[i].z;
+
+                        if (FABS(dot) < cos_similarity) {
+                                __pmf_data_sh[i] = 0.0;
+                        }
+                }
+                __syncwarp(WMASK);
+
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printArray("__pmf_data_sh after filtering", 8, dimt, __pmf_data_sh);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+
+                // cumsum(pmf, pmf, _len)
+                prefix_sum_sh_d<BDIM_X>(__pmf_data_sh, dimt);
+
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printArray("__pmf_data_sh after cumsum", 8, dimt, __pmf_data_sh);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+
+                // last_cdf = pmf[_len - 1]
+                // if last_cdf == 0:
+                //         return 1
+                REAL_T last_cdf = __pmf_data_sh[dimt - 1];
+                if (last_cdf == 0) {
+                        return 0;
+                }
+
+                // idx = where_to_insert(pmf, random() * last_cdf, _len)
+                if (tidx == 0) {
+                        __tmp = curand_uniform(st) * last_cdf;
+                }
+                REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
+// Both these implementations work
+#if 1
+                int low = 0;
+                int high = dimt - 1;
+                while ((high - low) >= BDIM_X) {
+                        const int mid = (low + high) / 2;
+                        if (__pmf_data_sh[mid] < selected_cdf) {
+                                low = mid;
+                        } else {
+                                high = mid;
+                        }
+                }
+                const bool __ballot = (low+tidx <= high) ? (selected_cdf < __pmf_data_sh[low+tidx]) : 0;
+                const int __msk = __ballot_sync(WMASK, __ballot);
+                const int indProb = low + __ffs(__msk) - 1;
+#else
+                int indProb = dimt - 1;
+                for (int ii = 0; ii < dimt; ii+=BDIM_X) {
+                        int __is_greater = 0;
+                        if (ii+tidx < dimt) {
+                                __is_greater = selected_cdf < __pmf_data_sh[ii+tidx];
+                        }
+                        const int __msk = __ballot_sync(WMASK, __is_greater);
+                        if (__msk != 0) {
+                                indProb = ii + __ffs(__msk) - 1;
+                                break;
+                        }
+                }
+#endif
+
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printf("last_cdf %10.8f\n", last_cdf);
+                                printf("selected_cdf %10.8f\n", selected_cdf);
+                                printf("indProb %i out of %i\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+
+                // newdir = self.vertices[idx]
+                // if (direction[0] * newdir[0]
+                //     + direction[1] * newdir[1]
+                //     + direction[2] * newdir[2] > 0):
+                //     copy_point(&newdir[0], &direction[0])
+                // else:
+                //     newdir[0] = newdir[0] * -1
+                //     newdir[1] = newdir[1] * -1
+                //     newdir[2] = newdir[2] * -1
+                //     copy_point(&newdir[0], &direction[0])
+                if (tidx == 0) {
+                        if ((dir.x * sphere_vertices[indProb].x +
+                             dir.y * sphere_vertices[indProb].y +
+                             dir.z * sphere_vertices[indProb].z) > 0) {
+                                *dirs = MAKE_REAL3(sphere_vertices[indProb].x,
+                                                   sphere_vertices[indProb].y,
+                                                   sphere_vertices[indProb].z);
+                        } else {
+                                *dirs = MAKE_REAL3(-sphere_vertices[indProb].x,
+                                                   -sphere_vertices[indProb].y,
+                                                   -sphere_vertices[indProb].z);
+                        }
+                        // printf("direction addr write %p, slid %i\n", dirs, blockIdx.x*blockDim.y+threadIdx.y);
+                }
+
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printf("last_cdf %10.8f\n", last_cdf);
+                                printf("selected_cdf %10.8f\n", selected_cdf);
+                                printf("indProb %i out of %i\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 15) {
+                                printf("last_cdf %10.8f l15\n", last_cdf);
+                                printf("selected_cdf %10.8f l15\n", selected_cdf);
+                                printf("indProb %i out of %i l15\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 31) {
+                                printf("last_cdf %10.8f l31\n", last_cdf);
+                                printf("selected_cdf %10.8f l31\n", selected_cdf);
+                                printf("indProb %i out of %i l31\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+                return 1;
+        }
 }
 
 template<int BDIM_X,
          int BDIM_Y,
          int NATTEMPTS,
+         ModelType MODEL_T,
          typename REAL_T,
          typename REAL3_T>
-__device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
-                               const REAL_T max_angle,
-			       const REAL_T min_signal,
-			       REAL3_T dir,
-                               const int dimx,
-                               const int dimy,
-                               const int dimz,
-                               const int dimt,
-                               const REAL_T *__restrict__ dataf,
-                               const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
-                                                                 // but not clear if it will never be needed so
-                                                                 // we'll keep it here for now...
-                               const REAL3_T point,
-                               const REAL_T *__restrict__ H, 
-                               const REAL_T *__restrict__ R,
-                               // model unused
-                               // max_angle, pmf_threshold from global defines
-                               // b0s_mask already passed
-                               // min_signal from global defines
-			       const int delta_nr,
-                               const REAL_T *__restrict__ delta_b,
-                               const REAL_T *__restrict__ delta_q, // fit_matrix
-			       const int samplm_nr,
-                               const REAL_T *__restrict__ sampling_matrix,
-                               const REAL3_T *__restrict__ sphere_vertices,
-                               const int2 *__restrict__ sphere_edges,
-                               const int num_edges,
-                                     REAL3_T *__restrict__ dirs) {
+__device__ int get_direction_boot_d(
+                                curandStatePhilox4_32_10_t *st,
+                                const REAL_T max_angle,
+                                const REAL_T min_signal,
+                                const REAL_T relative_peak_thres,
+                                const REAL_T min_separation_angle,
+                                REAL3_T dir,
+                                const int dimx,
+                                const int dimy,
+                                const int dimz,
+                                const int dimt,
+                                const REAL_T *__restrict__ dataf,
+                                const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
+                                                                  // but not clear if it will never be needed so
+                                                                  // we'll keep it here for now...
+                                const REAL3_T point,
+                                const REAL_T *__restrict__ H, 
+                                const REAL_T *__restrict__ R,
+                                // model unused
+                                // max_angle, pmf_threshold from global defines
+                                // b0s_mask already passed
+                                // min_signal from global defines
+                                const int delta_nr,
+                                const REAL_T *__restrict__ delta_b,
+                                const REAL_T *__restrict__ delta_q, // fit_matrix
+                                const int samplm_nr,
+                                const REAL_T *__restrict__ sampling_matrix,
+                                const REAL3_T *__restrict__ sphere_vertices,
+                                const int2 *__restrict__ sphere_edges,
+                                const int num_edges,
+                                REAL3_T *__restrict__ dirs) {
 
         const int tidx = threadIdx.x;
         const int tidy = threadIdx.y;
@@ -869,7 +1021,7 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
         #pragma unroll
         for(int i = 0; i < NATTEMPTS; i++) {
 
-                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, dataf, point, __vox_data_sh);
+                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh);
 
 		const int nmsk = maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
 
@@ -908,6 +1060,7 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
 //                                        }
 #endif
 					__h_sh[j+tidx] += __r_sh[srcPermInd];
+					//__h_sh[j+tidx] += __r_sh[j+tidx];
                                 }
                         }
 			__syncwarp(WMASK);
@@ -949,16 +1102,8 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
 			maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
 			__syncwarp(WMASK);
 
-                        ndotp_log_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
-                        ndotp_d    <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
-
-                        __syncwarp(WMASK);
+                        fit_model_coef<BDIM_X, MODEL_T>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
 
-                        #pragma unroll
-                        for(int j = tidx; j < delta_nr; j += BDIM_X) {
-				__r_sh[j] -= __h_sh[j];
-                        }
-                        __syncwarp(WMASK);
                         // __r_sh[tidy] <- python 'coef'
 
                         ndotp_d<BDIM_X>(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh);
@@ -1015,7 +1160,9 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
                                                            sphere_edges,
                                                            num_edges,
 							   samplm_nr,
-							   reinterpret_cast<int *>(__r_sh)); // reuse __r_sh as shInd in func which is large enough
+							   reinterpret_cast<int *>(__r_sh), // reuse __r_sh as shInd in func which is large enough
+							   relative_peak_thres,
+							   min_separation_angle);
                 if (NATTEMPTS == 1) { // init=True...
                         return ndir; // and dirs;
                 } else { // init=False...
@@ -1060,7 +1207,7 @@ __device__ int check_point_d(const REAL_T tc_threshold,
 
         __shared__ REAL_T __shInterpOut[BDIM_Y];
 
-        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, 1, metric_map, point, __shInterpOut+tidy);
+        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy);
         __syncwarp(WMASK);
 #if 0
         if (threadIdx.y == 1 && threadIdx.x == 0) {
@@ -1076,6 +1223,7 @@ __device__ int check_point_d(const REAL_T tc_threshold,
 
 template<int BDIM_X,
          int BDIM_Y,
+         ModelType MODEL_T,
          typename REAL_T,
          typename REAL3_T>
 __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
@@ -1083,8 +1231,11 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 			 const REAL_T min_signal,
 			 const REAL_T tc_threshold,
 			 const REAL_T step_size,
+			 const REAL_T relative_peak_thres,
+			 const REAL_T min_separation_angle,
                          REAL3_T seed,
                          REAL3_T first_step,
+                         REAL_T* ptt_frame,
                          REAL3_T voxel_size,
                          const int dimx,
                          const int dimy,
@@ -1110,12 +1261,11 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                          const REAL3_T *__restrict__ sphere_vertices,
                          const int2 *__restrict__ sphere_edges,
                          const int num_edges,
-                               REAL3_T *__restrict__ __shDir,
-                               int *__restrict__ nsteps,
-                               REAL3_T *__restrict__ streamline) {
+                         int *__restrict__ nsteps,
+                         REAL3_T *__restrict__ streamline) {
 
         const int tidx = threadIdx.x;
-        //const int tidy = threadIdx.y;
+        const int tidy = threadIdx.y;
 
         const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
@@ -1124,6 +1274,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 
         REAL3_T point = seed;
         REAL3_T direction = first_step;
+        __shared__ REAL3_T __sh_new_dir[BDIM_Y];
 
         if (tidx == 0) {
                 streamline[0] = point;
@@ -1135,34 +1286,77 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
         }
         __syncwarp(WMASK);
 
+        int step_frac;
+        if (MODEL_T == PTT) {
+                step_frac = STEP_FRAC; 
+        } else {
+                step_frac = 1; // STEP_FRAC could be useful in other models
+        }
+
         int i;
-        for(i = 1; i < MAX_SLINE_LEN; i++) {
-
-		// call get_direction_d() with NATTEMPTS=5
-                int ndir = get_direction_d<BDIM_X,
-                                           BDIM_Y,
-                                           5>(st,
-					      max_angle,
-					      min_signal,
-                                              direction,
-                                              dimx, dimy, dimz, dimt, dataf,
-                                              b0s_mask /* !dwi_mask */,
-                                              point,
-                                              H, R,
-                                              // model unused
-                                              // max_angle, pmf_threshold from global defines
-                                              // b0s_mask already passed
-                                              // min_signal from global defines
-					      delta_nr,
-                                              delta_b, delta_q, // fit_matrix
-					      samplm_nr,
-                                              sampling_matrix,
-                                              sphere_vertices,
-                                              sphere_edges,
-                                              num_edges,
-                                              __shDir);
+        for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) {
+                int ndir;
+                if (MODEL_T == PROB) {
+                        ndir = get_direction_prob_d<BDIM_X,
+                                                    BDIM_Y,
+                                                    0>(
+                                                        st,
+                                                        dataf,
+                                                        max_angle,
+                                                        relative_peak_thres,
+                                                        min_separation_angle,
+                                                        direction,
+                                                        dimx, dimy, dimz, dimt,
+                                                        point,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        __sh_new_dir + tidy);
+                } else if (MODEL_T == PTT) {
+                        ndir = get_direction_ptt_d<BDIM_X,
+                                                   BDIM_Y,
+                                                   0>(
+                                                        st,
+                                                        dataf,
+                                                        max_angle,
+                                                        step_size,
+                                                        direction,
+                                                        ptt_frame,
+                                                        dimx, dimy, dimz, dimt,
+                                                        point,
+                                                        sphere_vertices,
+                                                        __sh_new_dir + tidy);
+                } else {
+                        // call get_direction_boot_d() with NATTEMPTS=5
+                        ndir = get_direction_boot_d<BDIM_X,
+                                                    BDIM_Y,
+                                                    5,
+                                                    MODEL_T>(
+                                                        st,
+                                                        max_angle,
+                                                        min_signal,
+                                                        relative_peak_thres,
+                                                        min_separation_angle,
+                                                        direction,
+                                                        dimx, dimy, dimz, dimt, dataf,
+                                                        b0s_mask /* !dwi_mask */,
+                                                        point,
+                                                        H, R,
+                                                        // model unused
+                                                        // max_angle, pmf_threshold from global defines
+                                                        // b0s_mask already passed
+                                                        // min_signal from global defines
+                                                        delta_nr,
+                                                        delta_b, delta_q, // fit_matrix
+                                                        samplm_nr,
+                                                        sampling_matrix,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        __sh_new_dir + tidy);
+                }
                 __syncwarp(WMASK);
-                direction = __shDir[0];
+                direction = __sh_new_dir[tidy];
                 __syncwarp(WMASK);
 
                 if (ndir == 0) {
@@ -1177,12 +1371,12 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                 //point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
                 //point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
                 //point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
-                point.x += (direction.x / voxel_size.x) * step_size;
-                point.y += (direction.y / voxel_size.y) * step_size;
-                point.z += (direction.z / voxel_size.z) * step_size;
+                point.x += (direction.x / voxel_size.x) * (step_size / step_frac);
+                point.y += (direction.y / voxel_size.y) * (step_size / step_frac);
+                point.z += (direction.z / voxel_size.z) * (step_size / step_frac);
 
-                if (tidx == 0) {
-                        streamline[i] = point;
+                if ((tidx == 0) && ((i % step_frac) == 0)){
+                        streamline[i/step_frac] = point;
 #if 0
                         if (threadIdx.y == 1) {
                                 printf("streamline[%d]: %f, %f, %f\n", i, point.x, point.y, point.z);
@@ -1193,13 +1387,50 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 
                 tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
 
+#if 0
+                __syncwarp(WMASK);
+                if (tidx == 0) {
+                        printf("step_size %f\n", step_size);
+                        printf("direction %f, %f, %f\n", direction.x, direction.y, direction.z);
+                        printf("direction addr read %p, slid %i\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
+                        printf("voxel_size %f, %f, %f\n", voxel_size.x, voxel_size.y, voxel_size.z);
+                        printf("point %f, %f, %f\n", point.x, point.y, point.z);
+                        printf("tc %i\n", tissue_class);
+                }
+                __syncwarp(WMASK);
+                if (tidx == 15) {
+                        printf("step_size %f l15\n", step_size);
+                        printf("direction %f, %f, %f l15\n", direction.x, direction.y, direction.z);
+                        printf("direction addr read %p, slid %i l15\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
+                        printf("voxel_size %f, %f, %f l15\n", voxel_size.x, voxel_size.y, voxel_size.z);
+                        printf("point %f, %f, %f l15\n", point.x, point.y, point.z);
+                        printf("tc %i l15\n", tissue_class);
+                }
+                __syncwarp(WMASK);
+                if (tidx == 31) {
+                        printf("step_size %f l31\n", step_size);
+                        printf("direction %f, %f, %f l31\n", direction.x, direction.y, direction.z);
+                        printf("direction addr read %p, slid %i l31\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
+                        printf("voxel_size %f, %f, %f l31\n", voxel_size.x, voxel_size.y, voxel_size.z);
+                        printf("point %f, %f, %f l31\n", point.x, point.y, point.z);
+                        printf("tc %i l31\n", tissue_class);
+                }
+                __syncwarp(WMASK);
+#endif
+
                 if (tissue_class == ENDPOINT ||
                     tissue_class == INVALIDPOINT ||
                     tissue_class == OUTSIDEIMAGE) {
                         break;
                 }
         }
-        nsteps[0] = i;
+        nsteps[0] = i/step_frac;
+        if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){
+                nsteps[0]++;
+                if (tidx == 0) {
+                        streamline[nsteps[0]] = point;
+                }
+        }
 
         return tissue_class;
 }
@@ -1208,10 +1439,13 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__global__ void getNumStreamlines_k(const REAL_T max_angle,
-				    const REAL_T min_signal,
-				    const long long rndSeed,
-                                    const int rndOffset,
+__global__ void getNumStreamlinesBoot_k(
+                                    const ModelType model_type,
+                                    const REAL_T max_angle,
+		                    const REAL_T min_signal,
+		                    const REAL_T relative_peak_thres,
+	                            const REAL_T min_separation_angle,
+		                    const long long rndSeed,
                                     const int nseed,
                                     const REAL3_T *__restrict__ seeds,
                                     const int dimx,
@@ -1221,20 +1455,21 @@ __global__ void getNumStreamlines_k(const REAL_T max_angle,
                                     const REAL_T *__restrict__ dataf,
                                     const REAL_T *__restrict__ H,
                                     const REAL_T *__restrict__ R,
-				    const int delta_nr,
+		                    const int delta_nr,
                                     const REAL_T *__restrict__ delta_b,
                                     const REAL_T *__restrict__ delta_q,
                                     const int  *__restrict__ b0s_mask, // change to int
-				    const int samplm_nr,
+		                    const int samplm_nr,
                                     const REAL_T *__restrict__ sampling_matrix,
                                     const REAL3_T *__restrict__ sphere_vertices,
                                     const int2 *__restrict__ sphere_edges,
                                     const int num_edges,
-                                          REAL3_T *__restrict__ shDir0,
-                                          int *slineOutOff) {
+                                    REAL3_T *__restrict__ shDir0,
+                                    int *slineOutOff) {
 
         const int tidx = threadIdx.x;
         const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
 
         if (slid >= nseed) {
                 return;
@@ -1245,10 +1480,11 @@ __global__ void getNumStreamlines_k(const REAL_T max_angle,
 
         REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr;
 
-	const int hr_side = dimt-1;
+	// const int hr_side = dimt-1;
 
         curandStatePhilox4_32_10_t st;
-        curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+        //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+        curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
                                                                                    // elements of the same sequence
         // python:
         //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle,
@@ -1259,38 +1495,126 @@ __global__ void getNumStreamlines_k(const REAL_T max_angle,
 	//	printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z);
 	//}
 
-        int ndir = get_direction_d<BDIM_X,
-                                   BDIM_Y,
-                                   1>(&st,
-				      max_angle,
-				      min_signal,
-                                      MAKE_REAL3(0,0,0),
-                                      dimx, dimy, dimz, dimt, dataf,
-                                      b0s_mask /* !dwi_mask */,
-                                      seed,
-                                      H, R,
-                                      // model unused
-                                      // max_angle, pmf_threshold from global defines
-                                      // b0s_mask already passed
-                                      // min_signal from global defines
-				      delta_nr,
-                                      delta_b, delta_q, // fit_matrix
-				      samplm_nr,
-                                      sampling_matrix,
-                                      sphere_vertices,
-                                      sphere_edges,
-                                      num_edges,
-                                      __shDir);
+        int ndir;
+        switch(model_type) {
+            case OPDT:
+                ndir = get_direction_boot_d<BDIM_X,
+                                            BDIM_Y,
+                                            1,
+                                            OPDT>(
+                                                &st,
+                                                max_angle,
+                                                min_signal,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt, dataf,
+                                                b0s_mask /* !dwi_mask */,
+                                                seed,
+                                                H, R,
+                                                // model unused
+                                                // max_angle, pmf_threshold from global defines
+                                                // b0s_mask already passed
+                                                // min_signal from global defines
+                                                delta_nr,
+                                                delta_b, delta_q, // fit_matrix
+                                                samplm_nr,
+                                                sampling_matrix,
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+                break;
+            case CSA:
+                ndir = get_direction_boot_d<BDIM_X,
+                                            BDIM_Y,
+                                            1,
+                                            CSA>(
+                                                &st,
+                                                max_angle,
+                                                min_signal,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt, dataf,
+                                                b0s_mask /* !dwi_mask */,
+                                                seed,
+                                                H, R,
+                                                // model unused
+                                                // max_angle, pmf_threshold from global defines
+                                                // b0s_mask already passed
+                                                // min_signal from global defines
+                                                delta_nr,
+                                                delta_b, delta_q, // fit_matrix
+                                                samplm_nr,
+                                                sampling_matrix,
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+                break;
+            default:
+                printf("FATAL: Invalid Model Type.\n");
+                break;
+        }
+
         if (tidx == 0) {
                 slineOutOff[slid] = ndir;
+        }
 
-		//if (!tidx && !threadIdx.y && !blockIdx.x) {
-		//	printf("ndir: %d\n", ndir);
-		//	for(int i = 0; i < ndir; i++) {
-		//		printf("%f %f %f\n", __shDir[i].x, __shDir[i].y, __shDir[i].z);	
-		//	}
-		//}
+        return;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__global__ void getNumStreamlinesProb_k(const REAL_T max_angle,
+				        const REAL_T relative_peak_thres,
+				        const REAL_T min_separation_angle,
+				        const long long rndSeed,
+                                        const int nseed,
+                                        const REAL3_T *__restrict__ seeds,
+                                        const int dimx,
+                                        const int dimy,
+                                        const int dimz,
+                                        const int dimt,
+                                        const REAL_T *__restrict__ dataf,
+                                        const REAL3_T *__restrict__ sphere_vertices,
+                                        const int2 *__restrict__ sphere_edges,
+                                        const int num_edges,
+                                        REAL3_T *__restrict__ shDir0,
+                                        int *slineOutOff) {
+
+        const int tidx = threadIdx.x;
+        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+
+        if (slid >= nseed) {
+                return;
+        }
 
+        REAL3_T *__restrict__ __shDir = shDir0+slid*dimt;
+        curandStatePhilox4_32_10_t st;
+        curand_init(rndSeed, gid, 0, &st);
+
+        int ndir = get_direction_prob_d<BDIM_X,
+                                        BDIM_Y,
+                                        1>(
+                                                &st,
+                                                dataf,
+                                                max_angle,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt,
+                                                seeds[slid],
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+        if (tidx == 0) {
+                slineOutOff[slid] = ndir;
         }
 
         return;
@@ -1298,12 +1622,16 @@ __global__ void getNumStreamlines_k(const REAL_T max_angle,
 
 template<int BDIM_X,
          int BDIM_Y,
+         ModelType MODEL_T,
          typename REAL_T,
          typename REAL3_T>
-__global__ void genStreamlinesMerge_k(const REAL_T max_angle,
+__global__ void genStreamlinesMerge_k(
+				      const REAL_T max_angle,
 				      const REAL_T min_signal,
 				      const REAL_T tc_threshold,
 				      const REAL_T step_size,
+				      const REAL_T relative_peak_thres,
+				      const REAL_T min_separation_angle,
 				      const long long rndSeed,
                                       const int rndOffset,
                                       const int nseed,
@@ -1327,7 +1655,6 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
                                       const int num_edges,
                                       const int    *__restrict__ slineOutOff,
                                             REAL3_T *__restrict__ shDir0,
-                                            REAL3_T *__restrict__ shDir1,
                                             int     *__restrict__ slineSeed,
                                             int     *__restrict__ slineLen,
                                             REAL3_T *__restrict__ sline) {
@@ -1340,10 +1667,15 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
         const int lid = (tidy*BDIM_X + tidx) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
-	const int hr_side = dimt-1;
+        __shared__ REAL_T frame_sh[((MODEL_T == PTT) ? BDIM_Y*18 : 1)]; // Only used by PTT, TODO: way to remove this in other cases
+        REAL_T* __ptt_frame = frame_sh + tidy*18;
+	// const int hr_side = dimt-1;
 
         curandStatePhilox4_32_10_t st;
-        curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
+        // const int gbid = blockIdx.y*gridDim.x + blockIdx.x;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+        //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
+        curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
                                                                                  // elements of the same sequence
         if (slid >= nseed) {
                 return;
@@ -1366,7 +1698,6 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
         int slineOff = slineOutOff[slid];
 
         for(int i = 0; i < ndir; i++) {
-
                 REAL3_T first_step = shDir0[slid*samplm_nr + i];
 
 		REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2;
@@ -1379,67 +1710,97 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
                         printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z);
                 }
 #endif
+
+                if (MODEL_T == PTT) {
+                        if (!init_frame_ptt_d<BDIM_X, BDIM_Y>(
+                                &st,
+                                dataf,
+                                max_angle,
+                                step_size,
+                                first_step,
+                                dimx, dimy, dimz, dimt,
+                                seed,
+                                sphere_vertices,
+                                __ptt_frame
+                        )) { // this fails rarely
+                                if (tidx == 0) {
+                                        slineLen[slineOff] = 1;
+                                        currSline[0] = seed;
+                                }
+                                __syncwarp(WMASK);
+                                slineOff += 1;
+                                continue;
+                        }
+                }
+
+
                 int stepsB;
                 const int tissue_classB = tracker_d<BDIM_X,
-                                                    BDIM_Y>(&st,
-						            max_angle,
-						            min_signal,
-							    tc_threshold,
-							    step_size,
-                                                            seed,
-                                                            MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
-                                                            MAKE_REAL3(1, 1, 1),
-                                                            dimx, dimy, dimz, dimt, dataf,
-                                                            b0s_mask,
-                                                            H, R,
-                                                            metric_map,
-							    delta_nr,
-                                                            delta_b, delta_q, //fit_matrix
-							    samplm_nr,
-                                                            sampling_matrix,
-                                                            sphere_vertices,
-                                                            sphere_edges,
-                                                            num_edges,
-                                                            shDir1 + slid*samplm_nr,
-                                                            &stepsB,
-                                                            currSline);
+                                                    BDIM_Y,
+                                                    MODEL_T>(&st,
+		                		             max_angle,
+			        		             min_signal,
+			        			     tc_threshold,
+	                        			     step_size,
+	                        			     relative_peak_thres,
+	                        			     min_separation_angle,
+                                                             seed,
+                                                             MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
+                                                             __ptt_frame,
+                                                             MAKE_REAL3(1, 1, 1),
+                                                             dimx, dimy, dimz, dimt, dataf,
+                                                             b0s_mask,
+                                                             H, R,
+                                                             metric_map,
+			                                     delta_nr,
+                                                             delta_b, delta_q, //fit_matrix
+		                			     samplm_nr,
+                                                             sampling_matrix,
+                                                             sphere_vertices,
+                                                             sphere_edges,
+                                                             num_edges,
+                                                             &stepsB,
+                                                             currSline);
                 //if (tidx == 0) {
                 //        slineLenB[slineOff] = stepsB;
                 //}
 
                 // reverse backward sline
-                for(int i = 0; i < stepsB/2; i += BDIM_X) {
-                        if (i+tidx < stepsB/2) {
-                                const REAL3_T __p = currSline[i+tidx];
-                                currSline[i+tidx] = currSline[stepsB-1 - (i+tidx)];
-                                currSline[stepsB-1 - (i+tidx)] = __p;
+                for(int j = 0; j < stepsB/2; j += BDIM_X) {
+                        if (j+tidx < stepsB/2) {
+                                const REAL3_T __p = currSline[j+tidx];
+                                currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)];
+                                currSline[stepsB-1 - (j+tidx)] = __p;
                         }
                 }
 
                 int stepsF;
                 const int tissue_classF = tracker_d<BDIM_X,
-                                                    BDIM_Y>(&st,
-						            max_angle,
-						            min_signal,
-							    tc_threshold,
-							    step_size,
-                                                            seed,
-                                                            first_step,
-                                                            MAKE_REAL3(1, 1, 1),
-                                                            dimx, dimy, dimz, dimt, dataf,
-                                                            b0s_mask,
-                                                            H, R,
-                                                            metric_map,
-							    delta_nr,
-                                                            delta_b, delta_q, //fit_matrix
-							    samplm_nr,
-                                                            sampling_matrix,
-                                                            sphere_vertices,
-                                                            sphere_edges,
-                                                            num_edges,
-                                                            shDir1 + slid*samplm_nr,
-                                                            &stepsF,
-                                                            currSline + stepsB-1);
+                                                    BDIM_Y,
+                                                    MODEL_T>(&st,
+     	                    			             max_angle,
+			        		             min_signal,
+		        				     tc_threshold,
+	                				     step_size,
+			    				     relative_peak_thres,
+			            			     min_separation_angle,
+                                                             seed,
+                                                             first_step,
+                                                             __ptt_frame + 9,
+                                                             MAKE_REAL3(1, 1, 1),
+                                                             dimx, dimy, dimz, dimt, dataf,
+                                                             b0s_mask,
+                                                             H, R,
+                                                             metric_map,
+			        			     delta_nr,
+                                                             delta_b, delta_q, //fit_matrix
+			        			     samplm_nr,
+                                                             sampling_matrix,
+                                                             sphere_vertices,
+                                                             sphere_edges,
+                                                             num_edges,
+                                                             &stepsF,
+                                                             currSline + stepsB-1);
                 if (tidx == 0) {
                         slineLen[slineOff] = stepsB-1+stepsF;
                 }
@@ -1465,7 +1826,8 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
         return;
 }
 
-void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+                                    const REAL relative_peak_thresh, const REAL min_separation_angle,
                                     const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
                                     const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
@@ -1483,7 +1845,6 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
 
   std::vector<int*> slinesOffs_d(ngpus, nullptr);
   std::vector<REAL3*> shDirTemp0_d(ngpus, nullptr);
-  std::vector<REAL3*> shDirTemp1_d(ngpus, nullptr);
 
   //#pragma omp parallel for
   for (int n = 0; n < ngpus; ++n) {
@@ -1494,16 +1855,11 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
 
     CHECK_CUDA(cudaMalloc(&slinesOffs_d[n], sizeof(*slinesOffs_d[n])*(nseeds_gpu+1)));
     CHECK_CUDA(cudaMalloc(&shDirTemp0_d[n], sizeof(*shDirTemp0_d[n])*samplm_nr*grid.x*block.y));
-    CHECK_CUDA(cudaMalloc(&shDirTemp1_d[n], sizeof(*shDirTemp1_d[n])*samplm_nr*grid.x*block.y));
   }
 
   int n32dimt = ((dimt+31)/32)*32;
 
-  size_t shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_d
-		     sizeof(int)*samplm_nr;						        // for peak_directions_d	
-
-  //printf("shSizeGNS: %zu\n", shSizeGNS);
-
+  size_t shSizeGNS;
   //#pragma omp parallel for
   for (int n = 0; n < ngpus; ++n) {
     CHECK_CUDA(cudaSetDevice(n));
@@ -1513,32 +1869,60 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
     dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
 
     // Precompute number of streamlines before allocating memory
-    getNumStreamlines_k<THR_X_SL,
-                        THR_X_BL/THR_X_SL>
-                        <<<grid, block, shSizeGNS>>>(max_angle,
-						     min_signal,
-						     rng_seed,
-						     rng_offset + n*nseeds_per_gpu,
-						     nseeds_gpu,
-						     reinterpret_cast<const REAL3 *>(seeds_d[n]),
-						     dimx,
-						     dimy,
-						     dimz,
-						     dimt,
-						     dataf_d[n],
-						     H_d[n],
-						     R_d[n],
-						     delta_nr,
-						     delta_b_d[n],
-						     delta_q_d[n],
-						     b0s_mask_d[n],
-						     samplm_nr,
-						     sampling_matrix_d[n],
-						     reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-						     reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-						     nedges,
-						     shDirTemp0_d[n],
-						     slinesOffs_d[n]);
+    if (!((model_type == PTT) || (model_type == PROB))) {
+        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
+                    sizeof(int)*samplm_nr;						      // for peak_directions_d	
+        getNumStreamlinesBoot_k<THR_X_SL,
+                                THR_X_BL/THR_X_SL>
+                                <<<grid, block, shSizeGNS>>>(
+                                        model_type,
+                                        max_angle,
+                                        min_signal,
+                                        relative_peak_thresh,
+                                        min_separation_angle,
+                                        rng_seed,
+                                        nseeds_gpu,
+                                        reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                                        dimx,
+                                        dimy,
+                                        dimz,
+                                        dimt,
+                                        dataf_d[n],
+                                        H_d[n],
+                                        R_d[n],
+                                        delta_nr,
+                                        delta_b_d[n],
+                                        delta_q_d[n],
+                                        b0s_mask_d[n],
+                                        samplm_nr,
+                                        sampling_matrix_d[n],
+                                        reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
+                                        reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                                        nedges,
+                                        shDirTemp0_d[n],
+                                        slinesOffs_d[n]);
+    } else {
+        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt;
+        getNumStreamlinesProb_k<THR_X_SL,
+                                THR_X_BL/THR_X_SL>
+                                <<<grid, block, shSizeGNS>>>(
+                                        max_angle,
+                                        relative_peak_thresh,
+                                        min_separation_angle,
+                                        rng_seed,
+                                        nseeds_gpu,
+                                        reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                                        dimx,
+                                        dimy,
+                                        dimz,
+                                        dimt,
+                                        dataf_d[n],
+                                        reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
+                                        reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                                        nedges,
+                                        shDirTemp0_d[n],
+                                        slinesOffs_d[n]);
+    }
   }
 
   std::vector<int> slinesOffs_h;
@@ -1575,15 +1959,19 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
     CHECK_CUDA(cudaMemset(slineSeed_d[n], -1, sizeof(*slineSeed_d[n])*nSlines_h[n]));
 
     // Allocate/reallocate output arrays if necessary
-    if (nSlines_h[n] > nSlines_old_h[n]) {
+    if (nSlines_h[n] > EXCESS_ALLOC_FACT*nSlines_old_h[n]) {
       if(slines_h[n]) cudaFreeHost(slines_h[n]);
       if(slinesLen_h[n]) cudaFreeHost(slinesLen_h[n]);
       slines_h[n] = nullptr;
       slinesLen_h[n] = nullptr;
     }
 
-    if (!slines_h[n]) CHECK_CUDA(cudaMallocHost(&slines_h[n], 2*3*MAX_SLINE_LEN*nSlines_h[n]*sizeof(*slines_h[n])));
-    if (!slinesLen_h[n]) CHECK_CUDA(cudaMallocHost(&slinesLen_h[n], nSlines_h[n]*sizeof(*slinesLen_h[n])));
+#ifdef DEBUG
+    printf("buffer size %zu\n", sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]);
+#endif
+
+    if (!slines_h[n]) CHECK_CUDA(cudaMallocHost(&slines_h[n], sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]));
+    if (!slinesLen_h[n]) CHECK_CUDA(cudaMallocHost(&slinesLen_h[n], sizeof(*slinesLen_h[n])*EXCESS_ALLOC_FACT*nSlines_h[n]));
   }
 
   //if (nSlines_h) {
@@ -1619,40 +2007,58 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
     std::cerr << "GPU " << n << ": ";
     std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; 
 #endif
+
     //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
-    genStreamlinesMerge_k<THR_X_SL,
-                          THR_X_BL/THR_X_SL>
-                          <<<grid, block, shSizeGNS, streams[n]>>>(max_angle,
-								   min_signal,
-								   tc_threshold,
-								   step_size,
-								   rng_seed,
-								   rng_offset + n*nseeds_per_gpu,
-								   nseeds_gpu,
-								   reinterpret_cast<const REAL3 *>(seeds_d[n]),
-								   dimx,
-								   dimy,
-								   dimz,
-								   dimt,
-								   dataf_d[n],
-								   H_d[n],
-								   R_d[n],
-								   delta_nr,
-								   delta_b_d[n],
-								   delta_q_d[n],
-								   b0s_mask_d[n],
-								   metric_map_d[n],
-								   samplm_nr,
-								   sampling_matrix_d[n],
-								   reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-								   reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-								   nedges,
-								   slinesOffs_d[n],
-								   shDirTemp0_d[n],
-								   shDirTemp1_d[n],
-								   slineSeed_d[n],
-								   slineLen_d[n],
-								   sline_d[n]);
+    switch(model_type) {
+        case OPDT:
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, OPDT> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        case CSA:
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, CSA> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        case PROB:
+            // Shared memory requirements are smaller for probabilistic for main run
+            // than for preliminary run
+            shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt;
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PROB> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        case PTT:
+            shSizeGNS = 0; // PTT uses exclusively static shared memory
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PTT> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        default:
+            printf("FATAL: Invalid Model Type.\n");
+            break;
+    }
+
     CHECK_ERROR("genStreamlinesMerge_k");
   }
 
@@ -1680,7 +2086,6 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
     CHECK_CUDA(cudaFree(slineSeed_d[n]));
     CHECK_CUDA(cudaFree(slinesOffs_d[n]));
     CHECK_CUDA(cudaFree(shDirTemp0_d[n]));
-    CHECK_CUDA(cudaFree(shDirTemp1_d[n]));
     CHECK_CUDA(cudaFree(slineLen_d[n]));
     CHECK_CUDA(cudaFree(sline_d[n]));
   }
diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h
index 96a60e0..14105ce 100644
--- a/cuslines/generate_streamlines_cuda.h
+++ b/cuslines/generate_streamlines_cuda.h
@@ -33,14 +33,15 @@
 
 #include "globals.h"
 
-void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
-				    const int nseeds, const std::vector<REAL*> &seeds_d,
+void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+                                    const REAL relative_peak_thresh, const REAL min_separation_angle,
+                                    const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
                                     const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
-			            const int delta_nr,
+                                    const int delta_nr,
                                     const std::vector<REAL*> &delta_b_d, const std::vector<REAL*> &delta_q_d,
                                     const std::vector<int*> &b0s_mask_d, const std::vector<REAL*> &metric_map_d,
-			            const int samplm_nr,
+                                    const int samplm_nr,
                                     const std::vector<REAL*> &sampling_matrix_d,
                                     const std::vector<REAL*> &sphere_vertices_d, const std::vector<int*> &sphere_edges_d, const int nedges,
                                     std::vector<REAL*> &slines_h, std::vector<int*> &slinesLen_h, std::vector<int> &nSlines_h,
diff --git a/cuslines/globals.h b/cuslines/globals.h
index 45432c2..25c2f29 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -43,6 +43,7 @@
 #define REAL_MAX	(FLT_MAX)
 #define REAL_MIN	(-REAL_MAX)
 #define COS		__cosf
+#define SIN		__sinf
 #define FABS		fabsf
 #define SQRT		sqrtf
 #define RSQRT		rsqrtf
@@ -60,6 +61,7 @@
 #define REAL_MAX	(DBL_MAX)
 #define REAL_MIN	(-REAL_MAX)
 #define COS		cos
+#define SIN		sin
 #define FABS		fabs
 #define SQRT		sqrt
 #define RSQRT		rsqrt
@@ -70,16 +72,28 @@
 #define MAX_SLINE_LEN	(501)
 #define PMF_THRESHOLD_P	((REAL)0.1)
 
-//#define TC_THRESHOLD_P	((REAL)0.1)
-//#define STEP_SIZE_P	((REAL)0.5)  // only for TRK generation
-//#define MAX_ANGLE_P	((REAL)1.0471975511965976) // 60 deg in radians
-//#define MIN_SIGNAL_P	((REAL)1.0)
+#define THR_X_BL (64)
+#define THR_X_SL (32)
 
 #define MAX_SLINES_PER_SEED (10)
 
 #define MIN(x,y) (((x)<(y))?(x):(y))
 #define MAX(x,y) (((x)>(y))?(x):(y))
+#define POW2(n) (1 << (n))
 
 #define DIV_UP(a,b) (((a)+((b)-1))/(b))
 
+#define EXCESS_ALLOC_FACT 2
+
+#if 0
+  #define DEBUG
+#endif
+
+enum ModelType {
+  OPDT = 0,
+  CSA = 1,
+  PROB = 2,
+  PTT = 3,
+};
+
 #endif
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
new file mode 100644
index 0000000..b36e747
--- /dev/null
+++ b/cuslines/ptt.cu
@@ -0,0 +1,476 @@
+template<typename REAL_T>
+__device__ void norm3_d(REAL_T *num, int fail_ind) {
+    const REAL_T scale = SQRT(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]);
+
+    if (scale != 0) {
+        num[0] /= scale;
+        num[1] /= scale;
+        num[2] /= scale;
+    } else {
+        num[fail_ind] = 1.0; // this can happen randomly during propogation, though is exceedingly rare
+    }
+}
+
+template<typename REAL_T>
+__device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) {
+    dest[0] = src1[1] * src2[2] - src1[2] * src2[1];
+    dest[1] = src1[2] * src2[0] - src1[0] * src2[2];
+    dest[2] = src1[0] * src2[1] - src1[1] * src2[0];
+
+    norm3_d(dest, fail_ind);
+}
+
+template<typename REAL_T, typename REAL3_T>
+__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
+                            const int dimx, const int dimy, const int dimz, const int dimt,
+                            const REAL3_T *__restrict__ odf_sphere_vertices) {
+    int closest_odf_idx = 0;
+    REAL_T __max_cos = 0;
+    for (int ii = 0; ii < dimt; ii++) {
+        REAL_T cos_sim = FABS(
+            odf_sphere_vertices[ii].x * frame[0] \
+            + odf_sphere_vertices[ii].y * frame[1] \
+            + odf_sphere_vertices[ii].z * frame[2]);
+        if (cos_sim > __max_cos) {
+            __max_cos = cos_sim;
+            closest_odf_idx = ii;
+        }
+    }
+
+    const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos);
+
+#if 0
+    printf("inerpolated %f at %f, %f, %f, %i\n", __max_cos, pos.x, pos.y, pos.z, closest_odf_idx);
+#endif
+
+    if (rv != 0) {
+        return -1;
+    } else {
+        return __max_cos;
+    }
+}
+
+template<typename REAL_T>
+__device__ void prepare_propagator_d(REAL_T k1, REAL_T k2, REAL_T arclength,
+                                     REAL_T *propagator) {
+    if ((FABS(k1) < K_SMALL) && (FABS(k2) < K_SMALL)) {
+        propagator[0] = arclength;
+        propagator[1] = 0;
+        propagator[2] = 0;
+        propagator[3] = 1;
+        propagator[4] = 0;
+        propagator[5] = 0;
+        propagator[6] = 0;
+        propagator[7] = 0;
+        propagator[8] = 1;
+    } else {
+        if (FABS(k1) < K_SMALL) {
+            k1 = K_SMALL;
+        }
+        if (FABS(k2) < K_SMALL) {
+            k2 = K_SMALL;
+        }
+        const REAL_T k     = SQRT(k1*k1+k2*k2);
+        const REAL_T sinkt = SIN(k*arclength);
+        const REAL_T coskt = COS(k*arclength);
+        const REAL_T kk    = 1/(k*k);
+
+        propagator[0] = sinkt/k;
+        propagator[1] = k1*(1-coskt)*kk;
+        propagator[2] = k2*(1-coskt)*kk;
+        propagator[3] = coskt;
+        propagator[4] = k1*sinkt/k;
+        propagator[5] = k2*sinkt/k;
+        propagator[6] = -propagator[5];
+        propagator[7] = k1*k2*(coskt-1)*kk;
+        propagator[8] = (k1*k1+k2*k2*coskt)*kk;
+    }
+}
+
+template<bool IS_INIT, typename REAL_T>
+__device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) {
+    if (IS_INIT) {
+        for (int ii = 0; ii < 3; ii++) { // tangent
+            probing_frame[ii] = frame[ii];
+        }
+        if ((probing_frame[0] != 0) && (probing_frame[1] != 0)) { // norm
+            probing_frame[3] = -probing_frame[1];
+            probing_frame[4] = probing_frame[0];
+            probing_frame[5] = 0;
+        } else {
+            probing_frame[3] = 0;
+            probing_frame[4] = -probing_frame[2];
+            probing_frame[5] = 0;
+        }
+
+        norm3_d(probing_frame, 0); // tangent
+        norm3_d(probing_frame + 3, 1); // norm
+        crossnorm3_d(probing_frame + 2*3, probing_frame, probing_frame + 3, 2); // binorm
+    } else {
+        for (int ii = 0; ii < 9; ii++) {
+            probing_frame[ii] =  frame[ii];
+        }
+    }
+}
+
+template<typename REAL_T>
+__device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) {
+    REAL_T __tmp[3];
+
+    for (int ii = 0; ii < 3; ii++) {
+        direc[ii]       = propagator[0]*frame[ii] + propagator[1]*frame[3+ii] + propagator[2]*frame[6+ii];
+        __tmp[ii]       = propagator[3]*frame[ii] + propagator[4]*frame[3+ii] + propagator[5]*frame[6+ii];
+        frame[2*3 + ii] = propagator[6]*frame[ii] + propagator[7]*frame[3+ii] + propagator[8]*frame[6+ii];
+    }
+
+#if 1
+    norm3_d(__tmp, 0); // normalize tangent
+    crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calc normal
+    crossnorm3_d(frame + 2*3, __tmp, frame + 3, 2); // calculate binorm from tangent, norm
+#else
+    norm3_d(__tmp, 0); // normalize tangent
+    norm3_d(frame + 2*3, 2); // normalize binorm
+    crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calculate normal from binorm, tangent
+#endif
+
+    for (int ii = 0; ii < 3; ii++) {
+        frame[ii] = __tmp[ii];
+    }
+}
+
+template<typename REAL_T, typename REAL3_T>
+__device__ REAL_T calculate_data_support_d(REAL_T support,
+                                           const REAL3_T pos, const REAL_T *__restrict__ pmf,
+                                           const int dimx, const int dimy, const int dimz, const int dimt,
+                                           const REAL_T probe_step_size,
+                                           const REAL3_T *__restrict__ odf_sphere_vertices,
+                                           REAL_T k1, REAL_T k2,
+                                           REAL_T* probing_frame) {
+    REAL_T probing_prop[9];
+    REAL_T direc[3];
+    REAL3_T probing_pos;
+    REAL_T fod_amp;
+
+    prepare_propagator_d(k1, k2, probe_step_size, probing_prop);
+    probing_pos.x = pos.x;
+    probing_pos.y = pos.y;
+    probing_pos.z = pos.z;
+
+    for (int ii = 0; ii < PROBE_QUALITY; ii++) { // we spend about 2/3 of our time in this loop when doing PTT
+        propogate_frame_d(probing_prop, probing_frame, direc);
+
+        probing_pos.x += direc[0];
+        probing_pos.y += direc[1];
+        probing_pos.z += direc[2];
+
+        fod_amp = interp4_d(probing_pos, probing_frame, pmf,
+                            dimx, dimy, dimz, dimt,
+                            odf_sphere_vertices);
+        if (!ALLOW_WEAK_LINK && (fod_amp < PMF_THRESHOLD_P)) {
+            return 0;
+        }
+        support += fod_amp;
+    }
+    return support;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         bool IS_INIT,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int get_direction_ptt_d(
+    curandStatePhilox4_32_10_t *st,
+    const REAL_T *__restrict__ pmf,
+    const REAL_T max_angle,
+    const REAL_T step_size,
+    REAL3_T dir,
+    REAL_T *__frame_sh,
+    const int dimx, const int dimy, const int dimz, const int dimt,
+    REAL3_T pos,
+    const REAL3_T *__restrict__ odf_sphere_vertices,
+    REAL3_T *__restrict__ dirs) {
+    // Aydogan DB, Shi Y. Parallel Transport Tractography. IEEE Trans
+    // Med Imaging. 2021 Feb;40(2):635-647. doi: 10.1109/TMI.2020.3034038.
+    // Epub 2021 Feb 2. PMID: 33104507; PMCID: PMC7931442.
+    // https://github.com/nibrary/nibrary/blob/main/src/dMRI/tractography/algorithms/ptt
+    // Assumes probe count 1, data_support_exponent 1 for now
+    // Implemented with new CDF sampling strategy
+    // And using initial directions from voxel-wise peaks as in DIPY
+
+    const int tidx = threadIdx.x;
+    const int tidy = threadIdx.y;
+
+    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	REAL_T __shared__ face_cdf_sh[BDIM_Y*DISC_FACE_CNT];
+    REAL_T __shared__ vert_pdf_sh[BDIM_Y*DISC_VERT_CNT];
+    REAL_T __shared__ first_val_sh[BDIM_Y];
+
+    REAL_T *__face_cdf_sh = face_cdf_sh + tidy*DISC_FACE_CNT;
+    REAL_T *__vert_pdf_sh = vert_pdf_sh + tidy*DISC_VERT_CNT;
+    REAL_T *__first_val_sh = first_val_sh + tidy;
+
+    const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more
+    const REAL_T probe_step_size = ((step_size / 2) / (PROBE_QUALITY - 1));
+
+    REAL_T __tmp;
+
+    __syncwarp(WMASK);
+    if (IS_INIT) {
+        if (tidx==0) {
+            __frame_sh[0] = dir.x;
+            __frame_sh[1] = dir.y;
+            __frame_sh[2] = dir.z;
+        }
+    }
+    if (tidx==0) {
+        *__first_val_sh = interp4_d(pos, __frame_sh, pmf,
+                                    dimx, dimy, dimz, dimt,
+                                    odf_sphere_vertices);
+    }
+    __syncwarp(WMASK);
+
+    // Calculate __vert_pdf_sh
+    REAL_T probing_frame[9];
+    REAL_T k1_probe, k2_probe;
+    bool support_found = 0;
+    for (int ii = tidx; ii < DISC_VERT_CNT; ii += BDIM_X) {
+        k1_probe = DISC_VERT[ii*2] * max_curvature;
+        k2_probe = DISC_VERT[ii*2+1] * max_curvature;
+
+        get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
+
+        const REAL_T this_support = calculate_data_support_d(
+            *__first_val_sh,
+            pos, pmf, dimx, dimy, dimz, dimt,
+            probe_step_size,
+            odf_sphere_vertices,
+            k1_probe, k2_probe,
+            probing_frame);
+
+#if 0
+        if (threadIdx.y == 1 && ii == 0) { 
+            printf("    k1_probe: %f, k2_probe %f, support %f for id: %i\n", k1_probe, k2_probe, this_support, tidx);
+        }
+#endif
+
+        if (this_support < NORM_MIN_SUPPORT) {
+            __vert_pdf_sh[ii] = 0;
+        } else {
+            __vert_pdf_sh[ii] = this_support;
+            support_found = 1;
+        }
+    }
+    const int __msk = __ballot_sync(WMASK, support_found);
+    if (__msk == 0) {
+        return 0;
+    }
+
+#if 0
+    __syncwarp(WMASK);
+    if (threadIdx.y == 1 && threadIdx.x == 0) {
+        printArrayAlways("VERT PDF", 8, DISC_VERT_CNT, __vert_pdf_sh);
+    }
+    __syncwarp(WMASK);
+#endif
+
+    // Initialize __face_cdf_sh
+    for (int ii = tidx; ii < DISC_FACE_CNT; ii+=BDIM_X) { 
+        __face_cdf_sh[ii] = 0;
+    }
+    __syncwarp(WMASK);
+
+    // Move vert to face
+    for (int ii = tidx; ii < DISC_FACE_CNT; ii+=BDIM_X) {
+        bool all_verts_valid = 1;
+        for (int jj = 0; jj < 3; jj++) {
+            REAL_T vert_val = __vert_pdf_sh[DISC_FACE[ii*3 + jj]];
+            if (vert_val == 0) {
+                all_verts_valid = IS_INIT; // On init, even go with faces that are not fully supported
+            }
+            __face_cdf_sh[ii] += vert_val;
+        }
+        if (!all_verts_valid) {
+            __face_cdf_sh[ii] = 0;
+        }
+    }
+    __syncwarp(WMASK);
+
+#if 0
+    __syncwarp(WMASK);
+    if (threadIdx.y == 1 && threadIdx.x == 0) {
+        printArrayAlways("Face PDF", 8, DISC_FACE_CNT, __face_cdf_sh);
+    }
+    __syncwarp(WMASK);
+#endif
+
+    // Prefix sum __face_cdf_sh and return 0 if all 0
+    prefix_sum_sh_d<BDIM_X>(__face_cdf_sh, DISC_FACE_CNT);
+    REAL_T last_cdf = __face_cdf_sh[DISC_FACE_CNT - 1];
+
+    if (last_cdf == 0) {
+        return 0;
+    }
+
+#if 0
+    __syncwarp(WMASK);
+    if (threadIdx.y == 1 && threadIdx.x == 0) {
+        printArrayAlways("Face CDF", 8, DISC_FACE_CNT, __face_cdf_sh);
+    }
+    __syncwarp(WMASK);
+#endif
+
+    // Sample random valid faces randomly
+    REAL_T r1, r2;
+    for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING / BDIM_X; ii++) {
+        r1 = curand_uniform(st);
+        r2 = curand_uniform(st);
+		if (r1 + r2 > 1) {
+			r1 = 1 - r1;
+			r2 = 1 - r2;
+		}
+
+        __tmp = curand_uniform(st) * last_cdf;
+		int jj;
+		for (jj = 0; jj < DISC_FACE_CNT; jj++) {
+			if (__face_cdf_sh[jj] >= __tmp)
+				break;
+		}
+
+        const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2];
+        const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2];
+        const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2];
+
+        const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1];
+        const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1];
+        const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1];
+
+        k1_probe = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0);
+        k2_probe = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0);
+
+        get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
+
+        const REAL_T this_support = calculate_data_support_d(*__first_val_sh,
+                                                             pos, pmf, dimx, dimy, dimz, dimt,
+                                                             probe_step_size,
+                                                             odf_sphere_vertices,
+                                                             k1_probe, k2_probe,
+                                                             probing_frame);
+
+
+        __syncwarp(WMASK);
+        int winning_lane = -1; // -1 indicates nobody won
+        int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT);
+        if (__msk != 0) {
+            REAL_T group_max_support = this_support;
+            #pragma unroll
+            for(int j = 1; j < PROBABILISTIC_GROUP_SZ; j *= 2) {
+                __tmp = __shfl_xor_sync(WMASK, group_max_support, j, BDIM_X);
+                group_max_support = MAX(group_max_support, __tmp);
+            }
+
+            __msk &= __ballot_sync(WMASK, this_support == group_max_support);
+            winning_lane = __ffs(__msk) - 1;
+        }
+        if (winning_lane != -1) {
+            if (tidx == winning_lane) {
+#if 0
+                if (threadIdx.y == 1) {
+                    printf("winning k1 %f, k2 %f, cdf %f, cdf_idx %i", k1_probe, k2_probe, __tmp, jj);
+                }
+#endif
+                if (IS_INIT) {
+                    dirs[0] = dir;
+                } else {
+                    REAL_T __prop[9];
+                    REAL_T __dir[3];
+                    prepare_propagator_d(k1_probe, k2_probe, step_size/STEP_FRAC, __prop);
+                    propogate_frame_d(__prop, probing_frame, __dir);
+                    norm3_d(__dir, 0); // this will be scaled by the generic stepping code
+                    dirs[0] = (REAL3_T) {__dir[0], __dir[1], __dir[2]};
+                }
+
+                for (int jj = 0; jj < 9; jj++) {
+                    __frame_sh[jj] = probing_frame[jj];
+                }
+            }
+            __syncwarp(WMASK);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ bool init_frame_ptt_d(
+    curandStatePhilox4_32_10_t *st,
+    const REAL_T *__restrict__ pmf,
+    const REAL_T max_angle,
+    const REAL_T step_size,
+    REAL3_T first_step,
+    const int dimx, const int dimy, const int dimz, const int dimt,
+    REAL3_T seed,
+    const REAL3_T *__restrict__ sphere_vertices,
+    REAL_T* __frame) {
+    const int tidx = threadIdx.x;
+
+    const int lid = (threadIdx.y*BDIM_X + tidx) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+    bool init_norm_success;
+    REAL3_T tmp;
+
+    // Here we probabilistic find a good intial normal for this initial direction 
+    init_norm_success = (bool) get_direction_ptt_d<BDIM_X, BDIM_Y, 1>(
+        st,
+        pmf,
+        max_angle,
+        step_size,
+        MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
+        __frame,
+        dimx, dimy, dimz, dimt,
+        seed,
+        sphere_vertices,
+        &tmp);
+    __syncwarp(WMASK);
+
+    if (!init_norm_success) {
+        // try the other direction
+        init_norm_success = (bool) get_direction_ptt_d<BDIM_X, BDIM_Y, 1>(
+            st,
+            pmf,
+            max_angle,
+            step_size,
+            MAKE_REAL3(first_step.x, first_step.y, first_step.z),
+            __frame,
+            dimx, dimy, dimz, dimt,
+            seed,
+            sphere_vertices,
+            &tmp);
+        __syncwarp(WMASK);
+
+        if (!init_norm_success) { // This is rare
+            return false;
+        } else {
+            if (tidx == 0) {
+                for (int ii = 0; ii < 9; ii++) {
+                    __frame[ii] = -__frame[ii];
+                }
+            }
+            __syncwarp(WMASK);
+        }
+    }
+    if (tidx == 0) {
+        for (int ii = 0; ii < 9; ii++) {
+            __frame[9+ii] = -__frame[ii]; // save flipped frame for second run
+        }
+    }
+    __syncwarp(WMASK);
+    return true;
+}
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
new file mode 100644
index 0000000..d8986b5
--- /dev/null
+++ b/cuslines/ptt.cuh
@@ -0,0 +1,52 @@
+#ifndef __PTT_CUH__
+#define __PTT_CUH__
+
+#include "disc.h"
+#include "globals.h"
+
+#define STEP_FRAC 20 // divides output step size (usually 0.5) into this many internal steps
+#define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length
+#define PROBE_QUALITY 4
+#define SAMPLING_QUALITY 4 // can be 2-7
+#define PROBABILISTIC_BIAS 1 // 1 looks good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic.
+#define ALLOW_WEAK_LINK 1
+#define TRIES_PER_REJECTION_SAMPLING 1024
+#define DEFAULT_PTT_MINDATASUPPORT 0.05
+#define K_SMALL 0.0001
+
+#define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY)
+#define PROBABILISTIC_GROUP_SZ POW2(PROBABILISTIC_BIAS)
+
+#if SAMPLING_QUALITY == 2
+#define DISC_VERT_CNT DISC_2_VERT_CNT
+#define DISC_FACE_CNT DISC_2_FACE_CNT
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_2_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_2_FACE;
+#elif SAMPLING_QUALITY == 3
+#define DISC_VERT_CNT DISC_3_VERT_CNT
+#define DISC_FACE_CNT DISC_3_FACE_CNT
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_3_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_3_FACE;
+#elif SAMPLING_QUALITY == 4
+#define DISC_VERT_CNT DISC_4_VERT_CNT
+#define DISC_FACE_CNT DISC_4_FACE_CNT
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_4_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_4_FACE;
+#elif SAMPLING_QUALITY == 5
+#define DISC_VERT_CNT DISC_5_VERT_CNT
+#define DISC_FACE_CNT DISC_5_FACE_CNT
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_5_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_5_FACE;
+#elif SAMPLING_QUALITY == 6
+#define DISC_VERT_CNT DISC_6_VERT_CNT
+#define DISC_FACE_CNT DISC_6_FACE_CNT
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_6_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_6_FACE;
+#elif SAMPLING_QUALITY == 7
+#define DISC_VERT_CNT DISC_7_VERT_CNT
+#define DISC_FACE_CNT DISC_7_FACE_CNT
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_7_VERT; // TODO: check if removing __constant__ helps or hurts
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_7_FACE;
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cuslines/utils.cu b/cuslines/utils.cu
new file mode 100644
index 0000000..c7fe47f
--- /dev/null
+++ b/cuslines/utils.cu
@@ -0,0 +1,143 @@
+
+template<int BDIM_X, typename REAL_T>
+__device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) {
+    const int tidx = threadIdx.x;
+
+    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+#if 0
+    // for debugging
+    __syncwarp(WMASK);
+    if (tidx == 0) {
+        for (int j = 1; j < __len; j++) {
+            num_sh[j] += num_sh[j-1];
+        }
+    }
+    __syncwarp(WMASK);
+#else
+    for (int j = 0; j < __len; j += BDIM_X) {
+        if ((tidx == 0) && (j != 0)) {
+            num_sh[j] += num_sh[j-1];
+        }
+        __syncwarp(WMASK);
+
+        REAL_T __t_pmf;
+        if (j+tidx < __len) {
+            __t_pmf = num_sh[j+tidx];
+        }
+        for (int i = 1; i < BDIM_X; i*=2) {
+            REAL_T __tmp = __shfl_up_sync(WMASK, __t_pmf, i, BDIM_X);
+            if ((tidx >= i) && (j+tidx < __len)) {
+                    __t_pmf += __tmp;
+            }
+        }
+        if (j+tidx < __len) {
+            num_sh[j+tidx] = __t_pmf;
+        }
+        __syncwarp(WMASK);
+    }
+#endif
+}
+
+template<typename REAL_T>
+__device__ void printArrayAlways(const char *name, int ncol, int n, REAL_T *arr) {
+        printf("%s:\n", name);
+
+        for(int j = 0; j < n; j++) {
+                if ((j%ncol)==0) printf("\n");
+                printf("%10.8f ", arr[j]);
+        }
+        printf("\n");
+}
+
+template<typename REAL_T>
+__device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
+	if (!threadIdx.x && !threadIdx.y && !blockIdx.x) {
+		printArrayAlways(name, ncol, n, arr);
+	}
+}
+
+template<typename REAL_T>
+__device__ REAL_T interpolation_helper_d(const REAL_T* dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) {
+    REAL_T __tmp = 0;
+    #pragma unroll
+    for (int i = 0; i < 2; i++) {
+        #pragma unroll
+        for (int j = 0; j < 2; j++) {
+            #pragma unroll
+            for (int k = 0; k < 2; k++) {
+                __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] *
+                         dataf[coo[0][i] * dimy * dimz * dimt +
+                               coo[1][j] * dimz * dimt +
+                               coo[2][k] * dimt +
+                               t];
+            }
+        }
+    }
+    return __tmp;
+}
+
+template<int BDIM_X,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int trilinear_interp_d(const int dimx,
+                                  const int dimy,
+                                  const int dimz,
+                                  const int dimt,
+                                  int dimt_idx, // If -1, get all
+                                  const REAL_T *__restrict__ dataf,
+                                  const REAL3_T point,
+                                  REAL_T *__restrict__ __vox_data) {
+        const REAL_T HALF = static_cast<REAL_T>(0.5);
+
+        // all thr compute the same here
+        if (point.x < -HALF || point.x+HALF >= dimx ||
+            point.y < -HALF || point.y+HALF >= dimy ||
+               point.z < -HALF || point.z+HALF >= dimz) {
+                return -1;
+        }
+
+        long long  coo[3][2];
+        REAL_T wgh[3][2]; // could use just one...
+
+        const REAL_T ONE  = static_cast<REAL_T>(1.0);
+
+        const REAL3_T fl = MAKE_REAL3(FLOOR(point.x),
+                                      FLOOR(point.y),
+                                      FLOOR(point.z));
+
+        wgh[0][1] = point.x - fl.x; 
+        wgh[0][0] = ONE-wgh[0][1]; 
+        coo[0][0] = MAX(0, fl.x);
+        coo[0][1] = MIN(dimx-1, coo[0][0]+1);
+
+        wgh[1][1] = point.y - fl.y; 
+        wgh[1][0] = ONE-wgh[1][1]; 
+        coo[1][0] = MAX(0, fl.y);
+        coo[1][1] = MIN(dimy-1, coo[1][0]+1);
+
+        wgh[2][1] = point.z - fl.z; 
+        wgh[2][0] = ONE-wgh[2][1]; 
+        coo[2][0] = MAX(0, fl.z);
+        coo[2][1] = MIN(dimz-1, coo[2][0]+1);
+
+        if (dimt_idx == -1) {
+                for (int t = threadIdx.x; t < dimt; t += BDIM_X) {
+                        __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t);
+                }
+        } else {
+                *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx);
+        }
+
+        /*
+        __syncwarp(WMASK);
+        if (tidx == 0 && threadIdx.y == 0) {
+                printf("point: %f, %f, %f\n", point.x, point.y, point.z);
+                for(int i = 0; i < dimt; i++) {
+                        printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
+                }
+        }
+        */
+        return 0;
+}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b34f9a0
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,14 @@
+[build-system]
+requires = ["scikit-build-core", "pybind11"]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "cuslines"
+version = "1.0.0"
+description="GPU-accelerated tractography package"
+readme = "README.md"
+requires-python = ">=3.7"
+
+[tool.scikit-build]
+cmake.build-type = "Release"
+cmake.args = ["-DCMAKE_CUDA_ARCHITECTURES=52-real;60-real;61-real;70-real;72-real;75-real;80-real;86-real;87-real;90"] 
diff --git a/run_dipy_cpu.py b/run_dipy_cpu.py
deleted file mode 100644
index 9b32d98..0000000
--- a/run_dipy_cpu.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import random
-import time
-
-import numpy as np
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
-from dipy.tracking.local_tracking import LocalTracking
-from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
-from dipy.reconst import shm
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-import cuslines.cuslines as cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("nifti_file", help="path to the ep2multiband sequence nifti file")
-parser.add_argument("bvals", help="path to the bvals")
-parser.add_argument("bvecs", help="path to the bvecs")
-parser.add_argument("mask_nifti", help="path to the mask file")
-parser.add_argument("roi_nifti", help="path to the ROI file")
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
-args = parser.parse_args()
-
-img = get_img(args.nifti_file)
-voxel_order = "".join(aff2axcodes(img.affine))
-gtab = get_gtab(args.bvals, args.bvecs)
-roi = get_img(args.roi_nifti)
-mask = get_img(args.mask_nifti)
-data = img.get_data()
-roi_data = roi.get_data()
-mask = mask.get_data()
-
-tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
-print('Computing anisotropy measures (FA,MD,RGB)')
-FA = tenfit.fa
-FA[np.isnan(FA)] = 0
-
-# Setup tissue_classifier args
-tissue_classifier = ThresholdStoppingCriterion(FA, 0.1)
-metric_map = np.asarray(FA, 'float64')
-
-# Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
-
-# Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
-
-print('streamline gen')
-global_chunk_size = args.chunk_size
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
-
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  streamlines = [s for s in streamline_generator]
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
-
-  # Save tracklines file
-  if args.output_prefix:
-    fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-    ts = time.time()
-    #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-    sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
-    save_tractogram(sft, fname)
-    te = time.time()
-    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-    io_time += (te-ts)
-
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_cpu_hardi.py b/run_dipy_cpu_hardi.py
deleted file mode 100644
index dabbc63..0000000
--- a/run_dipy_cpu_hardi.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import random
-import time
-
-import numpy as np
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
-from dipy.tracking.local_tracking import LocalTracking
-from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
-from dipy.reconst import shm
-from dipy.data import get_fnames
-from dipy.data import read_stanford_pve_maps
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-import cuslines.cuslines as cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep")
-parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
-args = parser.parse_args()
-
-# Get Stanford HARDI data
-hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
-csf, gm, wm = read_stanford_pve_maps()
-wm_data = wm.get_fdata()
-
-img = get_img(hardi_nifti_fname)
-voxel_order = "".join(aff2axcodes(img.affine))
-
-gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
-#roi = get_img(hardi_nifti_fname)
-
-data = img.get_fdata()
-roi_data = (wm_data > 0.5)
-mask = roi_data
-
-tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
-print('Computing anisotropy measures (FA,MD,RGB)')
-FA = tenfit.fa
-FA[np.isnan(FA)] = 0
-
-# Setup tissue_classifier args
-tissue_classifier = ThresholdStoppingCriterion(FA, 0.1)
-metric_map = np.asarray(FA, 'float64')
-
-# Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
-seed_mask = seed_mask[0:args.nseeds]
-
-# Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
-
-
-print('streamline gen')
-global_chunk_size = args.chunk_size
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=.5)
-  streamlines = [s for s in streamline_generator]
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
-
-  # Save tracklines file
-  if args.output_prefix:
-    fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-    ts = time.time()
-    #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-    #save_tractogram(fname, streamlines)
-    sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
-    save_tractogram(sft, fname)
-    te = time.time()
-    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-
-    io_time += (te-ts)
-
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_gpu.py b/run_dipy_gpu.py
deleted file mode 100644
index ce109b9..0000000
--- a/run_dipy_gpu.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import argparse
-import random
-import time
-from tqdm import trange, tqdm
-
-import numpy as np
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-#from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
-#from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
-from dipy.reconst import shm
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-import cuslines.cuslines as cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("nifti_file", help="path to the ep2multiband sequence nifti file")
-parser.add_argument("bvals", help="path to the bvals")
-parser.add_argument("bvecs", help="path to the bvecs")
-parser.add_argument("mask_nifti", help="path to the mask file")
-parser.add_argument("--fa_numpy", default=None, help="path to the FA numpy file")
-parser.add_argument("--roi_nifti", default=None, help="path to the ROI file")
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
-parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
-parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
-parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
-parser.add_argument("--max-angle", type=float, default=1.0471975511965976, help="default: 60 deg (in rad)")
-parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
-parser.add_argument("--tc-threshold", type=float, default=0.1, help="default: 0.1")
-parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
-args = parser.parse_args()
-
-img = get_img(args.nifti_file)
-voxel_order = "".join(aff2axcodes(img.affine))
-gtab = get_gtab(args.bvals, args.bvecs)
-mask = get_img(args.mask_nifti)
-data = img.get_fdata()
-
-# resample mask if necessary
-if mask.shape != data.shape:
-    from dipy.align.imaffine import AffineMap
-    identity = np.eye(4)
-    affine_map = AffineMap(identity,
-                           img.shape[:3], img.affine,
-                           mask.shape[:3], mask.affine)
-    mask = affine_map.transform(mask.get_fdata())
-    #mask = np.round(mask)
-else:
-    mask = mask.get_fdata()
-
-# load or compute and save FA file
-if (args.fa_numpy is not None) and os.path.isfile(args.fa_numpy):
-    FA = np.load(args.fa_numpy, allow_pickle=True)
-else:
-    # Fit
-    tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-    print('Fitting Tensor')
-    tenfit = tenmodel.fit(data, mask)
-    print('Computing anisotropy measures (FA,MD,RGB)')
-    FA = tenfit.fa
-    FA[np.isnan(FA)] = 0
-
-    if args.fa_numpy is not None:
-        np.save(args.fa_numpy, FA)
-
-# Setup tissue_classifier args
-#tissue_classifier = ThresholdTissueClassifier(FA, 0.1)
-metric_map = np.asarray(FA, 'float64')
-
-# resample roi if necessary
-if args.roi_nifti is not None:
-    roi = get_img(args.roi_nifti)
-    roi_data = roi.get_fdata()
-else:
-    roi_data = np.zeros(data.shape[:3])
-    roi_data[ FA > 0.1 ] = 1.
-
-# Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-
-# Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
-b0s_mask = gtab.b0s_mask
-dwi_mask = ~b0s_mask
-
-# get fit_matrix from model
-fit_matrix = model._fit_matrix
-delta_b, delta_q = fit_matrix
-
-# setup sampling matrix
-sphere = small_sphere
-theta = sphere.theta
-phi = sphere.phi
-sampling_matrix, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
-
-## from BootPmfGen __init__
-# setup H and R matrices
-# TODO: figure out how to get H, R matrices from direction getter object
-x, y, z = model.gtab.gradients[dwi_mask].T
-r, theta, phi = shm.cart2sphere(x, y, z)
-B, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
-H = shm.hat(B)
-R = shm.lcr_matrix(H)
-
-# create floating point copy of data
-dataf = np.asarray(data, dtype=float)
-
-print('streamline gen')
-global_chunk_size = args.chunk_size * args.ngpus
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
-gpu_tracker = cuslines.GPUTracker(args.max_angle,
-                                  args.min_signal,
-                                  args.tc_threshold,
-                                  args.step_size,
-                                  dataf, H, R, delta_b, delta_q,
-                                  b0s_mask.astype(np.int32), metric_map, sampling_matrix,
-                                  sphere.vertices, sphere.edges.astype(np.int32),
-                                  ngpus=args.ngpus, rng_seed=0)
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-
-# debug start val to fast forward
-with tqdm(total=seed_mask.shape[0]) as pbar:
-    for idx in range(int(nchunks)):
-        # Main streamline computation
-        ts = time.time()
-        streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
-        te = time.time()
-        streamline_time += (te-ts)
-        pbar.update(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0])
-        #print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-        #                                                                  seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-        #                                                                  te-ts))
-
-        # Save tracklines file
-        if args.output_prefix:
-            if args.use_fast_write:
-                prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
-                ts = time.time()
-                gpu_tracker.dump_streamlines(prefix, voxel_order, roi_data.shape, img.header.get_zooms(), img.affine)
-                te = time.time()
-                print("Saved streamlines to {}_*.trk, time {} s".format(prefix, te-ts))
-            else:
-                fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-                ts = time.time()
-                #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-                sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
-                save_tractogram(sft, fname)
-                te = time.time()
-                print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-            io_time += (te-ts)
-
-pbar.close()
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py
deleted file mode 100644
index 9386631..0000000
--- a/run_dipy_gpu_hardi.py
+++ /dev/null
@@ -1,205 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import random
-import time
-
-import numpy as np
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-#from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
-#from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
-from dipy.reconst import shm
-from dipy.data import get_fnames
-from dipy.data import read_stanford_pve_maps
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-import cuslines.cuslines as cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
-parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
-parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
-parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
-parser.add_argument("--max-angle", type=float, default=1.0471975511965976, help="default: 60 deg (in rad)")
-parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
-parser.add_argument("--tc-threshold", type=float, default=0.1, help="default: 0.1")
-parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
-args = parser.parse_args()
-
-# Get Stanford HARDI data
-hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
-
-print(hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname)
-
-csf, gm, wm = read_stanford_pve_maps()
-wm_data = wm.get_fdata()
-
-img = get_img(hardi_nifti_fname)
-voxel_order = "".join(aff2axcodes(img.affine))
-
-gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
-#roi = get_img(hardi_nifti_fname)
-
-data = img.get_fdata()
-roi_data = (wm_data > 0.5)
-mask = roi_data
-
-tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
-print('Computing anisotropy measures (FA,MD,RGB)')
-FA = tenfit.fa
-FA[np.isnan(FA)] = 0
-
-# Setup tissue_classifier args
-#tissue_classifier = ThresholdTissueClassifier(FA, 0.1)
-metric_map = np.asarray(FA, 'float64')
-
-# Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
-seed_mask = seed_mask[0:args.nseeds]
-
-# Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
-b0s_mask = gtab.b0s_mask
-dwi_mask = ~b0s_mask
-
-# get fit_matrix from model
-fit_matrix = model._fit_matrix
-delta_b, delta_q = fit_matrix
-
-# setup sampling matrix
-sphere = small_sphere
-theta = sphere.theta
-phi = sphere.phi
-sampling_matrix, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
-
-## from BootPmfGen __init__
-# setup H and R matrices
-# TODO: figure out how to get H, R matrices from direction getter object
-x, y, z = model.gtab.gradients[dwi_mask].T
-r, theta, phi = shm.cart2sphere(x, y, z)
-B, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
-H = shm.hat(B)
-R = shm.lcr_matrix(H)
-
-# create floating point copy of data
-dataf = np.asarray(data, dtype=float)
-
-print('streamline gen')
-global_chunk_size = args.chunk_size * args.ngpus
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
-
-gpu_tracker = cuslines.GPUTracker(args.max_angle,
-                                  args.min_signal,
-                                  args.tc_threshold,
-                                  args.step_size,
-                                  dataf, H, R, delta_b, delta_q,
-                                  b0s_mask.astype(np.int32), metric_map, sampling_matrix,
-                                  sphere.vertices, sphere.edges.astype(np.int32),
-                                  ngpus=args.ngpus, rng_seed=0)
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
-
-  # Save tracklines file
-  if args.output_prefix:
-    if args.use_fast_write:
-      prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
-      ts = time.time()
-      #gpu_tracker.dump_streamlines(prefix, voxel_order, roi.shape, roi.header.get_zooms(), img.affine)
-      gpu_tracker.dump_streamlines(prefix, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
-      te = time.time()
-      print("Saved streamlines to {}_*.trk, time {} s".format(prefix, te-ts))
-    else:
-      fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-      ts = time.time()
-      #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-      #save_tractogram(fname, streamlines)
-      sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
-      save_tractogram(sft, fname)
-      te = time.time()
-      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-    io_time += (te-ts)
-
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
new file mode 100644
index 0000000..e627978
--- /dev/null
+++ b/run_gpu_streamlines.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import random
+import time
+import zipfile
+
+import numpy as np
+import numpy.linalg as npl
+
+import dipy.reconst.dti as dti
+from dipy.io import read_bvals_bvecs
+from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
+from dipy.io.streamline import save_tractogram
+from dipy.tracking import utils
+from dipy.core.gradients import gradient_table, unique_bvals_magnitude
+from dipy.data import default_sphere
+from dipy.direction import (BootDirectionGetter, ProbabilisticDirectionGetter, PTTDirectionGetter)
+from dipy.reconst.shm import OpdtModel, CsaOdfModel
+from dipy.reconst.csdeconv import ConstrainedSphericalDeconvModel, auto_response_ssst
+from dipy.tracking.local_tracking import LocalTracking
+from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
+from dipy.reconst import shm
+from dipy.data import get_fnames
+from dipy.data import read_stanford_pve_maps
+
+import nibabel as nib
+from nibabel.orientations import aff2axcodes
+
+from trx.trx_file_memmap import TrxFile, zip_from_folder
+
+# Import custom module
+import cuslines
+
+t0 = time.time()
+
+# set seed to get deterministic streamlines
+np.random.seed(0)
+random.seed(0)
+
+#Get Gradient values
+def get_gtab(fbval, fbvec):
+    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
+    gtab = gradient_table(bvals, bvecs)
+    return gtab
+
+def get_img(ep2_seq):
+    img = nib.load(ep2_seq)
+    return img
+
+print("parsing arguments")
+parser = argparse.ArgumentParser()
+parser.add_argument("nifti_file", nargs='?', default='hardi', help="path to the DWI nifti file")
+parser.add_argument("bvals", nargs='?', default='hardi', help="path to the bvals")
+parser.add_argument("bvecs", nargs='?', default='hardi', help="path to the bvecs")
+parser.add_argument("mask_nifti", nargs='?', default='hardi', help="path to the mask file")
+parser.add_argument("roi_nifti", nargs='?', default='hardi', help="path to the ROI file")
+parser.add_argument("--device", type=str, default ='gpu', choices=['cpu', 'gpu'], help="Whether to use cpu or gpu")
+parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
+parser.add_argument("--chunk-size", type=int, default=100000, help="how many seeds to process per sweep, per GPU")
+parser.add_argument("--nseeds", type=int, default=100000, help="how many seeds to process in total")
+parser.add_argument("--ngpus", type=int, default=1, help="number of GPUs to use if using gpu")
+parser.add_argument("--write-method", type=str, default="fast", help="Can be trx, fast, or standard")
+parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
+parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
+parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
+parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
+parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
+parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
+parser.add_argument("--min-separation-angle",type=float,default=45,help="min separation angle (in degrees)")
+parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
+parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csa', 'csd'], help="model to use")
+parser.add_argument("--dg", type=str, default="boot", choices=['boot', 'prob', 'ptt'], help="direction getting scheme to use")
+
+args = parser.parse_args()
+
+if args.device == "cpu" and args.write_method != "standard":
+  print("WARNING: only standard write method is implemented for cpu testing.")
+  write_method = "standard"
+else:
+  write_method = args.write_method
+
+if 'hardi' in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]:
+  if not all(arg == 'hardi' for arg in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]):
+    raise ValueError("If any of the arguments is 'hardi', all must be 'hardi'")
+  # Get Stanford HARDI data
+  hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
+  csf, gm, wm = read_stanford_pve_maps()
+  wm_data = wm.get_fdata()
+
+  img = get_img(hardi_nifti_fname)
+  voxel_order = "".join(aff2axcodes(img.affine))
+
+  gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
+
+  data = img.get_fdata()
+  roi_data = (wm_data > 0.5)
+  mask = roi_data
+else:
+  img = get_img(args.nifti_file)
+  voxel_order = "".join(aff2axcodes(img.affine))
+  gtab = get_gtab(args.bvals, args.bvecs)
+  roi = get_img(args.roi_nifti)
+  mask = get_img(args.mask_nifti)
+  data = img.get_fdata()
+  roi_data = roi.get_fdata()
+  mask = mask.get_fdata()
+
+tenmodel = dti.TensorModel(gtab, fit_method='WLS')
+print('Fitting Tensor')
+tenfit = tenmodel.fit(data, mask)
+print('Computing anisotropy measures (FA,MD,RGB)')
+FA = tenfit.fa
+FA[np.isnan(FA)] = 0
+
+# Setup tissue_classifier args
+tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold)
+metric_map = np.asarray(FA, 'float64')
+
+# Create seeds for ROI
+# seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
+# seed_mask = seed_mask[0:args.nseeds]
+seed_mask = np.asarray(utils.random_seeds_from_mask(
+  roi_data, seeds_count=args.nseeds,
+  seed_count_per_voxel=False,
+  affine=np.eye(4)))
+
+# Setup model
+sphere = default_sphere
+if args.model == "opdt":
+  model_type = cuslines.ModelType.OPDT
+  print("Running OPDT model...")
+  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+  fit_matrix = model._fit_matrix
+  delta_b, delta_q = fit_matrix
+elif args.model == "csa":
+  model_type = cuslines.ModelType.CSA
+  print("Running CSA model...")
+  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+  fit_matrix = model._fit_matrix
+  # Unlike OPDT, CSA has a single matrix used for fit_matrix. Populating delta_b and delta_q with necessary values for
+  # now.
+  delta_b = fit_matrix
+  delta_q = fit_matrix
+else:
+  print("Running CSD model...")
+  unique_bvals = unique_bvals_magnitude(gtab.bvals)
+  if len(unique_bvals[unique_bvals > 0]) > 1:
+    low_shell_idx = gtab.bvals <= unique_bvals[unique_bvals > 0][0]
+    response_gtab = gradient_table( # reinit as single shell for this CSD
+      gtab.bvals[low_shell_idx],
+      gtab.bvecs[low_shell_idx])
+    data = data[..., low_shell_idx]
+  else:
+    response_gtab = gtab
+  response, _ = auto_response_ssst(
+    response_gtab,
+    data,
+    roi_radii=10,
+    fa_thr=0.7)
+  model = ConstrainedSphericalDeconvModel(gtab, response, sh_order=args.sh_order)
+  # TODO: we shouldnt have to do this, also for CSA, but we populate delta_b, delta_q.
+  # we need to name change delta_b/delta_q and make it possible for them to be None, or something like this
+  delta_b = model._X
+  delta_q = model.B_reg
+
+if args.dg != "boot":
+  if args.dg == "prob":
+    model_type = cuslines.ModelType.PROB
+    dg = ProbabilisticDirectionGetter
+  else:
+    model_type = cuslines.ModelType.PTT
+    dg = PTTDirectionGetter
+  fit = model.fit(data, mask=(metric_map >= args.fa_threshold))
+  data = fit.odf(sphere).clip(min=0)
+else:
+  dg = BootDirectionGetter
+
+global_chunk_size = args.chunk_size
+
+# Setup direction getter args
+if args.device == "cpu":
+  if args.dg != "boot":
+    dg = dg.from_pmf(data, max_angle=args.max_angle, sphere=sphere, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
+  else:
+    dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
+else:
+  # Setup direction getter args
+  b0s_mask = gtab.b0s_mask
+  dwi_mask = ~b0s_mask
+
+  # setup sampling matrix
+  theta = sphere.theta
+  phi = sphere.phi
+  sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
+
+  ## from BootPmfGen __init__
+  # setup H and R matrices
+  # TODO: figure out how to get H, R matrices from direction getter object
+  x, y, z = model.gtab.gradients[dwi_mask].T
+  r, theta, phi = shm.cart2sphere(x, y, z)
+  B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
+  H = shm.hat(B)
+  R = shm.lcr_matrix(H)
+
+  # create floating point copy of data
+  dataf = np.asarray(data, dtype=np.float64)
+
+  gpu_tracker = cuslines.GPUTracker(model_type,
+                                    args.max_angle * np.pi/180,
+                                    args.min_signal,
+                                    args.fa_threshold,
+                                    args.step_size,
+                                    args.relative_peak_threshold,
+                                    args.min_separation_angle * np.pi/180,
+                                    dataf.astype(np.float64), H.astype(np.float64), R.astype(np.float64), delta_b.astype(np.float64), delta_q.astype(np.float64),
+                                    b0s_mask.astype(np.int32), metric_map.astype(np.float64), sampling_matrix.astype(np.float64),
+                                    sphere.vertices.astype(np.float64), sphere.edges.astype(np.int32),
+                                    ngpus=args.ngpus, rng_seed=0)
+
+print('streamline gen')
+nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
+
+t1 = time.time()
+streamline_time = 0
+io_time = 0
+
+if args.output_prefix and write_method == "trx":
+  # Will resize by a factor of 2 if these are exceeded
+  sl_len_guess = 100
+  sl_per_seed_guess = 3
+  n_sls_guess = sl_per_seed_guess*len(seed_mask)
+
+  # trx files use memory mapping
+  trx_file = TrxFile(
+    reference=hardi_nifti_fname,
+    nb_streamlines=n_sls_guess,
+    nb_vertices=n_sls_guess*sl_len_guess)
+  offsets_idx = 0
+  sls_data_idx = 0
+
+for idx in range(int(nchunks)):
+  # Main streamline computation
+  ts = time.time()
+  if args.device == "cpu":
+    streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size)
+    streamlines = [s for s in streamline_generator]
+  else:
+    streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
+  te = time.time()
+  streamline_time += (te-ts)
+  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
+                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
+                                                                    te-ts))
+
+  # Save tracklines file
+  if args.output_prefix:
+    ts = time.time()
+    if write_method == "standard":
+      fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
+      sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
+      save_tractogram(sft, fname)
+      te = time.time()
+      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+    elif write_method == "trx":
+      tractogram = nib.streamlines.Tractogram(streamlines, affine_to_rasmm=img.affine)
+      tractogram.to_world()
+      sls = tractogram.streamlines
+
+      new_offsets_idx = offsets_idx + len(sls._offsets)
+      new_sls_data_idx = sls_data_idx + len(sls._data)
+
+      if new_offsets_idx > trx_file.header["NB_STREAMLINES"]\
+          or new_sls_data_idx > trx_file.header["NB_VERTICES"]:
+        print("TRX resizing...")
+        trx_file.resize(nb_streamlines=new_offsets_idx*2, nb_vertices=new_sls_data_idx*2)
+
+      # TRX uses memmaps here
+      trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data
+      trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = offsets_idx + sls._offsets
+      trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = sls._lengths
+
+      offsets_idx = new_offsets_idx
+      sls_data_idx = new_sls_data_idx
+      
+      te = time.time()
+      print("Streamlines to TRX format, time {} s".format(te-ts))
+    else:
+      fname = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
+      gpu_tracker.dump_streamlines(fname, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
+      te = time.time()
+      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+
+    io_time += (te-ts)
+
+if args.output_prefix and write_method == "trx":
+  ts = time.time()
+  fname = "{}.trx".format(args.output_prefix)
+  trx_file.resize()
+  zip_from_folder(
+    trx_file._uncompressed_folder_handle.name,
+    fname,
+    zipfile.ZIP_STORED)
+  trx_file.close()
+  te = time.time()
+  print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+  io_time += (te-ts)
+
+t2 = time.time()
+
+print("Completed processing {} seeds.".format(seed_mask.shape[0]))
+print("Initialization time: {} sec".format(t1-t0))
+print("Streamline generation total time: {} sec".format(t2-t1))
+print("\tStreamline processing: {} sec".format(streamline_time))
+if args.output_prefix:
+  print("\tFile writing: {} sec".format(io_time))