diff --git a/src/alpaka/test/alpaka/atomtest.cc b/src/alpaka/test/alpaka/atomtest.cc new file mode 100644 index 000000000..c3e9e23ec --- /dev/null +++ b/src/alpaka/test/alpaka/atomtest.cc @@ -0,0 +1,208 @@ +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct shared_block { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { + + auto threadIdxLocal(alpaka::getIdx(acc)[0u]); + auto blockIdxInGrid(alpaka::getIdx(acc)[0u]); + Data b = 1.0; + Data c = -1.0; + + auto& s = alpaka::declareSharedVar(acc); + + if (threadIdxLocal == 0) { + s = 0; + } + + syncBlockThreads(acc); + + for (T index: cms::alpakatools::elements_with_stride(acc, elements)) { + for (int i = 0; i < 200000; i++) { + alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{}); + } + alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); + } + + syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + vec[blockIdxInGrid] = s; + } + } +}; + +template +struct global_block { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { + + auto blockIdxInGrid(alpaka::getIdx(acc)[0u]); + Data b = 1.0; + Data c = -1.0; + + for (T index: cms::alpakatools::elements_with_stride(acc, elements)) { + for (int i = 0; i < 200000; i++) { + alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{}); + alpaka::atomicAdd(acc, &vec[blockIdxInGrid], c, alpaka::hierarchy::Grids{}); + } + alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{}); + } + } +}; + +template +struct global_grid { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { + + Data b = 1.0; + Data c = -1.0; + + for (T index: cms::alpakatools::elements_with_stride(acc, elements)) { + for (int i = 0; i < 200000; i++) { + alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{}); //alpaka::hierarchy::Blocks/Threads/Grids + alpaka::atomicAdd(acc, &vec[0], c, alpaka::hierarchy::Grids{}); + } + alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{}); + } + } +}; + +template +struct shared_grid { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { + + auto threadIdxLocal(alpaka::getIdx(acc)[0u]); + Data b = 1.0; + Data c = -1.0; + + auto& s = alpaka::declareSharedVar(acc); + + if (threadIdxLocal == 0) { + s = 0; + } + + syncBlockThreads(acc); + + for (T index: cms::alpakatools::elements_with_stride(acc, elements)) { + for (int i = 0; i < 200000; i++) { + alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); //alpaka::hierarchy::Blocks/Threads/Grids + alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{}); + } + alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); + } + + syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + alpaka::atomicAdd(acc, &vec[0], s, alpaka::hierarchy::Grids{}); + } + } +}; + + +int main(void) { + + using Dim = alpaka::DimInt<1u>; + using Data = float; + const Idx num_items = 1<<15; + Idx nThreadsInit = 256; + Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; + + const DevAcc1 device_1(alpaka::getDevByIdx(0u)); + alpaka::Queue queue_1_0(device_1); + alpaka::Queue queue_1_1(device_1); + + const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(nThreadsInit)); + const Vec1 blocksPerGrid1(Vec1::all(nBlocksInit)); + auto workDivMultiBlockInit1 = + cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + + + using DevHost = alpaka::DevCpu; + auto const devHost = alpaka::getDevByIdx(0u); + + using BufHost = alpaka::Buf; + BufHost bufHostA(alpaka::allocBuf(devHost, num_items)); + BufHost res(alpaka::allocBuf(devHost, num_items)); + + Data* const pBufHostA(alpaka::getPtrNative(bufHostA)); + Data* const res_ptr(alpaka::getPtrNative(res)); + + for (Idx i = 0; i < num_items; i++) { + pBufHostA[i] = 0.0; + } + + using BufAcc = alpaka::Buf; + BufAcc order(alpaka::allocBuf(device_1, num_items)); + + + printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); + + // Run on shared memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + auto beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, alpaka::createTaskKernel(workDivMultiBlockInit1, + shared_block(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + auto endT = std::chrono::high_resolution_clock::now(); + std::cout << "Shared Block: " << std::chrono::duration(endT - beginT).count() << " s" + << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + for (Idx i = 0; i < nBlocksInit; i++) + { + if (res_ptr[i] != (Data) nThreadsInit) std::cout << "[" << i << "]: " << res_ptr[i] << " != " << (Data) num_items << std::endl; + } + + // Run on global memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, alpaka::createTaskKernel(workDivMultiBlockInit1, + global_block(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Global Block: " << std::chrono::duration(endT - beginT).count() << " s" + << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + for (Idx i = 0; i < nBlocksInit; i++) + { + if (res_ptr[i] != (Data) nThreadsInit) std::cout << "[" << i << "]: " << res_ptr[i] << " != " << (Data) num_items << std::endl; + } + + // Run on Shared memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, alpaka::createTaskKernel(workDivMultiBlockInit1, + shared_grid(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Shared Grid: " << std::chrono::duration(endT - beginT).count() << " s" + << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + if (res_ptr[0] != (Data) num_items) std::cout << "[0]: " << res_ptr[0] << " != " << (Data) num_items << std::endl + << std::endl; + + // Run on Global memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, alpaka::createTaskKernel(workDivMultiBlockInit1, + global_grid(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Global Grid: " << std::chrono::duration(endT - beginT).count() << " s" + << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + if (res_ptr[0] != (Data) num_items) std::cout << "[0]: " << res_ptr[0] << " != " << (Data) num_items << std::endl; + + return 0; +} diff --git a/src/alpaka/test/alpaka/barrier_fence.cc b/src/alpaka/test/alpaka/barrier_fence.cc new file mode 100644 index 000000000..62ff1286c --- /dev/null +++ b/src/alpaka/test/alpaka/barrier_fence.cc @@ -0,0 +1,121 @@ +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" +#include "AlpakaCore/threadfence.h" + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct global_fence { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { + + auto blockIdxLocal(alpaka::getIdx(acc)[0u]); + int no_blocks = 128; + + for (int i = 0; i < no_blocks*no_blocks*10; i++) { + if (i%no_blocks == (int) blockIdxLocal) { + if (i%no_blocks > 0) { + vec[blockIdxLocal] = vec[blockIdxLocal - 1] + 1; + } + } + cms::alpakatools::threadfence(acc); + } + } +}; + +template +struct shared_fence { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { + + auto threadIdxLocal(alpaka::getIdx(acc)[0u]); + auto blockIdxLocal(alpaka::getIdx(acc)[0u]); + + auto& s = alpaka::declareSharedVar(acc); + + for (int i = 0; i < 256*256*10; i++) { + if (i%256 == (int) threadIdxLocal && threadIdxLocal > 0) { + s[threadIdxLocal] = s[threadIdxLocal-1] + 1; + } + cms::alpakatools::threadfence(acc); + } + + if (threadIdxLocal == 0) { + vec[blockIdxLocal] = s[127] + s[129]; + } + } +}; + + +int main(void) { + + using Dim = alpaka::DimInt<1u>; + using Data = float; + const Idx num_items = 1<<15; + Idx nThreadsInit = 256; + Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; + + const DevAcc1 device_1(alpaka::getDevByIdx(0u)); + alpaka::Queue queue_1_0(device_1); + alpaka::Queue queue_1_1(device_1); + + const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(nThreadsInit)); + const Vec1 blocksPerGrid1(Vec1::all(nBlocksInit)); + auto workDivMultiBlockInit1 = + cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + + + using DevHost = alpaka::DevCpu; + auto const devHost = alpaka::getDevByIdx(0u); + + using BufHost = alpaka::Buf; + BufHost bufHostA(alpaka::allocBuf(devHost, num_items)); + BufHost res(alpaka::allocBuf(devHost, num_items)); + + Data* const pBufHostA(alpaka::getPtrNative(bufHostA)); + Data* const res_ptr(alpaka::getPtrNative(res)); + + for (Idx i = 0; i < num_items; i++) { + pBufHostA[i] = 0.0; + } + + using BufAcc = alpaka::Buf; + BufAcc order(alpaka::allocBuf(device_1, num_items)); + + printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); + + // Run on shared memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + auto beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, alpaka::createTaskKernel(workDivMultiBlockInit1, + shared_fence(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + auto endT = std::chrono::high_resolution_clock::now(); + std::cout << "Shared time: " << std::chrono::duration(endT - beginT).count() << " s" + << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + for (int i = 0; i < 128; i++) { + if (res_ptr[i] != 256.0) printf("Error1: d[%d] != r (%f, %d)\n", i, res_ptr[i], i); + } + + // Run on global memory + alpaka::memcpy(queue_1_0, order, bufHostA, num_items); + beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, alpaka::createTaskKernel(workDivMultiBlockInit1, + global_fence(), alpaka::getPtrNative(order), num_items)); + alpaka::wait(queue_1_0); + endT = std::chrono::high_resolution_clock::now(); + std::cout << "Global time: " << std::chrono::duration(endT - beginT).count() << " s" + << std::endl; + alpaka::memcpy(queue_1_0, res, order, num_items); + for (int i = 0; i < 128; i++) + { + if (res_ptr[i] != Data (i)) printf("Error1: d[%d] != r (%f, %d)\n", i, res_ptr[i], i); + } + + + return 0; +} diff --git a/src/alpaka/test/alpaka/barrier_sync.cc b/src/alpaka/test/alpaka/barrier_sync.cc new file mode 100644 index 000000000..6f0a654b7 --- /dev/null +++ b/src/alpaka/test/alpaka/barrier_sync.cc @@ -0,0 +1,65 @@ +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaWorkDivHelper.h" + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct check_sync { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { + + int n = (int) elements; + + auto threadIdxLocal(alpaka::getIdx(acc)[0u]); + for (int i = 0; i < n*n; i++) { + if (i%n == (int) threadIdxLocal) { + for (int j = i; j < 10000; j++) { + if (j % 2 == 0) { + // Do random stuff + int sum = 0; + for (int k = 0; k < 1000; k++) + sum += k; + } + } + } + syncBlockThreads(acc); + } + } +}; + + +int main(void) { + + using Dim = alpaka::DimInt<1u>; + using Data = float; + const Idx num_items = 1<<10; + Idx nThreadsInit = 1024; + Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; + + const DevAcc1 device_1(alpaka::getDevByIdx(0u)); + alpaka::Queue queue_1_0(device_1); + alpaka::Queue queue_1_1(device_1); + + const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(nThreadsInit)); + const Vec1 blocksPerGrid1(Vec1::all(nBlocksInit)); + auto workDivMultiBlockInit1 = + cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + + using BufAcc = alpaka::Buf; + BufAcc order(alpaka::allocBuf(device_1, num_items)); + + printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); + + // Run function + auto beginT = std::chrono::high_resolution_clock::now(); + alpaka::enqueue(queue_1_0, alpaka::createTaskKernel(workDivMultiBlockInit1, + check_sync(), alpaka::getPtrNative(order), nThreadsInit)); + alpaka::wait(queue_1_0); + auto endT = std::chrono::high_resolution_clock::now(); + std::cout << "Time: " << std::chrono::duration(endT - beginT).count() << " s" << std::endl; + + return 0; +} diff --git a/src/cuda/test/atomtest.cu b/src/cuda/test/atomtest.cu new file mode 100644 index 000000000..808098c8a --- /dev/null +++ b/src/cuda/test/atomtest.cu @@ -0,0 +1,139 @@ +#include +#include +#include + +#include +#include +#include + +using Data = float; + +__global__ void shared_block(Data *d, int n) +{ + __shared__ Data s; + int block = blockIdx.x; + if (threadIdx.x == 0) { + s = 0.0; + } + __syncthreads(); + + for (int i = 0; i < 200000; i++) { + atomicAdd(&s, 1.0); + atomicAdd(&s, -1.0); + } + atomicAdd(&s, 1.0); + + __syncthreads(); + + if (threadIdx.x == 0) { + d[block] = s; + } +} + +__global__ void global_block(Data *d, int n) +{ + int block = blockIdx.x; + for (int i = 0; i < 200000; i++) { + atomicAdd(&d[block], 1.0); + atomicAdd(&d[block], -1.0); + } + atomicAdd(&d[block], 1.0); +} + +__global__ void shared_grid(Data *d, int n) +{ + __shared__ Data var; + if (threadIdx.x == 0) { + var=0.0; + } + + __syncthreads(); + + for (int i = 0; i < 200000; i++) { + atomicAdd(&var, 1.0); + atomicAdd(&var, -1.0); + } + atomicAdd(&var, 1.0); + + __syncthreads(); + + if (threadIdx.x == 0) { + atomicAdd(&d[0], var); + } +} + +__global__ void global_grid(Data *d, int n) +{ + for (int i = 0; i < 200000; i++) { + atomicAdd(&d[0], 1.0); + atomicAdd(&d[0], -1.0); + } + atomicAdd(&d[0], 1.0); +} + +int main(void) +{ + const int n = 1<<15; + + const int thr = 256; + const int blocks = (n + thr - 1) / thr; + Data order[n], backup[n]; + + printf("Threads/block:%d blocks/grid:%d\n", thr, blocks); + + for (int i = 0; i < n; i++) + order[i] = 0.0; + + Data *d_d; + cudaMalloc(&d_d, n * sizeof(Data)); + struct timeval t1, t2; + + // run version with static shared memory + cudaMemcpy(d_d, &order, n*sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + shared_block<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0/1000.0; + printf("Shared block: %f s \n", time); + cudaMemcpy(backup, d_d, n*sizeof(Data), cudaMemcpyDeviceToHost); + for (int i = 0; i < blocks; i++) { + if (backup[i] != (Data) thr) printf("Error: d[%d] != r (%f, %d)\n", i, backup[i], thr); + } + + // run version with global memory + cudaMemcpy(d_d, &order, n*sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + global_block<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0/1000.0; + printf("Global block: %f s \n", time); + cudaMemcpy(backup, d_d, n*sizeof(Data), cudaMemcpyDeviceToHost); + for (int i = 0; i < blocks; i++) { + if (backup[i] != (Data) thr) printf("Error: d[%d] !=r (%f, %d)\n", i, backup[i], thr); + } + + // run version with shared memory + cudaMemcpy(d_d, order, n*sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + shared_grid<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0/1000.0; + printf("Shared grid: %f s \n", time); + cudaMemcpy(backup, d_d, n*sizeof(Data), cudaMemcpyDeviceToHost); + if (backup[0] != (Data) n) printf("Error: d !=r (%f, %d)\n", backup[0], n); + + // run version with global memory + cudaMemcpy(d_d, order, n*sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + global_grid<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0/1000.0; + printf("Global grid: %f s \n", time); + cudaMemcpy(backup, d_d, 1*sizeof(Data), cudaMemcpyDeviceToHost); + if (backup[0] != (Data) n) printf("Error: d !=r (%f, %d)\n", backup[0], n); + +} \ No newline at end of file diff --git a/src/cuda/test/barrier_fence.cu b/src/cuda/test/barrier_fence.cu new file mode 100644 index 000000000..8b2473c5b --- /dev/null +++ b/src/cuda/test/barrier_fence.cu @@ -0,0 +1,89 @@ +#include +#include +#include + +#include +#include +#include +#include + +using Data = float; + +__global__ void global_thread(Data *d, int n) +{ + int no_blocks = 128; + + for (int i = 0; i < no_blocks*no_blocks*10; i++) { + if (i%no_blocks == blockIdx.x) { + if (i%no_blocks > 0) { + d[blockIdx.x] = d[blockIdx.x-1] + 1; + } + } + __threadfence(); + } +} + +__global__ void shared_fence(Data *d, int n) +{ + __shared__ Data s[256]; + int block = blockIdx.x; + int no_threads = 256; + + for (int i = 0; i < no_threads*no_threads*10; i++) { + if (i%no_threads == threadIdx.x && threadIdx.x > 0) { + s[threadIdx.x] = s[threadIdx.x-1] + 1; + } + __threadfence(); + } + + if (threadIdx.x == 0) { + d[block] = s[127] + s[129]; + } +} + +int main(void) +{ + const int n = 1<<15; + + const int thr = 256; + const int blocks = (n + thr - 1) / thr; + Data order[n], backup[n]; + + printf("Threads/block:%d blocks/grid:%d\n", thr, blocks); + + for (int i = 0; i < n; i++) + order[i] = 0.0; + + Data *d_d; + cudaMalloc(&d_d, n * sizeof(Data)); + struct timeval t1, t2; + + // run version with shared memory + cudaMemcpy(d_d, order, n*sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + shared_fence<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + auto time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0/1000.0; + printf("Shared time: %f\n", time); + cudaMemcpy(backup, d_d, n*sizeof(Data), cudaMemcpyDeviceToHost); + for (int i = 0; i < 128; i++) + { + if (backup[i] != 256.0) printf("Error1: d[%d] != r (%f, %d)\n", i, backup[i], n/128); + } + + // run version with global memory + cudaMemcpy(d_d, order, n*sizeof(Data), cudaMemcpyHostToDevice); + gettimeofday(&t1, 0); + global_thread<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0/1000.0; + printf("Global time: %f\n", time); + cudaMemcpy(backup, d_d, n*sizeof(Data), cudaMemcpyDeviceToHost); + for (int i = 0; i < 128; i++) + { + if (backup[i] != Data (i)) printf("Error1: d[%d] != r (%f, %d)\n", i, backup[i], i); + } + +} \ No newline at end of file diff --git a/src/cuda/test/barrier_sync.cu b/src/cuda/test/barrier_sync.cu new file mode 100644 index 000000000..4eb4d6103 --- /dev/null +++ b/src/cuda/test/barrier_sync.cu @@ -0,0 +1,55 @@ +#include +#include +#include + +#include +#include +#include + +using Data = float; + +__device__ void rand_func() { + int sum = 0; + for (int i = 0; i < 1000; i++) + sum += i; +} + +__device__ void iterate(int id) { + for (int j = id; j < 10000; j++) { + if (j % 2 == 0) { + rand_func(); + } + } +} + +__global__ void check_sync(Data *d, int no_threads) +{ + for (int i = 0; i < no_threads*no_threads; i++) { + if ((i%no_threads) == threadIdx.x) { + iterate(threadIdx.x); + } + __syncthreads(); + } +} + +int main(void) +{ + const int n = 1<<10; + + const int thr = 1024; + const int blocks = (n + thr - 1) / thr; + + printf("Threads/block:%d blocks/grid:%d\n", thr, blocks); + + + Data *d_d; + cudaMalloc(&d_d, n * sizeof(Data)); + struct timeval t1, t2; + + gettimeofday(&t1, 0); + check_sync<<>>(d_d, n); + cudaDeviceSynchronize(); + gettimeofday(&t2, 0); + auto time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0/1000.0; + printf("Time: %f s\n", time); +} \ No newline at end of file