-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[alpaka/cuda] tests for atomic and barriers
- Loading branch information
1 parent
1fabee1
commit bed0543
Showing
6 changed files
with
677 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
#include <iostream> | ||
#include <string> | ||
|
||
#include "AlpakaCore/alpakaConfig.h" | ||
#include "AlpakaCore/alpakaWorkDivHelper.h" | ||
|
||
using namespace ALPAKA_ACCELERATOR_NAMESPACE; | ||
|
||
template <typename T, typename Data> | ||
struct shared_block { | ||
template <typename T_Acc> | ||
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { | ||
|
||
auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]); | ||
auto blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]); | ||
Data b = 1.0; | ||
Data c = -1.0; | ||
|
||
auto& s = alpaka::declareSharedVar<Data, __COUNTER__>(acc); | ||
|
||
if (threadIdxLocal == 0) { | ||
s = 0; | ||
} | ||
|
||
syncBlockThreads(acc); | ||
|
||
for (T index: cms::alpakatools::elements_with_stride<T, T_Acc>(acc, elements)) { | ||
for (int i = 0; i < 200000; i++) { | ||
alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); | ||
alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{}); | ||
} | ||
alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); | ||
} | ||
|
||
syncBlockThreads(acc); | ||
|
||
if (threadIdxLocal == 0) { | ||
vec[blockIdxInGrid] = s; | ||
} | ||
} | ||
}; | ||
|
||
template <typename T, typename Data> | ||
struct global_block { | ||
template <typename T_Acc> | ||
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { | ||
|
||
auto blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]); | ||
Data b = 1.0; | ||
Data c = -1.0; | ||
|
||
for (T index: cms::alpakatools::elements_with_stride<T, T_Acc>(acc, elements)) { | ||
for (int i = 0; i < 200000; i++) { | ||
alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{}); | ||
alpaka::atomicAdd(acc, &vec[blockIdxInGrid], c, alpaka::hierarchy::Grids{}); | ||
} | ||
alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{}); | ||
} | ||
} | ||
}; | ||
|
||
template <typename T, typename Data> | ||
struct global_grid { | ||
template <typename T_Acc> | ||
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { | ||
|
||
Data b = 1.0; | ||
Data c = -1.0; | ||
|
||
for (T index: cms::alpakatools::elements_with_stride<T, T_Acc>(acc, elements)) { | ||
for (int i = 0; i < 200000; i++) { | ||
alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{}); //alpaka::hierarchy::Blocks/Threads/Grids | ||
alpaka::atomicAdd(acc, &vec[0], c, alpaka::hierarchy::Grids{}); | ||
} | ||
alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{}); | ||
} | ||
} | ||
}; | ||
|
||
template <typename T, typename Data> | ||
struct shared_grid { | ||
template <typename T_Acc> | ||
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { | ||
|
||
auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]); | ||
Data b = 1.0; | ||
Data c = -1.0; | ||
|
||
auto& s = alpaka::declareSharedVar<Data, __COUNTER__>(acc); | ||
|
||
if (threadIdxLocal == 0) { | ||
s = 0; | ||
} | ||
|
||
syncBlockThreads(acc); | ||
|
||
for (T index: cms::alpakatools::elements_with_stride<T, T_Acc>(acc, elements)) { | ||
for (int i = 0; i < 200000; i++) { | ||
alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); //alpaka::hierarchy::Blocks/Threads/Grids | ||
alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{}); | ||
} | ||
alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); | ||
} | ||
|
||
syncBlockThreads(acc); | ||
|
||
if (threadIdxLocal == 0) { | ||
alpaka::atomicAdd(acc, &vec[0], s, alpaka::hierarchy::Grids{}); | ||
} | ||
} | ||
}; | ||
|
||
|
||
int main(void) { | ||
|
||
using Dim = alpaka::DimInt<1u>; | ||
using Data = float; | ||
const Idx num_items = 1<<15; | ||
Idx nThreadsInit = 256; | ||
Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; | ||
|
||
const DevAcc1 device_1(alpaka::getDevByIdx<PltfAcc1>(0u)); | ||
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_0(device_1); | ||
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_1(device_1); | ||
|
||
const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(nThreadsInit)); | ||
const Vec1 blocksPerGrid1(Vec1::all(nBlocksInit)); | ||
auto workDivMultiBlockInit1 = | ||
cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); | ||
|
||
|
||
using DevHost = alpaka::DevCpu; | ||
auto const devHost = alpaka::getDevByIdx<DevHost>(0u); | ||
|
||
using BufHost = alpaka::Buf<DevHost, Data, Dim, Idx>; | ||
BufHost bufHostA(alpaka::allocBuf<Data, Idx>(devHost, num_items)); | ||
BufHost res(alpaka::allocBuf<Data, Idx>(devHost, num_items)); | ||
|
||
Data* const pBufHostA(alpaka::getPtrNative(bufHostA)); | ||
Data* const res_ptr(alpaka::getPtrNative(res)); | ||
|
||
for (Idx i = 0; i < num_items; i++) { | ||
pBufHostA[i] = 0.0; | ||
} | ||
|
||
using BufAcc = alpaka::Buf<DevAcc1, Data, Dim, Idx>; | ||
BufAcc order(alpaka::allocBuf<Data, Idx>(device_1, num_items)); | ||
|
||
|
||
printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); | ||
|
||
// Run on shared memory | ||
alpaka::memcpy(queue_1_0, order, bufHostA, num_items); | ||
auto beginT = std::chrono::high_resolution_clock::now(); | ||
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1, | ||
shared_block<Idx, Data>(), alpaka::getPtrNative(order), num_items)); | ||
alpaka::wait(queue_1_0); | ||
auto endT = std::chrono::high_resolution_clock::now(); | ||
std::cout << "Shared Block: " << std::chrono::duration<double>(endT - beginT).count() << " s" | ||
<< std::endl; | ||
alpaka::memcpy(queue_1_0, res, order, num_items); | ||
for (Idx i = 0; i < nBlocksInit; i++) | ||
{ | ||
if (res_ptr[i] != (Data) nThreadsInit) std::cout << "[" << i << "]: " << res_ptr[i] << " != " << (Data) num_items << std::endl; | ||
} | ||
|
||
// Run on global memory | ||
alpaka::memcpy(queue_1_0, order, bufHostA, num_items); | ||
beginT = std::chrono::high_resolution_clock::now(); | ||
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1, | ||
global_block<Idx, Data>(), alpaka::getPtrNative(order), num_items)); | ||
alpaka::wait(queue_1_0); | ||
endT = std::chrono::high_resolution_clock::now(); | ||
std::cout << "Global Block: " << std::chrono::duration<double>(endT - beginT).count() << " s" | ||
<< std::endl; | ||
alpaka::memcpy(queue_1_0, res, order, num_items); | ||
for (Idx i = 0; i < nBlocksInit; i++) | ||
{ | ||
if (res_ptr[i] != (Data) nThreadsInit) std::cout << "[" << i << "]: " << res_ptr[i] << " != " << (Data) num_items << std::endl; | ||
} | ||
|
||
// Run on Shared memory | ||
alpaka::memcpy(queue_1_0, order, bufHostA, num_items); | ||
beginT = std::chrono::high_resolution_clock::now(); | ||
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1, | ||
shared_grid<Idx, Data>(), alpaka::getPtrNative(order), num_items)); | ||
alpaka::wait(queue_1_0); | ||
endT = std::chrono::high_resolution_clock::now(); | ||
std::cout << "Shared Grid: " << std::chrono::duration<double>(endT - beginT).count() << " s" | ||
<< std::endl; | ||
alpaka::memcpy(queue_1_0, res, order, num_items); | ||
if (res_ptr[0] != (Data) num_items) std::cout << "[0]: " << res_ptr[0] << " != " << (Data) num_items << std::endl | ||
<< std::endl; | ||
|
||
// Run on Global memory | ||
alpaka::memcpy(queue_1_0, order, bufHostA, num_items); | ||
beginT = std::chrono::high_resolution_clock::now(); | ||
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1, | ||
global_grid<Idx, Data>(), alpaka::getPtrNative(order), num_items)); | ||
alpaka::wait(queue_1_0); | ||
endT = std::chrono::high_resolution_clock::now(); | ||
std::cout << "Global Grid: " << std::chrono::duration<double>(endT - beginT).count() << " s" | ||
<< std::endl; | ||
alpaka::memcpy(queue_1_0, res, order, num_items); | ||
if (res_ptr[0] != (Data) num_items) std::cout << "[0]: " << res_ptr[0] << " != " << (Data) num_items << std::endl; | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
#include <iostream> | ||
#include <string> | ||
|
||
#include "AlpakaCore/alpakaConfig.h" | ||
#include "AlpakaCore/alpakaWorkDivHelper.h" | ||
#include "AlpakaCore/threadfence.h" | ||
|
||
using namespace ALPAKA_ACCELERATOR_NAMESPACE; | ||
|
||
template <typename T, typename Data> | ||
struct global_fence { | ||
template <typename T_Acc> | ||
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { | ||
|
||
auto blockIdxLocal(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]); | ||
int no_blocks = 128; | ||
|
||
for (int i = 0; i < no_blocks*no_blocks*10; i++) { | ||
if (i%no_blocks == (int) blockIdxLocal) { | ||
if (i%no_blocks > 0) { | ||
vec[blockIdxLocal] = vec[blockIdxLocal - 1] + 1; | ||
} | ||
} | ||
cms::alpakatools::threadfence(acc); | ||
} | ||
} | ||
}; | ||
|
||
template <typename T, typename Data> | ||
struct shared_fence { | ||
template <typename T_Acc> | ||
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { | ||
|
||
auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]); | ||
auto blockIdxLocal(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]); | ||
|
||
auto& s = alpaka::declareSharedVar<Data[256], __COUNTER__>(acc); | ||
|
||
for (int i = 0; i < 256*256*10; i++) { | ||
if (i%256 == (int) threadIdxLocal && threadIdxLocal > 0) { | ||
s[threadIdxLocal] = s[threadIdxLocal-1] + 1; | ||
} | ||
cms::alpakatools::threadfence(acc); | ||
} | ||
|
||
if (threadIdxLocal == 0) { | ||
vec[blockIdxLocal] = s[127] + s[129]; | ||
} | ||
} | ||
}; | ||
|
||
|
||
int main(void) { | ||
|
||
using Dim = alpaka::DimInt<1u>; | ||
using Data = float; | ||
const Idx num_items = 1<<15; | ||
Idx nThreadsInit = 256; | ||
Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; | ||
|
||
const DevAcc1 device_1(alpaka::getDevByIdx<PltfAcc1>(0u)); | ||
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_0(device_1); | ||
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_1(device_1); | ||
|
||
const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(nThreadsInit)); | ||
const Vec1 blocksPerGrid1(Vec1::all(nBlocksInit)); | ||
auto workDivMultiBlockInit1 = | ||
cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); | ||
|
||
|
||
using DevHost = alpaka::DevCpu; | ||
auto const devHost = alpaka::getDevByIdx<DevHost>(0u); | ||
|
||
using BufHost = alpaka::Buf<DevHost, Data, Dim, Idx>; | ||
BufHost bufHostA(alpaka::allocBuf<Data, Idx>(devHost, num_items)); | ||
BufHost res(alpaka::allocBuf<Data, Idx>(devHost, num_items)); | ||
|
||
Data* const pBufHostA(alpaka::getPtrNative(bufHostA)); | ||
Data* const res_ptr(alpaka::getPtrNative(res)); | ||
|
||
for (Idx i = 0; i < num_items; i++) { | ||
pBufHostA[i] = 0.0; | ||
} | ||
|
||
using BufAcc = alpaka::Buf<DevAcc1, Data, Dim, Idx>; | ||
BufAcc order(alpaka::allocBuf<Data, Idx>(device_1, num_items)); | ||
|
||
printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); | ||
|
||
// Run on shared memory | ||
alpaka::memcpy(queue_1_0, order, bufHostA, num_items); | ||
auto beginT = std::chrono::high_resolution_clock::now(); | ||
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1, | ||
shared_fence<Idx, Data>(), alpaka::getPtrNative(order), num_items)); | ||
alpaka::wait(queue_1_0); | ||
auto endT = std::chrono::high_resolution_clock::now(); | ||
std::cout << "Shared time: " << std::chrono::duration<double>(endT - beginT).count() << " s" | ||
<< std::endl; | ||
alpaka::memcpy(queue_1_0, res, order, num_items); | ||
for (int i = 0; i < 128; i++) { | ||
if (res_ptr[i] != 256.0) printf("Error1: d[%d] != r (%f, %d)\n", i, res_ptr[i], i); | ||
} | ||
|
||
// Run on global memory | ||
alpaka::memcpy(queue_1_0, order, bufHostA, num_items); | ||
beginT = std::chrono::high_resolution_clock::now(); | ||
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1, | ||
global_fence<Idx, Data>(), alpaka::getPtrNative(order), num_items)); | ||
alpaka::wait(queue_1_0); | ||
endT = std::chrono::high_resolution_clock::now(); | ||
std::cout << "Global time: " << std::chrono::duration<double>(endT - beginT).count() << " s" | ||
<< std::endl; | ||
alpaka::memcpy(queue_1_0, res, order, num_items); | ||
for (int i = 0; i < 128; i++) | ||
{ | ||
if (res_ptr[i] != Data (i)) printf("Error1: d[%d] != r (%f, %d)\n", i, res_ptr[i], i); | ||
} | ||
|
||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#include <iostream> | ||
#include <string> | ||
|
||
#include "AlpakaCore/alpakaConfig.h" | ||
#include "AlpakaCore/alpakaWorkDivHelper.h" | ||
|
||
using namespace ALPAKA_ACCELERATOR_NAMESPACE; | ||
|
||
template <typename T, typename Data> | ||
struct check_sync { | ||
template <typename T_Acc> | ||
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const { | ||
|
||
int n = (int) elements; | ||
|
||
auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]); | ||
for (int i = 0; i < n*n; i++) { | ||
if (i%n == (int) threadIdxLocal) { | ||
for (int j = i; j < 10000; j++) { | ||
if (j % 2 == 0) { | ||
// Do random stuff | ||
int sum = 0; | ||
for (int k = 0; k < 1000; k++) | ||
sum += k; | ||
} | ||
} | ||
} | ||
syncBlockThreads(acc); | ||
} | ||
} | ||
}; | ||
|
||
|
||
int main(void) { | ||
|
||
using Dim = alpaka::DimInt<1u>; | ||
using Data = float; | ||
const Idx num_items = 1<<10; | ||
Idx nThreadsInit = 1024; | ||
Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; | ||
|
||
const DevAcc1 device_1(alpaka::getDevByIdx<PltfAcc1>(0u)); | ||
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_0(device_1); | ||
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_1(device_1); | ||
|
||
const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(nThreadsInit)); | ||
const Vec1 blocksPerGrid1(Vec1::all(nBlocksInit)); | ||
auto workDivMultiBlockInit1 = | ||
cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); | ||
|
||
using BufAcc = alpaka::Buf<DevAcc1, Data, Dim, Idx>; | ||
BufAcc order(alpaka::allocBuf<Data, Idx>(device_1, num_items)); | ||
|
||
printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]); | ||
|
||
// Run function | ||
auto beginT = std::chrono::high_resolution_clock::now(); | ||
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1, | ||
check_sync<Idx, Data>(), alpaka::getPtrNative(order), nThreadsInit)); | ||
alpaka::wait(queue_1_0); | ||
auto endT = std::chrono::high_resolution_clock::now(); | ||
std::cout << "Time: " << std::chrono::duration<double>(endT - beginT).count() << " s" << std::endl; | ||
|
||
return 0; | ||
} |
Oops, something went wrong.