Skip to content

Commit

Permalink
[alpaka/cuda] tests for atomic and barriers
Browse files Browse the repository at this point in the history
  • Loading branch information
antoniopetre committed Sep 10, 2021
1 parent 1fabee1 commit bed0543
Show file tree
Hide file tree
Showing 6 changed files with 677 additions and 0 deletions.
208 changes: 208 additions & 0 deletions src/alpaka/test/alpaka/atomtest.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
#include <iostream>
#include <string>

#include "AlpakaCore/alpakaConfig.h"
#include "AlpakaCore/alpakaWorkDivHelper.h"

using namespace ALPAKA_ACCELERATOR_NAMESPACE;

template <typename T, typename Data>
struct shared_block {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const {

auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
auto blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
Data b = 1.0;
Data c = -1.0;

auto& s = alpaka::declareSharedVar<Data, __COUNTER__>(acc);

if (threadIdxLocal == 0) {
s = 0;
}

syncBlockThreads(acc);

for (T index: cms::alpakatools::elements_with_stride<T, T_Acc>(acc, elements)) {
for (int i = 0; i < 200000; i++) {
alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{});
alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{});
}
alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{});
}

syncBlockThreads(acc);

if (threadIdxLocal == 0) {
vec[blockIdxInGrid] = s;
}
}
};

template <typename T, typename Data>
struct global_block {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const {

auto blockIdxInGrid(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
Data b = 1.0;
Data c = -1.0;

for (T index: cms::alpakatools::elements_with_stride<T, T_Acc>(acc, elements)) {
for (int i = 0; i < 200000; i++) {
alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{});
alpaka::atomicAdd(acc, &vec[blockIdxInGrid], c, alpaka::hierarchy::Grids{});
}
alpaka::atomicAdd(acc, &vec[blockIdxInGrid], b, alpaka::hierarchy::Grids{});
}
}
};

template <typename T, typename Data>
struct global_grid {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const {

Data b = 1.0;
Data c = -1.0;

for (T index: cms::alpakatools::elements_with_stride<T, T_Acc>(acc, elements)) {
for (int i = 0; i < 200000; i++) {
alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{}); //alpaka::hierarchy::Blocks/Threads/Grids
alpaka::atomicAdd(acc, &vec[0], c, alpaka::hierarchy::Grids{});
}
alpaka::atomicAdd(acc, &vec[0], b, alpaka::hierarchy::Grids{});
}
}
};

template <typename T, typename Data>
struct shared_grid {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const {

auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
Data b = 1.0;
Data c = -1.0;

auto& s = alpaka::declareSharedVar<Data, __COUNTER__>(acc);

if (threadIdxLocal == 0) {
s = 0;
}

syncBlockThreads(acc);

for (T index: cms::alpakatools::elements_with_stride<T, T_Acc>(acc, elements)) {
for (int i = 0; i < 200000; i++) {
alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{}); //alpaka::hierarchy::Blocks/Threads/Grids
alpaka::atomicAdd(acc, &s, c, alpaka::hierarchy::Blocks{});
}
alpaka::atomicAdd(acc, &s, b, alpaka::hierarchy::Blocks{});
}

syncBlockThreads(acc);

if (threadIdxLocal == 0) {
alpaka::atomicAdd(acc, &vec[0], s, alpaka::hierarchy::Grids{});
}
}
};


int main(void) {

using Dim = alpaka::DimInt<1u>;
using Data = float;
const Idx num_items = 1<<15;
Idx nThreadsInit = 256;
Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit;

const DevAcc1 device_1(alpaka::getDevByIdx<PltfAcc1>(0u));
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_0(device_1);
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_1(device_1);

const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(nThreadsInit));
const Vec1 blocksPerGrid1(Vec1::all(nBlocksInit));
auto workDivMultiBlockInit1 =
cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1);


using DevHost = alpaka::DevCpu;
auto const devHost = alpaka::getDevByIdx<DevHost>(0u);

using BufHost = alpaka::Buf<DevHost, Data, Dim, Idx>;
BufHost bufHostA(alpaka::allocBuf<Data, Idx>(devHost, num_items));
BufHost res(alpaka::allocBuf<Data, Idx>(devHost, num_items));

Data* const pBufHostA(alpaka::getPtrNative(bufHostA));
Data* const res_ptr(alpaka::getPtrNative(res));

for (Idx i = 0; i < num_items; i++) {
pBufHostA[i] = 0.0;
}

using BufAcc = alpaka::Buf<DevAcc1, Data, Dim, Idx>;
BufAcc order(alpaka::allocBuf<Data, Idx>(device_1, num_items));


printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]);

// Run on shared memory
alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
auto beginT = std::chrono::high_resolution_clock::now();
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1,
shared_block<Idx, Data>(), alpaka::getPtrNative(order), num_items));
alpaka::wait(queue_1_0);
auto endT = std::chrono::high_resolution_clock::now();
std::cout << "Shared Block: " << std::chrono::duration<double>(endT - beginT).count() << " s"
<< std::endl;
alpaka::memcpy(queue_1_0, res, order, num_items);
for (Idx i = 0; i < nBlocksInit; i++)
{
if (res_ptr[i] != (Data) nThreadsInit) std::cout << "[" << i << "]: " << res_ptr[i] << " != " << (Data) num_items << std::endl;
}

// Run on global memory
alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
beginT = std::chrono::high_resolution_clock::now();
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1,
global_block<Idx, Data>(), alpaka::getPtrNative(order), num_items));
alpaka::wait(queue_1_0);
endT = std::chrono::high_resolution_clock::now();
std::cout << "Global Block: " << std::chrono::duration<double>(endT - beginT).count() << " s"
<< std::endl;
alpaka::memcpy(queue_1_0, res, order, num_items);
for (Idx i = 0; i < nBlocksInit; i++)
{
if (res_ptr[i] != (Data) nThreadsInit) std::cout << "[" << i << "]: " << res_ptr[i] << " != " << (Data) num_items << std::endl;
}

// Run on Shared memory
alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
beginT = std::chrono::high_resolution_clock::now();
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1,
shared_grid<Idx, Data>(), alpaka::getPtrNative(order), num_items));
alpaka::wait(queue_1_0);
endT = std::chrono::high_resolution_clock::now();
std::cout << "Shared Grid: " << std::chrono::duration<double>(endT - beginT).count() << " s"
<< std::endl;
alpaka::memcpy(queue_1_0, res, order, num_items);
if (res_ptr[0] != (Data) num_items) std::cout << "[0]: " << res_ptr[0] << " != " << (Data) num_items << std::endl
<< std::endl;

// Run on Global memory
alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
beginT = std::chrono::high_resolution_clock::now();
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1,
global_grid<Idx, Data>(), alpaka::getPtrNative(order), num_items));
alpaka::wait(queue_1_0);
endT = std::chrono::high_resolution_clock::now();
std::cout << "Global Grid: " << std::chrono::duration<double>(endT - beginT).count() << " s"
<< std::endl;
alpaka::memcpy(queue_1_0, res, order, num_items);
if (res_ptr[0] != (Data) num_items) std::cout << "[0]: " << res_ptr[0] << " != " << (Data) num_items << std::endl;

return 0;
}
121 changes: 121 additions & 0 deletions src/alpaka/test/alpaka/barrier_fence.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#include <iostream>
#include <string>

#include "AlpakaCore/alpakaConfig.h"
#include "AlpakaCore/alpakaWorkDivHelper.h"
#include "AlpakaCore/threadfence.h"

using namespace ALPAKA_ACCELERATOR_NAMESPACE;

template <typename T, typename Data>
struct global_fence {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const {

auto blockIdxLocal(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
int no_blocks = 128;

for (int i = 0; i < no_blocks*no_blocks*10; i++) {
if (i%no_blocks == (int) blockIdxLocal) {
if (i%no_blocks > 0) {
vec[blockIdxLocal] = vec[blockIdxLocal - 1] + 1;
}
}
cms::alpakatools::threadfence(acc);
}
}
};

template <typename T, typename Data>
struct shared_fence {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const {

auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
auto blockIdxLocal(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);

auto& s = alpaka::declareSharedVar<Data[256], __COUNTER__>(acc);

for (int i = 0; i < 256*256*10; i++) {
if (i%256 == (int) threadIdxLocal && threadIdxLocal > 0) {
s[threadIdxLocal] = s[threadIdxLocal-1] + 1;
}
cms::alpakatools::threadfence(acc);
}

if (threadIdxLocal == 0) {
vec[blockIdxLocal] = s[127] + s[129];
}
}
};


int main(void) {

using Dim = alpaka::DimInt<1u>;
using Data = float;
const Idx num_items = 1<<15;
Idx nThreadsInit = 256;
Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit;

const DevAcc1 device_1(alpaka::getDevByIdx<PltfAcc1>(0u));
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_0(device_1);
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_1(device_1);

const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(nThreadsInit));
const Vec1 blocksPerGrid1(Vec1::all(nBlocksInit));
auto workDivMultiBlockInit1 =
cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1);


using DevHost = alpaka::DevCpu;
auto const devHost = alpaka::getDevByIdx<DevHost>(0u);

using BufHost = alpaka::Buf<DevHost, Data, Dim, Idx>;
BufHost bufHostA(alpaka::allocBuf<Data, Idx>(devHost, num_items));
BufHost res(alpaka::allocBuf<Data, Idx>(devHost, num_items));

Data* const pBufHostA(alpaka::getPtrNative(bufHostA));
Data* const res_ptr(alpaka::getPtrNative(res));

for (Idx i = 0; i < num_items; i++) {
pBufHostA[i] = 0.0;
}

using BufAcc = alpaka::Buf<DevAcc1, Data, Dim, Idx>;
BufAcc order(alpaka::allocBuf<Data, Idx>(device_1, num_items));

printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]);

// Run on shared memory
alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
auto beginT = std::chrono::high_resolution_clock::now();
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1,
shared_fence<Idx, Data>(), alpaka::getPtrNative(order), num_items));
alpaka::wait(queue_1_0);
auto endT = std::chrono::high_resolution_clock::now();
std::cout << "Shared time: " << std::chrono::duration<double>(endT - beginT).count() << " s"
<< std::endl;
alpaka::memcpy(queue_1_0, res, order, num_items);
for (int i = 0; i < 128; i++) {
if (res_ptr[i] != 256.0) printf("Error1: d[%d] != r (%f, %d)\n", i, res_ptr[i], i);
}

// Run on global memory
alpaka::memcpy(queue_1_0, order, bufHostA, num_items);
beginT = std::chrono::high_resolution_clock::now();
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1,
global_fence<Idx, Data>(), alpaka::getPtrNative(order), num_items));
alpaka::wait(queue_1_0);
endT = std::chrono::high_resolution_clock::now();
std::cout << "Global time: " << std::chrono::duration<double>(endT - beginT).count() << " s"
<< std::endl;
alpaka::memcpy(queue_1_0, res, order, num_items);
for (int i = 0; i < 128; i++)
{
if (res_ptr[i] != Data (i)) printf("Error1: d[%d] != r (%f, %d)\n", i, res_ptr[i], i);
}


return 0;
}
65 changes: 65 additions & 0 deletions src/alpaka/test/alpaka/barrier_sync.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#include <iostream>
#include <string>

#include "AlpakaCore/alpakaConfig.h"
#include "AlpakaCore/alpakaWorkDivHelper.h"

using namespace ALPAKA_ACCELERATOR_NAMESPACE;

template <typename T, typename Data>
struct check_sync {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, Data *vec, T elements) const {

int n = (int) elements;

auto threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
for (int i = 0; i < n*n; i++) {
if (i%n == (int) threadIdxLocal) {
for (int j = i; j < 10000; j++) {
if (j % 2 == 0) {
// Do random stuff
int sum = 0;
for (int k = 0; k < 1000; k++)
sum += k;
}
}
}
syncBlockThreads(acc);
}
}
};


int main(void) {

using Dim = alpaka::DimInt<1u>;
using Data = float;
const Idx num_items = 1<<10;
Idx nThreadsInit = 1024;
Idx nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit;

const DevAcc1 device_1(alpaka::getDevByIdx<PltfAcc1>(0u));
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_0(device_1);
alpaka::Queue<DevAcc1, alpaka::Blocking> queue_1_1(device_1);

const Vec1 threadsPerBlockOrElementsPerThread1(Vec1::all(nThreadsInit));
const Vec1 blocksPerGrid1(Vec1::all(nBlocksInit));
auto workDivMultiBlockInit1 =
cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1);

using BufAcc = alpaka::Buf<DevAcc1, Data, Dim, Idx>;
BufAcc order(alpaka::allocBuf<Data, Idx>(device_1, num_items));

printf("Threads/block:%d blocks/grid:%d\n", threadsPerBlockOrElementsPerThread1[0u], blocksPerGrid1[0u]);

// Run function
auto beginT = std::chrono::high_resolution_clock::now();
alpaka::enqueue(queue_1_0, alpaka::createTaskKernel<Acc1>(workDivMultiBlockInit1,
check_sync<Idx, Data>(), alpaka::getPtrNative(order), nThreadsInit));
alpaka::wait(queue_1_0);
auto endT = std::chrono::high_resolution_clock::now();
std::cout << "Time: " << std::chrono::duration<double>(endT - beginT).count() << " s" << std::endl;

return 0;
}
Loading

0 comments on commit bed0543

Please sign in to comment.