forked from makortel/pixel-standalone
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyzer_kokkos.cc
71 lines (55 loc) · 2.48 KB
/
analyzer_kokkos.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <Kokkos_Core.hpp>
#include "analyzer_kokkos.h"
#include "input.h"
#include "output.h"
#include "rawtodigi_kokkos.h"
namespace {
constexpr int NLOOPS = 100;
}
#ifdef DIGI_KOKKOS_SERIAL
using KokkosExecSpace = Kokkos::Serial;
#elif defined DIGI_KOKKOS_OPENMP
using KokkosExecSpace = Kokkos::OpenMP;
#elif defined DIGI_KOKKOS_CUDA
using KokkosExecSpace = Kokkos::Cuda;
#endif
namespace kokkos {
void initialize(int& argc, char** argv) { Kokkos::initialize(argc, argv); }
void analyze(Input const& input, Output& output, double& totaltime) {
totaltime = 0.;
for (int i = 0; i < NLOOPS; ++i) {
output = Output();
const auto wordCounter = input.wordCounter;
// Rather non-idiomatic use of Kokkos::View...
Kokkos::View<Input, KokkosExecSpace> input_d{"input_d"};
Kokkos::View<Input, KokkosExecSpace>::HostMirror input_h = Kokkos::create_mirror_view(input_d);
std::memcpy(input_h.data(), &input, sizeof(Input));
Kokkos::View<Output, KokkosExecSpace> output_d{"output_d"};
Kokkos::View<Output, KokkosExecSpace>::HostMirror output_h = Kokkos::create_mirror_view(output_d);
output_h.data()->err.construct(pixelgpudetails::MAX_FED_WORDS, output_d.data()->err_d);
// could I still use unmanaged somehow to avoid the memcpy above and below?
//Kokkos::View<const Input, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > input_d{&input};
//Kokkos::View<Output, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > output_d{&output};
auto start = std::chrono::high_resolution_clock::now();
Kokkos::deep_copy(input_d, input_h);
Kokkos::parallel_for(
Kokkos::RangePolicy<KokkosExecSpace>(0, input.wordCounter), KOKKOS_LAMBDA(const size_t i) {
kokkos::rawtodigi(input_d.data(), output_d.data(), wordCounter, true, true, false, i);
});
Kokkos::fence(); // I don't know if parallel_for is synchronous or not
Kokkos::deep_copy(output_h, output_d);
Kokkos::fence();
auto stop = std::chrono::high_resolution_clock::now();
output_h.data()->err.set_data(output_h.data()->err_d);
std::memcpy(&output, output_h.data(), sizeof(Output));
output.err.set_data(output.err_d);
auto diff = stop - start;
auto time = std::chrono::duration_cast<std::chrono::microseconds>(diff).count();
if (i != 0) {
totaltime += time;
}
}
totaltime /= NLOOPS;
}
void finalize() { Kokkos::finalize(); }
} // namespace kokkos