forked from cms-patatrack/pixeltrack-standalone
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBrokenLineFitOnGPU.cu
85 lines (74 loc) · 4.7 KB
/
BrokenLineFitOnGPU.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#include "BrokenLineFitOnGPU.h"
#include "CUDACore/device_unique_ptr.h"
void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
uint32_t hitsInFit,
uint32_t maxNumberOfTuples,
cudaStream_t stream) {
assert(tuples_d);
auto blockSize = 64;
auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
// Fit internals
auto hitsGPU_ = cms::cuda::make_device_unique<double[]>(
maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double), stream);
auto hits_geGPU_ = cms::cuda::make_device_unique<float[]>(
maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float), stream);
auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique<double[]>(
maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double), stream);
for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
// fit triplets
kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(
tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset);
cudaCheck(cudaGetLastError());
kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
3,
offset);
cudaCheck(cudaGetLastError());
// fit quads
kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset);
cudaCheck(cudaGetLastError());
kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
4,
offset);
cudaCheck(cudaGetLastError());
if (fit5as4_) {
// fit penta (only first 4)
kernelBLFastFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
cudaCheck(cudaGetLastError());
kernelBLFit<4><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
5,
offset);
cudaCheck(cudaGetLastError());
} else {
// fit penta (all 5)
kernelBLFastFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(
tuples_d, tupleMultiplicity_d, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset);
cudaCheck(cudaGetLastError());
kernelBLFit<5><<<numberOfBlocks / 4, blockSize, 0, stream>>>(tupleMultiplicity_d,
bField_,
outputSoa_d,
hitsGPU_.get(),
hits_geGPU_.get(),
fast_fit_resultsGPU_.get(),
5,
offset);
cudaCheck(cudaGetLastError());
}
} // loop on concurrent fits
}