-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernel_skeleton_sm.cu
175 lines (146 loc) · 7.58 KB
/
kernel_skeleton_sm.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#include "kseq/kseq.h"
#include "common.h"
#define SAMPLE_MAX_LEN 200000
#define SIGNATURE_MAX_LEN 10000
// SAMPLE_SMEM_SIZE = SAMPLE_MAX_LEN / 2 + SIGNATURE_MAX_LEN
#define SAMPLE_SMEM_SIZE 110000
void check_cuda_errors()
{
cudaError_t rc;
rc = cudaGetLastError();
if (rc != cudaSuccess)
{
printf("Last CUDA error %s\n", cudaGetErrorString(rc));
}
}
__global__ void getMatches(const char* __restrict d_sample_seq, const char* __restrict d_sample_qual, const char* __restrict d_signature_seq,
double* __restrict match_scores, unsigned short* __restrict match_idx, int* __restrict match_count) {
// Half of sample so it will always fit in shared memory; A size of SIGNATURE_MAX_LEN needs to be copied twice
extern __shared__ char s_sample[];
const char *sample = &d_sample_seq[SAMPLE_MAX_LEN * blockIdx.x];
const char *signature = &d_signature_seq[SIGNATURE_MAX_LEN * threadIdx.x];
// Copy 1st half + SIGNATURE_MAX_LEN of sample into shared memory
int num_per_thread = (SAMPLE_SMEM_SIZE + blockDim.x - 1) / blockDim.x; // Ceiling
if (threadIdx.x == blockDim.x - 1) { // Last thread
memcpy(&s_sample[threadIdx.x * num_per_thread], &sample[threadIdx.x * num_per_thread],
SAMPLE_SMEM_SIZE - num_per_thread * threadIdx.x);
} else {
memcpy(&s_sample[threadIdx.x * num_per_thread], &sample[threadIdx.x * num_per_thread],
num_per_thread);
}
// if (threadIdx.x == 0) {
// memcpy(s_sample, sample, SAMPLE_SMEM_SIZE);
// }
__syncthreads();
for (int i = 0; i < (SAMPLE_MAX_LEN >> 1); ++i) {
if (!s_sample[i]) { // Sample ended; no match
return;
}
for (int j = 0; j < SIGNATURE_MAX_LEN; ++j) {
// TODO: FIX WHEN SIGNATURE == MAX LEN
if (!signature[j]) { // Signature ended; found match
const char *qual = &d_sample_qual[SAMPLE_MAX_LEN * blockIdx.x + i];
int tot = 0;
for (int s = 0; s < j; ++s) {
tot += qual[s] - 33;
}
int idx = atomicAdd(match_count, 1); // Each idx will be unique (no need synchronisation)
match_scores[idx] = (double)tot / j;
match_idx[idx << 1] = blockIdx.x; // Sample idx
match_idx[(idx << 1) + 1] = threadIdx.x; // Signature idx
return;
} else if (!s_sample[i + j] || s_sample[i + j] != 'N' && signature[j] != 'N' &&
s_sample[i + j] != signature[j]) { // No match
break;
}
}
}
// Copy 2nd half of sample into shared memory
num_per_thread = ((SAMPLE_MAX_LEN >> 1) + blockDim.x - 1) / blockDim.x; // Ceiling
if (threadIdx.x == blockDim.x - 1) { // Last thread
memcpy(&s_sample[threadIdx.x * num_per_thread], &sample[(SAMPLE_MAX_LEN >> 1) + threadIdx.x * num_per_thread],
(SAMPLE_MAX_LEN >> 1) - num_per_thread * threadIdx.x);
s_sample[(SAMPLE_MAX_LEN >> 1)] = 0; // Set next char to '\0' to indicate end
} else {
memcpy(&s_sample[threadIdx.x * num_per_thread], &sample[(SAMPLE_MAX_LEN >> 1) + threadIdx.x * num_per_thread],
num_per_thread);
}
__syncthreads();
for (int i = 0; i < (SAMPLE_MAX_LEN >> 1); ++i) {
if (!s_sample[i]) { // Sample ended; no match
return;
}
for (int j = 0; j < SIGNATURE_MAX_LEN; ++j) {
// TODO: FIX WHEN SIGNATURE == MAX LEN
if (!signature[j]) { // Signature ended; found match
const char *qual = &d_sample_qual[SAMPLE_MAX_LEN * blockIdx.x + i];
int tot = 0;
for (int s = 0; s < j; ++s) {
tot += qual[s] - 33;
}
int idx = atomicAdd(match_count, 1); // Each idx will be unique (no need synchronisation)
match_scores[idx] = (double)tot / j;
match_idx[idx << 1] = blockIdx.x; // Sample idx
match_idx[(idx << 1) + 1] = threadIdx.x; // Signature idx
return;
} else if (!s_sample[i + j] || s_sample[i + j] != 'N' && signature[j] != 'N' &&
s_sample[i + j] != signature[j]) { // No match
break;
}
}
}
}
void runMatcher(const std::vector<klibpp::KSeq>& samples, const std::vector<klibpp::KSeq>& signatures, std::vector<MatchResult>& matches) {
int sample_arr_len = samples.size() * SAMPLE_MAX_LEN;
int signature_arr_len = signatures.size() * SIGNATURE_MAX_LEN;
// calloc() to initialise all to '\0'; set to max possible length for each type.
char *h_sample_seq = (char*)calloc(sample_arr_len, sizeof(char));
char *h_sample_qual = (char*)calloc(sample_arr_len, sizeof(char));
char *h_signature_seq = (char*)calloc(signature_arr_len, sizeof(char));
// Device variables
char *d_sample_seq, *d_sample_qual, *d_signature_seq;
int *match_count; // 1 int storing the number of matches
// Pinned memory in host
double *match_scores; // [Score_1, Score2, ...]
unsigned short *match_idx; // [Samp_idx_1, Sig_idx_1, Samp_idx_2, Sig_idx_2, ...]
// Combine each string array into 1 for transfer
for (int i = 0; i < samples.size(); ++i) {
memcpy(&h_sample_seq[i * SAMPLE_MAX_LEN], &samples[i].seq[0], samples[i].seq.size());
memcpy(&h_sample_qual[i * SAMPLE_MAX_LEN], &samples[i].qual[0], samples[i].qual.size());
}
for (int i = 0; i < signatures.size(); ++i) {
memcpy(&h_signature_seq[i * SIGNATURE_MAX_LEN], &signatures[i].seq[0], signatures[i].seq.size());
}
// Use cudaMallocHost() to write straight to host, since only a small number of matches should be found
cudaMallocHost(&match_scores, sizeof(double) * samples.size() * signatures.size()); // Max possible number of matches
cudaMallocHost(&match_idx, sizeof(unsigned short) * 2 * samples.size() * signatures.size());
// Test out zero-copy memory
// cudaHostAlloc(&match_scores, sizeof(double) * samples.size() * signatures.size(), cudaHostAllocMapped); // Max possible number of matches
// cudaHostAlloc(&match_idx, sizeof(unsigned short) * 2 * samples.size() * signatures.size(), cudaHostAllocMapped);
cudaMalloc(&match_count, sizeof(int));
cudaMemset(match_count, 0, 1); // Initialise match_count to 0
cudaMalloc(&d_sample_seq, sample_arr_len);
cudaMalloc(&d_sample_qual, sample_arr_len);
cudaMalloc(&d_signature_seq, signature_arr_len);
cudaMemcpy(d_sample_seq, h_sample_seq, sample_arr_len, cudaMemcpyHostToDevice);
cudaMemcpy(d_sample_qual, h_sample_qual, sample_arr_len, cudaMemcpyHostToDevice);
cudaMemcpy(d_signature_seq, h_signature_seq, signature_arr_len, cudaMemcpyHostToDevice);
cudaFuncSetAttribute(getMatches, cudaFuncAttributeMaxDynamicSharedMemorySize, SAMPLE_SMEM_SIZE);
// 1 Sample per block; Each thread in block corresponds to 1 signature
getMatches<<<samples.size(), signatures.size(), SAMPLE_SMEM_SIZE>>>(d_sample_seq, d_sample_qual, d_signature_seq, match_scores, match_idx, match_count);
check_cuda_errors();
cudaDeviceSynchronize();
int h_match_count;
cudaMemcpy(&h_match_count, match_count, sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < h_match_count; ++i) {
matches.emplace_back(MatchResult(samples[match_idx[i << 1]].name,
signatures[match_idx[(i << 1) + 1]].name,
match_scores[i]));
}
cudaFree(d_sample_seq);
cudaFree(d_sample_qual);
cudaFree(d_signature_seq);
free(h_sample_seq);
free(h_sample_qual);
free(h_signature_seq);
}