Skip to content

Commit

Permalink
Launch config can now be specified by option name launch-config for k…
Browse files Browse the repository at this point in the history
…opiemtu, query and display device SMX count and warn if the launch config doesn't use a multiple of SMX count for block count, 4-way split of core phase 2 for devices that support the shuffle instruction (compute 3.0+)
  • Loading branch information
tsiv committed Jul 24, 2014
1 parent 86fba10 commit 96b2ced
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 18 deletions.
13 changes: 8 additions & 5 deletions cpu-miner.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ uint16_t opt_vote = 9999;
static int num_processors;
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
char *device_name[8]; // CB
int device_mpcount[8];
int device_bfactor[8];
int device_bsleep[8];
int device_config[8][2];
Expand Down Expand Up @@ -332,6 +333,7 @@ static struct option const options[] = {
{ "devices", 1, NULL, 'd' },
{ "diff", 1, NULL, 'f' },
{ "launch", 1, NULL, 'l' },
{ "launch-config", 1, NULL, 'l' },
{ "bfactor", 1, NULL, 1008 },
{ "bsleep", 1, NULL, 1009 },
{ 0, 0, 0, 0 }
Expand Down Expand Up @@ -1110,11 +1112,12 @@ static void *miner_thread(void *userdata)
affine_to_cpu(thr_id, thr_id % num_processors);
}

if( opt_algo == ALGO_CRYPTONIGHT ) {

applog(LOG_INFO, "GPU #%d: %s, using %d blocks of %d threads",
device_map[thr_id], device_name[thr_id], device_config[thr_id][0], device_config[thr_id][1]);
}
applog(LOG_INFO, "GPU #%d: %s (%d SMX), using %d blocks of %d threads",
device_map[thr_id], device_name[thr_id], device_mpcount[thr_id], device_config[thr_id][0], device_config[thr_id][1]);

if( device_config[thr_id][0] % device_mpcount[thr_id] )
applog(LOG_INFO, "GPU #%d: Warning: block count %d is not a multiple of SMX count %d.",
device_map[thr_id], device_config[thr_id][0], device_mpcount[thr_id]);

uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + (jsonrpc_2 ? 39 : 76));

Expand Down
2 changes: 2 additions & 0 deletions cryptonight/cryptonight.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ extern "C"
}

extern char *device_name[8];
extern int device_mpcount[8];
extern int device_map[8];
extern int device_config[8][2];

Expand Down Expand Up @@ -61,6 +62,7 @@ extern "C" void cuda_devicenames()
cudaGetDeviceProperties(&props, device_map[i]);

device_name[i] = strdup(props.name);
device_mpcount[i] = props.multiProcessorCount;
}
}

Expand Down
121 changes: 108 additions & 13 deletions cryptonight/cuda_cryptonight_core.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,34 +59,125 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state,
}
}

__global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int partidx, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
__global__ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
{
__shared__ uint32_t sharedMemory[1024];

cn_aes_gpu_init(sharedMemory);

__syncthreads();

int thread = (blockDim.x * blockIdx.x + threadIdx.x);

int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;

#if __CUDA_ARCH__ >= 300

int sub = threadIdx.x & 3;

if (thread < threads)
{
int i, j, k;
int batchsize = ITER >> (2+bfactor);
int start = partidx * batchsize;
int end = start + batchsize;
uint8_t *long_state = &d_long_state[MEMORY * thread];
struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
uint32_t a, b, c, x[4];
uint32_t *ls32;
uint32_t t1[4], t2[4], res;
uint64_t reshi, reslo;

a = ctx->a[sub];
b = ctx->b[sub];

for (i = start; i < end; ++i) {

//j = ((uint32_t *)a)[0] & 0x1FFFF0;
j = __shfl((int)a, 0, 4) & 0x1FFFF0;

//cn_aes_single_round(sharedMemory, &long_state[j], c, a);
ls32 = (uint32_t *)&long_state[j];
x[0] = ls32[sub];
x[1] = __shfl((int)x[0], sub+1, 4);
x[2] = __shfl((int)x[0], sub+2, 4);
x[3] = __shfl((int)x[0], sub+3, 4);
c = a ^
t_fn0(x[0] & 0xff) ^
t_fn1((x[1] >> 8) & 0xff) ^
t_fn2((x[2] >> 16) & 0xff) ^
t_fn3((x[3] >> 24) & 0xff);

//XOR_BLOCKS_DST(c, b, &long_state[j]);
ls32[sub] = c ^ b;

//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
ls32 = (uint32_t *)&long_state[__shfl((int)c, 0, 4) & 0x1FFFF0];
for( k = 0; k < 2; k++ ) t1[k] = __shfl((int)c, k, 4);
for( k = 0; k < 4; k++ ) t2[k] = __shfl((int)a, k, 4);
asm(
"mad.lo.u64 %0, %2, %3, %4;\n\t"
"mad.hi.u64 %1, %2, %3, %5;\n\t"
: "=l"(reslo), "=l"(reshi)
: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)ls32)[0]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
res = (sub & 2 ? reslo : reshi) >> (sub&1 ? 32 : 0);
a = ls32[sub] ^ res;
ls32[sub] = res;

//j = ((uint32_t *)a)[0] & 0x1FFFF0;
j = __shfl((int)a, 0, 4) & 0x1FFFF0;

//cn_aes_single_round(sharedMemory, &long_state[j], b, a);
ls32 = (uint32_t *)&long_state[j];
x[0] = ls32[sub];
x[1] = __shfl((int)x[0], sub+1, 4);
x[2] = __shfl((int)x[0], sub+2, 4);
x[3] = __shfl((int)x[0], sub+3, 4);
b = a ^
t_fn0(x[0] & 0xff) ^
t_fn1((x[1] >> 8) & 0xff) ^
t_fn2((x[2] >> 16) & 0xff) ^
t_fn3((x[3] >> 24) & 0xff);

//XOR_BLOCKS_DST(b, c, &long_state[j]);
ls32[sub] = c ^ b;

//MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
ls32 = (uint32_t *)&long_state[__shfl((int)b, 0, 4) & 0x1FFFF0];
for( k = 0; k < 2; k++ ) t1[k] = __shfl((int)b, k, 4);
for( k = 0; k < 4; k++ ) t2[k] = __shfl((int)a, k, 4);
asm(
"mad.lo.u64 %0, %2, %3, %4;\n\t"
"mad.hi.u64 %1, %2, %3, %5;\n\t"
: "=l"(reslo), "=l"(reshi)
: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)ls32)[0]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
res = (sub & 2 ? reslo : reshi) >> (sub&1 ? 32 : 0);
a = ls32[sub] ^ res;
ls32[sub] = res;
}

if( bfactor > 0 ) {

ctx->a[sub] = a;
ctx->b[sub] = b;
}
}

#else // __CUDA_ARCH__ < 300

// plain old single thread per hash implementation, but the kernel is still launched with 4 threads per hash
// have only one for the four "subthreads" actually do work
if (thread < threads && (threadIdx.x & 3) == 0)
{
int i, j;
int start = 0, end = ITER / 4;
int batchsize = ITER >> (2+bfactor);
int start = partidx * batchsize;
int end = start + batchsize;
uint8_t *long_state = &d_long_state[MEMORY * thread];
struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
uint32_t a[4], b[4], c[4];

MEMCPY8(a, ctx->a, 2);
MEMCPY8(b, ctx->b, 2);

if( partcount > 1 ) {

int batchsize = (ITER / 4) / partcount;
start = partidx * batchsize;
end = start + batchsize;
}

for (i = start; i < end; ++i) {

j = ((uint32_t *)a)[0] & 0x1FFFF0;
Expand All @@ -99,12 +190,14 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int part
MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
}

if( partcount > 1 ) {
if( bfactor > 0 ) {

MEMCPY8(ctx->a, a, 2);
MEMCPY8(ctx->b, b, 2);
}
}

#endif // __CUDA_ARCH__ >= 300
}

__global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
Expand Down Expand Up @@ -148,6 +241,7 @@ __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state,
}
}


__host__ void cryptonight_core_cpu_init(int thr_id, int threads)
{
cn_aes_cpu_init();
Expand All @@ -157,6 +251,7 @@ __host__ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uin
{
dim3 grid(blocks);
dim3 block(threads);
dim3 block4(threads << 2);
dim3 block8(threads << 3);

size_t shared_size = 1024;
Expand All @@ -167,7 +262,7 @@ __host__ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uin
if( partcount > 1 ) usleep(device_bsleep[thr_id]);

for( i = 0; i < partcount; i++ ) {
cryptonight_core_gpu_phase2<<<grid, block, shared_size>>>(blocks*threads, partcount, i, d_long_state, d_ctx);
cryptonight_core_gpu_phase2<<<grid, block4, shared_size>>>(blocks*threads, device_bfactor[thr_id], i, d_long_state, d_ctx);
cudaDeviceSynchronize();
if( partcount > 1 ) usleep(device_bsleep[thr_id]);
}
Expand Down

0 comments on commit 96b2ced

Please sign in to comment.