From 96b2cedd2206311231bbda7e32709584b20e6ade Mon Sep 17 00:00:00 2001 From: tsiv <118de909ebbe6649@jarru.org> Date: Thu, 24 Jul 2014 04:25:54 +0300 Subject: [PATCH] Launch config can now be specified by option name launch-config for kopiemtu, query and display device SMX count and warn if the launch config doesn't use a multiple of SMX count for block count, 4-way split of core phase 2 for devices that support the shuffle instruction (compute 3.0+) --- cpu-miner.c | 13 +-- cryptonight/cryptonight.cu | 2 + cryptonight/cuda_cryptonight_core.cu | 121 ++++++++++++++++++++++++--- 3 files changed, 118 insertions(+), 18 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index 0c85acc..10117f3 100755 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -179,6 +179,7 @@ uint16_t opt_vote = 9999; static int num_processors; int device_map[8] = {0,1,2,3,4,5,6,7}; // CB char *device_name[8]; // CB +int device_mpcount[8]; int device_bfactor[8]; int device_bsleep[8]; int device_config[8][2]; @@ -332,6 +333,7 @@ static struct option const options[] = { { "devices", 1, NULL, 'd' }, { "diff", 1, NULL, 'f' }, { "launch", 1, NULL, 'l' }, + { "launch-config", 1, NULL, 'l' }, { "bfactor", 1, NULL, 1008 }, { "bsleep", 1, NULL, 1009 }, { 0, 0, 0, 0 } @@ -1110,11 +1112,12 @@ static void *miner_thread(void *userdata) affine_to_cpu(thr_id, thr_id % num_processors); } - if( opt_algo == ALGO_CRYPTONIGHT ) { - - applog(LOG_INFO, "GPU #%d: %s, using %d blocks of %d threads", - device_map[thr_id], device_name[thr_id], device_config[thr_id][0], device_config[thr_id][1]); - } + applog(LOG_INFO, "GPU #%d: %s (%d SMX), using %d blocks of %d threads", + device_map[thr_id], device_name[thr_id], device_mpcount[thr_id], device_config[thr_id][0], device_config[thr_id][1]); + + if( device_config[thr_id][0] % device_mpcount[thr_id] ) + applog(LOG_INFO, "GPU #%d: Warning: block count %d is not a multiple of SMX count %d.", + device_map[thr_id], device_config[thr_id][0], device_mpcount[thr_id]); uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + (jsonrpc_2 ? 39 : 76)); diff --git a/cryptonight/cryptonight.cu b/cryptonight/cryptonight.cu index 33a1af5..f96b928 100755 --- a/cryptonight/cryptonight.cu +++ b/cryptonight/cryptonight.cu @@ -13,6 +13,7 @@ extern "C" } extern char *device_name[8]; +extern int device_mpcount[8]; extern int device_map[8]; extern int device_config[8][2]; @@ -61,6 +62,7 @@ extern "C" void cuda_devicenames() cudaGetDeviceProperties(&props, device_map[i]); device_name[i] = strdup(props.name); + device_mpcount[i] = props.multiProcessorCount; } } diff --git a/cryptonight/cuda_cryptonight_core.cu b/cryptonight/cuda_cryptonight_core.cu index 1e9cf10..bf3d91a 100755 --- a/cryptonight/cuda_cryptonight_core.cu +++ b/cryptonight/cuda_cryptonight_core.cu @@ -59,7 +59,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state, } } -__global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int partidx, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx) +__global__ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx) { __shared__ uint32_t sharedMemory[1024]; @@ -67,12 +67,110 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int part __syncthreads(); - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - + int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; + +#if __CUDA_ARCH__ >= 300 + + int sub = threadIdx.x & 3; + if (thread < threads) + { + int i, j, k; + int batchsize = ITER >> (2+bfactor); + int start = partidx * batchsize; + int end = start + batchsize; + uint8_t *long_state = &d_long_state[MEMORY * thread]; + struct cryptonight_gpu_ctx *ctx = &d_ctx[thread]; + uint32_t a, b, c, x[4]; + uint32_t *ls32; + uint32_t t1[4], t2[4], res; + uint64_t reshi, reslo; + + a = ctx->a[sub]; + b = ctx->b[sub]; + + for (i = start; i < end; ++i) { + + //j = ((uint32_t *)a)[0] & 0x1FFFF0; + j = __shfl((int)a, 0, 4) & 0x1FFFF0; + + //cn_aes_single_round(sharedMemory, &long_state[j], c, a); + ls32 = (uint32_t *)&long_state[j]; + x[0] = ls32[sub]; + x[1] = __shfl((int)x[0], sub+1, 4); + x[2] = __shfl((int)x[0], sub+2, 4); + x[3] = __shfl((int)x[0], sub+3, 4); + c = a ^ + t_fn0(x[0] & 0xff) ^ + t_fn1((x[1] >> 8) & 0xff) ^ + t_fn2((x[2] >> 16) & 0xff) ^ + t_fn3((x[3] >> 24) & 0xff); + + //XOR_BLOCKS_DST(c, b, &long_state[j]); + ls32[sub] = c ^ b; + + //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]); + ls32 = (uint32_t *)&long_state[__shfl((int)c, 0, 4) & 0x1FFFF0]; + for( k = 0; k < 2; k++ ) t1[k] = __shfl((int)c, k, 4); + for( k = 0; k < 4; k++ ) t2[k] = __shfl((int)a, k, 4); + asm( + "mad.lo.u64 %0, %2, %3, %4;\n\t" + "mad.hi.u64 %1, %2, %3, %5;\n\t" + : "=l"(reslo), "=l"(reshi) + : "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)ls32)[0]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0])); + res = (sub & 2 ? reslo : reshi) >> (sub&1 ? 32 : 0); + a = ls32[sub] ^ res; + ls32[sub] = res; + + //j = ((uint32_t *)a)[0] & 0x1FFFF0; + j = __shfl((int)a, 0, 4) & 0x1FFFF0; + + //cn_aes_single_round(sharedMemory, &long_state[j], b, a); + ls32 = (uint32_t *)&long_state[j]; + x[0] = ls32[sub]; + x[1] = __shfl((int)x[0], sub+1, 4); + x[2] = __shfl((int)x[0], sub+2, 4); + x[3] = __shfl((int)x[0], sub+3, 4); + b = a ^ + t_fn0(x[0] & 0xff) ^ + t_fn1((x[1] >> 8) & 0xff) ^ + t_fn2((x[2] >> 16) & 0xff) ^ + t_fn3((x[3] >> 24) & 0xff); + + //XOR_BLOCKS_DST(b, c, &long_state[j]); + ls32[sub] = c ^ b; + + //MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]); + ls32 = (uint32_t *)&long_state[__shfl((int)b, 0, 4) & 0x1FFFF0]; + for( k = 0; k < 2; k++ ) t1[k] = __shfl((int)b, k, 4); + for( k = 0; k < 4; k++ ) t2[k] = __shfl((int)a, k, 4); + asm( + "mad.lo.u64 %0, %2, %3, %4;\n\t" + "mad.hi.u64 %1, %2, %3, %5;\n\t" + : "=l"(reslo), "=l"(reshi) + : "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)ls32)[0]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0])); + res = (sub & 2 ? reslo : reshi) >> (sub&1 ? 32 : 0); + a = ls32[sub] ^ res; + ls32[sub] = res; + } + + if( bfactor > 0 ) { + + ctx->a[sub] = a; + ctx->b[sub] = b; + } + } + +#else // __CUDA_ARCH__ < 300 + + // plain old single thread per hash implementation, but the kernel is still launched with 4 threads per hash + // have only one for the four "subthreads" actually do work + if (thread < threads && (threadIdx.x & 3) == 0) { int i, j; - int start = 0, end = ITER / 4; + int batchsize = ITER >> (2+bfactor); + int start = partidx * batchsize; + int end = start + batchsize; uint8_t *long_state = &d_long_state[MEMORY * thread]; struct cryptonight_gpu_ctx *ctx = &d_ctx[thread]; uint32_t a[4], b[4], c[4]; @@ -80,13 +178,6 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int part MEMCPY8(a, ctx->a, 2); MEMCPY8(b, ctx->b, 2); - if( partcount > 1 ) { - - int batchsize = (ITER / 4) / partcount; - start = partidx * batchsize; - end = start + batchsize; - } - for (i = start; i < end; ++i) { j = ((uint32_t *)a)[0] & 0x1FFFF0; @@ -99,12 +190,14 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int part MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]); } - if( partcount > 1 ) { + if( bfactor > 0 ) { MEMCPY8(ctx->a, a, 2); MEMCPY8(ctx->b, b, 2); } } + +#endif // __CUDA_ARCH__ >= 300 } __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx) @@ -148,6 +241,7 @@ __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state, } } + __host__ void cryptonight_core_cpu_init(int thr_id, int threads) { cn_aes_cpu_init(); @@ -157,6 +251,7 @@ __host__ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uin { dim3 grid(blocks); dim3 block(threads); + dim3 block4(threads << 2); dim3 block8(threads << 3); size_t shared_size = 1024; @@ -167,7 +262,7 @@ __host__ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uin if( partcount > 1 ) usleep(device_bsleep[thr_id]); for( i = 0; i < partcount; i++ ) { - cryptonight_core_gpu_phase2<<>>(blocks*threads, partcount, i, d_long_state, d_ctx); + cryptonight_core_gpu_phase2<<>>(blocks*threads, device_bfactor[thr_id], i, d_long_state, d_ctx); cudaDeviceSynchronize(); if( partcount > 1 ) usleep(device_bsleep[thr_id]); }