diff --git a/README.md b/README.md index 934ccdd..d5b181c 100755 --- a/README.md +++ b/README.md @@ -3,6 +3,18 @@ ccminer-cryptonight A modification of Christian Buchner's & Christian H.'s ccminer project by tsiv for Cryptonight mining. +July 5th 2014 +------------- + +Massive improvement to interactivity on Windows, should also further help with TDR issues. +Introducing the --bfactor and --bsleep command line parameters allows for control over +execution of the biggest resource hog of the algorithm. Use bfactor to determine how +many parts the kernel is split into and bsleep to insert a short delay between the kernel +launches. The defaults are no splitting / no sleep for Linux and split into 64 (bfactor 6) +parts / sleep 100 microseconds between launches for Windows. These defaults seem to work +wonders on my 750 Ti on Windows 7, once again you may want to tweak according to your +environment. + June 30th 2014 -------------- diff --git a/README.txt b/README.txt index b254fb0..0197c42 100755 --- a/README.txt +++ b/README.txt @@ -61,6 +61,15 @@ most of their command line interface and options. value between devices, you can just enter a single value and it will be used for all devices. (default: 8x40) + --bfactor=X Enables running the Cryptonight kernel in smaller pieces.\n\ + The kernel will be run in 2^X parts according to bfactor,\n\ + with a small pause between parts, specified by --bsleep.\n\ + This is a per-device setting like the launch config.\n\ + (default: 0 (no splitting) on Linux, 6 (64 parts) on Windows)\n\ + --bsleep=X Insert a delay of X microseconds between kernel launches.\n\ + Use in combination with --bfactor to mitigate the lag\n\ + when running on your primary GPU.\n\ + This is a per-device setting like the launch config.\n\ -f, --diff Divide difficulty by this factor (std is 1) -o, --url=URL URL of mining server (default: " DEF_RPC_URL ") -O, --userpass=U:P username:password pair for mining server diff --git a/cpu-miner.c b/cpu-miner.c index 687609e..0c85acc 100755 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -179,7 +179,16 @@ uint16_t opt_vote = 9999; static int num_processors; int device_map[8] = {0,1,2,3,4,5,6,7}; // CB char *device_name[8]; // CB +int device_bfactor[8]; +int device_bsleep[8]; int device_config[8][2]; +#ifdef WIN32 +static int default_bfactor = 6; +static int default_bsleep = 100; +#else +static int default_bfactor = 0; +static int default_bsleep = 0; +#endif static char *rpc_url; static char *rpc_userpass; static char *rpc_user, *rpc_pass; @@ -239,6 +248,15 @@ Options:\n\ the remaining devices. If you don't need to vary the\n\ value between devices, you can just enter a single value\n\ and it will be used for all devices. (default: 8x40)\n\ + --bfactor=X Enables running the Cryptonight kernel in smaller pieces.\n\ + The kernel will be run in 2^X parts according to bfactor,\n\ + with a small pause between parts, specified by --bsleep.\n\ + This is a per-device setting like the launch config.\n\ + (default: 0 (no splitting) on Linux, 6 (64 parts) on Windows)\n\ + --bsleep=X Insert a delay of X microseconds between kernel launches.\n\ + Use in combination with --bfactor to mitigate the lag\n\ + when running on your primary GPU.\n\ + This is a per-device setting like the launch config.\n\ -m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ @@ -314,6 +332,8 @@ static struct option const options[] = { { "devices", 1, NULL, 'd' }, { "diff", 1, NULL, 'f' }, { "launch", 1, NULL, 'l' }, + { "bfactor", 1, NULL, 1008 }, + { "bsleep", 1, NULL, 1009 }, { 0, 0, 0, 0 } }; @@ -1732,6 +1752,42 @@ static void parse_arg (int key, char *arg) } } break; + case 1008: + { + p = strtok(arg, ","); + if( p == NULL ) show_usage_and_exit(1); + int last; + i = 0; + while( p != NULL && i < 8 ) { + device_bfactor[i++] = last = atoi(p); + if( last < 0 || last > 10 ) { + applog(LOG_ERR, "Valid range for --bfactor is 0-10"); + exit(1); + } + p = strtok(NULL, ","); + } + while (i < 8) { + device_bfactor[i++] = last; + } + } + break; + case 1009: + p = strtok(arg, ","); + if( p == NULL ) show_usage_and_exit(1); + int last; + i = 0; + while( p != NULL && i < 8 ) { + device_bsleep[i++] = last = atoi(p); + if( last < 0 || last > 1000000 ) { + applog(LOG_ERR, "Valid range for --bsleep is 0-1000000"); + exit(1); + } + p = strtok(NULL, ","); + } + while (i < 8) { + device_bsleep[i++] = last; + } + break; case 'V': show_version_and_exit(); @@ -1862,6 +1918,8 @@ int main(int argc, char *argv[]) for(i = 0; i < 8; i++) { device_config[i][0] = opt_cn_blocks; device_config[i][1] = opt_cn_threads; + device_bfactor[i] = default_bfactor; + device_bsleep[i] = default_bsleep; } /* parse command line */ diff --git a/cryptonight.h b/cryptonight.h index bcb6683..3977d74 100755 --- a/cryptonight.h +++ b/cryptonight.h @@ -122,6 +122,7 @@ struct cryptonight_gpu_ctx { uint32_t b[4]; uint32_t key1[40]; uint32_t key2[40]; + uint32_t text[32]; }; void hash_permutation(union hash_state *state); diff --git a/cryptonight/cuda_cryptonight_core.cu b/cryptonight/cuda_cryptonight_core.cu index 1a8a587..1e9cf10 100755 --- a/cryptonight/cuda_cryptonight_core.cu +++ b/cryptonight/cuda_cryptonight_core.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include "cuda.h" #include "cuda_runtime.h" #include "cryptonight.h" @@ -9,31 +10,15 @@ #include #endif -#include "cuda_cryptonight_aes.cu" +extern int device_bfactor[8]; +extern int device_bsleep[8]; -#define hi_dword(x) (x >> 32) -#define lo_dword(x) (x & 0xFFFFFFFF) +#include "cuda_cryptonight_aes.cu" __device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) { - uint64_t a = hi_dword(multiplier); - uint64_t b = lo_dword(multiplier); - uint64_t c = hi_dword(multiplicand); - uint64_t d = lo_dword(multiplicand); - - uint64_t ac = a * c; - uint64_t ad = a * d; - uint64_t bc = b * c; - uint64_t bd = b * d; - - uint64_t adbc = ad + bc; - uint64_t adbc_carry = adbc < ad ? 1 : 0; - - uint64_t product_lo = bd + (adbc << 32); - uint64_t product_lo_carry = product_lo < bd ? 1 : 0; - *product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry; - - return product_lo; + *product_hi = __umul64hi(multiplier, multiplicand); + return(multiplier * multiplicand); } __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx) @@ -50,6 +35,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state, if (thread < threads) { int i, j; + int start = 0, end = MEMORY; uint8_t *long_state = &d_long_state[MEMORY * thread]; uint32_t *ls32; struct cryptonight_gpu_ctx *ctx = &d_ctx[thread]; @@ -61,7 +47,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state, for( i = 0; i < 4; i++ ) text[i] = state[i]; - for (i = 0; i < MEMORY; i += INIT_SIZE_BYTE) { + for (i = start; i < end; i += INIT_SIZE_BYTE) { ls32 = (uint32_t *)&long_state[i]; @@ -73,7 +59,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state, } } -__global__ void cryptonight_core_gpu_phase2(int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx) +__global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int partidx, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx) { __shared__ uint32_t sharedMemory[1024]; @@ -86,6 +72,7 @@ __global__ void cryptonight_core_gpu_phase2(int threads, uint8_t *d_long_state, if (thread < threads) { int i, j; + int start = 0, end = ITER / 4; uint8_t *long_state = &d_long_state[MEMORY * thread]; struct cryptonight_gpu_ctx *ctx = &d_ctx[thread]; uint32_t a[4], b[4], c[4]; @@ -93,16 +80,29 @@ __global__ void cryptonight_core_gpu_phase2(int threads, uint8_t *d_long_state, MEMCPY8(a, ctx->a, 2); MEMCPY8(b, ctx->b, 2); - for (i = 0; i < ITER / 4; ++i) { + if( partcount > 1 ) { + + int batchsize = (ITER / 4) / partcount; + start = partidx * batchsize; + end = start + batchsize; + } + + for (i = start; i < end; ++i) { - j = E2I(a) * AES_BLOCK_SIZE; + j = ((uint32_t *)a)[0] & 0x1FFFF0; cn_aes_single_round(sharedMemory, &long_state[j], c, a); XOR_BLOCKS_DST(c, b, &long_state[j]); - MUL_SUM_XOR_DST(c, a, &long_state[E2I(c) * AES_BLOCK_SIZE]); - j = E2I(a) * AES_BLOCK_SIZE; + MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]); + j = ((uint32_t *)a)[0] & 0x1FFFF0; cn_aes_single_round(sharedMemory, &long_state[j], b, a); XOR_BLOCKS_DST(b, c, &long_state[j]); - MUL_SUM_XOR_DST(b, a, &long_state[E2I(b) * AES_BLOCK_SIZE]); + MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]); + } + + if( partcount > 1 ) { + + MEMCPY8(ctx->a, a, 2); + MEMCPY8(ctx->b, b, 2); } } } @@ -121,6 +121,7 @@ __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state, if (thread < threads) { int i, j; + int start = 0, end = MEMORY; uint8_t *long_state = &d_long_state[MEMORY * thread]; uint32_t *ls32; struct cryptonight_gpu_ctx *ctx = &d_ctx[thread]; @@ -132,7 +133,7 @@ __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state, for( i = 0; i < 4; i++ ) text[i] = state[i]; - for (i = 0; i < MEMORY; i += INIT_SIZE_BYTE) { + for (i = start; i < end; i += INIT_SIZE_BYTE) { ls32 = (uint32_t *)&long_state[i]; @@ -159,12 +160,17 @@ __host__ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uin dim3 block8(threads << 3); size_t shared_size = 1024; + int i, partcount = 1 << device_bfactor[thr_id]; cryptonight_core_gpu_phase1<<>>(blocks*threads, d_long_state, d_ctx); cudaDeviceSynchronize(); + if( partcount > 1 ) usleep(device_bsleep[thr_id]); - cryptonight_core_gpu_phase2<<>>(blocks*threads, d_long_state, d_ctx); - cudaDeviceSynchronize(); + for( i = 0; i < partcount; i++ ) { + cryptonight_core_gpu_phase2<<>>(blocks*threads, partcount, i, d_long_state, d_ctx); + cudaDeviceSynchronize(); + if( partcount > 1 ) usleep(device_bsleep[thr_id]); + } cryptonight_core_gpu_phase3<<>>(blocks*threads, d_long_state, d_ctx); cudaDeviceSynchronize();