From 96b2cedd2206311231bbda7e32709584b20e6ade Mon Sep 17 00:00:00 2001
From: tsiv <118de909ebbe6649@jarru.org>
Date: Thu, 24 Jul 2014 04:25:54 +0300
Subject: [PATCH] Launch config can now be specified by option name
 launch-config for kopiemtu, query and display device SMX count and warn if
 the launch config doesn't use a multiple of SMX count for block count, 4-way
 split of core phase 2 for devices that support the shuffle instruction
 (compute 3.0+)

---
 cpu-miner.c                          |  13 +--
 cryptonight/cryptonight.cu           |   2 +
 cryptonight/cuda_cryptonight_core.cu | 121 ++++++++++++++++++++++++---
 3 files changed, 118 insertions(+), 18 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index 0c85acc..10117f3 100755
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -179,6 +179,7 @@ uint16_t opt_vote = 9999;
 static int num_processors;
 int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
 char *device_name[8]; // CB
+int device_mpcount[8];
 int device_bfactor[8];
 int device_bsleep[8];
 int device_config[8][2];
@@ -332,6 +333,7 @@ static struct option const options[] = {
 	{ "devices", 1, NULL, 'd' },
 	{ "diff", 1, NULL, 'f' },
 	{ "launch", 1, NULL, 'l' },
+	{ "launch-config", 1, NULL, 'l' },
 	{ "bfactor", 1, NULL, 1008 },
 	{ "bsleep", 1, NULL, 1009 },
 	{ 0, 0, 0, 0 }
@@ -1110,11 +1112,12 @@ static void *miner_thread(void *userdata)
 		affine_to_cpu(thr_id, thr_id % num_processors);
 	}
 
-    if( opt_algo == ALGO_CRYPTONIGHT ) {
-
-        applog(LOG_INFO, "GPU #%d: %s, using %d blocks of %d threads",
-            device_map[thr_id], device_name[thr_id], device_config[thr_id][0], device_config[thr_id][1]);
-    }
+    applog(LOG_INFO, "GPU #%d: %s (%d SMX), using %d blocks of %d threads",
+        device_map[thr_id], device_name[thr_id], device_mpcount[thr_id], device_config[thr_id][0], device_config[thr_id][1]);
+    
+    if( device_config[thr_id][0] % device_mpcount[thr_id] )
+        applog(LOG_INFO, "GPU #%d: Warning: block count %d is not a multiple of SMX count %d.",
+            device_map[thr_id], device_config[thr_id][0], device_mpcount[thr_id]);
 
     uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + (jsonrpc_2 ? 39 : 76));
 
diff --git a/cryptonight/cryptonight.cu b/cryptonight/cryptonight.cu
index 33a1af5..f96b928 100755
--- a/cryptonight/cryptonight.cu
+++ b/cryptonight/cryptonight.cu
@@ -13,6 +13,7 @@ extern "C"
 }
 
 extern char *device_name[8];
+extern int device_mpcount[8];
 extern int device_map[8];
 extern int device_config[8][2];
 
@@ -61,6 +62,7 @@ extern "C" void cuda_devicenames()
         cudaGetDeviceProperties(&props, device_map[i]);
 
         device_name[i] = strdup(props.name);
+        device_mpcount[i] = props.multiProcessorCount;
     }
 }
 
diff --git a/cryptonight/cuda_cryptonight_core.cu b/cryptonight/cuda_cryptonight_core.cu
index 1e9cf10..bf3d91a 100755
--- a/cryptonight/cuda_cryptonight_core.cu
+++ b/cryptonight/cuda_cryptonight_core.cu
@@ -59,7 +59,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state,
     }
 }
 
-__global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int partidx, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
+__global__ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
@@ -67,12 +67,110 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int part
 
 	__syncthreads();
 
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-   
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
+
+#if __CUDA_ARCH__ >= 300
+
+    int sub = threadIdx.x & 3;
+
     if (thread < threads)
+    {
+        int i, j, k;
+        int batchsize = ITER >> (2+bfactor);
+        int start = partidx * batchsize;
+        int end = start + batchsize;
+        uint8_t *long_state = &d_long_state[MEMORY * thread];
+        struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
+        uint32_t a, b, c, x[4];
+        uint32_t *ls32;
+        uint32_t t1[4], t2[4], res;
+        uint64_t reshi, reslo;
+
+        a = ctx->a[sub];
+        b = ctx->b[sub];
+
+        for (i = start; i < end; ++i) {
+
+            //j = ((uint32_t *)a)[0] & 0x1FFFF0;
+            j = __shfl((int)a, 0, 4) & 0x1FFFF0;
+            
+            //cn_aes_single_round(sharedMemory, &long_state[j], c, a);
+            ls32 = (uint32_t *)&long_state[j];
+            x[0] = ls32[sub];
+            x[1] = __shfl((int)x[0], sub+1, 4);
+            x[2] = __shfl((int)x[0], sub+2, 4);
+            x[3] = __shfl((int)x[0], sub+3, 4);
+            c = a ^
+                t_fn0(x[0] & 0xff) ^
+                t_fn1((x[1] >> 8) & 0xff) ^
+                t_fn2((x[2] >> 16) & 0xff) ^
+                t_fn3((x[3] >> 24) & 0xff);
+            
+            //XOR_BLOCKS_DST(c, b, &long_state[j]);
+            ls32[sub] = c ^ b;
+
+            //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
+            ls32 = (uint32_t *)&long_state[__shfl((int)c, 0, 4) & 0x1FFFF0];
+            for( k = 0; k < 2; k++ ) t1[k] = __shfl((int)c, k, 4);
+            for( k = 0; k < 4; k++ ) t2[k] = __shfl((int)a, k, 4);
+            asm(
+                "mad.lo.u64 %0, %2, %3, %4;\n\t"
+                "mad.hi.u64 %1, %2, %3, %5;\n\t"
+                : "=l"(reslo), "=l"(reshi)
+                : "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)ls32)[0]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
+            res = (sub & 2 ? reslo : reshi) >> (sub&1 ? 32 : 0);
+            a = ls32[sub] ^ res;
+            ls32[sub] = res;
+
+            //j = ((uint32_t *)a)[0] & 0x1FFFF0;
+            j = __shfl((int)a, 0, 4) & 0x1FFFF0;
+            
+            //cn_aes_single_round(sharedMemory, &long_state[j], b, a);
+            ls32 = (uint32_t *)&long_state[j];
+            x[0] = ls32[sub];
+            x[1] = __shfl((int)x[0], sub+1, 4);
+            x[2] = __shfl((int)x[0], sub+2, 4);
+            x[3] = __shfl((int)x[0], sub+3, 4);
+            b = a ^
+                t_fn0(x[0] & 0xff) ^
+                t_fn1((x[1] >> 8) & 0xff) ^
+                t_fn2((x[2] >> 16) & 0xff) ^
+                t_fn3((x[3] >> 24) & 0xff);
+
+            //XOR_BLOCKS_DST(b, c, &long_state[j]);
+            ls32[sub] = c ^ b;
+
+            //MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
+            ls32 = (uint32_t *)&long_state[__shfl((int)b, 0, 4) & 0x1FFFF0];
+            for( k = 0; k < 2; k++ ) t1[k] = __shfl((int)b, k, 4);
+            for( k = 0; k < 4; k++ ) t2[k] = __shfl((int)a, k, 4);
+            asm(
+                "mad.lo.u64 %0, %2, %3, %4;\n\t"
+                "mad.hi.u64 %1, %2, %3, %5;\n\t"
+                : "=l"(reslo), "=l"(reshi)
+                : "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)ls32)[0]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
+            res = (sub & 2 ? reslo : reshi) >> (sub&1 ? 32 : 0);
+            a = ls32[sub] ^ res;
+            ls32[sub] = res;
+        }
+
+        if( bfactor > 0 ) {
+
+            ctx->a[sub] = a;
+            ctx->b[sub] = b;
+        }
+    }
+
+#else // __CUDA_ARCH__ < 300
+
+    // plain old single thread per hash implementation, but the kernel is still launched with 4 threads per hash
+    // have only one for the four "subthreads" actually do work
+    if (thread < threads && (threadIdx.x & 3) == 0)
     {
         int i, j;
-        int start = 0, end = ITER / 4;
+        int batchsize = ITER >> (2+bfactor);
+        int start = partidx * batchsize;
+        int end = start + batchsize;
         uint8_t *long_state = &d_long_state[MEMORY * thread];
         struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
         uint32_t a[4], b[4], c[4];
@@ -80,13 +178,6 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int part
         MEMCPY8(a, ctx->a, 2);
         MEMCPY8(b, ctx->b, 2);
 
-        if( partcount > 1 ) {
-            
-            int batchsize = (ITER / 4) / partcount;
-            start = partidx * batchsize;
-            end = start + batchsize;
-        }
-
         for (i = start; i < end; ++i) {
 
             j = ((uint32_t *)a)[0] & 0x1FFFF0;
@@ -99,12 +190,14 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int part
             MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
         }
         
-        if( partcount > 1 ) {
+        if( bfactor > 0 ) {
 
             MEMCPY8(ctx->a, a, 2);
             MEMCPY8(ctx->b, b, 2);
         }
     }
+
+#endif // __CUDA_ARCH__ >= 300
 }
 
 __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
@@ -148,6 +241,7 @@ __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state,
     }
 }
 
+
 __host__ void cryptonight_core_cpu_init(int thr_id, int threads)
 {
 	cn_aes_cpu_init();
@@ -157,6 +251,7 @@ __host__ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uin
 {
     dim3 grid(blocks);
     dim3 block(threads);
+    dim3 block4(threads << 2);
     dim3 block8(threads << 3);
 
     size_t shared_size = 1024;
@@ -167,7 +262,7 @@ __host__ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uin
     if( partcount > 1 ) usleep(device_bsleep[thr_id]);
 
     for( i = 0; i < partcount; i++ ) {
-        cryptonight_core_gpu_phase2<<<grid, block, shared_size>>>(blocks*threads, partcount, i, d_long_state, d_ctx);
+        cryptonight_core_gpu_phase2<<<grid, block4, shared_size>>>(blocks*threads, device_bfactor[thr_id], i, d_long_state, d_ctx);
         cudaDeviceSynchronize();
         if( partcount > 1 ) usleep(device_bsleep[thr_id]);
     }