Skip to content

Commit

Permalink
Split the single cryptonight kernel into smaller pieces.
Browse files Browse the repository at this point in the history
  • Loading branch information
tsiv committed Jun 28, 2014
1 parent dd83f59 commit 364e4c1
Show file tree
Hide file tree
Showing 10 changed files with 516 additions and 366 deletions.
8 changes: 4 additions & 4 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
x11/x11.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
cryptonight/cryptonight.cu cryptonight/cuda_cryptonight.cu \
cryptonight/cryptonight.cu cryptonight/cuda_cryptonight_core.cu cryptonight/cuda_cryptonight_extra.cu \
cryptonight.c \
crypto/oaes_lib.c \
crypto/c_keccak.c \
Expand All @@ -54,12 +54,12 @@ ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME

#NVCC_GENCODE = -gencode=arch=compute_35,code=\"sm_35,compute_35\"
NVCC_GENCODE = -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\"
NVCC_GENCODE = -gencode=arch=compute_35,code=\"sm_35,compute_35\"
#NVCC_GENCODE = -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\"

# we're now targeting all major compute architectures within one binary.
.cu.o:
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" $(NVCC_GENCODE) --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" $(NVCC_GENCODE) --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<

# Shavite compiles faster with 128 regs
x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
Expand Down
8 changes: 7 additions & 1 deletion ccminer.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,13 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="cryptonight\cuda_cryptonight.cu">
<CudaCompile Include="cryptonight\cuda_cryptonight_extra.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="cryptonight\cuda_cryptonight_core.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
Expand Down
2 changes: 1 addition & 1 deletion cryptonight.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@

#include "cpuminer-config.h"
#include "miner.h"
#include "cryptonight.h"
#include "crypto/oaes_lib.h"
#include "crypto/c_keccak.h"
#include "crypto/c_groestl.h"
#include "crypto/c_blake256.h"
#include "crypto/c_jh.h"
#include "crypto/c_skein.h"
#include "cryptonight.h"

struct cryptonight_ctx {
uint8_t long_state[MEMORY];
Expand Down
103 changes: 91 additions & 12 deletions cryptonight.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

#define MEMORY (1 << 21) /* 2 MiB */
#define ITER (1 << 20)
#define MEMORY (1 << 21) // 2 MiB / 2097152 B
#define ITER (1 << 20) // 1048576
#define AES_BLOCK_SIZE 16
#define AES_KEY_SIZE 32
#define INIT_SIZE_BLK 8
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B

#define AES_RKEY_LEN 4
#define AES_COL_LEN 4
Expand All @@ -21,6 +21,88 @@
#define hi_dword(x) (x >> 32)
#define lo_dword(x) (x & 0xFFFFFFFF)

#define C32(x) ((uint32_t)(x ## U))
#define T32(x) ((x) & C32(0xFFFFFFFF))

#ifndef ROTL64
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#define ROTL64(x, n) (cuda_ROTL64(x, n))
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#endif

#ifndef ROTL32
#if __CUDA_ARCH__ < 350
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
#endif

#ifndef ROTR32
#if __CUDA_ARCH__ < 350
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#else
#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
#endif
#endif

#define MEMSET8(dst,what,cnt) { \
int i_memset8; \
uint64_t *out_memset8 = (uint64_t *)(dst); \
for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
out_memset8[i_memset8] = (what); }

#define MEMSET4(dst,what,cnt) { \
int i_memset4; \
uint32_t *out_memset4 = (uint32_t *)(dst); \
for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
out_memset4[i_memset4] = (what); }

#define MEMCPY8(dst,src,cnt) { \
int i_memcpy8; \
uint64_t *in_memcpy8 = (uint64_t *)(src); \
uint64_t *out_memcpy8 = (uint64_t *)(dst); \
for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }

#define MEMCPY4(dst,src,cnt) { \
int i_memcpy4; \
uint32_t *in_memcpy4 = (uint32_t *)(src); \
uint32_t *out_memcpy4 = (uint32_t *)(dst); \
for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }

#define XOR_BLOCKS(a,b) { \
((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \
((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; }

#define XOR_BLOCKS_DST(x,y,z) { \
((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }

#define MUL_SUM_XOR_DST(a,c,dst) { \
uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
hi += ((uint64_t *)c)[0]; \
((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
((uint64_t *)dst)[0] = hi; \
((uint64_t *)dst)[1] = lo; }

#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))

union hash_state {
uint8_t b[200];
uint64_t w[25];
Expand All @@ -34,15 +116,12 @@ union cn_slow_hash_state {
};
};

union cn_gpu_hash_state {
union {
uint8_t b[200];
uint64_t w[25];
} hs;
struct {
uint8_t k[64];
uint8_t init[INIT_SIZE_BYTE];
};
struct cryptonight_gpu_ctx {
uint32_t state[50];
uint32_t a[4];
uint32_t b[4];
uint32_t key1[40];
uint32_t key2[40];
};

void hash_permutation(union hash_state *state);
Expand Down
27 changes: 17 additions & 10 deletions cryptonight/cryptonight.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@ extern int device_map[8];
extern int device_config[8][2];

static uint8_t *d_long_state[8];
static union cn_gpu_hash_state *d_hash_state[8];
static struct cryptonight_gpu_ctx *d_ctx[8];

extern bool opt_benchmark;

extern void cryptonight_cpu_init(int thr_id, int threads);
extern void cryptonight_cpu_setInput(int thr_id, void *data, void *pTargetIn);
extern void cryptonight_cpu_hash(int thr_id, int blocks, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state);
extern void cryptonight_core_cpu_init(int thr_id, int threads);
extern void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx);

extern void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
extern void cryptonight_extra_cpu_init(int thr_id);
extern void cryptonight_extra_cpu_prepare(int thr_id, int threads, uint32_t startNonce, struct cryptonight_gpu_ctx *d_ctx);
extern void cryptonight_extra_cpu_final(int thr_id, int threads, uint32_t startNonce, uint32_t *nonce, struct cryptonight_gpu_ctx *d_ctx);

extern "C" void cryptonight_hash(void* output, const void* input, size_t len);

Expand Down Expand Up @@ -49,23 +53,26 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata,
cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
if( cudaMalloc(&d_long_state[thr_id], alloc) != cudaSuccess ) {
applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for the long state variable", thr_id);
applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for long state", thr_id);
exit(1);
}
if( cudaMalloc(&d_hash_state[thr_id], sizeof(union cn_gpu_hash_state) * throughput) != cudaSuccess ) {
applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for the hash state variable", thr_id);
if( cudaMalloc(&d_ctx[thr_id], sizeof(struct cryptonight_gpu_ctx) * throughput) != cudaSuccess ) {
applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for hash context", thr_id);
exit(1);
}
cryptonight_cpu_init(thr_id, throughput);
cryptonight_core_cpu_init(thr_id, throughput);
cryptonight_extra_cpu_init(thr_id);
init[thr_id] = true;
}

cryptonight_cpu_setInput(thr_id, (void *)pdata, (void*)ptarget);
cryptonight_extra_cpu_setData(thr_id, (const void *)pdata, (const void *)ptarget);

do {
uint32_t foundNonce = 0xFFFFFFFF;

cryptonight_cpu_hash(thr_id, cn_blocks, cn_threads, nonce, &foundNonce, d_long_state[thr_id], d_hash_state[thr_id]);
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx[thr_id]);
cryptonight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx[thr_id]);
cryptonight_extra_cpu_final(thr_id, throughput, nonce, &foundNonce, d_ctx[thr_id]);

if (foundNonce < 0xffffffff)
{
Expand Down
Loading

0 comments on commit 364e4c1

Please sign in to comment.