From 0c0e7fa952a1bdf47130fb8bf400664626127979 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 10 Sep 2018 16:49:59 +0200 Subject: [PATCH] unify cpu cryptonight implementations xmr-stak has several implementations for multi hash per thread. The results into 3 intepedent implementations. Each time the algorithm must be changed the possibility to introduce errors is very large. - unify the different cryptonight CPU implementations - simplify the function selection array to find the specilized cryptonight implementation - add a intermediat pointer to access the large state (similar to the old multi hash implementation) As side effect this change increases the speed of the single and multi hash algorithm. --- xmrstak/backend/amd/minethd.cpp | 2 +- xmrstak/backend/amd/minethd.hpp | 2 +- .../backend/cpu/crypto/cryptonight_aesni.h | 929 +++++------------- xmrstak/backend/cpu/minethd.cpp | 478 ++------- xmrstak/backend/cpu/minethd.hpp | 7 +- xmrstak/backend/nvidia/minethd.cpp | 2 +- xmrstak/backend/nvidia/minethd.hpp | 2 +- 7 files changed, 364 insertions(+), 1058 deletions(-) diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index f7b47249e..d6051ffcd 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -252,7 +252,7 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = results[i]; - hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx); + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx); if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp index 3142117c5..04c2ff8ad 100644 --- a/xmrstak/backend/amd/minethd.hpp +++ b/xmrstak/backend/amd/minethd.hpp @@ -24,7 +24,7 @@ class minethd : public iBackend static bool init_gpus(); private: - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg); diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 9f70bcfa7..83ba3d3ce 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -467,712 +467,325 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } -template -void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0) -{ - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); - - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) - { - memset(output, 0, 32); - return; - } - - keccak((const uint8_t *)input, len, ctx0->hash_state, 200); - - uint64_t monero_const; - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - { - monero_const = *reinterpret_cast(reinterpret_cast(input) + 35); - monero_const ^= *(reinterpret_cast(ctx0->hash_state) + 24); +#define CN_INIT_SINGLE \ + if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ + { \ + memset(output, 0, 32 * N); \ + return; \ } - // Optim - 99% time boundary - cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state); - - uint8_t* l0 = ctx0->long_state; - uint64_t* h0 = (uint64_t*)ctx0->hash_state; - - uint64_t al0 = h0[0] ^ h0[4]; - uint64_t ah0 = h0[1] ^ h0[5]; - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - - uint64_t idx0 = h0[0] ^ h0[4]; - - // Optim - 90% time boundary - for(size_t i = 0; i < ITERATIONS; i++) - { - __m128i cx; - cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]); - - if (ALGO == cryptonight_bittube2) - { - cx = aes_round_bittube2(cx, _mm_set_epi64x(ah0, al0)); - } - else - { - if(SOFT_AES) - cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); - else - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); - } - - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - else - _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - - idx0 = _mm_cvtsi128_si64(cx); - - if(PREFETCH) - _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0); - bx0 = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*)&l0[idx0 & MASK])[0]; - ch = ((uint64_t*)&l0[idx0 & MASK])[1]; - - lo = _umul128(idx0, cl, &hi); - - al0 += hi; - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - al0 ^= cl; - if(PREFETCH) - _mm_prefetch((const char*)&l0[al0 & MASK], _MM_HINT_T0); - ah0 += lo; - - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const ^ ((uint64_t*)&l0[idx0 & MASK])[0]; - else - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const; - } - else - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; - ah0 ^= ch; - - idx0 = al0; +#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ + keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ + uint64_t monero_const; \ + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + { \ + monero_const = *reinterpret_cast(reinterpret_cast(input) + len * n + 35); \ + monero_const ^= *(reinterpret_cast(ctx[n]->hash_state) + 24); \ + } \ + /* Optim - 99% time boundary */ \ + cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \ + \ + __m128i ax0; \ + uint64_t idx0; \ + __m128i bx0; \ + uint8_t* l0 = ctx[n]->long_state; \ + { \ + uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ + idx0 = h0[0] ^ h0[4]; \ + ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ + bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ + } \ + __m128i *ptr0 - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) - { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; - } - else if(ALGO == cryptonight_haven) - { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; - } +#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ + __m128i cx; \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + cx = _mm_load_si128(ptr0); \ + if (ALGO == cryptonight_bittube2) \ + { \ + cx = aes_round_bittube2(cx, ax0); \ + } \ + else \ + { \ + if(SOFT_AES) \ + cx = soft_aesenc(cx, ax0); \ + else \ + cx = _mm_aesenc_si128(cx, ax0); \ } - // Optim - 90% time boundary - cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state); - - // Optim - 99% time boundary - - keccakf((uint64_t*)ctx0->hash_state, 24); - extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output); -} - -// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon -// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output -// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons) -template -void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) -{ - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); - - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) - { - memset(output, 0, 64); - return; - } +#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + cryptonight_monero_tweak((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \ + else \ + _mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \ + idx0 = _mm_cvtsi128_si64(cx); \ + \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ + bx0 = cx; \ + +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ + uint64_t lo, cl, ch; \ + uint64_t al0 = _mm_cvtsi128_si64(ax0); \ + uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ + cl = ((uint64_t*)ptr0)[0]; \ + ch = ((uint64_t*)ptr0)[1]; \ + \ + { \ + uint64_t hi; \ + lo = _umul128(idx0, cl, &hi); \ + ah0 += lo; \ + al0 += hi; \ + } \ + ((uint64_t*)ptr0)[0] = al0; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0) + - keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); - keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200); +#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ + if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + { \ + if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \ + else \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \ + } \ + else \ + ((uint64_t*)ptr0)[1] = ah0; \ + al0 ^= cl; \ + ah0 ^= ch; \ + ax0 = _mm_set_epi64x(ah0, al0); \ + idx0 = al0; - uint64_t monero_const_0, monero_const_1; - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - { - monero_const_0 = *reinterpret_cast(reinterpret_cast(input) + 35); - monero_const_0 ^= *(reinterpret_cast(ctx[0]->hash_state) + 24); - monero_const_1 = *reinterpret_cast(reinterpret_cast(input) + len + 35); - monero_const_1 ^= *(reinterpret_cast(ctx[1]->hash_state) + 24); +#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ + if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ + { \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = d ^ q; \ + } \ + else if(ALGO == cryptonight_haven) \ + { \ + ptr0 = (__m128i *)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = (~d) ^ q; \ } - // Optim - 99% time boundary - cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); - cn_explode_scratchpad((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; +#define CN_FINALIZE(n) \ + /* Optim - 90% time boundary */ \ + cn_implode_scratchpad((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \ + /* Optim - 99% time boundary */ \ + keccakf((uint64_t*)ctx[n]->hash_state, 24); \ + extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n) - uint64_t axl0 = h0[0] ^ h0[4]; - uint64_t axh0 = h0[1] ^ h0[5]; - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - uint64_t axl1 = h1[0] ^ h1[4]; - uint64_t axh1 = h1[1] ^ h1[5]; - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - - uint64_t idx0 = h0[0] ^ h0[4]; - uint64_t idx1 = h1[0] ^ h1[4]; +//! defer the evaluation of an macro +#ifndef _MSC_VER +# define CN_DEFER(...) __VA_ARGS__ +#else +# define CN_EMPTY(...) +# define CN_DEFER(...) __VA_ARGS__ CN_EMPTY() +#endif - // Optim - 90% time boundary - for (size_t i = 0; i < ITERATIONS; i++) +//! execute the macro f with the passed arguments +#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__) + +/** add append n to all arguments and keeps n as first argument + * + * @param n number which is appended to the arguments (expect the first argument n) + * + * @code{.cpp} + * CN_ENUM_2(1, foo, bar) + * // is transformed to + * 1, foo1, bar1 + * @endcode + */ +#define CN_ENUM_0(n, ...) n +#define CN_ENUM_1(n, x1) n, x1 ## n +#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n +#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n +#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n +#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n +#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n +#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n +#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n +#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n +#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n +#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n +#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n + +/** repeat a macro call multiple times + * + * @param n number of arguments followed after f + * @param f name of the macro which should be executed + * @param ... n parameter which name will get appended by a unique number + * + * @code{.cpp} + * REPEAT_2(2, f, foo, bar) + * // is transformed to + * f(0, foo0, bar); f(1, foo1, bar1) + * @endcode + */ +#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)) +#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)) +#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)) +#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)) +#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__)) + +template< size_t N> +struct Cryptonight_hash; + +template< > +struct Cryptonight_hash<1> +{ + static constexpr size_t N = 1; + + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - __m128i cx; - cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]); - - if (ALGO == cryptonight_bittube2) - { - cx = aes_round_bittube2(cx, _mm_set_epi64x(axh0, axl0)); - } - else - { - if(SOFT_AES) - cx = soft_aesenc(cx, _mm_set_epi64x(axh0, axl0)); - else - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0)); - } - - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - else - _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - - idx0 = _mm_cvtsi128_si64(cx); - bx0 = cx; - - if(PREFETCH) - _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0); - - cx = _mm_load_si128((__m128i *)&l1[idx1 & MASK]); - - if (ALGO == cryptonight_bittube2) - { - cx = aes_round_bittube2(cx, _mm_set_epi64x(axh1, axl1)); - } - else - { - if(SOFT_AES) - cx = soft_aesenc(cx, _mm_set_epi64x(axh1, axl1)); - else - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1)); - } - - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) - cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx)); - else - _mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx)); - - idx1 = _mm_cvtsi128_si64(cx); - bx1 = cx; - - if(PREFETCH) - _mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0); - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*)&l0[idx0 & MASK])[0]; - ch = ((uint64_t*)&l0[idx0 & MASK])[1]; - - lo = _umul128(idx0, cl, &hi); - - axl0 += hi; - axh0 += lo; - ((uint64_t*)&l0[idx0 & MASK])[0] = axl0; - - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) - ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0 ^ ((uint64_t*)&l0[idx0 & MASK])[0]; - else - ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0; - } else - ((uint64_t*)&l0[idx0 & MASK])[1] = axh0; - - axh0 ^= ch; - axl0 ^= cl; - idx0 = axl0; + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) - { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); + CN_INIT_SINGLE; + REPEAT_1(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; - } - else if(ALGO == cryptonight_haven) + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; + REPEAT_1(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_1(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } - if(PREFETCH) - _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0); - - cl = ((uint64_t*)&l1[idx1 & MASK])[0]; - ch = ((uint64_t*)&l1[idx1 & MASK])[1]; - - lo = _umul128(idx1, cl, &hi); - - axl1 += hi; - axh1 += lo; - ((uint64_t*)&l1[idx1 & MASK])[0] = axl1; + REPEAT_1(0, CN_FINALIZE); + } +}; - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) { - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) - ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1 ^ ((uint64_t*)&l1[idx1 & MASK])[0]; - else - ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1; - } else - ((uint64_t*)&l1[idx1 & MASK])[1] = axh1; +template< > +struct Cryptonight_hash<2> +{ + static constexpr size_t N = 2; - axh1 ^= ch; - axl1 ^= cl; - idx1 = axl1; + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) - { - int64_t n = ((int64_t*)&l1[idx1 & MASK])[0]; - int32_t d = ((int32_t*)&l1[idx1 & MASK])[2]; - int64_t q = n / (d | 0x5); + CN_INIT_SINGLE; + REPEAT_2(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = d ^ q; - } - else if(ALGO == cryptonight_haven) + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) { - int64_t n = ((int64_t*)&l1[idx1 & MASK])[0]; - int32_t d = ((int32_t*)&l1[idx1 & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = (~d) ^ q; + REPEAT_2(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_2(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } - if(PREFETCH) - _mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0); + REPEAT_2(0, CN_FINALIZE); } +}; - // Optim - 90% time boundary - cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); - cn_implode_scratchpad((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state); - - // Optim - 99% time boundary - - keccakf((uint64_t*)ctx[0]->hash_state, 24); - extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); - keccakf((uint64_t*)ctx[1]->hash_state, 24); - extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32); -} - -#define CN_STEP1(a, b, c, l, ptr, idx) \ - ptr = (__m128i *)&l[idx & MASK]; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ - c = _mm_load_si128(ptr); - -#define CN_STEP2(a, b, c, l, ptr, idx) \ - if (ALGO == cryptonight_bittube2) \ - { \ - c = aes_round_bittube2(c, a); \ - } \ - else \ - { \ - if(SOFT_AES) \ - c = soft_aesenc(c, a); \ - else \ - c = _mm_aesenc_si128(c, a); \ - } \ - b = _mm_xor_si128(b, c); \ - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - cryptonight_monero_tweak((uint64_t*)ptr, b); \ - else \ - _mm_store_si128(ptr, b);\ - -#define CN_STEP3(a, b, c, l, ptr, idx) \ - idx = _mm_cvtsi128_si64(c); \ - ptr = (__m128i *)&l[idx & MASK]; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr, _MM_HINT_T0); \ - b = _mm_load_si128(ptr); - -#define CN_STEP4(a, b, c, l, mc, ptr, idx) \ - lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \ - a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \ - if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - { \ - _mm_store_si128(ptr, _mm_xor_si128(a, mc)); \ - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ - ((uint64_t*)ptr)[1] ^= ((uint64_t*)ptr)[0];\ - } \ - else \ - _mm_store_si128(ptr, a);\ - a = _mm_xor_si128(a, b); \ - idx = _mm_cvtsi128_si64(a); \ - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ - { \ - int64_t n = ((int64_t*)&l[idx & MASK])[0]; \ - int32_t d = ((int32_t*)&l[idx & MASK])[2]; \ - int64_t q = n / (d | 0x5); \ - ((int64_t*)&l[idx & MASK])[0] = n ^ q; \ - idx = d ^ q; \ - } \ - else if(ALGO == cryptonight_haven) \ - { \ - int64_t n = ((int64_t*)&l[idx & MASK])[0]; \ - int32_t d = ((int32_t*)&l[idx & MASK])[2]; \ - int64_t q = n / (d | 0x5); \ - ((int64_t*)&l[idx & MASK])[0] = n ^ q; \ - idx = (~d) ^ q; \ - } - -#define CONST_INIT(ctx, n) \ - __m128i mc##n = _mm_set_epi64x(*reinterpret_cast(reinterpret_cast(input) + n * len + 35) ^ \ - *(reinterpret_cast((ctx)->hash_state) + 24), 0); - -// This lovelier creation will do 3 cn hashes at a time. -template -void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +template< > +struct Cryptonight_hash<3> { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 3; - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - memset(output, 0, 32 * 3); - return; - } + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - for (size_t i = 0; i < 3; i++) - { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); - } + CN_INIT_SINGLE; + REPEAT_3(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - CONST_INIT(ctx[0], 0); - CONST_INIT(ctx[1], 1); - CONST_INIT(ctx[2], 2); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; - uint8_t* l2 = ctx[2]->long_state; - uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; - - __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i cx0 = _mm_set_epi64x(0, 0); - __m128i cx1 = _mm_set_epi64x(0, 0); - __m128i cx2 = _mm_set_epi64x(0, 0); - - uint64_t idx0, idx1, idx2; - idx0 = _mm_cvtsi128_si64(ax0); - idx1 = _mm_cvtsi128_si64(ax1); - idx2 = _mm_cvtsi128_si64(ax2); - - for (size_t i = 0; i < ITERATIONS/2; i++) - { - uint64_t hi, lo; - __m128i *ptr0, *ptr1, *ptr2; - - // EVEN ROUND - CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); - - CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); - - CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); - - CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); - - // ODD ROUND - CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); - - CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); - - CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); - - CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); - } + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) + { + REPEAT_3(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_3(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); + } - for (size_t i = 0; i < 3; i++) - { - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); - keccakf((uint64_t*)ctx[i]->hash_state, 24); - extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + REPEAT_3(0, CN_FINALIZE); } -} +}; -// This even lovelier creation will do 4 cn hashes at a time. -template -void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +template< > +struct Cryptonight_hash<4> { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 4; - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - memset(output, 0, 32 * 4); - return; - } + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - for (size_t i = 0; i < 4; i++) - { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); - } + CN_INIT_SINGLE; + REPEAT_4(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - CONST_INIT(ctx[0], 0); - CONST_INIT(ctx[1], 1); - CONST_INIT(ctx[2], 2); - CONST_INIT(ctx[3], 3); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; - uint8_t* l2 = ctx[2]->long_state; - uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; - uint8_t* l3 = ctx[3]->long_state; - uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; - - __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - __m128i cx0 = _mm_set_epi64x(0, 0); - __m128i cx1 = _mm_set_epi64x(0, 0); - __m128i cx2 = _mm_set_epi64x(0, 0); - __m128i cx3 = _mm_set_epi64x(0, 0); - - uint64_t idx0, idx1, idx2, idx3; - idx0 = _mm_cvtsi128_si64(ax0); - idx1 = _mm_cvtsi128_si64(ax1); - idx2 = _mm_cvtsi128_si64(ax2); - idx3 = _mm_cvtsi128_si64(ax3); - - for (size_t i = 0; i < ITERATIONS/2; i++) - { - uint64_t hi, lo; - __m128i *ptr0, *ptr1, *ptr2, *ptr3; - - // EVEN ROUND - CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); - - CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); - - CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); - - CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3); - - // ODD ROUND - CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); - - CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); - - CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); - - CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3); - } + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) + { + REPEAT_4(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_4(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); + } - for (size_t i = 0; i < 4; i++) - { - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); - keccakf((uint64_t*)ctx[i]->hash_state, 24); - extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + REPEAT_4(0, CN_FINALIZE); } -} +}; -// This most lovely creation will do 5 cn hashes at a time. -template -void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +template< > +struct Cryptonight_hash<5> { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + static constexpr size_t N = 5; - if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) { - memset(output, 0, 32 * 5); - return; - } + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); - for (size_t i = 0; i < 5; i++) - { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); - } + CN_INIT_SINGLE; + REPEAT_5(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); - CONST_INIT(ctx[0], 0); - CONST_INIT(ctx[1], 1); - CONST_INIT(ctx[2], 2); - CONST_INIT(ctx[3], 3); - CONST_INIT(ctx[4], 4); - - uint8_t* l0 = ctx[0]->long_state; - uint64_t* h0 = (uint64_t*)ctx[0]->hash_state; - uint8_t* l1 = ctx[1]->long_state; - uint64_t* h1 = (uint64_t*)ctx[1]->hash_state; - uint8_t* l2 = ctx[2]->long_state; - uint64_t* h2 = (uint64_t*)ctx[2]->hash_state; - uint8_t* l3 = ctx[3]->long_state; - uint64_t* h3 = (uint64_t*)ctx[3]->hash_state; - uint8_t* l4 = ctx[4]->long_state; - uint64_t* h4 = (uint64_t*)ctx[4]->hash_state; - - __m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - __m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]); - __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); - __m128i cx0 = _mm_set_epi64x(0, 0); - __m128i cx1 = _mm_set_epi64x(0, 0); - __m128i cx2 = _mm_set_epi64x(0, 0); - __m128i cx3 = _mm_set_epi64x(0, 0); - __m128i cx4 = _mm_set_epi64x(0, 0); - - uint64_t idx0, idx1, idx2, idx3, idx4; - idx0 = _mm_cvtsi128_si64(ax0); - idx1 = _mm_cvtsi128_si64(ax1); - idx2 = _mm_cvtsi128_si64(ax2); - idx3 = _mm_cvtsi128_si64(ax3); - idx4 = _mm_cvtsi128_si64(ax4); - - for (size_t i = 0; i < ITERATIONS/2; i++) - { - uint64_t hi, lo; - __m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4; - - // EVEN ROUND - CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3); - CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4); - - CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3); - CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4); - - CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0); - CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1); - CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2); - CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3); - CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4); - - CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3); - CN_STEP4(ax4, bx4, cx4, l4, mc4, ptr4, idx4); - - // ODD ROUND - CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3); - CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4); - - CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3); - CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4); - - CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0); - CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1); - CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2); - CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3); - CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4); - - CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0); - CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1); - CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2); - CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3); - CN_STEP4(ax4, cx4, bx4, l4, mc4, ptr4, idx4); - } + // Optim - 90% time boundary + for(size_t i = 0; i < ITERATIONS; i++) + { + REPEAT_5(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_5(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); + } - for (size_t i = 0; i < 5; i++) - { - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); - keccakf((uint64_t*)ctx[i]->hash_state, 24); - extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); + REPEAT_5(0, CN_FINALIZE); } -} +}; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index a8452ebb1..93ce218a3 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -234,7 +234,7 @@ bool minethd::self_test() unsigned char out[32 * MAX_N]; cn_hash_fun hashf; - cn_hash_fun_multi hashf_multi; + cn_hash_fun hashf_multi; xmrstak_algo algo = xmrstak_algo::invalid_algo; @@ -248,37 +248,37 @@ bool minethd::self_test() if(algo == cryptonight) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx[0]); + hashf("This is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx[0]); + hashf("This is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; - hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; - hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); + hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" @@ -289,7 +289,7 @@ bool minethd::self_test() else if(algo == cryptonight_lite) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite); @@ -298,71 +298,71 @@ bool minethd::self_test() else if(algo == cryptonight_monero) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; } else if(algo == cryptonight_aeon) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; } else if(algo == cryptonight_ipbc) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0; } else if(algo == cryptonight_stellite) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; } else if(algo == cryptonight_masari) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; } else if(algo == cryptonight_heavy) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; } else if(algo == cryptonight_haven) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven); - hashf("This is a test This is a test This is a test", 44, out, ctx[0]); + hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; } else if(algo == cryptonight_bittube2) @@ -372,13 +372,13 @@ bool minethd::self_test() hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2); - hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]); + hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx); bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; - hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]); + hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx); bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0; - hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]); + hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx); bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; } @@ -438,8 +438,10 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work return pvThreads; } -minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +template +minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) { + static_assert(N >= 1, "number of threads must be >= 1" ); // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary @@ -483,46 +485,55 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr } static const cn_hash_fun func_table[] = { - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash, - cryptonight_hash + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; @@ -532,333 +543,14 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr return func_table[ algv << 2 | digit.to_ulong() ]; } -void minethd::work_main() +minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) { - if(affinity >= 0) //-1 means no affinity - bindMemoryToNUMANode(affinity); - - order_fix.set_value(); - std::unique_lock lck(thd_aff_set); - lck.release(); - std::this_thread::yield(); - - cryptonight_ctx* ctx; - uint64_t iCount = 0; - uint64_t* piHashVal; - uint32_t* piNonce; - job_result result; - - // start with root algorithm and switch later if fork version is reached - auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); - ctx = minethd_alloc_ctx(); - - piHashVal = (uint64_t*)(result.bResult + 24); - piNonce = (uint32_t*)(oWork.bWorkBlob + 39); - result.iThreadId = iThreadNo; - - uint8_t version = 0; - size_t lastPoolId = 0; - - while (bQuit == 0) - { - if (oWork.bStall) - { - /* We are stalled here because the executor didn't find a job for us yet, - * either because of network latency, or a socket problem. Since we are - * raison d'etre of this software it us sensible to just wait until we have something - */ - - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - globalStates::inst().consume_work(oWork, iJobNo); - continue; - } - - size_t nonce_ctr = 0; - constexpr size_t nonce_chunk = 4096; // Needs to be a power of 2 - - assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); - memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID)); - - if(oWork.bNiceHash) - result.iNonce = *piNonce; - - uint8_t new_version = oWork.getVersion(); - if(new_version != version || oWork.iPoolId != lastPoolId) - { - coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId); - if(new_version >= coinDesc.GetMiningForkVersion()) - { - miner_algo = coinDesc.GetMiningAlgo(); - hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); - } - else - { - miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); - } - result.algorithm = miner_algo; - lastPoolId = oWork.iPoolId; - version = new_version; - } - - while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) - { - if ((iCount++ & 0xF) == 0) //Store stats every 16 hashes - { - uint64_t iStamp = get_timestamp_ms(); - iHashCount.store(iCount, std::memory_order_relaxed); - iTimestamp.store(iStamp, std::memory_order_relaxed); - } - - if((nonce_ctr++ & (nonce_chunk-1)) == 0) - { - globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk); - // check if the job is still valid, there is a small posibility that the job is switched - if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo) - break; - } - - *piNonce = result.iNonce; - - hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx); - - if (*piHashVal < oWork.iTarget) - executor::inst()->push_event(ex_event(result, oWork.iPoolId)); - result.iNonce++; - - std::this_thread::yield(); - } - - globalStates::inst().consume_work(oWork, iJobNo); - } - - cryptonight_free_ctx(ctx); + return func_multi_selector<1>(bHaveAes, bNoPrefetch, algo); } -minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +void minethd::work_main() { - // We have two independent flag bits in the functions - // therefore we will build a binary digit and select the - // function as a two digit binary - - uint8_t algv; - switch(algo) - { - case cryptonight: - algv = 2; - break; - case cryptonight_lite: - algv = 1; - break; - case cryptonight_monero: - algv = 0; - break; - case cryptonight_heavy: - algv = 3; - break; - case cryptonight_aeon: - algv = 4; - break; - case cryptonight_ipbc: - algv = 5; - break; - case cryptonight_stellite: - algv = 6; - break; - case cryptonight_masari: - algv = 7; - break; - case cryptonight_haven: - algv = 8; - break; - case cryptonight_bittube2: - algv = 9; - break; - default: - algv = 2; - break; - } - - static const cn_hash_fun_multi func_table[] = { - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash, - cryptonight_penta_hash - }; - - std::bitset<2> digit; - digit.set(0, !bHaveAes); - digit.set(1, !bNoPrefetch); - - return func_table[algv << 4 | (N-2) << 2 | digit.to_ulong()]; + multiway_work_main<1u>(); } void minethd::double_work_main() @@ -926,7 +618,7 @@ void minethd::multiway_work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun_multi hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); uint8_t version = 0; size_t lastPoolId = 0; @@ -961,12 +653,12 @@ void minethd::multiway_work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); } lastPoolId = oWork.iPoolId; version = new_version; diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 2d40ce314..26478542c 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -22,7 +22,7 @@ class minethd : public iBackend static std::vector thread_starter(uint32_t threadOffset, miner_work& pWork); static bool self_test(); - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id); @@ -30,8 +30,9 @@ class minethd : public iBackend static cryptonight_ctx* minethd_alloc_ctx(); private: - typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**); - static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); + + template + static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 88a1acc32..486a990e3 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -300,7 +300,7 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = foundNonce[i]; - hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx); + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx); if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp index d4ae03864..389356842 100644 --- a/xmrstak/backend/nvidia/minethd.hpp +++ b/xmrstak/backend/nvidia/minethd.hpp @@ -28,7 +28,7 @@ class minethd : public iBackend static bool self_test(); private: - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg); void start_mining();