From d3cdd5f18c21e05ac18022704d45515dcccb78a8 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Thu, 10 May 2018 19:15:42 +0200 Subject: [PATCH] Unify memory areas m_memPoints1, m_memPoints2 and m_memInverse into a single slightly larger memory area. Reduces on-device memory usage from ~2.47 GiB to ~1 GiB. Speeds seem unaffected. --- Dispatcher.cpp | 58 ++++++++++++++++++++++++++------------------- Dispatcher.hpp | 9 ++++--- constants.hpp | 29 ++++++++++++----------- profanity.cl | 64 ++++++++++++++++++++++++++++++-------------------- 4 files changed, 93 insertions(+), 67 deletions(-) diff --git a/Dispatcher.cpp b/Dispatcher.cpp index e8ef08b..d639445 100755 --- a/Dispatcher.cpp +++ b/Dispatcher.cpp @@ -78,7 +78,7 @@ cl_command_queue Dispatcher::Device::createQueue(cl_context & clContext, cl_devi cl_kernel Dispatcher::Device::createKernel(cl_program & clProgram, const std::string s) { cl_kernel ret = clCreateKernel(clProgram, s.c_str(), NULL); - return ret == NULL ? throw std::runtime_error("failed to create kernel") : ret; + return ret == NULL ? throw std::runtime_error("failed to create kernel \"" + s + "\"") : ret; } Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index) : @@ -92,13 +92,14 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr m_kernelInversePre(createKernel(clProgram, "profanity_inverse_pre")), m_kernelInverse(createKernel(clProgram, "profanity_inverse_multiple")), m_kernelInversePost(createKernel(clProgram, "profanity_inverse_post")), + m_kernelPass(createKernel(clProgram, "profanity_pass")), m_kernelEnd(createKernel(clProgram, "profanity_end")), m_memPrecomp(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, sizeof(g_precomp), g_precomp), - m_memPoints1(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, PROFANITY_SIZE, true), - m_memPoints2(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, PROFANITY_SIZE, true), - m_memInverse(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, PROFANITY_SIZE, true), + m_memPoints(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, PROFANITY_MEM_SIZE, true), m_memPass(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, 1, true), m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, 40), + m_memPointOffset(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, 1, true), + m_memPointNextOffset(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, 1, true), m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20), m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20), m_speed(PROFANITY_SPEEDSAMPLES) @@ -158,37 +159,45 @@ void Dispatcher::init(Device & d) { // Kernel arguments - profanity_begin d.m_memPrecomp.setKernelArg(d.m_kernelBegin, 0); - d.m_memPoints1.setKernelArg(d.m_kernelBegin, 1); - d.m_memPass.setKernelArg(d.m_kernelBegin, 2); - d.m_memResult.setKernelArg(d.m_kernelBegin, 3); + d.m_memPoints.setKernelArg(d.m_kernelBegin, 1); + d.m_memPointOffset.setKernelArg(d.m_kernelBegin, 2); + d.m_memPointNextOffset.setKernelArg(d.m_kernelBegin, 3); + d.m_memPass.setKernelArg(d.m_kernelBegin, 4); + d.m_memResult.setKernelArg(d.m_kernelBegin, 5); /* seed set in dispatch() */ // Kernel arguments - profanity_inverse_pre d.m_memPrecomp.setKernelArg(d.m_kernelInversePre, 0); - d.m_memPoints1.setKernelArg(d.m_kernelInversePre, 1); - d.m_memPoints2.setKernelArg(d.m_kernelInversePre, 2); - d.m_memInverse.setKernelArg(d.m_kernelInversePre, 3); + d.m_memPoints.setKernelArg(d.m_kernelInversePre, 1); + d.m_memPointOffset.setKernelArg(d.m_kernelInversePre, 2); + d.m_memPointNextOffset.setKernelArg(d.m_kernelInversePre, 3); d.m_memPass.setKernelArg(d.m_kernelInversePre, 4); // Kernel arguments - profanity_inverse - d.m_memInverse.setKernelArg(d.m_kernelInverse, 0); - d.m_memPass.setKernelArg(d.m_kernelInverse, 1); + d.m_memPoints.setKernelArg(d.m_kernelInverse, 0); + d.m_memPointNextOffset.setKernelArg(d.m_kernelInverse, 1); // Kernel arguments - profanity_inverse_post d.m_memPrecomp.setKernelArg(d.m_kernelInversePost, 0); - d.m_memPoints1.setKernelArg(d.m_kernelInversePost, 1); - d.m_memPoints2.setKernelArg(d.m_kernelInversePost, 2); - d.m_memInverse.setKernelArg(d.m_kernelInversePost, 3); + d.m_memPoints.setKernelArg(d.m_kernelInversePost, 1); + d.m_memPointOffset.setKernelArg(d.m_kernelInversePost, 2); + d.m_memPointNextOffset.setKernelArg(d.m_kernelInversePost, 3); d.m_memPass.setKernelArg(d.m_kernelInversePost, 4); - // Kernel arguments - profanity_end - d.m_memPoints1.setKernelArg(d.m_kernelEnd, 0); - d.m_memResult.setKernelArg(d.m_kernelEnd, 1); - d.m_memData1.setKernelArg(d.m_kernelEnd, 2); - d.m_memData2.setKernelArg(d.m_kernelEnd, 3); + // Kernel arguments - profanity_pass + d.m_memPass.setKernelArg(d.m_kernelPass, 0); + d.m_memPointOffset.setKernelArg(d.m_kernelPass, 1); + d.m_memPointNextOffset.setKernelArg(d.m_kernelPass, 2); - CLMemory::setKernelArg(d.m_kernelEnd, 4, d.m_clScoreMax); - CLMemory::setKernelArg(d.m_kernelEnd, 5, m_mode.mode); + // Kernel arguments - profanity_end + d.m_memPoints.setKernelArg(d.m_kernelEnd, 0); + d.m_memPointOffset.setKernelArg(d.m_kernelEnd, 1); + d.m_memResult.setKernelArg(d.m_kernelEnd, 2); + d.m_memData1.setKernelArg(d.m_kernelEnd, 3); + d.m_memData2.setKernelArg(d.m_kernelEnd, 4); + + CLMemory::setKernelArg(d.m_kernelEnd, 5, d.m_clScoreMax); + CLMemory::setKernelArg(d.m_kernelEnd, 6, m_mode.mode); } void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal) { @@ -225,7 +234,7 @@ void Dispatcher::enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t wo void Dispatcher::dispatch(Device & d) { // Write new seed randomizeSeed(d); - CLMemory::setKernelArg(d.m_kernelBegin, 4, d.m_clSeed); + CLMemory::setKernelArg(d.m_kernelBegin, 6, d.m_clSeed); enqueueKernelDevice(d, d.m_kernelBegin, 1); @@ -233,6 +242,7 @@ void Dispatcher::dispatch(Device & d) { enqueueKernelDevice(d, d.m_kernelInversePre, g_worksizes[i]); enqueueKernelDevice(d, d.m_kernelInverse, g_worksizes[i] / 255); enqueueKernelDevice(d, d.m_kernelInversePost, g_worksizes[i]); + enqueueKernelDevice(d, d.m_kernelPass, 1); } enqueueKernelDevice(d, d.m_kernelEnd, g_worksizes[PROFANITY_PASSES]); @@ -251,7 +261,7 @@ void Dispatcher::handleResult(Device & d) { if (r.found > 0 && r.foundScore >= d.m_clScoreMax) { d.m_clScoreMax = r.foundScore; - CLMemory::setKernelArg(d.m_kernelEnd, 4, d.m_clScoreMax); + CLMemory::setKernelArg(d.m_kernelEnd, 5, d.m_clScoreMax); std::lock_guard lock(m_mutex); if (r.foundScore >= m_clScoreMax) { diff --git a/Dispatcher.hpp b/Dispatcher.hpp index 15e4ebd..352ac6e 100755 --- a/Dispatcher.hpp +++ b/Dispatcher.hpp @@ -45,16 +45,19 @@ class Dispatcher { cl_kernel m_kernelInversePre; cl_kernel m_kernelInverse; cl_kernel m_kernelInversePost; + cl_kernel m_kernelPass; cl_kernel m_kernelEnd; CLMemory m_memPrecomp; - CLMemory m_memPoints1; - CLMemory m_memPoints2; - CLMemory m_memInverse; + CLMemory m_memPoints; CLMemory m_memPass; CLMemory m_memResult; + // Offsets into points array for current and next pass + CLMemory m_memPointOffset; + CLMemory m_memPointNextOffset; + // Data parameters used in some modes CLMemory m_memData1; CLMemory m_memData2; diff --git a/constants.hpp b/constants.hpp index 305b577..cae3574 100755 --- a/constants.hpp +++ b/constants.hpp @@ -1,15 +1,16 @@ -#ifndef HPP_CONSTANTS -#define HPP_CONSTANTS - -const size_t g_worksizes[] = { - 1, - 255, - 255 * 255, - 255 * 255 * 255 -}; - -#define PROFANITY_PASSES 3 -#define PROFANITY_SIZE (g_worksizes[PROFANITY_PASSES]) -#define PROFANITY_DEBUG true - +#ifndef HPP_CONSTANTS +#define HPP_CONSTANTS + +const size_t g_worksizes[] = { + 1, + 255, + 255 * 255, + 255 * 255 * 255 +}; + +#define PROFANITY_PASSES 3 +#define PROFANITY_SIZE (255 * 255 * 255) +#define PROFANITY_MEM_SIZE (1 + 255 + 255 * 255 + 255 * 255 * 255) +#define PROFANITY_DEBUG true + #endif /* HPP_CONSTANTS */ \ No newline at end of file diff --git a/profanity.cl b/profanity.cl index 1ba6019..3d4cc87 100755 --- a/profanity.cl +++ b/profanity.cl @@ -315,10 +315,13 @@ void profanity_begin_seed(__global const point * const precomp, point * const p, } } -__kernel void profanity_begin(__global const point * const precomp, __global point * const pPoints1, __global uchar * const pPass, __global result * const pResult, const ulong4 seed) { +__kernel void profanity_begin(__global const point * const precomp, __global point * const pPoints, __global uint * const pPointOffset, __global uint * const pPointNextOffset, __global uchar * const pPass, __global result * const pResult, const ulong4 seed) { const size_t id = get_global_id(0); if( id == 0 ) { + *pPointOffset = 0; + *pPointNextOffset = 1; + point p; point o; bool bIsFirst = true; @@ -328,7 +331,7 @@ __kernel void profanity_begin(__global const point * const precomp, __global poi profanity_begin_seed(precomp, &p, &bIsFirst, 8, 8 * 255 * 2, seed.z); profanity_begin_seed(precomp, &p, &bIsFirst, 8 - PROFANITY_PASSES, 8 * 255 * 3, seed.w); - pPoints1[0] = p; + pPoints[*pPointOffset] = p; *pPass = 8 - PROFANITY_PASSES; for( uchar i = 0; i < 40; ++i ) { pResult[i].found = 0; @@ -336,29 +339,31 @@ __kernel void profanity_begin(__global const point * const precomp, __global poi } } -__kernel void profanity_inverse_pre(__global const point * const precomp, __global const point * const pPoints1, __global point * const pPoints2, __global mp_number * const pInverse, __global uchar * pPass ) { +__kernel void profanity_inverse_pre(__global const point * const precomp, __global point * const pPoints, __global const uint * const pPointOffset, __global const uint * const pPointNextOffset, __global uchar * pPass ) { const size_t id = get_global_id(0); - point s = pPoints1[id / 255]; + point s = pPoints[*pPointOffset + id / 255]; point o = precomp[8 * 255 * 3 + (*pPass) * 255 + id % 255]; + mp_number deltaX; - mp_mod_sub( &deltaX, &o.x, &s.x); - pInverse[id] = deltaX; - pPoints2[id / 255] = s; // Multiple overwrites + + // Temporarily save number to invert in X-coordinate of next point. Saves quite some memory. + pPoints[*pPointNextOffset + id].x = deltaX; } -__kernel void profanity_inverse_multiple(__global mp_number * const pInverse, __global uchar * pPass ) { +__kernel void profanity_inverse_multiple(__global point * const pPoints, __global const uint * const pPointNextOffset) { const size_t id = get_global_id(0) * 255; mp_number inv; mp_number copy; // Optimize this later mp_number buffer[255]; mp_number mont_rrr = { { 0x3795f671, 0x002bb1e3, 0x00000b73, 0x1, 0, 0, 0, 0 } }; + __global point * const pInverse = pPoints + *pPointNextOffset; - buffer[0] = pInverse[id]; + buffer[0] = pInverse[id].x; for( uchar i = 1; i < 255; ++i ) { - copy = pInverse[id + i]; + copy = pInverse[id + i].x; mp_mul_mont( &buffer[i], &buffer[i-1], © ); } @@ -371,29 +376,24 @@ __kernel void profanity_inverse_multiple(__global mp_number * const pInverse, __ mp_mul_mont(&inv, &inv, &mont_rrr); for( uchar i = 255 - 1; i > 0; --i ) { - copy = pInverse[id+i]; + copy = pInverse[id+i].x; mp_mul_mont( ©, ©, &inv); mp_mul_mont( &buffer[i], &buffer[i-1], &inv); - pInverse[id+i] = buffer[i]; + pInverse[id+i].x = buffer[i]; inv = copy; } - pInverse[id] = inv; - - // We increase the pass counter here where it's not used. (*pPass - 1) used in profanity_inverse_post - if( id == 0 ) { - *pPass += 1; - } + pInverse[id].x = inv; } -__kernel void profanity_inverse_post(__global const point * const precomp, __global point * const pPoints1, __global point * const pPoints2, __global const mp_number * const pInverse, __global uchar * pPass ) { +__kernel void profanity_inverse_post(__global const point * const precomp, __global point * const pPoints, __global const uint * const pPointOffset, __global const uint * const pPointNextOffset, __global uchar * pPass ) { const size_t id = get_global_id(0); - point s = pPoints2[id / 255]; - point o = precomp[8 * 255 * 3 + (*pPass - 1) * 255 + id % 255]; + point s = pPoints[*pPointOffset + id / 255]; + point o = precomp[8 * 255 * 3 + *pPass * 255 + id % 255]; - mp_number tmp = pInverse[id]; + mp_number tmp = pPoints[*pPointNextOffset + id].x; // Inverse was saved to X-coordinate of next point (intermediary storage) mp_number newX; mp_number newY; @@ -409,12 +409,24 @@ __kernel void profanity_inverse_post(__global const point * const precomp, __glo mp_mul_mont( &newY, &newY, &tmp ); mp_mod_sub( &newY, &newY, &s.y ); - pPoints1[id].x = newX; - pPoints1[id].y = newY; + pPoints[*pPointNextOffset + id].x = newX; + pPoints[*pPointNextOffset + id].y = newY; +} + +__kernel void profanity_pass(__global uchar * const pPass, __global uint * const pPointOffset, __global uint * const pPointNextOffset) { + ++*pPass; + *pPointOffset = *pPointNextOffset; + uint newPower = 1; + for( uint i = 8 - PROFANITY_PASSES; i < *pPass; ++i ) { + newPower *= 255; + } + + *pPointNextOffset += newPower; } __kernel void profanity_end( - __global point * const pPoints1, + __global point * const pPoints, + __global const uint * const pPointOffset, __global result * const pResult, __constant const uchar * const data1, __constant const uchar * const data2, @@ -423,7 +435,7 @@ __kernel void profanity_end( { const size_t id = get_global_id(0); ethhash h = { { 0 } }; // This doesn't work for some reason, we zero-initialize below. - point self = pPoints1[id]; + point self = pPoints[*pPointOffset + id]; uchar i; // De-montgomerize by multiplying with one.