Skip to content
This repository has been archived by the owner on Sep 15, 2022. It is now read-only.

Commit

Permalink
Unify memory areas m_memPoints1, m_memPoints2 and m_memInverse into a…
Browse files Browse the repository at this point in the history
… single slightly larger memory area. Reduces on-device memory usage from ~2.47 GiB to ~1 GiB. Speeds seem unaffected.
  • Loading branch information
Johan Gustafsson committed May 10, 2018
1 parent e7c7b56 commit d3cdd5f
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 67 deletions.
58 changes: 34 additions & 24 deletions Dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ cl_command_queue Dispatcher::Device::createQueue(cl_context & clContext, cl_devi

cl_kernel Dispatcher::Device::createKernel(cl_program & clProgram, const std::string s) {
cl_kernel ret = clCreateKernel(clProgram, s.c_str(), NULL);
return ret == NULL ? throw std::runtime_error("failed to create kernel") : ret;
return ret == NULL ? throw std::runtime_error("failed to create kernel \"" + s + "\"") : ret;
}

Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_program & clProgram, cl_device_id clDeviceId, const size_t worksizeLocal, const size_t index) :
Expand All @@ -92,13 +92,14 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr
m_kernelInversePre(createKernel(clProgram, "profanity_inverse_pre")),
m_kernelInverse(createKernel(clProgram, "profanity_inverse_multiple")),
m_kernelInversePost(createKernel(clProgram, "profanity_inverse_post")),
m_kernelPass(createKernel(clProgram, "profanity_pass")),
m_kernelEnd(createKernel(clProgram, "profanity_end")),
m_memPrecomp(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, sizeof(g_precomp), g_precomp),
m_memPoints1(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, PROFANITY_SIZE, true),
m_memPoints2(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, PROFANITY_SIZE, true),
m_memInverse(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, PROFANITY_SIZE, true),
m_memPoints(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, PROFANITY_MEM_SIZE, true),
m_memPass(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, 1, true),
m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, 40),
m_memPointOffset(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, 1, true),
m_memPointNextOffset(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, 1, true),
m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
m_speed(PROFANITY_SPEEDSAMPLES)
Expand Down Expand Up @@ -158,37 +159,45 @@ void Dispatcher::init(Device & d) {

// Kernel arguments - profanity_begin
d.m_memPrecomp.setKernelArg(d.m_kernelBegin, 0);
d.m_memPoints1.setKernelArg(d.m_kernelBegin, 1);
d.m_memPass.setKernelArg(d.m_kernelBegin, 2);
d.m_memResult.setKernelArg(d.m_kernelBegin, 3);
d.m_memPoints.setKernelArg(d.m_kernelBegin, 1);
d.m_memPointOffset.setKernelArg(d.m_kernelBegin, 2);
d.m_memPointNextOffset.setKernelArg(d.m_kernelBegin, 3);
d.m_memPass.setKernelArg(d.m_kernelBegin, 4);
d.m_memResult.setKernelArg(d.m_kernelBegin, 5);
/* seed set in dispatch() */

// Kernel arguments - profanity_inverse_pre
d.m_memPrecomp.setKernelArg(d.m_kernelInversePre, 0);
d.m_memPoints1.setKernelArg(d.m_kernelInversePre, 1);
d.m_memPoints2.setKernelArg(d.m_kernelInversePre, 2);
d.m_memInverse.setKernelArg(d.m_kernelInversePre, 3);
d.m_memPoints.setKernelArg(d.m_kernelInversePre, 1);
d.m_memPointOffset.setKernelArg(d.m_kernelInversePre, 2);
d.m_memPointNextOffset.setKernelArg(d.m_kernelInversePre, 3);
d.m_memPass.setKernelArg(d.m_kernelInversePre, 4);

// Kernel arguments - profanity_inverse
d.m_memInverse.setKernelArg(d.m_kernelInverse, 0);
d.m_memPass.setKernelArg(d.m_kernelInverse, 1);
d.m_memPoints.setKernelArg(d.m_kernelInverse, 0);
d.m_memPointNextOffset.setKernelArg(d.m_kernelInverse, 1);

// Kernel arguments - profanity_inverse_post
d.m_memPrecomp.setKernelArg(d.m_kernelInversePost, 0);
d.m_memPoints1.setKernelArg(d.m_kernelInversePost, 1);
d.m_memPoints2.setKernelArg(d.m_kernelInversePost, 2);
d.m_memInverse.setKernelArg(d.m_kernelInversePost, 3);
d.m_memPoints.setKernelArg(d.m_kernelInversePost, 1);
d.m_memPointOffset.setKernelArg(d.m_kernelInversePost, 2);
d.m_memPointNextOffset.setKernelArg(d.m_kernelInversePost, 3);
d.m_memPass.setKernelArg(d.m_kernelInversePost, 4);

// Kernel arguments - profanity_end
d.m_memPoints1.setKernelArg(d.m_kernelEnd, 0);
d.m_memResult.setKernelArg(d.m_kernelEnd, 1);
d.m_memData1.setKernelArg(d.m_kernelEnd, 2);
d.m_memData2.setKernelArg(d.m_kernelEnd, 3);
// Kernel arguments - profanity_pass
d.m_memPass.setKernelArg(d.m_kernelPass, 0);
d.m_memPointOffset.setKernelArg(d.m_kernelPass, 1);
d.m_memPointNextOffset.setKernelArg(d.m_kernelPass, 2);

CLMemory<cl_uchar>::setKernelArg(d.m_kernelEnd, 4, d.m_clScoreMax);
CLMemory<cl_uchar>::setKernelArg(d.m_kernelEnd, 5, m_mode.mode);
// Kernel arguments - profanity_end
d.m_memPoints.setKernelArg(d.m_kernelEnd, 0);
d.m_memPointOffset.setKernelArg(d.m_kernelEnd, 1);
d.m_memResult.setKernelArg(d.m_kernelEnd, 2);
d.m_memData1.setKernelArg(d.m_kernelEnd, 3);
d.m_memData2.setKernelArg(d.m_kernelEnd, 4);

CLMemory<cl_uchar>::setKernelArg(d.m_kernelEnd, 5, d.m_clScoreMax);
CLMemory<cl_uchar>::setKernelArg(d.m_kernelEnd, 6, m_mode.mode);
}

void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal) {
Expand Down Expand Up @@ -225,14 +234,15 @@ void Dispatcher::enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t wo
void Dispatcher::dispatch(Device & d) {
// Write new seed
randomizeSeed(d);
CLMemory<cl_ulong4>::setKernelArg(d.m_kernelBegin, 4, d.m_clSeed);
CLMemory<cl_ulong4>::setKernelArg(d.m_kernelBegin, 6, d.m_clSeed);

enqueueKernelDevice(d, d.m_kernelBegin, 1);

for (auto i = 1; i < PROFANITY_PASSES + 1; ++i) {
enqueueKernelDevice(d, d.m_kernelInversePre, g_worksizes[i]);
enqueueKernelDevice(d, d.m_kernelInverse, g_worksizes[i] / 255);
enqueueKernelDevice(d, d.m_kernelInversePost, g_worksizes[i]);
enqueueKernelDevice(d, d.m_kernelPass, 1);
}

enqueueKernelDevice(d, d.m_kernelEnd, g_worksizes[PROFANITY_PASSES]);
Expand All @@ -251,7 +261,7 @@ void Dispatcher::handleResult(Device & d) {

if (r.found > 0 && r.foundScore >= d.m_clScoreMax) {
d.m_clScoreMax = r.foundScore;
CLMemory<cl_uchar>::setKernelArg(d.m_kernelEnd, 4, d.m_clScoreMax);
CLMemory<cl_uchar>::setKernelArg(d.m_kernelEnd, 5, d.m_clScoreMax);

std::lock_guard<std::mutex> lock(m_mutex);
if (r.foundScore >= m_clScoreMax) {
Expand Down
9 changes: 6 additions & 3 deletions Dispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,19 @@ class Dispatcher {
cl_kernel m_kernelInversePre;
cl_kernel m_kernelInverse;
cl_kernel m_kernelInversePost;
cl_kernel m_kernelPass;
cl_kernel m_kernelEnd;

CLMemory<point> m_memPrecomp;
CLMemory<point> m_memPoints1;
CLMemory<point> m_memPoints2;
CLMemory<bignum> m_memInverse;
CLMemory<point> m_memPoints;
CLMemory<cl_uchar> m_memPass;

CLMemory<result> m_memResult;

// Offsets into points array for current and next pass
CLMemory<cl_uint> m_memPointOffset;
CLMemory<cl_uint> m_memPointNextOffset;

// Data parameters used in some modes
CLMemory<cl_uchar> m_memData1;
CLMemory<cl_uchar> m_memData2;
Expand Down
29 changes: 15 additions & 14 deletions constants.hpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#ifndef HPP_CONSTANTS
#define HPP_CONSTANTS

const size_t g_worksizes[] = {
1,
255,
255 * 255,
255 * 255 * 255
};

#define PROFANITY_PASSES 3
#define PROFANITY_SIZE (g_worksizes[PROFANITY_PASSES])
#define PROFANITY_DEBUG true

#ifndef HPP_CONSTANTS
#define HPP_CONSTANTS

const size_t g_worksizes[] = {
1,
255,
255 * 255,
255 * 255 * 255
};

#define PROFANITY_PASSES 3
#define PROFANITY_SIZE (255 * 255 * 255)
#define PROFANITY_MEM_SIZE (1 + 255 + 255 * 255 + 255 * 255 * 255)
#define PROFANITY_DEBUG true

#endif /* HPP_CONSTANTS */
64 changes: 38 additions & 26 deletions profanity.cl
Original file line number Diff line number Diff line change
Expand Up @@ -315,10 +315,13 @@ void profanity_begin_seed(__global const point * const precomp, point * const p,
}
}

__kernel void profanity_begin(__global const point * const precomp, __global point * const pPoints1, __global uchar * const pPass, __global result * const pResult, const ulong4 seed) {
__kernel void profanity_begin(__global const point * const precomp, __global point * const pPoints, __global uint * const pPointOffset, __global uint * const pPointNextOffset, __global uchar * const pPass, __global result * const pResult, const ulong4 seed) {
const size_t id = get_global_id(0);

if( id == 0 ) {
*pPointOffset = 0;
*pPointNextOffset = 1;

point p;
point o;
bool bIsFirst = true;
Expand All @@ -328,37 +331,39 @@ __kernel void profanity_begin(__global const point * const precomp, __global poi
profanity_begin_seed(precomp, &p, &bIsFirst, 8, 8 * 255 * 2, seed.z);
profanity_begin_seed(precomp, &p, &bIsFirst, 8 - PROFANITY_PASSES, 8 * 255 * 3, seed.w);

pPoints1[0] = p;
pPoints[*pPointOffset] = p;
*pPass = 8 - PROFANITY_PASSES;
for( uchar i = 0; i < 40; ++i ) {
pResult[i].found = 0;
}
}
}

__kernel void profanity_inverse_pre(__global const point * const precomp, __global const point * const pPoints1, __global point * const pPoints2, __global mp_number * const pInverse, __global uchar * pPass ) {
__kernel void profanity_inverse_pre(__global const point * const precomp, __global point * const pPoints, __global const uint * const pPointOffset, __global const uint * const pPointNextOffset, __global uchar * pPass ) {
const size_t id = get_global_id(0);

point s = pPoints1[id / 255];
point s = pPoints[*pPointOffset + id / 255];
point o = precomp[8 * 255 * 3 + (*pPass) * 255 + id % 255];

mp_number deltaX;

mp_mod_sub( &deltaX, &o.x, &s.x);
pInverse[id] = deltaX;
pPoints2[id / 255] = s; // Multiple overwrites

// Temporarily save number to invert in X-coordinate of next point. Saves quite some memory.
pPoints[*pPointNextOffset + id].x = deltaX;
}

__kernel void profanity_inverse_multiple(__global mp_number * const pInverse, __global uchar * pPass ) {
__kernel void profanity_inverse_multiple(__global point * const pPoints, __global const uint * const pPointNextOffset) {
const size_t id = get_global_id(0) * 255;

mp_number inv;
mp_number copy; // Optimize this later
mp_number buffer[255];
mp_number mont_rrr = { { 0x3795f671, 0x002bb1e3, 0x00000b73, 0x1, 0, 0, 0, 0 } };
__global point * const pInverse = pPoints + *pPointNextOffset;

buffer[0] = pInverse[id];
buffer[0] = pInverse[id].x;
for( uchar i = 1; i < 255; ++i ) {
copy = pInverse[id + i];
copy = pInverse[id + i].x;
mp_mul_mont( &buffer[i], &buffer[i-1], &copy );
}

Expand All @@ -371,29 +376,24 @@ __kernel void profanity_inverse_multiple(__global mp_number * const pInverse, __
mp_mul_mont(&inv, &inv, &mont_rrr);

for( uchar i = 255 - 1; i > 0; --i ) {
copy = pInverse[id+i];
copy = pInverse[id+i].x;
mp_mul_mont( &copy, &copy, &inv);

mp_mul_mont( &buffer[i], &buffer[i-1], &inv);
pInverse[id+i] = buffer[i];
pInverse[id+i].x = buffer[i];
inv = copy;
}

pInverse[id] = inv;

// We increase the pass counter here where it's not used. (*pPass - 1) used in profanity_inverse_post
if( id == 0 ) {
*pPass += 1;
}
pInverse[id].x = inv;
}

__kernel void profanity_inverse_post(__global const point * const precomp, __global point * const pPoints1, __global point * const pPoints2, __global const mp_number * const pInverse, __global uchar * pPass ) {
__kernel void profanity_inverse_post(__global const point * const precomp, __global point * const pPoints, __global const uint * const pPointOffset, __global const uint * const pPointNextOffset, __global uchar * pPass ) {
const size_t id = get_global_id(0);

point s = pPoints2[id / 255];
point o = precomp[8 * 255 * 3 + (*pPass - 1) * 255 + id % 255];
point s = pPoints[*pPointOffset + id / 255];
point o = precomp[8 * 255 * 3 + *pPass * 255 + id % 255];

mp_number tmp = pInverse[id];
mp_number tmp = pPoints[*pPointNextOffset + id].x; // Inverse was saved to X-coordinate of next point (intermediary storage)
mp_number newX;
mp_number newY;

Expand All @@ -409,12 +409,24 @@ __kernel void profanity_inverse_post(__global const point * const precomp, __glo
mp_mul_mont( &newY, &newY, &tmp );
mp_mod_sub( &newY, &newY, &s.y );

pPoints1[id].x = newX;
pPoints1[id].y = newY;
pPoints[*pPointNextOffset + id].x = newX;
pPoints[*pPointNextOffset + id].y = newY;
}

__kernel void profanity_pass(__global uchar * const pPass, __global uint * const pPointOffset, __global uint * const pPointNextOffset) {
++*pPass;
*pPointOffset = *pPointNextOffset;
uint newPower = 1;
for( uint i = 8 - PROFANITY_PASSES; i < *pPass; ++i ) {
newPower *= 255;
}

*pPointNextOffset += newPower;
}

__kernel void profanity_end(
__global point * const pPoints1,
__global point * const pPoints,
__global const uint * const pPointOffset,
__global result * const pResult,
__constant const uchar * const data1,
__constant const uchar * const data2,
Expand All @@ -423,7 +435,7 @@ __kernel void profanity_end(
{
const size_t id = get_global_id(0);
ethhash h = { { 0 } }; // This doesn't work for some reason, we zero-initialize below.
point self = pPoints1[id];
point self = pPoints[*pPointOffset + id];
uchar i;

// De-montgomerize by multiplying with one.
Expand Down

0 comments on commit d3cdd5f

Please sign in to comment.