Skip to content

Commit

Permalink
Optimise CPU induced velocity implementation
Browse files Browse the repository at this point in the history
Benchmarks of ind_vel CPU based.

64      0.2773181       14770.042056396607
128     0.0025487       6.428375250127516e6
256     0.008490701     7.718561753617281e6
512     0.0332823       7.876378735844578e6
1023    0.075158499     1.3924293512035144e7
1024    0.0695779       1.5070532453552062e7
2048    0.1895569       2.212688643884765e7
4096    0.653736601     2.5663571497047022e7
8192    2.5616574       2.6197439204789836e7
16384   10.022485799    2.678332116237903e7
32768   39.904468701    2.6907809048791878e7
BASE REFERENCE
==============================================================================

Take 1/4pi outside summation of induced velocities.
64      0.279353999     14662.399731746815
128     0.001186701     1.3806342119876867e7
256     0.0045381       1.4441286000749214e7
512     0.0155286       1.6881367283592854e7
1023    0.068377601     1.5305143565946398e7
1024    0.0337222       3.109453119903209e7
2048    0.134259        3.1240393567656547e7
4096    0.533717299     3.143464907627062e7
8192    2.251759101     2.9802861225340284e7
16384   8.614089501     3.1162371364824757e7
32768   34.0320941      3.155085963399472e7
GOOD!
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

den = powf(..., -1).
64      0.2782128       14722.54331935842
128     0.001603299     1.0218929844027845e7
256     0.0066348       9.877614999698559e6
512     0.0240207       1.0913253985104514e7
1023    0.060004901     1.744072538341493e7
1024    0.0725094       1.4461242266519926e7
2048    0.131886499     3.180237576857659e7
4096    0.546977801     3.0672572030030154e7
8192    2.125348801     3.1575459034500383e7
16384   8.4960887       3.1595180497585908e7
32768   33.8701437      3.1701720356149536e7
Slight improvement.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

radd = bsv_V3f_abs(rad) (Probably already done by compiler)
64      0.2744757       14922.996826312858
128     0.0017165       9.545004369356249e6
256     0.0068147       9.61685767531953e6
512     0.018308999     1.4317768000315037e7
1023    0.058209899     1.797854004178911e7
1024    0.0683  1.535250366032211e7
2048    0.1734336       2.418391822576479e7
4096    0.532896199     3.148308438206744e7
8192    2.129666299     3.1511445728145972e7
16384   8.7540114       3.0664279920860056e7
32768   33.9050131      3.166911691887829e7
No change. Keep anyway -neater.

Take 1/regularisation_radius outside summation
64      0.293100099     13974.7479239166
128     0.001287        1.273038073038073e7
256     0.0051691       1.267841597183262e7
512     0.0138748       1.889353360048433e7
1023    0.057763599     1.8117447979652375e7
1024    0.056297101     1.8625754814621802e7
2048    0.1636168       2.5634922575187877e7
4096    0.498578799     3.365007905199756e7
8192    2.012370501     3.3348165244248927e7
16384   7.9936686       3.358100884992906e7
32768   31.9621156      3.359420375790143e7
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  • Loading branch information
hjabird committed May 30, 2019
1 parent 8a580bd commit c890dce
Showing 1 changed file with 30 additions and 13 deletions.
43 changes: 30 additions & 13 deletions src/Particle.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,27 +33,43 @@ SOFTWARE.
# include "ocl_particle.h"
#endif

CVTX_EXPORT bsv_V3f cvtx_Particle_ind_vel(
/* The induced velocity for a particle excluding the constant
coefficient 1 / 4pi */
inline bsv_V3f particle_ind_vel_inner(
const cvtx_Particle * self,
const bsv_V3f mes_point,
const cvtx_VortFunc * kernel,
float regularisation_radius)
float recip_reg_rad)
{
bsv_V3f rad, num, ret;
if(bsv_V3f_isequal(self->coord, mes_point)){
if (bsv_V3f_isequal(self->coord, mes_point)) {
ret = bsv_V3f_zero();
} else {
float cor, den, rho;
}
else {
float cor, den, rho, radd;
rad = bsv_V3f_minus(mes_point, self->coord);
rho = fabsf(bsv_V3f_abs(rad) / regularisation_radius);
cor = - kernel->g_fn(rho) / ((float)4. * (float)acos(-1));
den = powf(bsv_V3f_abs(rad), 3);
radd = bsv_V3f_abs(rad);
rho = radd * recip_reg_rad; /* Assume positive. */
cor = -kernel->g_fn(rho);
den = powf(radd, -3);
num = bsv_V3f_cross(rad, self->vorticity);
ret = bsv_V3f_mult(num, cor / den);
ret = bsv_V3f_mult(num, cor * den);
}
return ret;
}

CVTX_EXPORT bsv_V3f cvtx_Particle_ind_vel(
const cvtx_Particle * self,
const bsv_V3f mes_point,
const cvtx_VortFunc * kernel,
float regularisation_radius)
{
bsv_V3f ret;
ret = particle_ind_vel_inner(self, mes_point, kernel,
1.f/fabsf(regularisation_radius));
return bsv_V3f_mult(ret, 1.f / (4.f * acosf(-1.f)));
}

CVTX_EXPORT bsv_V3f cvtx_Particle_ind_dvort(
const cvtx_Particle * self,
const cvtx_Particle * induced_particle,
Expand Down Expand Up @@ -126,19 +142,20 @@ CVTX_EXPORT bsv_V3f cvtx_ParticleArr_ind_vel(
const cvtx_VortFunc *kernel,
float regularisation_radius)
{
bsv_V3f vel;
double rx = 0, ry = 0, rz = 0;
long i;
float recip_reg_rad = 1.f / fabsf(regularisation_radius);
assert(num_particles >= 0);
#pragma omp parallel for reduction(+:rx, ry, rz)
for (i = 0; i < num_particles; ++i) {
vel = cvtx_Particle_ind_vel(array_start[i],
mes_point, kernel, regularisation_radius);
bsv_V3f vel = particle_ind_vel_inner(array_start[i],
mes_point, kernel, recip_reg_rad);
rx += vel.x[0];
ry += vel.x[1];
rz += vel.x[2];
}
bsv_V3f ret = {(float)rx, (float)ry, (float)rz};
return ret;
return bsv_V3f_mult(ret, 1.f / (4.f * acosf(-1.f)));
}

CVTX_EXPORT bsv_V3f cvtx_ParticleArr_ind_dvort(
Expand Down

0 comments on commit c890dce

Please sign in to comment.