diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp index f4a6cbcc..6880d680 100644 --- a/include/mcl/bn.hpp +++ b/include/mcl/bn.hpp @@ -47,7 +47,7 @@ namespace mcl { namespace msm { bool initMsm(const mcl::CurveParam& cp, const msm::Func *func); -void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n); +void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n, size_t b); void mulEachAVX512(Unit *_x, const Unit *_y, size_t n); } // mcl::msm diff --git a/include/mcl/ec.hpp b/include/mcl/ec.hpp index 5e3559a3..bd0290a3 100644 --- a/include/mcl/ec.hpp +++ b/include/mcl/ec.hpp @@ -243,30 +243,45 @@ void normalizeVecT(Eout& Q, Ein& P, size_t n, size_t N = 256) } /* - split x to (a, b) such that x = a + b L where 0 <= a, b <= L, 0 <= x <= r-1 = L^2+L - if adj is true, then 0 <= a < L, 0 <= b <= L+1 + split x in [0, r-1] to (a, b) such that x = a + b L, 0 <= a < L, 0 <= b <= L+1 a[] : 128 bit b[] : 128 bit x[] : 256 bit */ inline void optimizedSplitRawForBLS12_381(Unit *a, Unit *b, const Unit *x) { - const bool adj = false; /* z = -0xd201000000010000 L = z^2-1 = 0xac45a4010001a40200000000ffffffff r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001 s=255 - v = (1<(xH, x+n-1, mcl::UnitBitSize-1); // >>127 + Unit t[n*2]; + mcl::bint::mulT(t, xH, q); + mcl::bint::copyT(b, t+n); // (xH * q)/H + mcl::bint::mulT(t, b, L); // bL + mcl::bint::subT(t, x, t); // x - bL + Unit d = mcl::bint::subT(a, t, L); + if (t[n] - d == 0) { + mcl::bint::addT(b, b, one); + } else { + mcl::bint::copyT(a, t); + } +#else + const bool adj = false; Unit t[n*3]; // n = 128 bit - // t[n*3] = x[n*2] * v[n] - mcl::bint::mulNM(t, x, n*2, v, n); + // t[n*3] = x[n*2] * q[n] + mcl::bint::mulNM(t, x, n*2, q, n); // b[n] = t[n*3]>>255 mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255 mcl::bint::copyT(b, t); @@ -283,6 +298,7 @@ inline void optimizedSplitRawForBLS12_381(Unit *a, Unit *b, const Unit *x) mcl::bint::clearT(a); } } +#endif } } // mcl::ec::local @@ -571,21 +587,26 @@ void addJacobi(E& R, const E& P, const E& Q) /* accept P == Q - https://github.com/apache/incubator-milagro-crypto-c/blob/fa0a45a3/src/ecp.c.in#L767-L976 + https://eprint.iacr.org/2015/1060 (x, y, z) is zero <=> x = 0, y = 1, z = 0 */ // (b=4) 12M+27A // (generic) 14M+19A +// Q.z = 1 if mixed template -void addCTProj(E& R, const E& P, const E& Q) +void addCTProj(E& R, const E& P, const E& Q, bool mixed = false) { typedef typename E::Fp F; assert(E::a_ == 0); F t0, t1, t2, t3, t4, x3, y3; F::mul(t0, P.x, Q.x); F::mul(t1, P.y, Q.y); - F::mul(t2, P.z, Q.z); + if (mixed) { + t2 = P.z; + } else { + F::mul(t2, P.z, Q.z); + } F::add(t3, P.x, P.y); F::add(t4, Q.x, Q.y); F::mul(t3, t3, t4); @@ -967,12 +988,13 @@ inline size_t ilog2(size_t n) return cybozu::bsr(n) + 1; } -inline size_t costMulVec(size_t n, size_t x) +// The number of ADD for n-elements with bucket size b +inline size_t glvCost(size_t n, size_t b) { - return (n + (size_t(1)<<(x+1))-1)/x; + return (n + (size_t(1)<<(b+1))-1)/b; } -// calculate approximate value such that argmin { x : (n + 2^(x+1)-1)/x } -inline size_t argminForMulVec0(size_t n) +// approximate value such that argmin { b : glvCost(n, b) } +inline size_t estimateBucketSize(size_t n) { if (n <= 16) return 2; size_t log2n = ilog2(n); @@ -980,24 +1002,38 @@ inline size_t argminForMulVec0(size_t n) } /* - First, get approximate value x and compute costMulVec of x-1 and x+1, + First, get approximate value x and compute glvCost of x-1 and x+1, and return the minimum value. */ -inline size_t argminForMulVec(size_t n) +inline size_t glvGetTheoreticBucketSize(size_t n) { - size_t x = argminForMulVec0(n); -#if 1 - size_t vm1 = x > 1 ? costMulVec(n, x-1) : n; - size_t v0 = costMulVec(n, x); - size_t vp1 = costMulVec(n, x+1); + size_t x = estimateBucketSize(n); + size_t vm1 = x > 1 ? glvCost(n, x-1) : n; + size_t v0 = glvCost(n, x); + size_t vp1 = glvCost(n, x+1); if (vm1 <= v0) return x-1; if (vp1 < v0) return x+1; -#endif return x; } +// return heuristic backet size which is faster than glvGetTheoreticBucketSize +inline size_t glvGetBucketSize(size_t n) +{ + if (n <= 2) return 2; + size_t log2n = mcl::ec::ilog2(n); + const size_t tblMin = 8; + if (log2n < tblMin) return 3; + // n >= 2^tblMin + static const size_t tbl[] = { + 3, 4, 5, 5, 8, 8, 9, 10, 11, 12, 13, 13, 13, 16, 16, 16, 18, 19, 19, 19, 19, 19 + }; + if (log2n >= CYBOZU_NUM_OF_ARRAY(tbl)) return 19; + size_t ret = tbl[log2n - tblMin]; + return ret; +} + #ifndef MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC - // use (1 << argminForMulVec(n)) * sizeof(G) bytes stack + alpha + // use (1 << glvGetBucketSize(n)) * sizeof(G) bytes stack + alpha // about 18KiB (G1) or 36KiB (G2) for n = 1024 // you can decrease this value but this algorithm is slow if n < 256 #define MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC 1024 @@ -1039,7 +1075,7 @@ void mulVecUpdateTable(G& win, G *tbl, size_t tblN, const G *xVec, const Unit *y fast for n >= 256 */ template -size_t mulVecCore(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next, size_t n, bool doNormalize = true) +size_t mulVecCore(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next, size_t n, size_t b, bool doNormalize) { if (n == 0) { z.clear(); @@ -1050,15 +1086,15 @@ size_t mulVecCore(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next return 1; } - size_t c, tblN; + size_t tblN; G *tbl = 0; #ifndef MCL_DONT_USE_MALLOC G *tbl_ = 0; // malloc is used if tbl_ != 0 // if n is large then try to use malloc if (n > MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC) { - c = argminForMulVec(n); - tblN = (1 << c) - 1; + if (b == 0) b = glvGetBucketSize(n); + tblN = (1 << b) - 1; tbl_ = (G*)malloc(sizeof(G) * tblN); if (tbl_) { tbl = tbl_; @@ -1068,25 +1104,25 @@ size_t mulVecCore(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next #endif // n is small or malloc fails so use stack if (n > MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC) n = MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC; - c = argminForMulVec(n); - tblN = (1 << c) - 1; + if (b == 0) b = glvGetBucketSize(n); + tblN = (1 << b) - 1; tbl = (G*)CYBOZU_ALLOCA(sizeof(G) * tblN); // keep tbl_ = 0 #ifndef MCL_DONT_USE_MALLOC main: #endif const size_t maxBitSize = sizeof(Unit) * yUnitSize * 8; - const size_t winN = (maxBitSize + c-1) / c; + const size_t winN = (maxBitSize + b-1) / b; // about 10% faster if (doNormalize) G::normalizeVec(xVec, xVec, n); - mulVecUpdateTable(z, tbl, tblN, xVec, yVec, yUnitSize, next, c * (winN-1), n, true); + mulVecUpdateTable(z, tbl, tblN, xVec, yVec, yUnitSize, next, b * (winN-1), n, true); for (size_t w = 1; w < winN; w++) { - for (size_t i = 0; i < c; i++) { + for (size_t i = 0; i < b; i++) { G::dbl(z, z); } - mulVecUpdateTable(z, tbl, tblN, xVec, yVec, yUnitSize, next, c * (winN-1-w), n, false); + mulVecUpdateTable(z, tbl, tblN, xVec, yVec, yUnitSize, next, b * (winN-1-w), n, false); } #ifndef MCL_DONT_USE_MALLOC if (tbl_) free(tbl_); @@ -1094,23 +1130,23 @@ size_t mulVecCore(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next return n; } template -void mulVecLong(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next, size_t n, bool doNormalize = true) +void mulVecLong(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next, size_t n, size_t b, bool doNormalize) { - size_t done = mulVecCore(z, xVec, yVec, yUnitSize, next, n, doNormalize); + size_t done = mulVecCore(z, xVec, yVec, yUnitSize, next, n, b, doNormalize); if (done == n) return; do { xVec += done; yVec += next * done; n -= done; G t; - done = mulVecCore(t, xVec, yVec, yUnitSize, next, n, doNormalize); + done = mulVecCore(t, xVec, yVec, yUnitSize, next, n, b, doNormalize); z += t; } while (done < n); } // for n >= 128 template -bool mulVecGLVlarge(G& z, const G *xVec, const void *yVec, size_t n) +bool mulVecGLVlarge(G& z, const G *xVec, const void *yVec, size_t n, size_t b) { const int splitN = GLV::splitN; assert(n > 0); @@ -1147,7 +1183,7 @@ bool mulVecGLVlarge(G& z, const G *xVec, const void *yVec, size_t n) assert(b); (void)b; } } - mulVecLong(z, tbl, yp, next, next, n * splitN, false); + mulVecLong(z, tbl, yp, next, next, n * splitN, false, b); free(tbl); return true; } @@ -1345,7 +1381,7 @@ static void mulVecGLVsmall(G& z, const G *xVec, const void* yVec, size_t n) // return false if malloc fails or n is not in a target range template -bool mulVecGLVT(G& z, const G *xVec, const void *yVec, size_t n, bool constTime = false) +bool mulVecGLVT(G& z, const G *xVec, const void *yVec, size_t n, bool constTime = false, size_t b = 0) { if (n == 1 && constTime) { local::mulGLV_CT(z, xVec[0], yVec); @@ -1356,7 +1392,7 @@ bool mulVecGLVT(G& z, const G *xVec, const void *yVec, size_t n, bool constTime return true; } if (n >= 128) { - return mulVecGLVlarge(z, xVec, yVec, n); + return mulVecGLVlarge(z, xVec, yVec, n, b); } return false; } @@ -1388,8 +1424,8 @@ class EcT : public fp::Serializable > { */ static bool verifyOrder_; static mpz_class order_; - static bool (*mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime); - static void (*mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n); + static bool (*mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime, size_t b); + static void (*mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n, size_t b); static void (*mulEachOpti)(Unit *xVec, const Unit *yVec, size_t n); static bool (*isValidOrderFast)(const EcT& x); /* default constructor is undefined value */ @@ -1480,11 +1516,11 @@ class EcT : public fp::Serializable > { { isValidOrderFast = f; } - static void setMulVecGLV(bool f(EcT& z, const EcT *xVec, const void *yVec, size_t yn, bool constTime)) + static void setMulVecGLV(bool f(EcT& z, const EcT *xVec, const void *yVec, size_t yn, bool constTime, size_t b)) { mulVecGLV = f; } - static void setMulVecOpti(void f(Unit* _z, Unit *_xVec, const Unit *_yVec, size_t yn)) + static void setMulVecOpti(void f(Unit* _z, Unit *_xVec, const Unit *_yVec, size_t yn, size_t b)) { mulVecOpti = f; } @@ -1603,7 +1639,7 @@ class EcT : public fp::Serializable > { static inline void mul(EcT& z, const EcT& x, const EcT::Fr& y, bool constTime = false) { if (mulVecGLV) { - mulVecGLV(z, &x, &y, 1, constTime); + mulVecGLV(z, &x, &y, 1, constTime, 0); return; } fp::Block b; @@ -2155,17 +2191,17 @@ class EcT : public fp::Serializable > { GLV : 7680, 15360, 30720 Long: 9779, 16322, 24533 */ - static inline void mulVec(EcT& z, EcT *xVec, const EcT::Fr *yVec, size_t n) + static inline void mulVec(EcT& z, EcT *xVec, const EcT::Fr *yVec, size_t n, size_t b = 0) { if (n == 0) { z.clear(); return; } if (mulVecOpti && n >= 128) { - mulVecOpti((Unit*)&z, (Unit*)xVec, yVec[0].getUnit(), n); + mulVecOpti((Unit*)&z, (Unit*)xVec, yVec[0].getUnit(), n, b); return; } - if (mulVecGLV && mulVecGLV(z, xVec, yVec, n, false)) { + if (mulVecGLV && mulVecGLV(z, xVec, yVec, n, false, b)) { return; } EcT r; @@ -2284,8 +2320,8 @@ template int EcT::specialB_; template int EcT::ioMode_; template bool EcT::verifyOrder_; template mpz_class EcT::order_; -template bool (*EcT::mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime); -template void (*EcT::mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n); +template bool (*EcT::mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime, size_t b); +template void (*EcT::mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n, size_t b); template bool (*EcT::isValidOrderFast)(const EcT& x); template int EcT::mode_; template void (*EcT::mulEachOpti)(Unit *xVec, const Unit *yVec, size_t n); diff --git a/include/mcl/vint.hpp b/include/mcl/vint.hpp index 347aaf01..e26e0b3a 100644 --- a/include/mcl/vint.hpp +++ b/include/mcl/vint.hpp @@ -696,9 +696,6 @@ class Vint { // logical left shift (copy sign) static void shl(Vint& y, const Vint& x, size_t shiftBit) { -if (shiftBit > MCL_MAX_BIT_SIZE*2) { - printf("shiftBit=%zd\n", shiftBit); -} assert(shiftBit <= MCL_MAX_BIT_SIZE * 2); // many be too big size_t xn = x.size(); size_t yn = xn + (shiftBit + UnitBitSize - 1) / UnitBitSize; diff --git a/misc/gather_test.cpp b/misc/gather_test.cpp new file mode 100644 index 00000000..deefdd2f --- /dev/null +++ b/misc/gather_test.cpp @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include "../src/avx512.hpp" +#include + +typedef std::map Mem2Clk; + +void gatherBench(Mem2Clk& bench, uint64_t *base, size_t bit, size_t c, cybozu::XorShift& rg, int mode) +{ + cybozu::CpuClock clk; + clk.begin(); + Vec v = vzero(); + const size_t n = size_t(1) << bit; + const size_t mask = n - 1; + + uint64_t tbl[8]; + for (size_t i = 0; i < c; i++) { + for (size_t i = 0; i < 8; i++) { + tbl[i] = rg.get32() & mask; + } + Vec idx; + memcpy(&idx, tbl, sizeof(idx)); + v = vpaddq(v, vpgatherqq(idx, base)); + if (mode == 1) { + vpscatterqq(base, idx, v); + } + } + clk.end(); + uint64_t s = 0; + memcpy(tbl, &v, sizeof(v)); + for (size_t i = 0; i < 8; i++) s += tbl[i]; + + double ave = clk.getClock() / double(c); + char m[64]; + if (n * 8 < 1024 * 1024) { + snprintf(m, sizeof(m), "%zd KiB", n * 8 / 1024); + } else { + size_t mem = n * 8 / 1024 / 1024; + snprintf(m, sizeof(m), "%zd MiB", mem); + bench[mem] = ave; + } + printf("%2zd %10s %8.2f clk %llx\n", bit, m, ave, (long long)s); +} + +void printTable(const Mem2Clk& bench, bool csv) +{ + if (csv) { + printf("MiB"); + for (auto i = bench.begin(); i != bench.end(); ++i) { + printf(",%zd", i->first); + } + printf("\n"); + printf("cycle"); + for (auto i = bench.begin(); i != bench.end(); ++i) { + printf(",%.2f", i->second); + } + printf("\n"); + } else { + // markdown + printf("MiB"); + for (auto i = bench.begin(); i != bench.end(); ++i) { + printf("|%zd", i->first); + } + printf("\n"); + printf("-"); + for (auto i = bench.begin(); i != bench.end(); ++i) { + printf("|-"); + } + printf("\n"); + printf("cycle"); + for (auto i = bench.begin(); i != bench.end(); ++i) { + printf("|%.2f", i->second); + } + printf("\n"); + } +} + +int main() +{ + const size_t maxB = 29; + const size_t n = size_t(1) << maxB; + std::vector v(n); + cybozu::XorShift rg; + for (size_t i = 0; i < n; i++) { + v[i] = rg.get32(); + } + for (int mode = 0; mode < 2; mode++) { + printf("%s\n", mode == 0 ? "RO" : "RW"); + Mem2Clk bench; + for (size_t b = 8; b <= maxB; b++) { + gatherBench(bench, v.data(), b, 100000, rg, mode); + } + printTable(bench, true); + printTable(bench, false); + } +} diff --git a/misc/internal.md b/misc/internal.md index 9f7fa891..de1b4734 100644 --- a/misc/internal.md +++ b/misc/internal.md @@ -27,28 +27,27 @@ G1 mulEach ### Definition of parameters ```python -M = 1<<256 -H = 1<<128 +m = 128 +H = 1<> s - a = x - b * L - if adj: - if a >= L: + xH = x >> (m-1) # x // (H/2) + b = (xH * q) >> m # (xH * q) // H + a = x - b * L + if a >= L: a -= L b += 1 - return (a, b) + return (a, b) ``` -variables|z|L|r|S|v +variables|z|L|r|S|q -|-|-|-|-|- bit_length|64|128|255|255|128 @@ -56,27 +55,31 @@ bit_length|64|128|255|255|128 - a + b L = x for (a, b) = split(x). ### Theorem -0 <= a < 1.11 L < H and 0 <= b < L+1 for x in [0, r-1]. +0 <= a < L and 0 <= b <= L+1 ### Proof - -``` -Let r0 := L S % r, then S=v L + r0 and r0 in [0, L-1]. In fact, r0 ~ 0.11 L. -Let r1 := x v % S, then x v = b S + r1 and r1 in [0, S-1]. ``` +S = q * L + r0 where 0 <= r0 < L, r0 ~ 0.11 L +H/2 ~ 0.74 L +x = xH * (H/2) + xL where 0 <= xL < H/2, xH <= (r-1)/(H/2) -``` -b <= xv / S < (r-1) (S/L)/S = (r-1)/L = L+1. -``` +b = (xH * q) // H <= xH * q / H = xH * H/2 * q / (H * H/2) = (x-xL) * q / S + <= x * (S//L) / S <= x /L <= (r-1) / L = L+1 +=> 0 <= x - b L = a + +xH * q = b * H + r1 where 0 <= r1 < H + +a H = (x - b L) * H = x * H - b * H * L = (xH * (H/2) + xL) * H - (xH * q - r1) * L + = xH * S + xL * H - xH * q * L + r1 * L + = xH * S + xL * H - xH * (S - r0) + r1 * L + = xL * H + xH * r0 + r1 * L + +a = xL + xH * r0 / H + r1 * L / H + <= H/2 + (r-1)/(H/2) * r0 / H + (H-1) * L / H + = H/2 + (r-1)/S*r0 + L + = 0.74 L + 0.1 L + L = 1.8 L ``` -aS = (x - bL)S = xS - bSL = xS - (xv - r1)L = x(S - vL) + r1 L = r0 x + r1 L - <= r0 (r-1) + (S-1)L = S L + (r-1)r0 - L. -a <= L + ((r-1)r0 - L)/S -((r-1)r0 - L)/S ~ 0.10016 L < 0.11 L. -``` -### Remark -If adj is true, then a is in [0, L-1]. ## window size @@ -167,3 +170,46 @@ def naf(x, w=3): Consider to apply `w=5` to `(a, b)=split(x)`. The max value of `a` is `1.1 L = 0b101...` of 128-bit length. `0b101` is less than `(1<<(w-1))-1` and so negativity and CF operation are unnecessary. + +----------------------------------------------------------------------------- +Vec +gcc + mul +vsqr::Vec 110.10 clk + +gcc + sqr +vsqr::Vec 136.29 clk + +clang + mul +vsqr::Vec 129.06 clk + +clang + sqr +vsqr::Vec 102.91 clk + +VecA +gcc + mul +vsqr::VecA 269.07 clk + +gcc + sqr +vsqr::VecA 243.14 clk + +clang + mul +vsqr::VecA 183.07 clk + +clang + sqr +vsqr::VecA 174.61 clk + +Vec +compiler|gcc|clang +-|-|- +mul|110|129 +sqr|136|102 + +VecA +compiler|gcc|clang +-|-|- +mul|269|183 +sqr|243|174 + +asm +Vec::mul 108 clk < Vec::asm +VecA::mul 181.90 clk \ No newline at end of file diff --git a/misc/mulvec_test.cpp b/misc/mulvec_test.cpp index 463ae171..88737109 100644 --- a/misc/mulvec_test.cpp +++ b/misc/mulvec_test.cpp @@ -1,17 +1,17 @@ #include #include -void put(int n, int x) +void put(size_t n, size_t x) { - printf(" x=%d(%zd)", x, mcl::ec::costMulVec(n, x)); + printf(" x=%zd(%zd)", x, mcl::ec::glvCost(n, x)); } -int getmin(int n) +size_t getmin(size_t n) { - int min = 100000000; - int a = 0; - for (int x = 1; x < 30; x++) { - int v = mcl::ec::costMulVec(n, x); + size_t min = 100000000; + size_t a = 0; + for (size_t x = 1; x < 30; x++) { + size_t v = mcl::ec::glvCost(n, x); if (v < min) { a = x; min = v; @@ -20,32 +20,76 @@ int getmin(int n) return a; } -void disp(int n) +void disp(size_t n) { - int x0 = getmin(n); - int x1 = mcl::ec::argminForMulVec(n); - printf("n=%d", n); + size_t x0 = getmin(n); + size_t x1 = mcl::ec::glvGetBucketSize(n); + printf("n=%zd", n); put(n, x0); put(n, x1); - printf(" diff=%d\n", x1-x0); + printf(" diff=%zd\n", x1-x0); +} + +inline size_t ilog2(size_t n) +{ + if (n == 0) return 0; + return cybozu::bsr(n) + 1; +} + +inline size_t glvCost(size_t n, size_t x) +{ + return (n + (size_t(1)<<(x+1))-1)/x; +} +// calculate approximate value such that argmin { x : (n + 2^(x+1)-1)/x } +inline size_t estimateBucketSize(size_t n) +{ + if (n <= 16) return 2; + size_t log2n = ilog2(n); + return log2n - ilog2(log2n); +} + +/* + First, get approximate value x and compute glvCost of x-1 and x+1, + and return the minimum value. +*/ +inline size_t glvGetBucketSize(size_t n) +{ + size_t x = estimateBucketSize(n); +#if 1 + size_t vm1 = x > 1 ? glvCost(n, x-1) : n; + size_t v0 = glvCost(n, x); + size_t vp1 = glvCost(n, x+1); + if (vm1 <= v0) return x-1; + if (vp1 < v0) return x+1; +#endif + return x; } int main() { - for (int i = 1; i < 16; i++) { + for (size_t i = 1; i < 16; i++) { disp(i); } - for (int i = 4; i < 30; i++) { - int n = 1 << i; + for (size_t i = 4; i < 30; i++) { + size_t n = size_t(1) << i; disp(n*0.9); disp(n); disp(n*1.1); } + for (size_t i = 5; i <= 27; i++) { + size_t n = size_t(1) << i; + size_t glvN = n/8*2; + printf("n=2^%zd=%zd v=%zd\n", i, n, getmin(glvN)); + } puts("all search"); - for (int i = 1; i < 100000000; i++) { - int x0 = getmin(i); - int x1 = mcl::ec::argminForMulVec(i); -// if (std::abs(x0-x1) > 1) printf("i=%d x0=%d x1=%d\n", i, x0, x1); - if (x0 != x1) printf("i=%d x0=%d x1=%d\n", i, x0, x1); + for (size_t i = 1; i < 200000000; i++) { + size_t x0 = getmin(i); + size_t x1 = glvGetBucketSize(i); +// if (std::abs(x0-x1) > 1) printf("i=%zd x0=%zd x1=%zd\n", i, x0, x1); + if (x0 != x1) printf("i=%zd x0=%zd x1=%zd\n", i, x0, x1); + } + for (size_t i = 1; i <= 100000000; i *= 10) { + size_t x = glvGetBucketSize(i); + printf("i=%zd x=%zd\n", i, x); } } diff --git a/readme.md b/readme.md index 1b89b8b4..c6e5816e 100644 --- a/readme.md +++ b/readme.md @@ -10,6 +10,7 @@ mcl is a library for pairing-based cryptography, which supports the optimal Ate pairing over BN curves and BLS12-381 curves. # News +- mulVec (resp. mulEach) with AVX-512 IFMA is 1.52 (resp. 3.26) times faster than without it. - Add {Fp,Fr,Fp2}::squareRoot. - Improve the performance of squareRoot. - Add batch inversion for Fr and Fp elements, and batch normalization for G1 and G2 points. diff --git a/sample/mt_test.cpp b/sample/mt_test.cpp index 47af39fe..415ec354 100644 --- a/sample/mt_test.cpp +++ b/sample/mt_test.cpp @@ -27,12 +27,14 @@ int main(int argc, char *argv[]) int bit; size_t cpuN; bool g1only; + bool msmOnly; int C; opt.appendOpt(&n, 100, "n", ": array size"); opt.appendOpt(&bit, 0, "b", ": set n to 1<= p ? s-p : t +# input : t[N] +# output : t[N] = t >= p ? t-p : t +# s : temporary regs # k : mask register # c : for CF # z : for zero @@ -87,6 +88,30 @@ def sub_p_if_possible(t, s, k, z, c, pp, vmask): for i in range(N): vpandq(t[i]|k, s[i], vmask) +# input : t[N] +# output : t[N] = t >= p ? t-p : t +# s : temporary regs +# k : mask register +# c : for CF +# z : for zero +# vp : [C_p] +def sub_p_if_possible2(t, s, k, z, c, vp, vmask): + S = 63 + N = 8 + assert(len(t) == N and len(s) == N) + # s = t-p + for i in range(N): + vpsubq(s[i], t[i], vp[i]) + if i > 0: + vpsubq(s[i], s[i], c) + vpsrlq(c, s[i], S) + + #vpxorq(z, z, z) + vpcmpeqq(k, c, z) # k = s>=0 + # t = select(k, t, s&mask) + for i in range(N): + vpandq(t[i]|k, s[i], vmask) + def gen_vadd(mont, vN=1): SUF = 'A' if vN == 2 else '' with FuncProc(MSM_PRE+'vadd'+SUF): @@ -347,16 +372,10 @@ def vmulUnit(z, px, y, N, H): vmulH(z[N], y, ptr(px+i*64)) # [H]:z[0:N] = z[0:N] + x[] * y -def vmulUnitAdd(z, px, y, N, H): +def vmulUnitAdd(z, px, y, N): for i in range(0, N): vmulL(z[i], y, ptr(px+i*64)) - if i > 0: - vpaddq(z[i], z[i], H) - if i < N-1: - vpxorq(H, H, H) - vmulH(H, y, ptr(px+i*64)) - else: - vmulH(z[N], y, ptr(px+i*64)) + vmulH(z[i+1], y, ptr(px+i*64)) def shift(v, s): vmovdqa64(s, v[0]) @@ -366,7 +385,7 @@ def shift(v, s): def gen_vmul(mont): with FuncProc(MSM_PRE+'vmul'): - with StackFrame(3, 1, useRCX=True, vNum=mont.N*2+4, vType=T_ZMM) as sf: + with StackFrame(3, 1, useRCX=True, vNum=mont.N*2+4+8, vType=T_ZMM) as sf: regs = list(reversed(sf.v)) W = mont.W N = mont.N @@ -381,10 +400,14 @@ def gen_vmul(mont): H = pops(regs, 1)[0] q = pops(regs, 1)[0] s = pops(regs, N) - s0 = s[0] # alias + vp = pops(regs, N) lpL = Label() - vmulUnitAddL = Label() + # vp = load(C_ap) + lea(rax, ptr(rip+C_ap)) + for i in range(N): + vmovdqa64(vp[i], ptr(rax+i*64)) +# vmulUnitAddL = Label() mov(rax, mont.mask) vpbroadcastq(vmask, rax) @@ -399,9 +422,11 @@ def gen_vmul(mont): vpxorq(q, q, q) vmulL(q, t[0], ptr_b(rp)) - lea(rax, ptr(rip+C_ap)) - - call(vmulUnitAddL) # t += p * q + #lea(rax, ptr(rip+C_ap)) + #call(vmulUnitAddL) # t += p * q + for i in range(0, N): + vmulL(t[i], q, vp[i]) + vmulH(t[i+1], q, vp[i]) # N-1 times loop mov(ecx, N-1) @@ -411,16 +436,21 @@ def gen_vmul(mont): mov(rax, px) vmovdqa64(q, ptr(py)) add(py, 64) - shift(t, s0) - call(vmulUnitAddL) # t += x * py[i] - vpsrlq(q, s0, W) + shift(t, H) + #call(vmulUnitAddL) # t += x * py[i] + vmulUnitAdd(t, rax, q, N) + vpsrlq(q, H, W) vpaddq(t[0], t[0], q) vpxorq(q, q, q) vmulL(q, t[0], ptr_b(rp)) - lea(rax, ptr(rip+C_ap)) - call(vmulUnitAddL) # t += p * q + #lea(rax, ptr(rip+C_ap)) + #call(vmulUnitAddL) # t += p * q + #vmulUnitAdd(t, rax, q, N) + for i in range(0, N): + vmulL(t[i], q, vp[i]) + vmulH(t[i+1], q, vp[i]) dec(ecx) jnz(lpL) @@ -430,19 +460,93 @@ def gen_vmul(mont): vpaddq(t[i], t[i], q) vpandq(t[i-1], t[i-1], vmask) - lea(rax, ptr(rip+C_p)) + #lea(rax, ptr(rip+C_p)) vpxorq(H, H, H) # zero - sub_p_if_possible(t[1:], s, k1, H, q, rax, vmask) + sub_p_if_possible2(t[1:], s, k1, H, q, vp, vmask) un(vmovdqa64)(ptr(pz), t[1:]) - sf.close() - # out of vmul - align(32) - L(vmulUnitAddL) - #set rax(= px) and q(= y) - vmulUnitAdd(t, rax, q, N, H) - ret() +def gen_vsqr(mont): + with FuncProc(MSM_PRE+'vsqr'): + with StackFrame(2, 1, useRCX=True, vNum=mont.N*3+4, vType=T_ZMM) as sf: + regs = list(reversed(sf.v)) + W = mont.W + N = mont.N + S = 63 + un = genUnrollFunc() + + pz = sf.p[0] + px = sf.p[1] + rp = sf.t[0] + + t = pops(regs, N*2) + q = pops(regs, 1)[0] + H = pops(regs, 1)[0] + vmask = pops(regs, 1)[0] + vx = pops(regs, N) + vrp = pops(regs, 1)[0] + + un(vmovdqa64)(vx, ptr(px)) + # square + vpxorq(t[0], t[0], t[0]) + vmulL(t[0], vx[0], vx[0]) + for i in range(1, N): + tL = t[i*2-1] + tH = t[i*2] + vpxorq(tL, tL, tL) + vpxorq(tH, tH, tH) + vmulL(tL, vx[i], vx[i-1]) + vmulH(tH, vx[i], vx[i-1]) + + for i in range(2, N): + for j in range(i, N): + vmulL(t[j*2-i ], vx[j], vx[j-i]) + vmulH(t[j*2-i+1], vx[j], vx[j-i]) + + for i in range(1, N*2-1): + vpaddq(t[i], t[i], t[i]) + + for i in range(1, N): + vmulH(t[i*2-1], vx[i-1], vx[i-1]) + vmulL(t[i*2], vx[i], vx[i]) + vpxorq(t[N*2-1], t[N*2-1], t[N*2-1]) + vmulH(t[N*2-1], vx[i], vx[i]) + + # reduction + # rename vx to vp + (vp, vx) = (vx, None) + + lea(rax, ptr(rip+C_ap)) + un(vmovdqa64)(vp, ptr(rax)) + + mov(rax, mont.mask) + vpbroadcastq(vmask, rax) + vpbroadcastq(vrp, ptr(rip+C_rp)) + + for i in range(N): + if i > 0: + vpsrlq(q, t[i-1], W) + vpaddq(t[i], t[i], q) + vpxorq(q, q, q) + vmulL(q, t[i], vrp) + + #call(vmulUnitAddL) # t += p * q + for j in range(0, N): + vmulL(t[i+j], q, vp[j]) + vmulH(t[i+j+1], q, vp[j]) + + for i in range(N, N*2): + vpsrlq(q, t[i-1], W) + vpaddq(t[i], t[i], q) + vpandq(t[i-1], t[i-1], vmask) + + s = t[0:N] + t = t[N:N*2] + + vpxorq(vrp, vrp, vrp) + sub_p_if_possible2(t, s, k1, vrp, q, vp, vmask) + + un(vmovdqa64)(ptr(pz), t) def vmulUnitA(z, px, y, N, H): un = genUnrollFunc() @@ -460,18 +564,12 @@ def vmulUnitA(z, px, y, N, H): un(vpxorq)(z[N], z[N], z[N]) un(vmulH)(z[N], y, ptr(px+i*64*vN)) -def vmulUnitAddA(z, px, y, N, H): +def vmulUnitAddA(z, px, y, N): un = genUnrollFunc() vN = 2 for i in range(0, N): un(vmulL)(z[i], y, ptr(px+i*64*vN)) - if i > 0: - un(vpaddq)(z[i], z[i], H) - if i < N-1: - un(vpxorq)(H, H, H) - un(vmulH)(H, y, ptr(px+i*64*vN)) - else: - un(vmulH)(z[N], y, ptr(px+i*64*vN)) + un(vmulH)(z[i+1], y, ptr(px+i*64*vN)) def shiftA(v, s): un = genUnrollFunc() @@ -500,11 +598,14 @@ def gen_vmulA(mont): vmask = pops(regs, 1)[0] H = pops(regs, vN) q = pops(regs, vN) - s = pops(regs, N) - s0 = s[0:vN] # alias + vp = pops(regs, N) lpL = Label() - vmulUnitAddAL = Label() + # vp = load(C_ap) + lea(rax, ptr(rip+C_ap)) + for i in range(N): + vmovdqa64(vp[i], ptr(rax+i*64)) +# vmulUnitAddAL = Label() mov(rax, mont.mask) vpbroadcastq(vmask, rax) @@ -520,9 +621,12 @@ def gen_vmulA(mont): un(vpxorq)(q, q, q) un(vmulL)(q, t[0], ptr_b(rp)) - lea(rax, ptr(rip+C_apA)) - - call(vmulUnitAddAL) # t += p * q + #lea(rax, ptr(rip+C_apA)) + #call(vmulUnitAddAL) # t += p * q + #vmulUnitAddA(t, rax, q, N) + for i in range(0, N): + un(vmulL)(t[i], q, vp[i]) + un(vmulH)(t[i+1], q, vp[i]) # N-1 times loop mov(i_, N-1) @@ -532,16 +636,21 @@ def gen_vmulA(mont): mov(rax, px) un(vmovdqa64)(q, ptr(py)) add(py, 64*vN) - shiftA(t, s0) - call(vmulUnitAddAL) # t += x * py[i] - un(vpsrlq)(q, s0, W) + shiftA(t, H) + #call(vmulUnitAddAL) # t += x * py[i] + vmulUnitAddA(t, rax, q, N) + un(vpsrlq)(q, H, W) un(vpaddq)(t[0], t[0], q) un(vpxorq)(q, q, q) un(vmulL)(q, t[0], ptr_b(rp)) - lea(rax, ptr(rip+C_apA)) - call(vmulUnitAddAL) # t += p * q + #lea(rax, ptr(rip+C_apA)) + #call(vmulUnitAddAL) # t += p * q + #vmulUnitAddA(t, rax, q, N) + for i in range(0, N): + un(vmulL)(t[i], q, vp[i]) + un(vmulH)(t[i+1], q, vp[i]) dec(i_) jnz(lpL) @@ -555,24 +664,21 @@ def gen_vmulA(mont): # [[a,b], [c,d], ...] -> [(a, c, ...), (b, d, ...)] t0, t1 = map(list, zip(*t)) t2 = [t0[1:], t1[1:]] - # s = t[1:] - p + # vp = t[1:] - p q0 = q[0] H0 = H[0] lea(rax, ptr(rip+C_p)) vpxorq(H0, H0, H0) - for j in range(vN): - sub_p_if_possible(t2[j], s, k1, H0, q0, rax, vmask) - un(vmovdqa64)(ptr(pz+j*64*N), torg[2+j*N:]) +# for j in range(vN): +# sub_p_if_possible(t2[j], vp, k1, H0, q0, rax, vmask) +# un(vmovdqa64)(ptr(pz+j*64*N), torg[2+j*N:]) + sub_p_if_possible2(t2[0], vp, k1, H0, q0, vp, vmask) + un(vmovdqa64)(ptr(pz), torg[2:]) + sub_p_if_possible(t2[1], vp, k1, H0, q0, rax, vmask) + un(vmovdqa64)(ptr(pz+64*N), torg[2+N:]) - sf.close() - # out of vmul - align(32) - L(vmulUnitAddAL) - #set rax(= px) and q(= y) - vmulUnitAddA(t, rax, q, N, H) - ret() def msm_data(mont): align(64) makeLabel(C_p) @@ -594,6 +700,7 @@ def msm_code(mont): gen_vadd(mont) gen_vsub(mont) gen_vmul(mont) + gen_vsqr(mont) gen_vadd(mont, 2) # a little faster #gen_vaddA(mont) diff --git a/src/msm_avx.cpp b/src/msm_avx.cpp index ac65e9f1..924bcdce 100644 --- a/src/msm_avx.cpp +++ b/src/msm_avx.cpp @@ -34,6 +34,7 @@ void mcl_c5_vsubPreA(VecA *, const VecA *, const VecA *); void mcl_c5_vadd(Vec *, const Vec *, const Vec *); void mcl_c5_vsub(Vec *, const Vec *, const Vec *); void mcl_c5_vmul(Vec *, const Vec *, const Vec *); +void mcl_c5_vsqr(Vec *, const Vec *); void mcl_c5_vaddA(VecA *, const VecA *, const VecA *); void mcl_c5_vsubA(VecA *, const VecA *, const VecA *); void mcl_c5_vmulA(VecA *, const VecA *, const VecA *); @@ -237,6 +238,20 @@ template inline V vmulUnitAdd(V *z, const U *x, const V& y) { V H; +#if 1 + V v = x[0]; + z[0] = vmulL(v, y, z[0]); + H = vmulH(v, y, z[1]); + for (size_t i = 1; i < N-1; i++) { + v = x[i]; + z[i] = vmulL(v, y, H); + H = vmulH(v, y, z[i+1]); + } + v = x[N-1]; + z[N-1] = vmulL(v, y, H); + H = vmulH(v, y); + return H; +#else z[0] = vmulL(x[0], y, z[0]); H = vmulH(x[0], y); for (size_t i = 1; i < N; i++) { @@ -244,6 +259,7 @@ inline V vmulUnitAdd(V *z, const U *x, const V& y) H = vmulH(x[i], y); } return H; +#endif } template @@ -339,6 +355,73 @@ inline void vmul(VecA *z, const VecA *x, const VecA *y) } #endif +template +inline void vsqr(V *z, const V *x) +{ +#if 0 + V t[N*2]; + vmulUnit(t, x, x[0]); + V q = vmulL(t[0], G::rp()); + t[N] = vpaddq(t[N], vmulUnitAdd(t, G::ap(), q)); + for (size_t i = 1; i < N; i++) { + t[N+i] = vmulUnitAdd(t+i, x, x[i]); + t[i] = vpaddq(t[i], vpsrlq(t[i-1], W)); + q = vmulL(t[i], G::rp()); + t[N+i] = vpaddq(t[N+i], vmulUnitAdd(t+i, G::ap(), q)); + } +#else + V t[N*2]; + t[0] = vmulL(x[0], x[0]); + for (size_t i = 1; i < N; i++) { + t[i*2-1] = vmulL(x[i], x[i-1]); + t[i*2 ] = vmulH(x[i], x[i-1]); + } + for (size_t i = 2; i < N; i++) { + for (size_t j = i; j < N; j++) { + t[j*2-i ] = vmulL(x[j], x[j-i], t[j*2-i ]); + t[j*2-i+1] = vmulH(x[j], x[j-i], t[j*2-i+1]); + } + } + for (size_t i = 1; i < N*2-1; i++) { + t[i] = vpaddq(t[i], t[i]); + } + for (size_t i = 1; i < N; i++) { + t[i*2-1] = vmulH(x[i-1], x[i-1], t[i*2-1]); + t[i*2] = vmulL(x[i], x[i], t[i*2]); + } + t[N*2-1] = vmulH(x[N-1], x[N-1]); + + for (size_t i = 0; i < N; i++) { + if (i > 0) t[i] = vpaddq(t[i], vpsrlq(t[i-1], W)); + V q = vmulL(t[i], G::rp()); + t[N+i] = vpaddq(t[N+i], vmulUnitAdd(t+i, G::ap(), q)); + } +#endif + for (size_t i = N; i < N*2; i++) { + t[i] = vpaddq(t[i], vpsrlq(t[i-1], W)); + t[i-1] = vpandq(t[i-1], G::mask()); + } + VM c = vsubPre(z, t+N, G::ap()); + uvselect(z, c, t+N, z); +} + +#if 1 +template<> +inline void vsqr(Vec *z, const Vec *x) +{ + mcl_c5_vsqr(z, x); +// mcl_c5_vmul(z, x, x); +} +#endif + +#if 1 +template<> +inline void vsqr(VecA *z, const VecA *x) +{ + mcl_c5_vmulA(z, x, x); +} +#endif + template inline V getUnitAt(const V *x, size_t xN, size_t bitPos) { @@ -523,7 +606,8 @@ struct FpMT { } static void sqr(T& z, const T& x) { - mul(z, x, x); + vsqr(z.v, x.v); +// mul(z, x, x); } void toMont(T& x) const { @@ -652,12 +736,19 @@ struct FpM : FpMT { #endif }; -template -inline void normalizeJacobiVec(E P[n]) +// set y = 1 if isProj +template +inline void normalizeJacobiVec(E *P, size_t n, bool isProj = false) { - assert(n >= 2); typedef typename E::Fp F; - F tbl[n]; + bool alocated = false; + F *tbl = 0; + if (sizeof(F) * n < 1024 * 1024) { + tbl = (F*)CYBOZU_ALLOCA(sizeof(F) * n); + } else { + tbl = (F*)Xbyak::AlignedMalloc(sizeof(F) * n, 64); + alocated = true; + } tbl[0] = F::select(P[0].z.isZero(), F::one(), P[0].z); for (size_t i = 1; i < n; i++) { F t = F::select(P[i].z.isZero(), F::one(), P[i].z); @@ -681,6 +772,10 @@ inline void normalizeJacobiVec(E P[n]) F::mul(rz2, rz2, rz); F::mul(P[pos].y, P[pos].y, rz2); // yz^-3 z = F::select(zIsZero, z, F::one()); + if (isProj) P[pos].y = F::select(zIsZero, F::one(), P[pos].y); + } + if (alocated) { + Xbyak::AlignedFree(tbl); } } @@ -795,7 +890,13 @@ struct EcMT { static void add(T& z, const T& x, const T& y) { if (isProj) { - mcl::ec::addCTProj(z, x, y); + if (mixed) { + T t; + mcl::ec::addCTProj(t, x, y, mixed); + z = select(y.isZero(), x, t); + } else { + mcl::ec::addCTProj(z, x, y); + } } else { T t; if (mixed) { @@ -907,16 +1008,11 @@ struct EcMT { v2 = t1.isEqualAll(t2); return kandb(v1, v2); } -//#define SIGNED_TABLE // a little slower (32.1Mclk->32.4Mclk) template static void makeNAFtbl(V *idxTbl, VM *negTbl, const V a[2]) { const Vec mask = vpbroadcastq((1<= H ? Fu - idx : idx; - CF = vpsrlq(idx, w); - CF = vpaddq(v, CF, one); -#else V masked = vpandq(idx, mask); negTbl[i] = vpcmpgtq(masked, H); idxTbl[i] = vselect(negTbl[i], vpsubq(Fu, masked), masked); // idx >= H ? F - idx : idx; CF = vpsrlq(idx, w); CF = vpaddq(negTbl[i], CF, one); -#endif pos += w; } } template - static void mulGLV(T& Q, const T& P, const mcl::msm::FrA *y) + static void mulGLV(T *Q, const T *P, const mcl::msm::FrA *y, size_t n = 1) { const size_t m = sizeof(V)/8; const size_t w = 5; - const size_t halfN = (1<<(w-1))+1; // [0, 2^(w-1)] -#ifdef SIGNED_TABLE - const size_t tblN = 1<(tbl1, halfN, P); - if (!isProj && mixed) normalizeJacobiVec(tbl1+1); - for (size_t i = 0; i < halfN; i++) { - mulLambda(tbl2[i], tbl1[i]); - } -#ifdef SIGNED_TABLE - for (size_t i = halfN; i < tblN; i++) { - T::neg(tbl1[i], tbl1[tblN-i]); - T::neg(tbl2[i], tbl2[tblN-i]); - } -#endif - Unit *pa = (Unit*)a; - Unit *pb = (Unit*)b; - for (size_t i = 0; i < m; i++) { - Unit buf[4]; - g_func.fr->fromMont(buf, y[i].v); - Unit aa[2], bb[2]; - mcl::ec::local::optimizedSplitRawForBLS12_381(aa, bb, buf); - pa[i+m*0] = aa[0]; pa[i+m*1] = aa[1]; - pb[i+m*0] = bb[0]; pb[i+m*1] = bb[1]; + const size_t tblN = (1<<(w-1))+1; // [0, 2^(w-1)] + + T tbl1s[tblN*n]; + for (size_t k = 0; k < n; k++) { + makeTable(tbl1s + tblN*k, tblN, P[k]); } - const size_t bitLen = 128; - const size_t n = (bitLen + w-1)/w; - V aTbl[n], bTbl[n]; - VM aNegTbl[n], bNegTbl[n]; - makeNAFtbl(aTbl, aNegTbl, a); - makeNAFtbl(bTbl, bNegTbl, b); + if (!isProj && mixed) normalizeJacobiVec(tbl1s+1, tblN*n-1); - for (size_t i = 0; i < n; i++) { - if (i > 0) for (size_t k = 0; k < w; k++) T::template dbl(Q, Q); - const size_t pos = n-1-i; + for (size_t k = 0; k < n; k++) { + const T *tbl1 = &tbl1s[tblN*k]; + T tbl2[tblN]; + for (size_t i = 0; i < tblN; i++) { + mulLambda(tbl2[i], tbl1[i]); + } + V a[2], b[2]; + Unit *pa = (Unit*)a; + Unit *pb = (Unit*)b; + for (size_t i = 0; i < m; i++) { + Unit buf[4]; + g_func.fr->fromMont(buf, y[k*m+i].v); + Unit aa[2], bb[2]; + mcl::ec::local::optimizedSplitRawForBLS12_381(aa, bb, buf); + pa[i+m*0] = aa[0]; pa[i+m*1] = aa[1]; + pb[i+m*0] = bb[0]; pb[i+m*1] = bb[1]; + } + const size_t bitLen = 128; + const size_t nw = (bitLen + w-1)/w; + V aTbl[nw], bTbl[nw]; + VM aNegTbl[nw], bNegTbl[nw]; + makeNAFtbl(aTbl, aNegTbl, a); + makeNAFtbl(bTbl, bNegTbl, b); - T t; - V idx = bTbl[pos]; - t.gather(tbl2, idx); -#ifndef SIGNED_TABLE - t.y = F::select(bNegTbl[pos], t.y.neg(), t.y); -#endif - if (i == 0) { - Q = t; - } else { - add(Q, Q, t); + for (size_t i = 0; i < nw; i++) { + if (i > 0) for (size_t j = 0; j < w; j++) T::template dbl(Q[k], Q[k]); + const size_t pos = nw-1-i; + + T t; + V idx = bTbl[pos]; + t.gather(tbl2, idx); + t.y = F::select(bNegTbl[pos], t.y.neg(), t.y); + if (i == 0) { + Q[k] = t; + } else { + add(Q[k], Q[k], t); + } + idx = aTbl[pos]; + t.gather(tbl1, idx); + t.y = F::select(aNegTbl[pos], t.y.neg(), t.y); + add(Q[k], Q[k], t); } - idx = aTbl[pos]; - t.gather(tbl1, idx); -#ifndef SIGNED_TABLE - t.y = F::select(aNegTbl[pos], t.y.neg(), t.y); -#endif - add(Q, Q, t); } } }; @@ -1012,6 +1093,7 @@ struct EcM : EcMT { static const FpM &b3_; static const EcM &zeroProj_; static const EcM &zeroJacobi_; + template void setG1A(const mcl::msm::G1A v[M], bool JacobiToProj = true) { cvtFromG1Ax(x.v, v[0].v+0*6); @@ -1023,7 +1105,7 @@ struct EcM : EcMT { FpM::mul(z, z, g_m64to52u_); if (JacobiToProj) { - mcl::ec::JacobiToProj(*this, *this); + if (!isNormalized) mcl::ec::JacobiToProj(*this, *this); y = FpM::select(z.isZero(), FpM::one(), y); } } @@ -1115,9 +1197,10 @@ inline void reduceSum(mcl::msm::G1A& Q, const G& P) } } -template +template void mulVecUpdateTable(G& win, G *tbl, size_t tblN, const G *xVec, const V *yVec, size_t yn, size_t pos, size_t n, bool first) { + const bool isProj = true; const Vec m = vpbroadcastq(tblN-1); for (size_t i = 0; i < tblN; i++) { tbl[i].clear(); @@ -1127,7 +1210,7 @@ void mulVecUpdateTable(G& win, G *tbl, size_t tblN, const G *xVec, const V *yVec v = vpandq(v, m); G T; T.gather(tbl, v); - G::add(T, T, xVec[i]); + G::template add(T, T, xVec[i]); T.scatter(tbl, v); } G sum = tbl[tblN - 1]; @@ -1142,23 +1225,36 @@ void mulVecUpdateTable(G& win, G *tbl, size_t tblN, const G *xVec, const V *yVec } } +inline size_t glvGetBucketSizeAVX512(size_t n) +{ + size_t log2n = mcl::ec::ilog2(n); + const size_t tblMin = 6; + if (log2n < tblMin) return 2; + // n >= 2^tblMin + static const size_t tbl[] = { + 3, 4, 5, 5, 6, 7, 8, 8, 10, 10, 10, 10, 10, 13, 15, 15, 16, 16, 16, 16, 16 + }; + if (log2n >= CYBOZU_NUM_OF_ARRAY(tbl)) return 16; + size_t ret = tbl[log2n - tblMin]; + return ret; +} // xVec[n], yVec[n * maxBitSize/64] -template -inline void mulVecAVX512_inner(mcl::msm::G1A& P, const G *xVec, const V *yVec, size_t n, size_t maxBitSize) +template +inline void mulVecAVX512_inner(mcl::msm::G1A& P, const G *xVec, const V *yVec, size_t n, size_t maxBitSize, size_t b) { - size_t c = mcl::ec::argminForMulVec(n); - size_t tblN = size_t(1) << c; + if (b == 0) b = glvGetBucketSizeAVX512(n); + size_t tblN = size_t(1) << b; G *tbl = (G*)Xbyak::AlignedMalloc(sizeof(G) * tblN, 64); const size_t yn = maxBitSize / 64; - const size_t winN = (maxBitSize + c-1) / c; + const size_t winN = (maxBitSize + b-1) / b; G T; - mulVecUpdateTable(T, tbl, tblN, xVec, yVec, yn, c*(winN-1), n, true); + mulVecUpdateTable(T, tbl, tblN, xVec, yVec, yn, b*(winN-1), n, true); for (size_t w = 1; w < winN; w++) { - for (size_t i = 0; i < c; i++) { + for (size_t i = 0; i < b; i++) { G::dbl(T, T); } - mulVecUpdateTable(T, tbl, tblN, xVec, yVec, yn, c*(winN-1-w), n, false); + mulVecUpdateTable(T, tbl, tblN, xVec, yVec, yn, b*(winN-1-w), n, false); } reduceSum(P, T); Xbyak::AlignedFree(tbl); @@ -1249,9 +1345,10 @@ struct EcMA : EcMT { cvtFpMA2FpM(P[0].z, P[1].z, z); } + template void setG1A(const mcl::msm::G1A v[M*vN], bool JacobiToProj = true) { -#ifdef __clang__ // very slow on gcc, faster on clang +#if 1 assert(vN == 2); cvtFromG1Ax(x.v, v[0].v+0*6); @@ -1263,7 +1360,8 @@ struct EcMA : EcMT { FpMA::mul(z, z, g_m64to52u_); if (JacobiToProj) { - mcl::ec::JacobiToProj(*this, *this); + // Jacobi = Proj if normalized + if (!isNormalized) mcl::ec::JacobiToProj(*this, *this); y = FpMA::select(z.isZero(), FpMA::one(), y); } #else @@ -1276,7 +1374,7 @@ struct EcMA : EcMT { } void getG1A(mcl::msm::G1A v[M*vN], bool ProjToJacobi = true) const { -#ifdef __clang__ // very slow on gcc, faster on clang +#if 1 EcMA T = *this; if (ProjToJacobi) mcl::ec::ProjToJacobi(T, T); @@ -1301,40 +1399,66 @@ const FpMA& EcMA::b3_ = *(const FpMA*)g_b3A_; const EcMA& EcMA::zeroProj_ = *(const EcMA*)g_zeroProjA_; const EcMA& EcMA::zeroJacobi_ = *(const EcMA*)g_zeroJacobiA_; +#define USE_GLV + template -void mulVecAVX512T(Unit *_P, Unit *_x, const Unit *_y, size_t n) +void mulVecAVX512T(Unit *_P, Unit *_x, const Unit *_y, size_t n, size_t b = 0) { mcl::msm::G1A& P = *(mcl::msm::G1A*)_P; mcl::msm::G1A *x = (mcl::msm::G1A*)_x; const mcl::msm::FrA *y = (const mcl::msm::FrA*)_y; const size_t m = sizeof(V)/8; const size_t d = n/m; +#ifdef USE_GLV + const size_t e = 2; +#else + const size_t e = 1; +#endif const mcl::fp::Op *fr = g_func.fr; -// mcl::ec::normalizeVec(x, x, n); + const bool mixed = true; + if (mixed) { +// g_func.normalizeVecG1(x, x, n); + } - G *xVec = (G*)Xbyak::AlignedMalloc(sizeof(G) * d * 2, 64); + G *xVec = (G*)Xbyak::AlignedMalloc(sizeof(G) * d * e, 64); V *yVec = (V*)Xbyak::AlignedMalloc(sizeof(V) * d * 4, 64); for (size_t i = 0; i < d; i++) { - xVec[i*2].setG1A(x+i*m); - G::mulLambda(xVec[i*2+1], xVec[i*2]); + xVec[i].template setG1A(x+i*m); + } + normalizeJacobiVec(xVec, d, true); +#ifdef USE_GLV + for (size_t i = 0; i < d; i++) { + G::mulLambda(xVec[d+i], xVec[i]); } - Unit *py = (Unit*)yVec; + Unit *const py = (Unit*)yVec; + Unit *const py2 = py + d*m*2; for (size_t i = 0; i < d; i++) { for (size_t j = 0; j < m; j++) { Unit ya[4]; fr->fromMont(ya, y[i*m+j].v); Unit a[2], b[2]; mcl::ec::local::optimizedSplitRawForBLS12_381(a, b, ya); - py[j+0] = a[0]; - py[j+m] = a[1]; - py[j+m*2] = b[0]; - py[j+m*3] = b[1]; + py[i*m*2+j+0] = a[0]; + py[i*m*2+j+m] = a[1]; + py2[i*m*2+j+0] = b[0]; + py2[i*m*2+j+m] = b[1]; + } + } +#else + Unit *const py = (Unit*)yVec; + for (size_t i = 0; i < d; i++) { + for (size_t j = 0; j < m; j++) { + Unit ya[4]; + fr->fromMont(ya, y[i*m+j].v); + for (size_t k = 0; k < 4; k++) { + py[(i*4+k)*m+j] = ya[k]; + } } - py += m*4; } - mulVecAVX512_inner(P, xVec, yVec, d*2, 128); +#endif + mulVecAVX512_inner(P, xVec, yVec, d * e, 256 / e, b); Xbyak::AlignedFree(yVec); Xbyak::AlignedFree(xVec); @@ -1351,10 +1475,10 @@ void mulVecAVX512T(Unit *_P, Unit *_x, const Unit *_y, size_t n) namespace mcl { namespace msm { -void mulVecAVX512(Unit *P, Unit *x, const Unit *y, size_t n) +void mulVecAVX512(Unit *P, Unit *x, const Unit *y, size_t n, size_t b = 0) { - mulVecAVX512T(P, x, y, n); -// mulVecAVX512T(P, x, y, n); // slower + mulVecAVX512T(P, x, y, n, b); +// mulVecAVX512T(P, x, y, n, b); // slower } void mulEachAVX512(Unit *_x, const Unit *_y, size_t n) @@ -1365,22 +1489,30 @@ void mulEachAVX512(Unit *_x, const Unit *_y, size_t n) mcl::msm::G1A *x = (mcl::msm::G1A*)_x; const mcl::msm::FrA *y = (const mcl::msm::FrA*)_y; if (!isProj && mixed) g_func.normalizeVecG1(x, x, n); + const size_t u = 4; + const size_t q = n / u; #if 1 - // 30.6Mclk at n=1024 - for (size_t i = 0; i < n; i += 16) { - EcMA P; - P.setG1A(x+i, isProj); - EcMA::mulGLV(P, P, y+i); - P.getG1A(x+i, isProj); - } + typedef EcMA V; #else - for (size_t i = 0; i < n; i += 8) { - EcM P; + typedef EcM V; +#endif + const size_t m = sizeof(V)/(sizeof(FpM)*3)*8; + for (size_t i = 0; i < n; i += m*u) { + V P[u]; + for (size_t k = 0; k < u; k++) { + P[k].setG1A(x+i+k*m, isProj); + } + V::mulGLV(P, P, y+i, u); + for (size_t k = 0; k < u; k++) { + P[k].getG1A(x+i+k*m, isProj); + } + } + for (size_t i = q*m*u; i < n; i += m) { + V P; P.setG1A(x+i, isProj); - EcM::mulGLV(P, P, y+i); + V::mulGLV(&P, &P, y+i); P.getG1A(x+i, isProj); } -#endif } bool initMsm(const mcl::CurveParam& cp, const mcl::msm::Func *func) @@ -1411,20 +1543,11 @@ bool initMsm(const mcl::CurveParam& cp, const mcl::msm::Func *func) #include #include -using namespace mcl::bn; - -#if 0 -#include -int main(int argc, char *argv[]) -{ - Vec x[8], y[8]; - memset(x, argc, sizeof(x)); - memset(y, argc+1, sizeof(y)); - vsub(x, x, y); - mcl::bint::dump((const uint8_t*)x, sizeof(x)); -} -#else +#define CYBOZU_TEST_DISABLE_AUTO_RUN #include +#include + +using namespace mcl::bn; template inline void toArray(Unit x[N], const mpz_class& mx) @@ -1571,6 +1694,52 @@ CYBOZU_TEST_AUTO(init) g_mont.init(mcl::bn::Fp::getOp().mp); } +CYBOZU_TEST_AUTO(glvParam) +{ + const char *item[] = {"d", "exact b", "cost", "MiB", "heuristic b", "cost", "MiB" }; + const size_t itemN = CYBOZU_NUM_OF_ARRAY(item); + for (size_t i = 0; i < itemN; i++) { + if (i > 0) putchar('|'); + printf("%s", item[i]); + } + printf("\n"); + for (size_t i = 0; i < itemN; i++) { + if (i > 0) putchar('|'); + putchar('-'); + } + printf("\n"); + for (size_t d = 9; d < 28; d++) { + size_t n = (size_t(1) << d)/8*2; // /(#SIMD)*(GLV) + size_t b1 = mcl::ec::glvGetTheoreticBucketSize(n); + size_t cost1 = mcl::ec::glvCost(n, b1); + double mem1 = (8*8*8*3) * (size_t(1) << b1) / 1024.0 / 1024; + size_t b2 = glvGetBucketSizeAVX512(n); + size_t cost2 = mcl::ec::glvCost(n, b2); + double mem2 = (8*8*8*3) * (size_t(1) << b2) / 1024.0 / 1024; + printf("%zd|%zd|%zd|%.2f|%zd|%zd(%.2fx)|%.2f(%.2fx)\n", d, b1, cost1, mem1, b2, cost2, cost2/double(cost1), mem2, mem1/mem2); + } +} + +#if 0 +CYBOZU_TEST_AUTO(sqr) +{ + const size_t n = 8; + cybozu::XorShift rg; + Vec v[n]; + Vec z1[n], z2[n]; + for (size_t i = 0; i < n*8; i++) ((Unit*)v)[i] = rg.get32(); + for (size_t i = 0; i < n; i++) dump(v[i], "v"); + + vsqr(z1, v); + mcl_c5_vsqr(z2, v); + for (size_t i = 0; i < n*2; i++) { + printf("i=%zd eq=%s\n", i, memcmp(&z1[i], &z2[i], sizeof(Vec)) == 0 ? "o" : "x"); + dump(z1[i], "z1"); + dump(z2[i], "z2"); + } +} +#endif + void setParam(G1 *P, Fr *x, size_t n, cybozu::XorShift& rg) { for (size_t i = 0; i < n; i++) { @@ -1599,10 +1768,7 @@ CYBOZU_TEST_AUTO(cmp) EcM PM, QM; cybozu::XorShift rg; - for (size_t i = 0; i < n; i++) { - uint32_t v = rg.get32(); - hashAndMapToG1(P[i], &v, sizeof(v)); - } + setParam(P, 0, n, rg); PM.setG1A(PA); QM.setG1A(PA); v = PM.isEqualJacobiAll(QM); @@ -1698,6 +1864,20 @@ void asmTest(const mcl::bn::Fp x[8], const mcl::bn::Fp y[8]) dump(wm.v[i], "ng"); } } + // sqr + for (size_t i = 0; i < 8; i++) { + mcl::bn::Fp::sqr(z[i], x[i]); + } + mcl_c5_vsqr(zm.v, xm.v); + wm.setFpA((const mcl::msm::FpA*)z); + CYBOZU_TEST_ASSERT(zm == wm); + if (zm != wm) { + for (size_t i = 0; i < 8; i++) { + printf("i=%zd\n", i); + dump(zm.v[i], "ok"); + dump(wm.v[i], "ng"); + } + } } #if 1 @@ -1855,6 +2035,8 @@ CYBOZU_TEST_AUTO(vaddPre) CYBOZU_BENCH_C("vsub::VecA", C, vsub, za.v, za.v, xa.v); CYBOZU_BENCH_C("vmul::Vec", C/10, vmul, z[0].v, z[0].v, x[0].v); CYBOZU_BENCH_C("vmul::VecA", C/10, vmul, za.v, za.v, xa.v); + CYBOZU_BENCH_C("vsqr::Vec", C/10, vsqr, z[0].v, z[0].v); + CYBOZU_BENCH_C("vsqr::VecA", C/10, vsqr, za.v, za.v); CYBOZU_BENCH_C("FpM::inv", C/100, FpM::inv, z[0], z[0]); CYBOZU_BENCH_C("FpMA::inv", C/100, FpMA::inv, za, za); { @@ -1967,6 +2149,7 @@ CYBOZU_TEST_AUTO(op) } CYBOZU_BENCH_C("FpM::add", C, FpM::add, a[0], a[0], b[0]); CYBOZU_BENCH_C("FpM::mul", C, FpM::mul, a[0], a[0], b[0]); + CYBOZU_BENCH_C("FpM::sqr", C, FpM::sqr, a[0], a[0]); CYBOZU_BENCH_C("mulu", C, FpM::mul, a[0], a[0], g_m64to52u_); CYBOZU_BENCH_C("addn", C, lpN, FpM::add, a, a, b, n); CYBOZU_BENCH_C("muln", C, lpN, FpM::mul, a, a, b, n); @@ -2066,22 +2249,24 @@ CYBOZU_TEST_AUTO(opA) CYBOZU_TEST_AUTO(normalizeJacobiVec) { const bool isProj = false; - const size_t n = 64; - G1 P[n], Q[n], R[n]; - EcM PP[n/8]; - cybozu::XorShift rg; - setParam(P, 0, n, rg); - P[n/2].clear(); - P[n/3].clear(); - mcl::ec::normalizeVec(Q, P, n); - for (size_t i = 0; i < n/8; i++) { - PP[i].setG1A((mcl::msm::G1A*)&P[i*8], isProj); - } - normalizeJacobiVec(PP); - for (size_t i = 0; i < n/8; i++) { - PP[i].getG1A((mcl::msm::G1A*)&R[i*8], isProj); + const size_t N = 64; + G1 P[N], Q[N], R[N]; + EcM PP[N/8]; + for (size_t n = 8; n < N; n += 8) { + cybozu::XorShift rg; + setParam(P, 0, n, rg); + P[n/2].clear(); + P[n/3].clear(); + mcl::ec::normalizeVec(Q, P, n); + for (size_t i = 0; i < n/8; i++) { + PP[i].setG1A((mcl::msm::G1A*)&P[i*8], isProj); + } + normalizeJacobiVec(PP, n/8); + for (size_t i = 0; i < n/8; i++) { + PP[i].getG1A((mcl::msm::G1A*)&R[i*8], isProj); + } + CYBOZU_TEST_EQUAL_ARRAY(P, R, n); } - CYBOZU_TEST_EQUAL_ARRAY(P, R, n); const int C = 10000; CYBOZU_BENCH_C("EcM::setG1A:proj", C, PP[0].setG1A, (mcl::msm::G1A*)P, true); CYBOZU_BENCH_C("EcM::setG1A:jacobi", C, PP[0].setG1A, (mcl::msm::G1A*)P, false); @@ -2126,6 +2311,13 @@ CYBOZU_TEST_AUTO(mulEach_special) } } +void mulEachOrg(G1 *P, const Fr *x, size_t n) +{ + for (size_t i = 0; i < n; i++) { + G1::mul(P[i], P[i], x[i]); + } +} + CYBOZU_TEST_AUTO(mulEach) { const size_t n = 1024; @@ -2150,10 +2342,18 @@ CYBOZU_TEST_AUTO(mulEach) } } #ifdef NDEBUG + CYBOZU_BENCH_C("mulEachOrg", 100, mulEachOrg, Q, x, n); CYBOZU_BENCH_C("mulEach", 100, G1::mulEach, Q, x, n); #endif } +void copyMulVec(G1& R, const G1 *_P, const Fr *x, size_t n) +{ + G1 *P = (G1*)CYBOZU_ALLOCA(sizeof(G1) * n); + mcl::bint::copyN(P, _P, n); + mcl::msm::mulVecAVX512((Unit*)&R, (Unit*)P, (const Unit*)x, n); +} + CYBOZU_TEST_AUTO(mulVec) { const size_t n = 8203; @@ -2169,12 +2369,92 @@ CYBOZU_TEST_AUTO(mulVec) G1::mul(T, P[i], x[i]); Q += T; } + G1 P2[n]; + mcl::bint::copyN(P2, P, n); mcl::msm::mulVecAVX512((Unit*)&R, (Unit*)P, (const Unit*)x, n); // G1::mulVec(R, P, x, n); CYBOZU_TEST_EQUAL(Q, R); #ifdef NDEBUG + G1 R2; + CYBOZU_BENCH_C("mulVec(copy)", 30, copyMulVec, R2, P2, x, n); CYBOZU_BENCH_C("mulVec", 30, mcl::msm::mulVecAVX512, (Unit*)&R, (Unit*)P, (const Unit*)x, n); + CYBOZU_TEST_EQUAL(R, R2); #endif } -#endif + +void msmBench(int C, size_t db, size_t de, size_t b) +{ + initPairing(mcl::BLS12_381); + + printf("d = [%zd, %zd], b = %zd\n", db, de, b); + const size_t maxN = size_t(1) << de; + cybozu::XorShift rg; + std::vector Pvec(maxN); + std::vector xVec(maxN); + hashAndMapToG1(Pvec[0], "abc", 3); + for (size_t i = 1; i < maxN; i++) { + G1::add(Pvec[i], Pvec[i-1], Pvec[0]); + xVec[i].setByCSPRNG(rg); + } + G1 P1; + for (size_t d = db; d <= de; d++) { + size_t n = size_t(1) << d; + size_t b1 = glvGetBucketSizeAVX512(n/4); + printf("% 8zd % 2zd", n, b1); + CYBOZU_BENCH_C(" ", C, mcl::msm::mulVecAVX512, (Unit*)&P1, (Unit*)Pvec.data(), (const Unit*)xVec.data(), n, b1); + size_t b2 = b ? b : mcl::ec::glvGetTheoreticBucketSize(n/4); + printf("% 8zd % 2zd", n, size_t(b2)); + CYBOZU_BENCH_C(" ", C, mcl::msm::mulVecAVX512, (Unit*)&P1, (Unit*)Pvec.data(), (const Unit*)xVec.data(), n, b2); + } +} + +void showParams() +{ + puts("d|b|cost_b|theoretic|cost_t|cost_b/cost_t"); + for (size_t d = 7; d <= 27; d++) { + size_t n = size_t(1) << d; + size_t nn = n/8*2; // /#SIMD*GLV + size_t b1 = glvGetBucketSizeAVX512(nn); + size_t c1 = mcl::ec::glvCost(nn, b1); + size_t b2 = mcl::ec::glvGetTheoreticBucketSize(nn); + size_t c2 = mcl::ec::glvCost(nn, b2); + printf("%zd|%zd|%zd|%zd|%zd|%.2f\n", d, b1, c1, b2, c2, c1/double(c2)); + } +} + +int main(int argc, char *argv[]) +{ + cybozu::Option opt; + size_t db, de, d; + size_t b; + bool msm, show; + int C; + opt.appendOpt(&b, 0, "b", ": set bucket size"); + opt.appendOpt(&d, 9, "d", ": set n to 1< FpVec; void f(FpVec& zv, const FpVec& xv, const FpVec& yv)