diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 00000000..2f957d54 --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,32 @@ +# Contributing to mcl + +Thank you for considering contributing to the mcl project. This document provides guidelines on how to contribute. + +## Bug Reports and Feedback + +If you find a bug, have a feature request, or have questions, please open an issue. Include the following information: + +- Detailed description of the problem +- Steps to reproduce +- Expected behavior +- Actual behavior +- Environment details (OS, compiler version, etc.) + +## Creating Pull Requests + +If you want to add features or make fixes, follow these steps to create a pull request: + +1. Fork the repository +2. Create a new branch: `git checkout -b my-feature-branch` +3. Make your changes +4. Run tests and ensure all tests pass +5. Commit your changes: `git commit -am 'Add new feature'` +6. Push the branch: `git push origin my-feature-branch` +7. Create a pull request + +When creating a pull request, clearly describe the changes and include any related issue numbers. + +## License + +mcl is released under the BSD-3-Clause License. Any code contributions will be licensed under the same license. + diff --git a/CMakeLists.txt b/CMakeLists.txt index e7664c4f..6cd09109 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -283,8 +283,10 @@ endif() foreach(bit IN ITEMS 256 384 384_256) if (MCL_STATIC_LIB) add_library(mclbn${bit} STATIC src/bn_c${bit}.cpp) + target_link_libraries(mclbn${bit} PUBLIC mcl::mcl_st) else() add_library(mclbn${bit} SHARED src/bn_c${bit}.cpp) + target_link_libraries(mclbn${bit} PUBLIC mcl::mcl) endif() add_library(mcl::mclbn${bit} ALIAS mclbn${bit}) set_target_properties(mclbn${bit} PROPERTIES @@ -294,7 +296,6 @@ foreach(bit IN ITEMS 256 384 384_256) target_compile_options(mclbn${bit} PRIVATE ${MCL_COMPILE_OPTIONS}) target_compile_definitions(mclbn${bit} PUBLIC MCL_NO_AUTOLINK MCLBN_NO_AUTOLINK) - target_link_libraries(mclbn${bit} PUBLIC mcl::mcl) set_target_properties(mclbn${bit} PROPERTIES VERSION ${mcl_VERSION} SOVERSION ${mcl_VERSION_MAJOR}) diff --git a/Makefile b/Makefile index ad7cf292..e334dca7 100644 --- a/Makefile +++ b/Makefile @@ -443,6 +443,13 @@ bin/llvm_test64.exe: test/llvm_test.cpp src/base64.ll bin/llvm_test32.exe: test/llvm_test.cpp src/base32.ll $(CLANG) -o $@ -Ofast -DNDEBUG -Wall -Wextra -I ./include test/llvm_test.cpp src/base32.ll -m32 +$(OBJ_DIR)/$(MSM)_test.o: src/$(MSM).cpp + $(PRE)$(CXX) -c $< -o $@ $(CFLAGS) -mavx512f -mavx512ifma -std=c++11 $(CFLAGS_USER) -DMCL_MSM_TEST +MSM_TEST_OBJ=$(OBJ_DIR)/$(MSM)_test.o $(filter-out $(OBJ_DIR)/msm_avx.o,$(LIB_OBJ)) +$(EXE_DIR)/msm_test.exe: $(MSM_TEST_OBJ) + $(PRE)$(CXX) -o $@ $(LDFLAGS) $(MSM_TEST_OBJ) +-include $(OBJ_DIR)/msm_test.d + make_tbl: $(MAKE) ../bls/src/qcoeff-bn254.hpp diff --git a/api.md b/api.md index ac6e98cb..06222df0 100644 --- a/api.md +++ b/api.md @@ -61,7 +61,7 @@ r = |G1| = |G2| = |GT| curveType | b| r and p | ------------|--|------------------| BN254 | 2|r = 0x2523648240000001ba344d8000000007ff9f800000000010a10000000000000d
p = 0x2523648240000001ba344d80000000086121000000000013a700000000000013 | -BN_SNARK1|3|r = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47
p = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47| +BN_SNARK1|3|r = 0x30644e72e131a029b85045b68181585d2833e84879b9709143e1f593f0000001
p = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47| BLS12-381 | 4|r = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001
p = 0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab | BN381 | 2|r = 0x240026400f3d82b2e42de125b00158405b710818ac000007e0042f008e3e00000000001080046200000000000000000d
p = 0x240026400f3d82b2e42de125b00158405b710818ac00000840046200950400000000001380052e000000000000000013 | @@ -253,7 +253,7 @@ C++ T x = ; ``` -### Set `buf[0..bufSize-1]` to `x` with masking according to the following way. +### Set `bufSize` bytes `buf` to `x` with masking according to the following way. ``` int mclBnFp_setLittleEndian(mclBnFp *x, const void *buf, mclSize bufSize); int mclBnFr_setLittleEndian(mclBnFr *x, const void *buf, mclSize bufSize); @@ -270,7 +270,7 @@ T::setArrayMask(const uint8_t *buf, size_t n); - always return 0 -### Set (`buf[0..bufSize-1]` mod `p` or `r`) to `x`. +### Set `bufSize` bytes `buf` of mod `p` or `r` to `x`. ``` int mclBnFp_setLittleEndianMod(mclBnFp *x, const void *buf, mclSize bufSize); int mclBnFr_setLittleEndianMod(mclBnFr *x, const void *buf, mclSize bufSize); @@ -281,7 +281,7 @@ C++ T::setLittleEndianMod(const uint8_t *buf, mclSize bufSize); ``` -- return 0 if bufSize <= (sizeof(*x) * 8 * 2) else -1 +- return 0 if bufSize <= (sizeof(T) * 2) else -1 ### Get little-endian byte sequence `buf` corresponding to `x` ``` diff --git a/include/mcl/bn.h b/include/mcl/bn.h index 41ac9b06..8aa4b81d 100644 --- a/include/mcl/bn.h +++ b/include/mcl/bn.h @@ -485,6 +485,9 @@ MCLBN_DLL_API void mclBnG1_mulVec(mclBnG1 *z, mclBnG1 *x, const mclBnFr *y, mclS MCLBN_DLL_API void mclBnG2_mulVec(mclBnG2 *z, mclBnG2 *x, const mclBnFr *y, mclSize n); MCLBN_DLL_API void mclBnGT_powVec(mclBnGT *z, const mclBnGT *x, const mclBnFr *y, mclSize n); +// x[i] *= y[i] +MCLBN_DLL_API void mclBnG1_mulEach(mclBnG1 *x, const mclBnFr *y, mclSize n); + MCLBN_DLL_API void mclBn_pairing(mclBnGT *z, const mclBnG1 *x, const mclBnG2 *y); MCLBN_DLL_API void mclBn_finalExp(mclBnGT *y, const mclBnGT *x); MCLBN_DLL_API void mclBn_millerLoop(mclBnGT *z, const mclBnG1 *x, const mclBnG2 *y); diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp index 8e8fb9ca..480e434f 100644 --- a/include/mcl/bn.hpp +++ b/include/mcl/bn.hpp @@ -48,6 +48,7 @@ namespace msm { bool initMsm(const mcl::CurveParam& cp, const Param *param); void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n); +void mulEachAVX512(Unit *_x, const Unit *_y, size_t n); } // mcl::msm #endif @@ -692,9 +693,12 @@ struct GLV1 : mcl::GLV1T { const mpz_class& r = Fr::getOp().mp; B[0][0] = z * z - 1; // L v0 = (B[0][0] << rBitSize) / r; - if (curveType == BLS12_381.curveType && MCL_SIZEOF_UNIT == 8) { +#if MCL_SIZEOF_UNIT == 8 + if (curveType == BLS12_381.curveType) { optimizedSplit = optimizedSplitForBLS12_381; - } else { + } else +#endif + { optimizedSplit = splitForBLS12; } } else { @@ -723,38 +727,19 @@ struct GLV1 : mcl::GLV1T { b = (x * v0) >> rBitSize; a = x - b * B[0][0]; } +#if MCL_SIZEOF_UNIT == 8 static inline void optimizedSplitForBLS12_381(mpz_class u[2], const mpz_class& x) { - assert(sizeof(Unit) == 8); - /* - z = -0xd201000000010000 - L = z^2-1 = 0xac45a4010001a40200000000ffffffff - r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001 - s=255 - v = 0xbe35f678f00fd56eb1fb72917b67f718 - */ - mpz_class& a = u[0]; - mpz_class& b = u[1]; - static const uint64_t Lv[] = { 0x00000000ffffffff, 0xac45a4010001a402 }; - static const uint64_t vv[] = { 0xb1fb72917b67f718, 0xbe35f678f00fd56e }; static const size_t n = 128 / mcl::UnitBitSize; - Unit t[n*3]; - // n = 128 bit - // t[n*3] = x[n*2] * vv[n] - mcl::bint::mulNM(t, gmp::getUnit(x), n*2, (const Unit*)vv, n); - // t[n] <- t[n*3] - mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255 + Unit xa[n*2], a[2], b[2]; + mcl::gmp::getArray(xa, n*2, x); + ec::local::optimizedSplitRawForBLS12_381(a, b, xa); bool dummy; - gmp::setArray(&dummy, b, t, n); - Unit t2[n*2]; - // t2[n*2] = t[n] * Lv[n] - // Do not overlap I/O buffers on pre-Broadwell CPUs. - mcl::bint::mulT(t2, t, (const Unit*)Lv); - // t[n] = x[n*2] - t2[n*2] - mcl::bint::subT(t, gmp::getUnit(x), t2); - gmp::setArray(&dummy, a, t, n); + gmp::setArray(&dummy, u[0], a, n); + gmp::setArray(&dummy, u[1], b, n); (void)dummy; } +#endif }; /* @@ -2314,6 +2299,7 @@ inline void init(bool *pb, const mcl::CurveParam& cp = mcl::BN254, fp::Mode mode if (sizeof(Unit) == 8 && sizeof(Fp) == sizeof(mcl::msm::FpA) && sizeof(Fr) == sizeof(mcl::msm::FrA)) { if (mcl::msm::initMsm(cp, ¶)) { G1::setMulVecOpti(mcl::msm::mulVecAVX512); + G1::setMulEachOpti(mcl::msm::mulEachAVX512); } } #endif diff --git a/include/mcl/ec.hpp b/include/mcl/ec.hpp index 88e9092b..028153ff 100644 --- a/include/mcl/ec.hpp +++ b/include/mcl/ec.hpp @@ -242,10 +242,54 @@ void normalizeVecT(Eout& Q, Ein& P, size_t n, size_t N = 256) } } +#if MCL_SIZEOF_UNIT == 8 +/* + split x to (a, b) such that x = a + b L where 0 <= a, b <= L, 0 <= x <= r-1 = L^2+L + if adj is true, then 0 <= a < L, 0 <= b <= L+1 +*/ +inline void optimizedSplitRawForBLS12_381(Unit a[2], Unit b[2], const Unit x[4]) +{ + const bool adj = false; + assert(sizeof(Unit) == 8); + /* + z = -0xd201000000010000 + L = z^2-1 = 0xac45a4010001a40200000000ffffffff + r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001 + s=255 + v = (1<>255 + mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255 + b[0] = t[0]; + b[1] = t[1]; + Unit t2[n*2]; + // t2[n*2] = t[n] * L[n] + // Do not overlap I/O buffers on pre-Broadwell CPUs. + mcl::bint::mulT(t2, t, L); + // a[n] = x[n*2] - t2[n*2] + mcl::bint::subT(a, x, t2); + if (adj) { + if (mcl::bint::cmpEqT(a, L)) { + // if a == L then b = b + 1 and a = 0 + mcl::bint::addT(b, b, one); + mcl::bint::clearT(a); + } + } +} +#endif + } // mcl::ec::local // [X:Y:Z] as Proj = (X/Z, Y/Z) as Affine = [XZ:YZ^2:Z] as Jacobi -// Remark. convert P = [1:0:0] to Q = [0:0:0] +// Remark. convert P = [*:*:0] to Q = [0:0:0] template void ProjToJacobi(E& Q, const E& P) { @@ -257,7 +301,7 @@ void ProjToJacobi(E& Q, const E& P) } // [X:Y:Z] as Jacobi = (X/Z^2, Y/Z^3) as Affine = [XZ:Y:Z^3] as Proj -// Remark. convert P = [1:1:0] to Q = [0:1:0] +// Remark. convert P = [*:1:0] to Q = [0:1:0] template void JacobiToProj(E& Q, const E& P) { @@ -1316,6 +1360,7 @@ class EcT : public fp::Serializable > { static mpz_class order_; static bool (*mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime); static void (*mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n); + static void (*mulEachOpti)(Unit *xVec, const Unit *yVec, size_t n); static bool (*isValidOrderFast)(const EcT& x); /* default constructor is undefined value */ EcT() {} @@ -1382,6 +1427,7 @@ class EcT : public fp::Serializable > { order_ = 0; mulVecGLV = 0; mulVecOpti = 0; + mulEachOpti = 0; isValidOrderFast = 0; mode_ = mode; } @@ -1412,6 +1458,10 @@ class EcT : public fp::Serializable > { { mulVecOpti = f; } + static void setMulEachOpti(void f(Unit *_xVec, const Unit *_yVec, size_t yn)) + { + mulEachOpti = f; + } static inline void init(bool *pb, const char *astr, const char *bstr, int mode = ec::Jacobi) { Fp a, b; @@ -1463,12 +1513,14 @@ class EcT : public fp::Serializable > { void clear() { if (mode_ == ec::Jacobi) { - x = 1; - } else { + x = 0; + y = 0; + z.clear(); + } else { // ec::Proj x.clear(); + y = 1; + z.clear(); } - y = 1; - z.clear(); } static inline void clear(EcT& P) { @@ -2080,7 +2132,7 @@ class EcT : public fp::Serializable > { return; } if (mulVecOpti && n >= 128) { - mulVecOpti((Unit*)&z, (Unit*)xVec, (const Unit*)yVec, n); + mulVecOpti((Unit*)&z, (Unit*)xVec, yVec[0].getUnit(), n); return; } if (mulVecGLV && mulVecGLV(z, xVec, yVec, n, false)) { @@ -2133,6 +2185,20 @@ class EcT : public fp::Serializable > { mulVec(z, xVec, yVec, n); #endif } + // xVec[i] *= yVec[i] + static void mulEach(EcT *xVec, const EcT::Fr *yVec, size_t n) + { + if (mulEachOpti && n >= 8) { + size_t n8 = n & ~size_t(7); + mulEachOpti((Unit*)xVec, yVec[0].getUnit(), n8); + xVec += n8; + yVec += n8; + n -= n8; + } + for (size_t i = 0; i < n; i++) { + xVec[i] *= yVec[i]; + } + } #ifndef CYBOZU_DONT_USE_EXCEPTION static inline void init(const std::string& astr, const std::string& bstr, int mode = ec::Jacobi) { @@ -2192,6 +2258,7 @@ template bool (*EcT::mulVecGLV)(EcT& z, const EcT *x template void (*EcT::mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n); template bool (*EcT::isValidOrderFast)(const EcT& x); template int EcT::mode_; +template void (*EcT::mulEachOpti)(Unit *xVec, const Unit *yVec, size_t n); // r = the order of Ec template diff --git a/include/mcl/impl/bn_c_impl.hpp b/include/mcl/impl/bn_c_impl.hpp index 00f49c5f..77bef385 100644 --- a/include/mcl/impl/bn_c_impl.hpp +++ b/include/mcl/impl/bn_c_impl.hpp @@ -639,6 +639,10 @@ void mclBnGT_powVec(mclBnGT *z, const mclBnGT *x, const mclBnFr *y, mclSize n) { GT::powVec(*cast(z), cast(x), cast(y), n); } +void mclBnG1_mulEach(mclBnG1 *x, const mclBnFr *y, mclSize n) +{ + G1::mulEach(cast(x), cast(y), n); +} void mclBn_pairing(mclBnGT *z, const mclBnG1 *x, const mclBnG2 *y) { diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 9e544144..8e3e68c0 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -29,7 +29,7 @@ namespace mcl { -static const int version = 0x191; /* 0xABC = A.BC */ +static const int version = 0x192; /* 0xABC = A.BC */ /* specifies available string format mode for X::setIoMode() diff --git a/misc/internal.md b/misc/internal.md index b1b2fdd2..9f5d362d 100644 --- a/misc/internal.md +++ b/misc/internal.md @@ -16,6 +16,10 @@ w/o IFMA|66.498|122.666|227.042|426.498 w IFMA|46.411|87.002|153.958|300.331 speed up rate|1.43|1.41|1.47|1.42 +G1 mulEach +- w/o IFMA : 42.166Mclk +- w IFMA : 16.643Mclk + # GLV method ## Split function for BLS12-381 @@ -41,33 +45,42 @@ bit_length|64|128|255|255|128 ### Split function ```python +adj = False def split(x): b = (x * v) >> s a = x - b * L + if adj: + if a >= L: + a -= L + b += 1 return (a, b) ``` +- x in [0, r-1] - a + b L = x for (a, b) = split(x). ### Theorem -0 <= a, b < H for all x in [0, M-r]. +0 <= a < 1.11 L < H and 0 <= b < L+1 for x in [0, r-1]. ### Proof ``` -Let r0 := L S % r, then S=v L + r0 and r0 in [0, L-1]. +Let r0 := L S % r, then S=v L + r0 and r0 in [0, L-1]. In fact, r0 ~ 0.11 L. Let r1 := x v % S, then x v = b S + r1 and r1 in [0, S-1]. ``` ``` -b <= xv / S < (M-r) (S/L)/S = (M-r)/L < H. +b <= xv / S < (r-1) (S/L)/S = (r-1)/L = L+1. ``` ``` aS = (x - bL)S = xS - bSL = xS - (xv - r1)L = x(S - vL) + r1 L = r0 x + r1 L - <= r0 (M-r) + (S-1)L < S H. + <= r0 (r-1) + (S-1)L = S L + (r-1)r0 - L. +a <= L + ((r-1)r0 - L)/S +((r-1)r0 - L)/S ~ 0.10016 L < 0.11 L. ``` -Then, a < H. -So for x in [0, M-1], set x = x - r if x >= H and apply split() to x. +### Remark +If adj is true, then a is in [0, L-1]. + ## window size - 128-bit (Fr is 256 bit and use GLV method) @@ -84,14 +97,15 @@ f(w)|130|68|51|48|58|86 argmin f(w) = 4 -## Use projective coordinates +## Selection of coordinates -- psuedo code of GLV method +### psuedo code of GLV method ```python def mul(P, x): - (a, b) = split(x) - # a, b < 1<<128 + assert(0 <= x < r) + (a, b) = split(x) # x = a + b L + # a, b < H=1<<128 w = 4 for i in range(1<> (w*i)) & mask - j2 = (b >> (w*i)) & mask ### AAA - Q = add(Q, tbl1[j1]) - Q = add(Q, tbl2[j2]) + j2 = (b >> (w*i)) & mask + Q = add(Q, tbl2[j2]) # ADD1 + Q = add(Q, tbl1[j1]) # ADD2 return Q ``` -The values of tbl1[i] are 0, P, ..., 15P, and the values of tbl2[i] are 0, LP, ... , 15LP. -Since L is odd and Q is a multiple of 16 just before AAA, Q != tbl1[j1] and Q != tbl2[j2]. So we can omit the ehckd of x == y in add(x, y). +Note that the value of tbl2 is added first. + +### Theorem +We can use Jacobi additive formula add(P, Q) assuming P != Q and P+Q != 0. + +Proof. + +During the calculation, Q is monotonic increase and always in [0, P, ..., (r-1)P]. + +- ADD1 : tbl2[] is in [0, L P, ..., 15 L P] and L is odd. +After computing AAA, Q is a multiple of 16 P, so Q != tbl2[j2]. +- ADD2 : tbl1[] is in [0, P, ..., 15 P]. +After computing ADD1, if the immediately preceding tbl2[j2] is 0, then then Q is a multiple of 16 P, so Q != tbl1[j1]. +Otherwise, Q is bigger than L P, so Q != tbl1[j1]. ## Jacobi and Proj `sqr` is equal to `mul` on AVX-512. diff --git a/readme.md b/readme.md index 13eb7c5b..0a8d8d72 100644 --- a/readme.md +++ b/readme.md @@ -10,6 +10,7 @@ mcl is a library for pairing-based cryptography, which supports the optimal Ate pairing over BN curves and BLS12-381 curves. # News +- mulEach with AVX-512 IFMA is 2.5 times faster than G1::mul on BLS12-381 - mulVec (multi scalar multiplication) with AVX-512 IFMA is 1.4 times faster on Xeon w9-3495X - a little performance improvement of G1::mulVec of BLS12-381 - improve performance of Fr::inv on M1 mac diff --git a/sample/mt_test.cpp b/sample/mt_test.cpp index 9072f31a..47af39fe 100644 --- a/sample/mt_test.cpp +++ b/sample/mt_test.cpp @@ -65,6 +65,7 @@ int main(int argc, char *argv[]) P2.clear(); CYBOZU_BENCH_C("G1 multi ", C, G1::mulVecMT, P2, Pvec.data(), xVec.data(), n, cpuN); if (P1 != P2) puts("G1::mulVecMT err"); + CYBOZU_BENCH_C("G1 mulEach", C, G1::mulEach, Pvec.data(), xVec.data(), n); G2 Q1, Q2; CYBOZU_BENCH_C("G2 single", C, G2::mulVec, Q1, Qvec.data(), xVec.data(), n); CYBOZU_BENCH_C("G2 multi ", C, G2::mulVecMT, Q2, Qvec.data(), xVec.data(), n, cpuN); diff --git a/src/msm_avx.cpp b/src/msm_avx.cpp index 997660c2..386caf47 100644 --- a/src/msm_avx.cpp +++ b/src/msm_avx.cpp @@ -6,19 +6,26 @@ http://opensource.org/licenses/BSD-3-Clause */ #include -#include #ifdef _WIN32 #include #else #include #endif + +#include #define XBYAK_NO_EXCEPTION #include "xbyak/xbyak_util.h" +#if defined(__GNUC__) && !defined(__EMSCRIPTEN__) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + typedef mcl::Unit Unit; typedef __m512i Vec; typedef __mmask8 Vmask; +namespace { + static mcl::msm::Param g_param; const size_t S = sizeof(Unit)*8-1; // 63 @@ -27,12 +34,10 @@ const size_t N = 8; // = ceil(384/52) const size_t M = sizeof(Vec) / sizeof(Unit); const uint64_t g_mask = (Unit(1)< inline void toArray(Unit x[N], mpz_class mx) { @@ -250,7 +267,7 @@ inline void vrawAdd(Vec *z, const Vec *x, const Vec *y) { Vec t = vadd(x[0], y[0]); Vec c = vpsrlq(t, W); - z[0] = vand(t, vmask); + z[0] = vand(t, g_vmask); for (size_t i = 1; i < n; i++) { t = vadd(x[i], y[i]); @@ -260,7 +277,7 @@ inline void vrawAdd(Vec *z, const Vec *x, const Vec *y) return; } c = vpsrlq(t, W); - z[i] = vand(t, vmask); + z[i] = vand(t, g_vmask); } } @@ -269,12 +286,12 @@ inline Vmask vrawSub(Vec *z, const Vec *x, const Vec *y) { Vec t = vsub(x[0], y[0]); Vec c = vpsrlq(t, S); - z[0] = vand(t, vmask); + z[0] = vand(t, g_vmask); for (size_t i = 1; i < n; i++) { t = vsub(x[i], y[i]); t = vsub(t, c); c = vpsrlq(t, S); - z[i] = vand(t, vmask); + z[i] = vand(t, g_vmask); } return vcmpneq(c, vzero()); } @@ -290,7 +307,7 @@ inline void uvadd(Vec *z, const Vec *x, const Vec *y) { Vec sN[N], tN[N]; vrawAdd(sN, x, y); - Vmask c = vrawSub(tN, sN, vpN); + Vmask c = vrawSub(tN, sN, g_vpN); uvselect(z, c, sN, tN); } @@ -298,8 +315,8 @@ inline void uvsub(Vec *z, const Vec *x, const Vec *y) { Vec sN[N], tN[N]; Vmask c = vrawSub(sN, x, y); - vrawAdd(tN, sN, vpN); - tN[N-1] = vand(tN[N-1], vmask); + vrawAdd(tN, sN, g_vpN); + tN[N-1] = vand(tN[N-1], g_vmask); uvselect(z, c, tN, sN); } @@ -406,15 +423,15 @@ inline void vset(Vec *t, const Vmask& c, const Vec a[n]) inline void uvmont(Vec z[N], Vec xy[N*2]) { for (size_t i = 0; i < N; i++) { - Vec q = vmulL(xy[i], vrp); - xy[N+i] = vadd(xy[N+i], vrawMulUnitAdd(xy+i, vpN, q)); + Vec q = vmulL(xy[i], g_vrp); + xy[N+i] = vadd(xy[N+i], vrawMulUnitAdd(xy+i, g_vpN, q)); xy[i+1] = vadd(xy[i+1], vpsrlq(xy[i], W)); } for (size_t i = N; i < N*2-1; i++) { xy[i+1] = vadd(xy[i+1], vpsrlq(xy[i], W)); - xy[i] = vand(xy[i], vmask); + xy[i] = vand(xy[i], g_vmask); } - Vmask c = vrawSub(z, xy+N, vpN); + Vmask c = vrawSub(z, xy+N, g_vpN); uvselect(z, c, xy+N, z); } @@ -427,19 +444,19 @@ inline void uvmul(Vec *z, const Vec *x, const Vec *y) #else Vec t[N*2], q; vrawMulUnit(t, x, y[0]); - q = vmulL(t[0], vrp); - t[N] = vadd(t[N], vrawMulUnitAdd(t, vpN, q)); + q = vmulL(t[0], g_vrp); + t[N] = vadd(t[N], vrawMulUnitAdd(t, g_vpN, q)); for (size_t i = 1; i < N; i++) { t[N+i] = vrawMulUnitAdd(t+i, x, y[i]); t[i] = vadd(t[i], vpsrlq(t[i-1], W)); - q = vmulL(t[i], vrp); - t[N+i] = vadd(t[N+i], vrawMulUnitAdd(t+i, vpN, q)); + q = vmulL(t[i], g_vrp); + t[N+i] = vadd(t[N+i], vrawMulUnitAdd(t+i, g_vpN, q)); } for (size_t i = N; i < N*2; i++) { t[i] = vadd(t[i], vpsrlq(t[i-1], W)); - t[i-1] = vand(t[i-1], vmask); + t[i-1] = vand(t[i-1], g_vmask); } - Vmask c = vrawSub(z, t+N, vpN); + Vmask c = vrawSub(z, t+N, g_vpN); uvselect(z, c, t+N, z); #endif } @@ -471,27 +488,6 @@ inline Vec getUnitAt(const Vec *x, size_t xN, size_t bitPos) return vor(vpsrlq(x[q], r), vpsllq(x[q+1], bitSize - r)); } -inline void split(Unit a[2], Unit b[2], const Unit x[4]) -{ - /* - z = -0xd201000000010000 - L = z^2-1 = 0xac45a4010001a40200000000ffffffff - r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001 - s=255 - v = 0xbe35f678f00fd56eb1fb72917b67f718 - */ - static const uint64_t Lv[] = { 0x00000000ffffffff, 0xac45a4010001a402 }; - static const uint64_t vv[] = { 0xb1fb72917b67f718, 0xbe35f678f00fd56e }; - static const size_t n = 128 / mcl::UnitBitSize; - Unit t[n*3]; - mcl::bint::mulNM(t, x, n*2, vv, n); - mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255 - b[0] = t[0]; - b[1] = t[1]; - mcl::bint::mulT(t, t, Lv); - mcl::bint::subT(a, x, t); -} - class Montgomery { Unit v_[N]; public: @@ -507,7 +503,7 @@ class Montgomery { if (x == 0) return 0; return mcl::gmp::getUnit(x, 0) & g_mask; } - void set(const mpz_class& _p) + void init(const mpz_class& _p) { mp = _p; mR = 1; @@ -551,7 +547,6 @@ class Montgomery { } }; -Montgomery g_mont; /* |64 |64 |64 |64 |64 |64 | @@ -561,13 +556,13 @@ Montgomery g_mont; inline void split52bit(Vec y[8], const Vec x[6]) { assert(&y != &x); - y[0] = vand(x[0], vmask); - y[1] = vand(vor(vpsrlq(x[0], 52), vpsllq(x[1], 12)), vmask); - y[2] = vand(vor(vpsrlq(x[1], 40), vpsllq(x[2], 24)), vmask); - y[3] = vand(vor(vpsrlq(x[2], 28), vpsllq(x[3], 36)), vmask); - y[4] = vand(vor(vpsrlq(x[3], 16), vpsllq(x[4], 48)), vmask); - y[5] = vand(vpsrlq(x[4], 4), vmask); - y[6] = vand(vor(vpsrlq(x[4], 56), vpsllq(x[5], 8)), vmask); + y[0] = vand(x[0], g_vmask); + y[1] = vand(vor(vpsrlq(x[0], 52), vpsllq(x[1], 12)), g_vmask); + y[2] = vand(vor(vpsrlq(x[1], 40), vpsllq(x[2], 24)), g_vmask); + y[3] = vand(vor(vpsrlq(x[2], 28), vpsllq(x[3], 36)), g_vmask); + y[4] = vand(vor(vpsrlq(x[3], 16), vpsllq(x[4], 48)), g_vmask); + y[5] = vand(vpsrlq(x[4], 4), g_vmask); + y[6] = vand(vor(vpsrlq(x[4], 56), vpsllq(x[5], 8)), g_vmask); y[7] = vpsrlq(x[5], 44); } @@ -667,6 +662,7 @@ struct FpM { static FpM mR2_; static FpM m64to52_; static FpM m52to64_; + static Montgomery g_mont; static void add(FpM& z, const FpM& x, const FpM& y) { uvadd(z.v, x.v, y.v); @@ -723,6 +719,10 @@ struct FpM { mpz_class r = getRaw(i); return g_mont.fromMont(r); } + void clear() + { + memset(this, 0, sizeof(*this)); + } bool operator==(const FpM& rhs) const { for (size_t i = 0; i < N; i++) { @@ -739,6 +739,14 @@ struct FpM { } return vcmpeq(t, vzero()); } + Vmask isZero() const + { + Vec t = v[0]; + for (size_t i = 1; i < M; i++) { + t = vor(t, v[i]); + } + return vcmpeq(t, vzero()); + } static void pow(FpM& z, const FpM& x, const Vec *y, size_t yn) { const int w = 4; @@ -790,14 +798,10 @@ struct FpM { } static void inv(FpM& z, const FpM& x) { -#if 1 mcl::msm::FpA v[M]; x.getFp(v); g_param.invVecFp(v, v, M, M); z.setFp(v); -#else - pow(z, x, g_vmpM2, 6); -#endif } // condition set (set x if c) void cset(const Vmask& c, const FpM& x) @@ -806,6 +810,22 @@ struct FpM { v[i] = vselect(c, x.v[i], v[i]); } } + // return c ? a : b; + static FpM select(const Vmask& c, const FpM& a, const FpM& b) + { + FpM d; + for (size_t i = 0; i < N; i++) { + d.v[i] = vselect(c, a.v[i], b.v[i]); + } + return d; + } + static void init(const mpz_class& mp) + { + g_mont.init(mp); + } +#ifdef MCL_MSM_TEST + void dump(size_t pos, const char *msg = nullptr) const; +#endif }; FpM FpM::one_; @@ -814,16 +834,7 @@ FpM FpM::rw_; FpM FpM::mR2_; FpM FpM::m64to52_; FpM FpM::m52to64_; - -template -inline Vmask isZero(const E& P) -{ - Vec v = P.z.v[0]; - for (size_t i = 1; i < N; i++) { - v = vor(v, P.z.v[i]); - } - return vcmpeq(v, vzero()); -} +Montgomery FpM::g_mont; template inline void normalizeJacobiVec(E P[n]) @@ -831,27 +842,28 @@ inline void normalizeJacobiVec(E P[n]) assert(n >= 2); typedef typename E::Fp F; F tbl[n]; - tbl[0] = P[0].z; + tbl[0] = F::select(P[0].z.isZero(), F::one_, P[0].z); for (size_t i = 1; i < n; i++) { - F::mul(tbl[i], tbl[i-1], P[i].z); + F t = F::select(P[i].z.isZero(), F::one_, P[i].z); + F::mul(tbl[i], tbl[i-1], t); } F r; F::inv(r, tbl[n-1]); for (size_t i = 0; i < n; i++) { size_t pos = n-1-i; - F t = P[pos].z; + F& z = P[pos].z; F rz, rz2; - if (pos > 0) { - F::mul(rz, r, tbl[pos-1]); - F::mul(r, r, t); - } else { + if (pos == 0) { rz = r; + } else { + F::mul(rz, r, tbl[pos-1]); + F::mul(r, r, F::select(z.isZero(), F::one_, z)); } F::sqr(rz2, rz); F::mul(P[pos].x, P[pos].x, rz2); // xz^-2 F::mul(rz2, rz2, rz); F::mul(P[pos].y, P[pos].y, rz2); // yz^-3 - P[pos].z = F::one_; + z = F::select(z.isZero(), z, F::one_); } } @@ -862,8 +874,6 @@ template inline void addJacobiMixedNoCheck(E& R, const E& P, const E& Q) { typedef typename E::Fp F; - Vmask c = isZero(Q); - E saveP = P; F r, U1, S1, H, H3; F::sqr(r, P.z); U1 = P.x; @@ -885,7 +895,6 @@ inline void addJacobiMixedNoCheck(E& R, const E& P, const E& Q) F::mul(U1, U1, r); F::mul(H3, H3, S1); F::sub(R.y, U1, H3); - R.cset(c, saveP); } // 12M+4S+7A @@ -894,8 +903,6 @@ template inline void addJacobiNoCheck(E& R, const E& P, const E& Q) { typedef typename E::Fp F; - Vmask c = isZero(Q); - E saveP = P; F r, U1, S1, H, H3; F::sqr(r, P.z); F::sqr(S1, Q.z); @@ -920,7 +927,6 @@ inline void addJacobiNoCheck(E& R, const E& P, const E& Q) F::mul(U1, U1, r); F::mul(H3, H3, S1); F::sub(R.y, U1, H3); - R.cset(c, saveP); } // assume a = 0 @@ -969,11 +975,14 @@ struct EcM { if (isProj) { mcl::ec::addCTProj(z, x, y); } else { + EcM t; if (mixed) { - addJacobiMixedNoCheck(z, x, y); + addJacobiMixedNoCheck(t, x, y); } else { - addJacobiNoCheck(z, x, y); + addJacobiNoCheck(t, x, y); } + t = select(x.isZero(), y, t); + z = select(y.isZero(), x, t); } } template @@ -985,18 +994,26 @@ struct EcM { dblJacobiNoCheck(z, x); } } - static void init(Montgomery& mont) + static void init(const Montgomery& mont) { const int b = 4; mpz_class b3 = mont.toMont(b * 3); expandN(b3_.v, b3); - zeroJacobi_.x.set(1); - zeroJacobi_.y.set(1); + zeroJacobi_.x.set(0); + zeroJacobi_.y.set(0); zeroJacobi_.z.set(0); zeroProj_.x.set(0); zeroProj_.y.set(1); zeroProj_.z.set(0); } + static EcM select(const Vmask& c, const EcM& a, const EcM& b) + { + EcM d; + d.x = FpM::select(c, a.x, b.x); + d.y = FpM::select(c, a.y, b.y); + d.z = FpM::select(c, a.z, b.z); + return d; + } template static const EcM& zero() { @@ -1017,44 +1034,23 @@ struct EcM { } void setG1(const mcl::msm::G1A v[M], bool JacobiToProj = true) { -#if 1 setArray(v[0].v); FpM::mul(x, x, FpM::m64to52_); FpM::mul(y, y, FpM::m64to52_); FpM::mul(z, z, FpM::m64to52_); -#else - Unit a[6*3*M]; - const Unit *src = (const Unit *)v; - for (size_t i = 0; i < M*3; i++) { - mcl::bn::Fp::getOp().fromMont(a+i*6, src+i*6); + if (JacobiToProj) { + mcl::ec::JacobiToProj(*this, *this); + y = FpM::select(z.isZero(), FpM::one_, y); } - setArray(a); - x.toMont(x); - y.toMont(y); - z.toMont(z); -#endif - if (JacobiToProj) mcl::ec::JacobiToProj(*this, *this); } void getG1(mcl::msm::G1A v[M], bool ProjToJacobi = true) const { EcM T = *this; if (ProjToJacobi) mcl::ec::ProjToJacobi(T, T); -#if 1 FpM::mul(T.x, T.x, FpM::m52to64_); FpM::mul(T.y, T.y, FpM::m52to64_); FpM::mul(T.z, T.z, FpM::m52to64_); T.getArray(v[0].v); -#else - T.x.fromMont(T.x); - T.y.fromMont(T.y); - T.z.fromMont(T.z); - Unit a[6*3*M]; - T.getArray(a); - Unit *dst = (Unit *)v; - for (size_t i = 0; i < M*3; i++) { - mcl::bn::Fp::getOp().toMont(dst+i*6, a+i*6); - } -#endif } void normalize() { @@ -1067,7 +1063,7 @@ struct EcM { template static void makeTable(EcM *tbl, const EcM& P) { - tbl[0].clear(); + tbl[0].clear(); tbl[1] = P; dbl(tbl[2], P); for (size_t i = 3; i < tblN; i++) { @@ -1117,14 +1113,13 @@ struct EcM { Q.z = P.z; } template - static void mulGLV(EcM& Q, const EcM& _P, const Vec y[4]) + static void mulGLV(EcM& Q, const EcM& P, const Vec y[4]) { - EcM P = _P; - if (!isProj) mcl::ec::ProjToJacobi(P, _P); + // QQQ (n=1024) isProj=T : 36.8, isProj=F&&mixed=F : 36.0, isProj=F&&mixed=T : 34.6 Vec a[2], b[2]; EcM tbl1[tblN], tbl2[tblN]; makeTable(tbl1, P); - if (!isProj) normalizeJacobiVec(tbl1+1); + if (!isProj && mixed) normalizeJacobiVec(tbl1+1); for (size_t i = 0; i < tblN; i++) { mulLambda(tbl2[i], tbl1[i]); } @@ -1134,7 +1129,7 @@ struct EcM { for (size_t i = 0; i < M; i++) { Unit buf[4] = { src[i+M*0], src[i+M*1], src[i+M*2], src[i+M*3] }; Unit aa[2], bb[2]; - split(aa, bb, buf); + mcl::ec::local::optimizedSplitRawForBLS12_381(aa, bb, buf); pa[i+M*0] = aa[0]; pa[i+M*1] = aa[1]; pb[i+M*0] = bb[0]; pb[i+M*1] = bb[1]; } @@ -1149,16 +1144,17 @@ struct EcM { if (!first) for (int k = 0; k < w; k++) EcM::dbl(Q, Q); EcM T; Vec idx; - idx = vand(vpsrlq(v1, bitLen-w-j*w), g_vmask4); + // compute v2 first before v1. see misc/internal.md + idx = vand(vpsrlq(v2, bitLen-w-j*w), g_vmask4); if (first) { - Q.gather(tbl1, idx); + Q.gather(tbl2, idx); first = false; } else { - T.gather(tbl1, idx); + T.gather(tbl2, idx); add(Q, Q, T); } - idx = vand(vpsrlq(v2, bitLen-w-j*w), g_vmask4); - T.gather(tbl2, idx); + idx = vand(vpsrlq(v1, bitLen-w-j*w), g_vmask4); + T.gather(tbl1, idx); add(Q, Q, T); } } @@ -1167,18 +1163,6 @@ struct EcM { mul(T, T, b, 2); add(Q, Q, T); #endif - if (!isProj) mcl::ec::JacobiToProj(Q, Q); - } - static void mulGLVbn(mcl::msm::G1A _Q[8], mcl::msm::G1A _P[8], const Vec y[4]) - { - const bool isProj = false; - const bool mixed = true; -// mcl::ec::normalizeVec(_P, _P, 8); - g_param.normalizeVecG1(_P, _P, 8); - EcM P, Q; - P.setG1(_P, isProj); - mulGLV(Q, P, y); - Q.getG1(_Q); } void cset(const Vmask& c, const EcM& v) { @@ -1186,7 +1170,11 @@ struct EcM { y.cset(c, v.y); z.cset(c, v.z); } - Vmask isEqualAll(const EcM& rhs) const + Vmask isZero() const + { + return z.isZero(); + } + Vmask isEqualJacobiAll(const EcM& rhs) const { FpM s1, s2, t1, t2; Vmask v1, v2; @@ -1194,7 +1182,7 @@ struct EcM { FpM::sqr(s2, rhs.z); FpM::mul(t1, x, s2); FpM::mul(t2, rhs.x, s1); - v1 = t2.isEqualAll(s1); + v1 = t1.isEqualAll(t2); FpM::mul(t1, y, s2); FpM::mul(t2, rhs.y, s1); FpM::mul(t1, t1, rhs.z); @@ -1202,6 +1190,9 @@ struct EcM { v2 = t1.isEqualAll(t2); return mand(v1, v2); } +#ifdef MCL_MSM_TEST + void dump(bool isProj, size_t pos, const char *msg = nullptr) const; +#endif }; FpM EcM::b3_; @@ -1228,27 +1219,6 @@ inline void cvtFr8toVec4(Vec yv[4], const mcl::msm::FrA y[8]) cvt4Ux8to8Ux4(yv, ya); } -template -inline void mulVecAVX512_naive(mcl::msm::G1A& P, const mcl::msm::G1A *x, const mcl::msm::FrA *y, size_t n) -{ - assert(n % 8 == 0); - EcM R; - for (size_t i = 0; i < n; i += 8) { - Vec yv[4]; - cvtFr8toVec4(yv, y+i); - EcM T, X; - X.setG1(x+i, isProj); - if (i == 0) { - EcM::mulGLV(R, X, yv); - } else { - EcM::mulGLV(T, X, yv); - EcM::add(R, R, T); - } - } - if (!isProj) mcl::ec::JacobiToProj(R, R); - reduceSum(P, R); -} - // xVec[n], yVec[n * maxBitSize/64] // assume xVec[] is normalized inline void mulVecAVX512_inner(mcl::msm::G1A& P, const EcM *xVec, const Vec *yVec, size_t n, size_t maxBitSize) @@ -1292,47 +1262,7 @@ inline void mulVecAVX512_inner(mcl::msm::G1A& P, const EcM *xVec, const Vec *yVe Xbyak::AlignedFree(tbl); } -#if 0 -void mulVec_naive(mcl::msm::G1A& P, const mcl::msm::G1A *x, const mcl::msm::FrA *y, size_t n) -{ - size_t c = mcl::ec::argminForMulVec(n); - size_t tblN = (1 << c) - 0; - mcl::msm::G1A *tbl = (mcl::msm::G1A*)CYBOZU_ALLOCA(sizeof(mcl::msm::G1A) * tblN); - const size_t maxBitSize = 256; - const size_t winN = (maxBitSize + c-1) / c; - mcl::msm::G1A *win = (mcl::msm::G1A*)CYBOZU_ALLOCA(sizeof(mcl::msm::G1A) * winN); - - Unit *yVec = (Unit*)CYBOZU_ALLOCA(sizeof(mcl::msm::FrA) * n); - const mcl::msm::addG1Func addG1 = g_param.addG1; - const mcl::msm::dblG1Func dblG1 = g_param.dblG1; - const mcl::msm::clearG1Func clearG1 = g_param.clearG1; - for (size_t i = 0; i < n; i++) { - g_param.fr->fromMont(yVec+i*4, y[i].v); - } - for (size_t w = 0; w < winN; w++) { - for (size_t i = 0; i < tblN; i++) { - clearG1(tbl[i]); - } - for (size_t i = 0; i < n; i++) { - Unit v = mcl::fp::getUnitAt(yVec+i*4, 4, c * w) & (tblN-1); - addG1(tbl[v], tbl[v], x[i]); - } - mcl::msm::G1A sum = tbl[tblN-1]; - win[w] = sum; - for (size_t i = 1; i < tblN-1; i++) { - addG1(sum, sum, tbl[tblN - 1 - i]); - addG1(win[w], win[w], sum); - } - } - P = win[winN - 1]; - for (size_t w = 1; w < winN; w++) { - for (size_t i = 0; i < c; i++) { - dblG1(P, P); - } - addG1(P, P, win[winN - 1 - w]); - } -} -#endif +} // namespace namespace mcl { namespace msm { @@ -1357,7 +1287,7 @@ void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n) Unit ya[4]; fr->fromMont(ya, y[i*8+j].v); Unit a[2], b[2]; - split(a, b, ya); + mcl::ec::local::optimizedSplitRawForBLS12_381(a, b, ya); py[j+0] = a[0]; py[j+8] = a[1]; py[j+16] = b[0]; @@ -1387,37 +1317,61 @@ void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n) } } +void mulEachAVX512(Unit *_x, const Unit *_y, size_t n) +{ + assert(n % 8 == 0); + const bool isProj = false; + const bool mixed = true; + mcl::msm::G1A *x = (mcl::msm::G1A*)_x; + const mcl::msm::FrA *y = (const mcl::msm::FrA*)_y; + if (!isProj && mixed) g_param.normalizeVecG1(x, x, n); + for (size_t i = 0; i < n; i += 8) { + EcM P; + Vec yv[4]; + cvtFr8toVec4(yv, y+i); + P.setG1(x+i, isProj); + EcM::mulGLV(P, P, yv); + P.getG1(x+i, isProj); + } +} + bool initMsm(const mcl::CurveParam& cp, const mcl::msm::Param *param) { + assert(EcM::a_ == 0); + assert(EcM::b_ == 4); + (void)EcM::a_; // disable unused warning + (void)EcM::b_; + if (cp != mcl::BLS12_381) return false; Xbyak::util::Cpu cpu; if (!cpu.has(Xbyak::util::Cpu::tAVX512_IFMA)) return false; g_param = *param; - Montgomery& mont = g_mont; const mpz_class& mp = g_param.fp->mp; - - mont.set(mp); - toArray<6, 64>(g_mpM2, mp-2); - expand(vmask, g_mask); - expandN(vpN, mp); - expand(vrp, mont.rp); + FpM::init(mp); + Montgomery& mont = FpM::g_mont; + Unit pM2[6]; // x^(-1) = x^(p-2) mod p + toArray<6, 64>(pM2, mp-2); + expand(g_vmask, g_mask); + expandN(g_vpN, mp); + expand(g_vrp, mont.rp); + Vec vpM2[6]; // NOT 52-bit but 64-bit for (int i = 0; i < 6; i++) { - expand(g_vmpM2[i], g_mpM2[i]); + expand(vpM2[i], pM2[i]); } expand(g_vmask4, getMask(4)); for (int i = 0; i < 8; i++) { ((Unit*)&g_offset)[i] = i; } expand(g_vi192, 192); - expandN(FpM::one_.v, g_mont.toMont(1)); + expandN(FpM::one_.v, mont.toMont(1)); expandN(FpM::rawOne_.v, mpz_class(1)); - expandN(FpM::mR2_.v, g_mont.mR2); + expandN(FpM::mR2_.v, mont.mR2); { mpz_class t(1); t <<= 32; - FpM::m64to52_.set(t); - FpM::pow(FpM::m52to64_, FpM::m64to52_, g_vmpM2, 6); + FpM::m64to52_.set(t); // 2^32 + FpM::pow(FpM::m52to64_, FpM::m64to52_, vpM2, 6); } FpM::rw_.setFp(g_param.rw); EcM::init(mont); @@ -1426,3 +1380,259 @@ bool initMsm(const mcl::CurveParam& cp, const mcl::msm::Param *param) } } // mcl::msm +#ifdef MCL_MSM_TEST +#include +#include +#include +#include + +using namespace mcl::bn; + +void FpM::dump(size_t pos, const char *msg) const +{ + Fp T[8]; + getFp((mcl::msm::FpA*)T); + if (msg) printf("%s\n", msg); + printf(" [%zd]=%s\n", pos, T[pos].getStr(16).c_str()); +} + +void EcM::dump(bool isProj, size_t pos, const char *msg) const +{ + G1 T[8]; + getG1((mcl::msm::G1A*)T, isProj); + if (msg) printf("%s\n", msg); + printf(" [%zd]=%s\n", pos, T[pos].getStr(16|mcl::IoEcProj).c_str()); +// printf(" [%zd]=%s\n", pos, T[pos].getStr(16|mcl::IoEcAffine).c_str()); +} + +CYBOZU_TEST_AUTO(init) +{ + initPairing(mcl::BLS12_381); +} + +void setParam(G1 *P, Fr *x, size_t n, cybozu::XorShift& rg) +{ + for (size_t i = 0; i < n; i++) { + uint32_t v = rg.get32(); + hashAndMapToG1(P[i], &v, sizeof(v)); + if (x) x[i].setByCSPRNG(rg); + } +} + +CYBOZU_TEST_AUTO(cmp) +{ + const size_t n = 8; + Vmask v; + FpM x, y; + x.clear(); + v = x.isEqualAll(x); + CYBOZU_TEST_EQUAL(cvtToInt(v), 0xff); + for (size_t i = 0; i < n; i++) { + y.clear(); + y.set(1, i); + v = x.isEqualAll(y); + CYBOZU_TEST_EQUAL(cvtToInt(v), 0xff ^ (1<(TM, PM); + TM.getG1(TA); + for (size_t i = 0; i < n; i++) { + CYBOZU_TEST_EQUAL(R[i], T[i]); + } + + // as Jacobi + PM.setG1(PA, false); + EcM::dbl(TM, PM); + TM.getG1(TA, false); + for (size_t i = 0; i < n; i++) { + CYBOZU_TEST_EQUAL(R[i], T[i]); + } + + // test add + // R = P + Q + for (size_t i = 0; i < n; i++) { + G1::add(R[i], P[i], Q[i]); + } + + // as Proj + PM.setG1(PA); + QM.setG1(QA); + EcM::add(TM, PM, QM); + TM.getG1(TA); + for (size_t i = 0; i < n; i++) { + CYBOZU_TEST_EQUAL(R[i], T[i]); + } + + // as Jacobi + PM.setG1(PA, false); + QM.setG1(QA, false); + EcM::add(TM, PM, QM); + TM.getG1(TA, false); + for (size_t i = 0; i < n; i++) { + CYBOZU_TEST_EQUAL(R[i], T[i]); + } + + // as Jacobi (mixed) + for (size_t i = 0; i < n; i++) { + Q[i].normalize(); + } + QM.setG1(QA, false); + EcM::add(TM, PM, QM); + TM.getG1(TA, false); + for (size_t i = 0; i < n; i++) { + CYBOZU_TEST_EQUAL(R[i], T[i]); + } +#if 1 + // mulEachAVX512 + for (size_t i = 0; i < n; i++) { + Q[i] = P[i]; + G1::mul(R[i], P[i], x[i]); + } + mcl::msm::mulEachAVX512((Unit*)Q, (const Unit*)x, n); + for (size_t i = 0; i < n; i++) { + CYBOZU_TEST_EQUAL(R[i], Q[i]); + } +#endif +} + +CYBOZU_TEST_AUTO(normalizeJacobiVec) +{ + const bool isProj = false; + const size_t n = 64; + G1 P[n], Q[n], R[n]; + EcM PP[n/8]; + cybozu::XorShift rg; + setParam(P, 0, n, rg); + P[n/2].clear(); + P[n/3].clear(); + mcl::ec::normalizeVec(Q, P, n); + for (size_t i = 0; i < n/8; i++) { + PP[i].setG1((mcl::msm::G1A*)&P[i*8], isProj); + } + normalizeJacobiVec(PP); + for (size_t i = 0; i < n/8; i++) { + PP[i].getG1((mcl::msm::G1A*)&R[i*8], isProj); + } + CYBOZU_TEST_EQUAL_ARRAY(P, R, n); +} + +CYBOZU_TEST_AUTO(mulEach_special) +{ + const size_t n = 8; + G1 P[n], Q[n], R[n]; + Fr x[n]; + for (size_t i = 0; i < n; i++) P[i].clear(); + P[0].setStr("1 13de196893df2bb5b57882ff1eec37d98966aa71b828fd25125d04ed2c75ddc55d5bc68bd797bd555f9a827387ee6b28 5d59257a0fccd5215cdeb0928296a7a4d684823db76aef279120d2d71c4b54604ec885eb554f99780231ade171979a3", 16); + x[0].setStr("5b4b92c347ffcd8543904dd1b22a60d94b4a9c243046456b8befd41507bec5d", 16); +// x[0].setStr("457977620305299156129707153920788267006"); // L+L + for (size_t i = 0; i < n; i++) Q[i] = P[i]; + G1::mul(R[0], P[0], x[0]); + G1::mulEach(Q, x, 8); + CYBOZU_TEST_EQUAL(R[0], Q[0]); + mpz_class L; + L.setStr("0xac45a4010001a40200000000ffffffff"); + mpz_class tbl[] = { + 0, + 1, + L, + }; + cybozu::XorShift rg; + for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) { + const mpz_class& a = tbl[i]; + for (size_t j = 0; j < CYBOZU_NUM_OF_ARRAY(tbl); j++) { + const mpz_class& b = tbl[j]; + setParam(P, x, n, rg); + x[0].setMpz(a * L + b); + for (size_t k = 0; k < 8; k++) { + Q[k] = P[k]; + G1::mul(R[k], P[k], x[k]); + } + G1::mulEach(Q, x, n); + CYBOZU_TEST_EQUAL_ARRAY(R, Q, n); + } + } +} + +CYBOZU_TEST_AUTO(mulEach) +{ + const size_t n = 1024; + G1 P[n], Q[n], R[n]; + Fr x[n]; + cybozu::XorShift rg; + setParam(P, x, n, rg); + if (n > 32) P[32].clear(); + P[n/2].clear(); + for (size_t i = 0; i < n; i++) { + Q[i] = P[i]; + G1::mul(R[i], P[i], x[i]); + } + G1::mulEach(Q, x, n); + for (size_t i = 0; i < n; i++) { + CYBOZU_TEST_EQUAL(R[i], Q[i]); + if (R[i] != Q[i]) { + printf("P[%zd]=%s\n", i, P[i].getStr(16).c_str()); + printf("x[%zd]=%s\n", i, x[i].getStr(16).c_str()); + printf("R[%zd]=%s\n", i, R[i].getStr(16|mcl::IoEcProj).c_str()); + printf("Q[%zd]=%s\n", i, Q[i].getStr(16|mcl::IoEcProj).c_str()); + } + } +#ifdef NDEBUG + CYBOZU_BENCH_C("mulEach", 100, G1::mulEach, Q, x, n); +#endif +} +#endif diff --git a/test/bls12_test.cpp b/test/bls12_test.cpp index ee707a82..ded566c4 100644 --- a/test/bls12_test.cpp +++ b/test/bls12_test.cpp @@ -359,6 +359,32 @@ void testSerialize(const G1& P, const G2& Q) #include "bench.hpp" +void testMulVec() +{ + puts("testMulVec"); + const size_t n = 8192; + cybozu::XorShift rg; + std::vector Pvec(n); + std::vector xVec(n); + hashAndMapToG1(Pvec[0], "abc", 3); + for (size_t i = 1; i < n; i++) { + G1::add(Pvec[i], Pvec[i-1], Pvec[0]); + } + for (size_t i = 0; i < n; i++) { + xVec[i].setByCSPRNG(rg); + } + G1 P; + G1 P8191; + P8191.setStr("1 c252fef934098904eca8e3fbd9cc8c78877e434d9ce01e424ef07302cec5652dc17d341b8abd4278255a75718cebd67 17455f24f76e7e7d1dd3231d8f144a40decc40d5b129734879b8aad4a209a2e6d83d8256221e46aaf8205e254355d9ad", 16); + G1 P8192; + P8192.setStr("1 f0d44ba84af56d1db97f46660bfd12401aae239a6650cdfc168158d1076d68c5149ac3a311b9c058ad4e61ad1b8063 b2240da1e42c5f469ccf818e58901aca2283d1bd29565f5efbfa14e48cdae199c7a7981b958bfec332f6e613cf36990", 16); + G1::mulVec(P, Pvec.data(), xVec.data(), n-1); + CYBOZU_TEST_EQUAL(P, P8191); + G1::mulVec(P, Pvec.data(), xVec.data(), n); + CYBOZU_TEST_EQUAL(P, P8192); +} + + CYBOZU_TEST_AUTO(naive) { for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(g_testSetTbl); i++) { @@ -375,6 +401,7 @@ CYBOZU_TEST_AUTO(naive) clk.put(); return; #endif + testMulVec(); testSerialize(P, Q); testParam(ts); testIo(P, Q); @@ -857,7 +884,6 @@ CYBOZU_TEST_AUTO(verifyG2) CYBOZU_TEST_ASSERT(n == 0); } - typedef std::vector FpVec; void f(FpVec& zv, const FpVec& xv, const FpVec& yv) diff --git a/test/bn_c_test.hpp b/test/bn_c_test.hpp index febe27fc..40bacb61 100644 --- a/test/bn_c_test.hpp +++ b/test/bn_c_test.hpp @@ -1078,13 +1078,21 @@ CYBOZU_TEST_AUTO(mulVec) for (size_t i = 0; i < N; i++) { char c = char('a' + i); mclBnG1_hashAndMapTo(&x1Vec[i], &c, 1); + if (i == 10) { + mclBnG1_clear(&x1Vec[i]); // x1Vec[i] contains zero + } mclBnG2_hashAndMapTo(&x2Vec[i], &c, 1); mclBn_pairing(&xtVec[i], &x1Vec[i], &x2Vec[i]); - mclBnFr_setByCSPRNG(&yVec[i]); +// mclBnFr_setByCSPRNG(&yVec[i]); + mclBnFr_setHashOf(&yVec[i], &c, 1); } + mclBnG1 x1Vec2[N]; + memcpy(x1Vec2, x1Vec, sizeof(x1Vec)); + mclBnG1_mulVec(&z1, x1Vec, yVec, N); mclBnG2_mulVec(&z2, x2Vec, yVec, N); mclBnGT_powVec(&zt, xtVec, yVec, N); + mclBnG1_mulEach(x1Vec2, yVec, N); mclBnG1_clear(&w1); mclBnG2_clear(&w2); @@ -1094,6 +1102,22 @@ CYBOZU_TEST_AUTO(mulVec) mclBnG2 t2; mclBnGT tt; mclBnG1_mul(&t1, &x1Vec[i], &yVec[i]); + CYBOZU_TEST_ASSERT(mclBnG1_isEqual(&t1, &x1Vec2[i])); +#if 0 + if (mclBnG1_isEqual(&t1, &x1Vec2[i]) == 0) { + char buf[1024]; + printf("i=%zd\n", i); + mclBnG1_getStr(buf, sizeof(buf), &x1Vec[i], 10); + printf("x1=%s\n", buf); + mclBnFr_getStr(buf, sizeof(buf), &yVec[i], 10); + printf("y=%s\n", buf); + mclBnG1_getStr(buf, sizeof(buf), &t1, 10); + printf("xy=%s\n", buf); + mclBnG1_getStr(buf, sizeof(buf), &x1Vec2[i], 10); + printf("ng=%s\n", buf); + exit(1); + } +#endif mclBnG2_mul(&t2, &x2Vec[i], &yVec[i]); mclBnGT_pow(&tt, &xtVec[i], &yVec[i]); mclBnG1_add(&w1, &w1, &t1); diff --git a/test/common_test.hpp b/test/common_test.hpp index 99d04fb2..e00cd726 100644 --- a/test/common_test.hpp +++ b/test/common_test.hpp @@ -33,6 +33,9 @@ void testMulVec(const G& P) cybozu::XorShift rg; for (size_t i = 0; i < N; i++) { G::mul(x0Vec[i], P, i + 3); + if (i == 30) { + x0Vec[i].clear(); // x0Vec[i] contains zero value + } xVec[i] = x0Vec[i]; yVec[i].setByCSPRNG(rg); } @@ -57,6 +60,17 @@ void testMulVec(const G& P) CYBOZU_BENCH_C("mulVecCopy", C, mulVecCopy, Q1, xVec.data(), yVec.data(), n, x0Vec.data()); #endif } + puts("mulEach"); + for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(nTbl); i++) { + const size_t n = nTbl[i]; + xVec = x0Vec; + G::mulEach(xVec.data(), yVec.data(), n); + for (size_t j = 0; j < n; j++) { + G T; + G::mul(T, x0Vec[j], yVec[j]); + CYBOZU_TEST_EQUAL(xVec[j], T); + } + } } template