diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 00000000..2f957d54
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,32 @@
+# Contributing to mcl
+
+Thank you for considering contributing to the mcl project. This document provides guidelines on how to contribute.
+
+## Bug Reports and Feedback
+
+If you find a bug, have a feature request, or have questions, please open an issue. Include the following information:
+
+- Detailed description of the problem
+- Steps to reproduce
+- Expected behavior
+- Actual behavior
+- Environment details (OS, compiler version, etc.)
+
+## Creating Pull Requests
+
+If you want to add features or make fixes, follow these steps to create a pull request:
+
+1. Fork the repository
+2. Create a new branch: `git checkout -b my-feature-branch`
+3. Make your changes
+4. Run tests and ensure all tests pass
+5. Commit your changes: `git commit -am 'Add new feature'`
+6. Push the branch: `git push origin my-feature-branch`
+7. Create a pull request
+
+When creating a pull request, clearly describe the changes and include any related issue numbers.
+
+## License
+
+mcl is released under the BSD-3-Clause License. Any code contributions will be licensed under the same license.
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e7664c4f..6cd09109 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -283,8 +283,10 @@ endif()
foreach(bit IN ITEMS 256 384 384_256)
if (MCL_STATIC_LIB)
add_library(mclbn${bit} STATIC src/bn_c${bit}.cpp)
+ target_link_libraries(mclbn${bit} PUBLIC mcl::mcl_st)
else()
add_library(mclbn${bit} SHARED src/bn_c${bit}.cpp)
+ target_link_libraries(mclbn${bit} PUBLIC mcl::mcl)
endif()
add_library(mcl::mclbn${bit} ALIAS mclbn${bit})
set_target_properties(mclbn${bit} PROPERTIES
@@ -294,7 +296,6 @@ foreach(bit IN ITEMS 256 384 384_256)
target_compile_options(mclbn${bit} PRIVATE ${MCL_COMPILE_OPTIONS})
target_compile_definitions(mclbn${bit}
PUBLIC MCL_NO_AUTOLINK MCLBN_NO_AUTOLINK)
- target_link_libraries(mclbn${bit} PUBLIC mcl::mcl)
set_target_properties(mclbn${bit} PROPERTIES
VERSION ${mcl_VERSION}
SOVERSION ${mcl_VERSION_MAJOR})
diff --git a/Makefile b/Makefile
index ad7cf292..e334dca7 100644
--- a/Makefile
+++ b/Makefile
@@ -443,6 +443,13 @@ bin/llvm_test64.exe: test/llvm_test.cpp src/base64.ll
bin/llvm_test32.exe: test/llvm_test.cpp src/base32.ll
$(CLANG) -o $@ -Ofast -DNDEBUG -Wall -Wextra -I ./include test/llvm_test.cpp src/base32.ll -m32
+$(OBJ_DIR)/$(MSM)_test.o: src/$(MSM).cpp
+ $(PRE)$(CXX) -c $< -o $@ $(CFLAGS) -mavx512f -mavx512ifma -std=c++11 $(CFLAGS_USER) -DMCL_MSM_TEST
+MSM_TEST_OBJ=$(OBJ_DIR)/$(MSM)_test.o $(filter-out $(OBJ_DIR)/msm_avx.o,$(LIB_OBJ))
+$(EXE_DIR)/msm_test.exe: $(MSM_TEST_OBJ)
+ $(PRE)$(CXX) -o $@ $(LDFLAGS) $(MSM_TEST_OBJ)
+-include $(OBJ_DIR)/msm_test.d
+
make_tbl:
$(MAKE) ../bls/src/qcoeff-bn254.hpp
diff --git a/api.md b/api.md
index ac6e98cb..06222df0 100644
--- a/api.md
+++ b/api.md
@@ -61,7 +61,7 @@ r = |G1| = |G2| = |GT|
curveType | b| r and p |
------------|--|------------------|
BN254 | 2|r = 0x2523648240000001ba344d8000000007ff9f800000000010a10000000000000d
p = 0x2523648240000001ba344d80000000086121000000000013a700000000000013 |
-BN_SNARK1|3|r = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47
p = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47|
+BN_SNARK1|3|r = 0x30644e72e131a029b85045b68181585d2833e84879b9709143e1f593f0000001
p = 0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47|
BLS12-381 | 4|r = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001
p = 0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab |
BN381 | 2|r = 0x240026400f3d82b2e42de125b00158405b710818ac000007e0042f008e3e00000000001080046200000000000000000d
p = 0x240026400f3d82b2e42de125b00158405b710818ac00000840046200950400000000001380052e000000000000000013 |
@@ -253,7 +253,7 @@ C++
T x = ;
```
-### Set `buf[0..bufSize-1]` to `x` with masking according to the following way.
+### Set `bufSize` bytes `buf` to `x` with masking according to the following way.
```
int mclBnFp_setLittleEndian(mclBnFp *x, const void *buf, mclSize bufSize);
int mclBnFr_setLittleEndian(mclBnFr *x, const void *buf, mclSize bufSize);
@@ -270,7 +270,7 @@ T::setArrayMask(const uint8_t *buf, size_t n);
- always return 0
-### Set (`buf[0..bufSize-1]` mod `p` or `r`) to `x`.
+### Set `bufSize` bytes `buf` of mod `p` or `r` to `x`.
```
int mclBnFp_setLittleEndianMod(mclBnFp *x, const void *buf, mclSize bufSize);
int mclBnFr_setLittleEndianMod(mclBnFr *x, const void *buf, mclSize bufSize);
@@ -281,7 +281,7 @@ C++
T::setLittleEndianMod(const uint8_t *buf, mclSize bufSize);
```
-- return 0 if bufSize <= (sizeof(*x) * 8 * 2) else -1
+- return 0 if bufSize <= (sizeof(T) * 2) else -1
### Get little-endian byte sequence `buf` corresponding to `x`
```
diff --git a/include/mcl/bn.h b/include/mcl/bn.h
index 41ac9b06..8aa4b81d 100644
--- a/include/mcl/bn.h
+++ b/include/mcl/bn.h
@@ -485,6 +485,9 @@ MCLBN_DLL_API void mclBnG1_mulVec(mclBnG1 *z, mclBnG1 *x, const mclBnFr *y, mclS
MCLBN_DLL_API void mclBnG2_mulVec(mclBnG2 *z, mclBnG2 *x, const mclBnFr *y, mclSize n);
MCLBN_DLL_API void mclBnGT_powVec(mclBnGT *z, const mclBnGT *x, const mclBnFr *y, mclSize n);
+// x[i] *= y[i]
+MCLBN_DLL_API void mclBnG1_mulEach(mclBnG1 *x, const mclBnFr *y, mclSize n);
+
MCLBN_DLL_API void mclBn_pairing(mclBnGT *z, const mclBnG1 *x, const mclBnG2 *y);
MCLBN_DLL_API void mclBn_finalExp(mclBnGT *y, const mclBnGT *x);
MCLBN_DLL_API void mclBn_millerLoop(mclBnGT *z, const mclBnG1 *x, const mclBnG2 *y);
diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp
index 8e8fb9ca..480e434f 100644
--- a/include/mcl/bn.hpp
+++ b/include/mcl/bn.hpp
@@ -48,6 +48,7 @@ namespace msm {
bool initMsm(const mcl::CurveParam& cp, const Param *param);
void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n);
+void mulEachAVX512(Unit *_x, const Unit *_y, size_t n);
} // mcl::msm
#endif
@@ -692,9 +693,12 @@ struct GLV1 : mcl::GLV1T {
const mpz_class& r = Fr::getOp().mp;
B[0][0] = z * z - 1; // L
v0 = (B[0][0] << rBitSize) / r;
- if (curveType == BLS12_381.curveType && MCL_SIZEOF_UNIT == 8) {
+#if MCL_SIZEOF_UNIT == 8
+ if (curveType == BLS12_381.curveType) {
optimizedSplit = optimizedSplitForBLS12_381;
- } else {
+ } else
+#endif
+ {
optimizedSplit = splitForBLS12;
}
} else {
@@ -723,38 +727,19 @@ struct GLV1 : mcl::GLV1T {
b = (x * v0) >> rBitSize;
a = x - b * B[0][0];
}
+#if MCL_SIZEOF_UNIT == 8
static inline void optimizedSplitForBLS12_381(mpz_class u[2], const mpz_class& x)
{
- assert(sizeof(Unit) == 8);
- /*
- z = -0xd201000000010000
- L = z^2-1 = 0xac45a4010001a40200000000ffffffff
- r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001
- s=255
- v = 0xbe35f678f00fd56eb1fb72917b67f718
- */
- mpz_class& a = u[0];
- mpz_class& b = u[1];
- static const uint64_t Lv[] = { 0x00000000ffffffff, 0xac45a4010001a402 };
- static const uint64_t vv[] = { 0xb1fb72917b67f718, 0xbe35f678f00fd56e };
static const size_t n = 128 / mcl::UnitBitSize;
- Unit t[n*3];
- // n = 128 bit
- // t[n*3] = x[n*2] * vv[n]
- mcl::bint::mulNM(t, gmp::getUnit(x), n*2, (const Unit*)vv, n);
- // t[n] <- t[n*3]
- mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255
+ Unit xa[n*2], a[2], b[2];
+ mcl::gmp::getArray(xa, n*2, x);
+ ec::local::optimizedSplitRawForBLS12_381(a, b, xa);
bool dummy;
- gmp::setArray(&dummy, b, t, n);
- Unit t2[n*2];
- // t2[n*2] = t[n] * Lv[n]
- // Do not overlap I/O buffers on pre-Broadwell CPUs.
- mcl::bint::mulT(t2, t, (const Unit*)Lv);
- // t[n] = x[n*2] - t2[n*2]
- mcl::bint::subT(t, gmp::getUnit(x), t2);
- gmp::setArray(&dummy, a, t, n);
+ gmp::setArray(&dummy, u[0], a, n);
+ gmp::setArray(&dummy, u[1], b, n);
(void)dummy;
}
+#endif
};
/*
@@ -2314,6 +2299,7 @@ inline void init(bool *pb, const mcl::CurveParam& cp = mcl::BN254, fp::Mode mode
if (sizeof(Unit) == 8 && sizeof(Fp) == sizeof(mcl::msm::FpA) && sizeof(Fr) == sizeof(mcl::msm::FrA)) {
if (mcl::msm::initMsm(cp, ¶)) {
G1::setMulVecOpti(mcl::msm::mulVecAVX512);
+ G1::setMulEachOpti(mcl::msm::mulEachAVX512);
}
}
#endif
diff --git a/include/mcl/ec.hpp b/include/mcl/ec.hpp
index 88e9092b..028153ff 100644
--- a/include/mcl/ec.hpp
+++ b/include/mcl/ec.hpp
@@ -242,10 +242,54 @@ void normalizeVecT(Eout& Q, Ein& P, size_t n, size_t N = 256)
}
}
+#if MCL_SIZEOF_UNIT == 8
+/*
+ split x to (a, b) such that x = a + b L where 0 <= a, b <= L, 0 <= x <= r-1 = L^2+L
+ if adj is true, then 0 <= a < L, 0 <= b <= L+1
+*/
+inline void optimizedSplitRawForBLS12_381(Unit a[2], Unit b[2], const Unit x[4])
+{
+ const bool adj = false;
+ assert(sizeof(Unit) == 8);
+ /*
+ z = -0xd201000000010000
+ L = z^2-1 = 0xac45a4010001a40200000000ffffffff
+ r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001
+ s=255
+ v = (1<>255
+ mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255
+ b[0] = t[0];
+ b[1] = t[1];
+ Unit t2[n*2];
+ // t2[n*2] = t[n] * L[n]
+ // Do not overlap I/O buffers on pre-Broadwell CPUs.
+ mcl::bint::mulT(t2, t, L);
+ // a[n] = x[n*2] - t2[n*2]
+ mcl::bint::subT(a, x, t2);
+ if (adj) {
+ if (mcl::bint::cmpEqT(a, L)) {
+ // if a == L then b = b + 1 and a = 0
+ mcl::bint::addT(b, b, one);
+ mcl::bint::clearT(a);
+ }
+ }
+}
+#endif
+
} // mcl::ec::local
// [X:Y:Z] as Proj = (X/Z, Y/Z) as Affine = [XZ:YZ^2:Z] as Jacobi
-// Remark. convert P = [1:0:0] to Q = [0:0:0]
+// Remark. convert P = [*:*:0] to Q = [0:0:0]
template
void ProjToJacobi(E& Q, const E& P)
{
@@ -257,7 +301,7 @@ void ProjToJacobi(E& Q, const E& P)
}
// [X:Y:Z] as Jacobi = (X/Z^2, Y/Z^3) as Affine = [XZ:Y:Z^3] as Proj
-// Remark. convert P = [1:1:0] to Q = [0:1:0]
+// Remark. convert P = [*:1:0] to Q = [0:1:0]
template
void JacobiToProj(E& Q, const E& P)
{
@@ -1316,6 +1360,7 @@ class EcT : public fp::Serializable > {
static mpz_class order_;
static bool (*mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime);
static void (*mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n);
+ static void (*mulEachOpti)(Unit *xVec, const Unit *yVec, size_t n);
static bool (*isValidOrderFast)(const EcT& x);
/* default constructor is undefined value */
EcT() {}
@@ -1382,6 +1427,7 @@ class EcT : public fp::Serializable > {
order_ = 0;
mulVecGLV = 0;
mulVecOpti = 0;
+ mulEachOpti = 0;
isValidOrderFast = 0;
mode_ = mode;
}
@@ -1412,6 +1458,10 @@ class EcT : public fp::Serializable > {
{
mulVecOpti = f;
}
+ static void setMulEachOpti(void f(Unit *_xVec, const Unit *_yVec, size_t yn))
+ {
+ mulEachOpti = f;
+ }
static inline void init(bool *pb, const char *astr, const char *bstr, int mode = ec::Jacobi)
{
Fp a, b;
@@ -1463,12 +1513,14 @@ class EcT : public fp::Serializable > {
void clear()
{
if (mode_ == ec::Jacobi) {
- x = 1;
- } else {
+ x = 0;
+ y = 0;
+ z.clear();
+ } else { // ec::Proj
x.clear();
+ y = 1;
+ z.clear();
}
- y = 1;
- z.clear();
}
static inline void clear(EcT& P)
{
@@ -2080,7 +2132,7 @@ class EcT : public fp::Serializable > {
return;
}
if (mulVecOpti && n >= 128) {
- mulVecOpti((Unit*)&z, (Unit*)xVec, (const Unit*)yVec, n);
+ mulVecOpti((Unit*)&z, (Unit*)xVec, yVec[0].getUnit(), n);
return;
}
if (mulVecGLV && mulVecGLV(z, xVec, yVec, n, false)) {
@@ -2133,6 +2185,20 @@ class EcT : public fp::Serializable > {
mulVec(z, xVec, yVec, n);
#endif
}
+ // xVec[i] *= yVec[i]
+ static void mulEach(EcT *xVec, const EcT::Fr *yVec, size_t n)
+ {
+ if (mulEachOpti && n >= 8) {
+ size_t n8 = n & ~size_t(7);
+ mulEachOpti((Unit*)xVec, yVec[0].getUnit(), n8);
+ xVec += n8;
+ yVec += n8;
+ n -= n8;
+ }
+ for (size_t i = 0; i < n; i++) {
+ xVec[i] *= yVec[i];
+ }
+ }
#ifndef CYBOZU_DONT_USE_EXCEPTION
static inline void init(const std::string& astr, const std::string& bstr, int mode = ec::Jacobi)
{
@@ -2192,6 +2258,7 @@ template bool (*EcT::mulVecGLV)(EcT& z, const EcT *x
template void (*EcT::mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n);
template bool (*EcT::isValidOrderFast)(const EcT& x);
template int EcT::mode_;
+template void (*EcT::mulEachOpti)(Unit *xVec, const Unit *yVec, size_t n);
// r = the order of Ec
template
diff --git a/include/mcl/impl/bn_c_impl.hpp b/include/mcl/impl/bn_c_impl.hpp
index 00f49c5f..77bef385 100644
--- a/include/mcl/impl/bn_c_impl.hpp
+++ b/include/mcl/impl/bn_c_impl.hpp
@@ -639,6 +639,10 @@ void mclBnGT_powVec(mclBnGT *z, const mclBnGT *x, const mclBnFr *y, mclSize n)
{
GT::powVec(*cast(z), cast(x), cast(y), n);
}
+void mclBnG1_mulEach(mclBnG1 *x, const mclBnFr *y, mclSize n)
+{
+ G1::mulEach(cast(x), cast(y), n);
+}
void mclBn_pairing(mclBnGT *z, const mclBnG1 *x, const mclBnG2 *y)
{
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index 9e544144..8e3e68c0 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -29,7 +29,7 @@
namespace mcl {
-static const int version = 0x191; /* 0xABC = A.BC */
+static const int version = 0x192; /* 0xABC = A.BC */
/*
specifies available string format mode for X::setIoMode()
diff --git a/misc/internal.md b/misc/internal.md
index b1b2fdd2..9f5d362d 100644
--- a/misc/internal.md
+++ b/misc/internal.md
@@ -16,6 +16,10 @@ w/o IFMA|66.498|122.666|227.042|426.498
w IFMA|46.411|87.002|153.958|300.331
speed up rate|1.43|1.41|1.47|1.42
+G1 mulEach
+- w/o IFMA : 42.166Mclk
+- w IFMA : 16.643Mclk
+
# GLV method
## Split function for BLS12-381
@@ -41,33 +45,42 @@ bit_length|64|128|255|255|128
### Split function
```python
+adj = False
def split(x):
b = (x * v) >> s
a = x - b * L
+ if adj:
+ if a >= L:
+ a -= L
+ b += 1
return (a, b)
```
+- x in [0, r-1]
- a + b L = x for (a, b) = split(x).
### Theorem
-0 <= a, b < H for all x in [0, M-r].
+0 <= a < 1.11 L < H and 0 <= b < L+1 for x in [0, r-1].
### Proof
```
-Let r0 := L S % r, then S=v L + r0 and r0 in [0, L-1].
+Let r0 := L S % r, then S=v L + r0 and r0 in [0, L-1]. In fact, r0 ~ 0.11 L.
Let r1 := x v % S, then x v = b S + r1 and r1 in [0, S-1].
```
```
-b <= xv / S < (M-r) (S/L)/S = (M-r)/L < H.
+b <= xv / S < (r-1) (S/L)/S = (r-1)/L = L+1.
```
```
aS = (x - bL)S = xS - bSL = xS - (xv - r1)L = x(S - vL) + r1 L = r0 x + r1 L
- <= r0 (M-r) + (S-1)L < S H.
+ <= r0 (r-1) + (S-1)L = S L + (r-1)r0 - L.
+a <= L + ((r-1)r0 - L)/S
+((r-1)r0 - L)/S ~ 0.10016 L < 0.11 L.
```
-Then, a < H.
-So for x in [0, M-1], set x = x - r if x >= H and apply split() to x.
+### Remark
+If adj is true, then a is in [0, L-1].
+
## window size
- 128-bit (Fr is 256 bit and use GLV method)
@@ -84,14 +97,15 @@ f(w)|130|68|51|48|58|86
argmin f(w) = 4
-## Use projective coordinates
+## Selection of coordinates
-- psuedo code of GLV method
+### psuedo code of GLV method
```python
def mul(P, x):
- (a, b) = split(x)
- # a, b < 1<<128
+ assert(0 <= x < r)
+ (a, b) = split(x) # x = a + b L
+ # a, b < H=1<<128
w = 4
for i in range(1<> (w*i)) & mask
- j2 = (b >> (w*i)) & mask ### AAA
- Q = add(Q, tbl1[j1])
- Q = add(Q, tbl2[j2])
+ j2 = (b >> (w*i)) & mask
+ Q = add(Q, tbl2[j2]) # ADD1
+ Q = add(Q, tbl1[j1]) # ADD2
return Q
```
-The values of tbl1[i] are 0, P, ..., 15P, and the values of tbl2[i] are 0, LP, ... , 15LP.
-Since L is odd and Q is a multiple of 16 just before AAA, Q != tbl1[j1] and Q != tbl2[j2]. So we can omit the ehckd of x == y in add(x, y).
+Note that the value of tbl2 is added first.
+
+### Theorem
+We can use Jacobi additive formula add(P, Q) assuming P != Q and P+Q != 0.
+
+Proof.
+
+During the calculation, Q is monotonic increase and always in [0, P, ..., (r-1)P].
+
+- ADD1 : tbl2[] is in [0, L P, ..., 15 L P] and L is odd.
+After computing AAA, Q is a multiple of 16 P, so Q != tbl2[j2].
+- ADD2 : tbl1[] is in [0, P, ..., 15 P].
+After computing ADD1, if the immediately preceding tbl2[j2] is 0, then then Q is a multiple of 16 P, so Q != tbl1[j1].
+Otherwise, Q is bigger than L P, so Q != tbl1[j1].
## Jacobi and Proj
`sqr` is equal to `mul` on AVX-512.
diff --git a/readme.md b/readme.md
index 13eb7c5b..0a8d8d72 100644
--- a/readme.md
+++ b/readme.md
@@ -10,6 +10,7 @@ mcl is a library for pairing-based cryptography,
which supports the optimal Ate pairing over BN curves and BLS12-381 curves.
# News
+- mulEach with AVX-512 IFMA is 2.5 times faster than G1::mul on BLS12-381
- mulVec (multi scalar multiplication) with AVX-512 IFMA is 1.4 times faster on Xeon w9-3495X
- a little performance improvement of G1::mulVec of BLS12-381
- improve performance of Fr::inv on M1 mac
diff --git a/sample/mt_test.cpp b/sample/mt_test.cpp
index 9072f31a..47af39fe 100644
--- a/sample/mt_test.cpp
+++ b/sample/mt_test.cpp
@@ -65,6 +65,7 @@ int main(int argc, char *argv[])
P2.clear();
CYBOZU_BENCH_C("G1 multi ", C, G1::mulVecMT, P2, Pvec.data(), xVec.data(), n, cpuN);
if (P1 != P2) puts("G1::mulVecMT err");
+ CYBOZU_BENCH_C("G1 mulEach", C, G1::mulEach, Pvec.data(), xVec.data(), n);
G2 Q1, Q2;
CYBOZU_BENCH_C("G2 single", C, G2::mulVec, Q1, Qvec.data(), xVec.data(), n);
CYBOZU_BENCH_C("G2 multi ", C, G2::mulVecMT, Q2, Qvec.data(), xVec.data(), n, cpuN);
diff --git a/src/msm_avx.cpp b/src/msm_avx.cpp
index 997660c2..386caf47 100644
--- a/src/msm_avx.cpp
+++ b/src/msm_avx.cpp
@@ -6,19 +6,26 @@
http://opensource.org/licenses/BSD-3-Clause
*/
#include
-#include
#ifdef _WIN32
#include
#else
#include
#endif
+
+#include
#define XBYAK_NO_EXCEPTION
#include "xbyak/xbyak_util.h"
+#if defined(__GNUC__) && !defined(__EMSCRIPTEN__)
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
typedef mcl::Unit Unit;
typedef __m512i Vec;
typedef __mmask8 Vmask;
+namespace {
+
static mcl::msm::Param g_param;
const size_t S = sizeof(Unit)*8-1; // 63
@@ -27,12 +34,10 @@ const size_t N = 8; // = ceil(384/52)
const size_t M = sizeof(Vec) / sizeof(Unit);
const uint64_t g_mask = (Unit(1)<
inline void toArray(Unit x[N], mpz_class mx)
{
@@ -250,7 +267,7 @@ inline void vrawAdd(Vec *z, const Vec *x, const Vec *y)
{
Vec t = vadd(x[0], y[0]);
Vec c = vpsrlq(t, W);
- z[0] = vand(t, vmask);
+ z[0] = vand(t, g_vmask);
for (size_t i = 1; i < n; i++) {
t = vadd(x[i], y[i]);
@@ -260,7 +277,7 @@ inline void vrawAdd(Vec *z, const Vec *x, const Vec *y)
return;
}
c = vpsrlq(t, W);
- z[i] = vand(t, vmask);
+ z[i] = vand(t, g_vmask);
}
}
@@ -269,12 +286,12 @@ inline Vmask vrawSub(Vec *z, const Vec *x, const Vec *y)
{
Vec t = vsub(x[0], y[0]);
Vec c = vpsrlq(t, S);
- z[0] = vand(t, vmask);
+ z[0] = vand(t, g_vmask);
for (size_t i = 1; i < n; i++) {
t = vsub(x[i], y[i]);
t = vsub(t, c);
c = vpsrlq(t, S);
- z[i] = vand(t, vmask);
+ z[i] = vand(t, g_vmask);
}
return vcmpneq(c, vzero());
}
@@ -290,7 +307,7 @@ inline void uvadd(Vec *z, const Vec *x, const Vec *y)
{
Vec sN[N], tN[N];
vrawAdd(sN, x, y);
- Vmask c = vrawSub(tN, sN, vpN);
+ Vmask c = vrawSub(tN, sN, g_vpN);
uvselect(z, c, sN, tN);
}
@@ -298,8 +315,8 @@ inline void uvsub(Vec *z, const Vec *x, const Vec *y)
{
Vec sN[N], tN[N];
Vmask c = vrawSub(sN, x, y);
- vrawAdd(tN, sN, vpN);
- tN[N-1] = vand(tN[N-1], vmask);
+ vrawAdd(tN, sN, g_vpN);
+ tN[N-1] = vand(tN[N-1], g_vmask);
uvselect(z, c, tN, sN);
}
@@ -406,15 +423,15 @@ inline void vset(Vec *t, const Vmask& c, const Vec a[n])
inline void uvmont(Vec z[N], Vec xy[N*2])
{
for (size_t i = 0; i < N; i++) {
- Vec q = vmulL(xy[i], vrp);
- xy[N+i] = vadd(xy[N+i], vrawMulUnitAdd(xy+i, vpN, q));
+ Vec q = vmulL(xy[i], g_vrp);
+ xy[N+i] = vadd(xy[N+i], vrawMulUnitAdd(xy+i, g_vpN, q));
xy[i+1] = vadd(xy[i+1], vpsrlq(xy[i], W));
}
for (size_t i = N; i < N*2-1; i++) {
xy[i+1] = vadd(xy[i+1], vpsrlq(xy[i], W));
- xy[i] = vand(xy[i], vmask);
+ xy[i] = vand(xy[i], g_vmask);
}
- Vmask c = vrawSub(z, xy+N, vpN);
+ Vmask c = vrawSub(z, xy+N, g_vpN);
uvselect(z, c, xy+N, z);
}
@@ -427,19 +444,19 @@ inline void uvmul(Vec *z, const Vec *x, const Vec *y)
#else
Vec t[N*2], q;
vrawMulUnit(t, x, y[0]);
- q = vmulL(t[0], vrp);
- t[N] = vadd(t[N], vrawMulUnitAdd(t, vpN, q));
+ q = vmulL(t[0], g_vrp);
+ t[N] = vadd(t[N], vrawMulUnitAdd(t, g_vpN, q));
for (size_t i = 1; i < N; i++) {
t[N+i] = vrawMulUnitAdd(t+i, x, y[i]);
t[i] = vadd(t[i], vpsrlq(t[i-1], W));
- q = vmulL(t[i], vrp);
- t[N+i] = vadd(t[N+i], vrawMulUnitAdd(t+i, vpN, q));
+ q = vmulL(t[i], g_vrp);
+ t[N+i] = vadd(t[N+i], vrawMulUnitAdd(t+i, g_vpN, q));
}
for (size_t i = N; i < N*2; i++) {
t[i] = vadd(t[i], vpsrlq(t[i-1], W));
- t[i-1] = vand(t[i-1], vmask);
+ t[i-1] = vand(t[i-1], g_vmask);
}
- Vmask c = vrawSub(z, t+N, vpN);
+ Vmask c = vrawSub(z, t+N, g_vpN);
uvselect(z, c, t+N, z);
#endif
}
@@ -471,27 +488,6 @@ inline Vec getUnitAt(const Vec *x, size_t xN, size_t bitPos)
return vor(vpsrlq(x[q], r), vpsllq(x[q+1], bitSize - r));
}
-inline void split(Unit a[2], Unit b[2], const Unit x[4])
-{
- /*
- z = -0xd201000000010000
- L = z^2-1 = 0xac45a4010001a40200000000ffffffff
- r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001
- s=255
- v = 0xbe35f678f00fd56eb1fb72917b67f718
- */
- static const uint64_t Lv[] = { 0x00000000ffffffff, 0xac45a4010001a402 };
- static const uint64_t vv[] = { 0xb1fb72917b67f718, 0xbe35f678f00fd56e };
- static const size_t n = 128 / mcl::UnitBitSize;
- Unit t[n*3];
- mcl::bint::mulNM(t, x, n*2, vv, n);
- mcl::bint::shrT(t, t+n*2-1, mcl::UnitBitSize-1); // >>255
- b[0] = t[0];
- b[1] = t[1];
- mcl::bint::mulT(t, t, Lv);
- mcl::bint::subT(a, x, t);
-}
-
class Montgomery {
Unit v_[N];
public:
@@ -507,7 +503,7 @@ class Montgomery {
if (x == 0) return 0;
return mcl::gmp::getUnit(x, 0) & g_mask;
}
- void set(const mpz_class& _p)
+ void init(const mpz_class& _p)
{
mp = _p;
mR = 1;
@@ -551,7 +547,6 @@ class Montgomery {
}
};
-Montgomery g_mont;
/*
|64 |64 |64 |64 |64 |64 |
@@ -561,13 +556,13 @@ Montgomery g_mont;
inline void split52bit(Vec y[8], const Vec x[6])
{
assert(&y != &x);
- y[0] = vand(x[0], vmask);
- y[1] = vand(vor(vpsrlq(x[0], 52), vpsllq(x[1], 12)), vmask);
- y[2] = vand(vor(vpsrlq(x[1], 40), vpsllq(x[2], 24)), vmask);
- y[3] = vand(vor(vpsrlq(x[2], 28), vpsllq(x[3], 36)), vmask);
- y[4] = vand(vor(vpsrlq(x[3], 16), vpsllq(x[4], 48)), vmask);
- y[5] = vand(vpsrlq(x[4], 4), vmask);
- y[6] = vand(vor(vpsrlq(x[4], 56), vpsllq(x[5], 8)), vmask);
+ y[0] = vand(x[0], g_vmask);
+ y[1] = vand(vor(vpsrlq(x[0], 52), vpsllq(x[1], 12)), g_vmask);
+ y[2] = vand(vor(vpsrlq(x[1], 40), vpsllq(x[2], 24)), g_vmask);
+ y[3] = vand(vor(vpsrlq(x[2], 28), vpsllq(x[3], 36)), g_vmask);
+ y[4] = vand(vor(vpsrlq(x[3], 16), vpsllq(x[4], 48)), g_vmask);
+ y[5] = vand(vpsrlq(x[4], 4), g_vmask);
+ y[6] = vand(vor(vpsrlq(x[4], 56), vpsllq(x[5], 8)), g_vmask);
y[7] = vpsrlq(x[5], 44);
}
@@ -667,6 +662,7 @@ struct FpM {
static FpM mR2_;
static FpM m64to52_;
static FpM m52to64_;
+ static Montgomery g_mont;
static void add(FpM& z, const FpM& x, const FpM& y)
{
uvadd(z.v, x.v, y.v);
@@ -723,6 +719,10 @@ struct FpM {
mpz_class r = getRaw(i);
return g_mont.fromMont(r);
}
+ void clear()
+ {
+ memset(this, 0, sizeof(*this));
+ }
bool operator==(const FpM& rhs) const
{
for (size_t i = 0; i < N; i++) {
@@ -739,6 +739,14 @@ struct FpM {
}
return vcmpeq(t, vzero());
}
+ Vmask isZero() const
+ {
+ Vec t = v[0];
+ for (size_t i = 1; i < M; i++) {
+ t = vor(t, v[i]);
+ }
+ return vcmpeq(t, vzero());
+ }
static void pow(FpM& z, const FpM& x, const Vec *y, size_t yn)
{
const int w = 4;
@@ -790,14 +798,10 @@ struct FpM {
}
static void inv(FpM& z, const FpM& x)
{
-#if 1
mcl::msm::FpA v[M];
x.getFp(v);
g_param.invVecFp(v, v, M, M);
z.setFp(v);
-#else
- pow(z, x, g_vmpM2, 6);
-#endif
}
// condition set (set x if c)
void cset(const Vmask& c, const FpM& x)
@@ -806,6 +810,22 @@ struct FpM {
v[i] = vselect(c, x.v[i], v[i]);
}
}
+ // return c ? a : b;
+ static FpM select(const Vmask& c, const FpM& a, const FpM& b)
+ {
+ FpM d;
+ for (size_t i = 0; i < N; i++) {
+ d.v[i] = vselect(c, a.v[i], b.v[i]);
+ }
+ return d;
+ }
+ static void init(const mpz_class& mp)
+ {
+ g_mont.init(mp);
+ }
+#ifdef MCL_MSM_TEST
+ void dump(size_t pos, const char *msg = nullptr) const;
+#endif
};
FpM FpM::one_;
@@ -814,16 +834,7 @@ FpM FpM::rw_;
FpM FpM::mR2_;
FpM FpM::m64to52_;
FpM FpM::m52to64_;
-
-template
-inline Vmask isZero(const E& P)
-{
- Vec v = P.z.v[0];
- for (size_t i = 1; i < N; i++) {
- v = vor(v, P.z.v[i]);
- }
- return vcmpeq(v, vzero());
-}
+Montgomery FpM::g_mont;
template
inline void normalizeJacobiVec(E P[n])
@@ -831,27 +842,28 @@ inline void normalizeJacobiVec(E P[n])
assert(n >= 2);
typedef typename E::Fp F;
F tbl[n];
- tbl[0] = P[0].z;
+ tbl[0] = F::select(P[0].z.isZero(), F::one_, P[0].z);
for (size_t i = 1; i < n; i++) {
- F::mul(tbl[i], tbl[i-1], P[i].z);
+ F t = F::select(P[i].z.isZero(), F::one_, P[i].z);
+ F::mul(tbl[i], tbl[i-1], t);
}
F r;
F::inv(r, tbl[n-1]);
for (size_t i = 0; i < n; i++) {
size_t pos = n-1-i;
- F t = P[pos].z;
+ F& z = P[pos].z;
F rz, rz2;
- if (pos > 0) {
- F::mul(rz, r, tbl[pos-1]);
- F::mul(r, r, t);
- } else {
+ if (pos == 0) {
rz = r;
+ } else {
+ F::mul(rz, r, tbl[pos-1]);
+ F::mul(r, r, F::select(z.isZero(), F::one_, z));
}
F::sqr(rz2, rz);
F::mul(P[pos].x, P[pos].x, rz2); // xz^-2
F::mul(rz2, rz2, rz);
F::mul(P[pos].y, P[pos].y, rz2); // yz^-3
- P[pos].z = F::one_;
+ z = F::select(z.isZero(), z, F::one_);
}
}
@@ -862,8 +874,6 @@ template
inline void addJacobiMixedNoCheck(E& R, const E& P, const E& Q)
{
typedef typename E::Fp F;
- Vmask c = isZero(Q);
- E saveP = P;
F r, U1, S1, H, H3;
F::sqr(r, P.z);
U1 = P.x;
@@ -885,7 +895,6 @@ inline void addJacobiMixedNoCheck(E& R, const E& P, const E& Q)
F::mul(U1, U1, r);
F::mul(H3, H3, S1);
F::sub(R.y, U1, H3);
- R.cset(c, saveP);
}
// 12M+4S+7A
@@ -894,8 +903,6 @@ template
inline void addJacobiNoCheck(E& R, const E& P, const E& Q)
{
typedef typename E::Fp F;
- Vmask c = isZero(Q);
- E saveP = P;
F r, U1, S1, H, H3;
F::sqr(r, P.z);
F::sqr(S1, Q.z);
@@ -920,7 +927,6 @@ inline void addJacobiNoCheck(E& R, const E& P, const E& Q)
F::mul(U1, U1, r);
F::mul(H3, H3, S1);
F::sub(R.y, U1, H3);
- R.cset(c, saveP);
}
// assume a = 0
@@ -969,11 +975,14 @@ struct EcM {
if (isProj) {
mcl::ec::addCTProj(z, x, y);
} else {
+ EcM t;
if (mixed) {
- addJacobiMixedNoCheck(z, x, y);
+ addJacobiMixedNoCheck(t, x, y);
} else {
- addJacobiNoCheck(z, x, y);
+ addJacobiNoCheck(t, x, y);
}
+ t = select(x.isZero(), y, t);
+ z = select(y.isZero(), x, t);
}
}
template
@@ -985,18 +994,26 @@ struct EcM {
dblJacobiNoCheck(z, x);
}
}
- static void init(Montgomery& mont)
+ static void init(const Montgomery& mont)
{
const int b = 4;
mpz_class b3 = mont.toMont(b * 3);
expandN(b3_.v, b3);
- zeroJacobi_.x.set(1);
- zeroJacobi_.y.set(1);
+ zeroJacobi_.x.set(0);
+ zeroJacobi_.y.set(0);
zeroJacobi_.z.set(0);
zeroProj_.x.set(0);
zeroProj_.y.set(1);
zeroProj_.z.set(0);
}
+ static EcM select(const Vmask& c, const EcM& a, const EcM& b)
+ {
+ EcM d;
+ d.x = FpM::select(c, a.x, b.x);
+ d.y = FpM::select(c, a.y, b.y);
+ d.z = FpM::select(c, a.z, b.z);
+ return d;
+ }
template
static const EcM& zero()
{
@@ -1017,44 +1034,23 @@ struct EcM {
}
void setG1(const mcl::msm::G1A v[M], bool JacobiToProj = true)
{
-#if 1
setArray(v[0].v);
FpM::mul(x, x, FpM::m64to52_);
FpM::mul(y, y, FpM::m64to52_);
FpM::mul(z, z, FpM::m64to52_);
-#else
- Unit a[6*3*M];
- const Unit *src = (const Unit *)v;
- for (size_t i = 0; i < M*3; i++) {
- mcl::bn::Fp::getOp().fromMont(a+i*6, src+i*6);
+ if (JacobiToProj) {
+ mcl::ec::JacobiToProj(*this, *this);
+ y = FpM::select(z.isZero(), FpM::one_, y);
}
- setArray(a);
- x.toMont(x);
- y.toMont(y);
- z.toMont(z);
-#endif
- if (JacobiToProj) mcl::ec::JacobiToProj(*this, *this);
}
void getG1(mcl::msm::G1A v[M], bool ProjToJacobi = true) const
{
EcM T = *this;
if (ProjToJacobi) mcl::ec::ProjToJacobi(T, T);
-#if 1
FpM::mul(T.x, T.x, FpM::m52to64_);
FpM::mul(T.y, T.y, FpM::m52to64_);
FpM::mul(T.z, T.z, FpM::m52to64_);
T.getArray(v[0].v);
-#else
- T.x.fromMont(T.x);
- T.y.fromMont(T.y);
- T.z.fromMont(T.z);
- Unit a[6*3*M];
- T.getArray(a);
- Unit *dst = (Unit *)v;
- for (size_t i = 0; i < M*3; i++) {
- mcl::bn::Fp::getOp().toMont(dst+i*6, a+i*6);
- }
-#endif
}
void normalize()
{
@@ -1067,7 +1063,7 @@ struct EcM {
template
static void makeTable(EcM *tbl, const EcM& P)
{
- tbl[0].clear();
+ tbl[0].clear();
tbl[1] = P;
dbl(tbl[2], P);
for (size_t i = 3; i < tblN; i++) {
@@ -1117,14 +1113,13 @@ struct EcM {
Q.z = P.z;
}
template
- static void mulGLV(EcM& Q, const EcM& _P, const Vec y[4])
+ static void mulGLV(EcM& Q, const EcM& P, const Vec y[4])
{
- EcM P = _P;
- if (!isProj) mcl::ec::ProjToJacobi(P, _P);
+ // QQQ (n=1024) isProj=T : 36.8, isProj=F&&mixed=F : 36.0, isProj=F&&mixed=T : 34.6
Vec a[2], b[2];
EcM tbl1[tblN], tbl2[tblN];
makeTable(tbl1, P);
- if (!isProj) normalizeJacobiVec(tbl1+1);
+ if (!isProj && mixed) normalizeJacobiVec(tbl1+1);
for (size_t i = 0; i < tblN; i++) {
mulLambda(tbl2[i], tbl1[i]);
}
@@ -1134,7 +1129,7 @@ struct EcM {
for (size_t i = 0; i < M; i++) {
Unit buf[4] = { src[i+M*0], src[i+M*1], src[i+M*2], src[i+M*3] };
Unit aa[2], bb[2];
- split(aa, bb, buf);
+ mcl::ec::local::optimizedSplitRawForBLS12_381(aa, bb, buf);
pa[i+M*0] = aa[0]; pa[i+M*1] = aa[1];
pb[i+M*0] = bb[0]; pb[i+M*1] = bb[1];
}
@@ -1149,16 +1144,17 @@ struct EcM {
if (!first) for (int k = 0; k < w; k++) EcM::dbl(Q, Q);
EcM T;
Vec idx;
- idx = vand(vpsrlq(v1, bitLen-w-j*w), g_vmask4);
+ // compute v2 first before v1. see misc/internal.md
+ idx = vand(vpsrlq(v2, bitLen-w-j*w), g_vmask4);
if (first) {
- Q.gather(tbl1, idx);
+ Q.gather(tbl2, idx);
first = false;
} else {
- T.gather(tbl1, idx);
+ T.gather(tbl2, idx);
add(Q, Q, T);
}
- idx = vand(vpsrlq(v2, bitLen-w-j*w), g_vmask4);
- T.gather(tbl2, idx);
+ idx = vand(vpsrlq(v1, bitLen-w-j*w), g_vmask4);
+ T.gather(tbl1, idx);
add(Q, Q, T);
}
}
@@ -1167,18 +1163,6 @@ struct EcM {
mul(T, T, b, 2);
add(Q, Q, T);
#endif
- if (!isProj) mcl::ec::JacobiToProj(Q, Q);
- }
- static void mulGLVbn(mcl::msm::G1A _Q[8], mcl::msm::G1A _P[8], const Vec y[4])
- {
- const bool isProj = false;
- const bool mixed = true;
-// mcl::ec::normalizeVec(_P, _P, 8);
- g_param.normalizeVecG1(_P, _P, 8);
- EcM P, Q;
- P.setG1(_P, isProj);
- mulGLV(Q, P, y);
- Q.getG1(_Q);
}
void cset(const Vmask& c, const EcM& v)
{
@@ -1186,7 +1170,11 @@ struct EcM {
y.cset(c, v.y);
z.cset(c, v.z);
}
- Vmask isEqualAll(const EcM& rhs) const
+ Vmask isZero() const
+ {
+ return z.isZero();
+ }
+ Vmask isEqualJacobiAll(const EcM& rhs) const
{
FpM s1, s2, t1, t2;
Vmask v1, v2;
@@ -1194,7 +1182,7 @@ struct EcM {
FpM::sqr(s2, rhs.z);
FpM::mul(t1, x, s2);
FpM::mul(t2, rhs.x, s1);
- v1 = t2.isEqualAll(s1);
+ v1 = t1.isEqualAll(t2);
FpM::mul(t1, y, s2);
FpM::mul(t2, rhs.y, s1);
FpM::mul(t1, t1, rhs.z);
@@ -1202,6 +1190,9 @@ struct EcM {
v2 = t1.isEqualAll(t2);
return mand(v1, v2);
}
+#ifdef MCL_MSM_TEST
+ void dump(bool isProj, size_t pos, const char *msg = nullptr) const;
+#endif
};
FpM EcM::b3_;
@@ -1228,27 +1219,6 @@ inline void cvtFr8toVec4(Vec yv[4], const mcl::msm::FrA y[8])
cvt4Ux8to8Ux4(yv, ya);
}
-template
-inline void mulVecAVX512_naive(mcl::msm::G1A& P, const mcl::msm::G1A *x, const mcl::msm::FrA *y, size_t n)
-{
- assert(n % 8 == 0);
- EcM R;
- for (size_t i = 0; i < n; i += 8) {
- Vec yv[4];
- cvtFr8toVec4(yv, y+i);
- EcM T, X;
- X.setG1(x+i, isProj);
- if (i == 0) {
- EcM::mulGLV(R, X, yv);
- } else {
- EcM::mulGLV(T, X, yv);
- EcM::add(R, R, T);
- }
- }
- if (!isProj) mcl::ec::JacobiToProj(R, R);
- reduceSum(P, R);
-}
-
// xVec[n], yVec[n * maxBitSize/64]
// assume xVec[] is normalized
inline void mulVecAVX512_inner(mcl::msm::G1A& P, const EcM *xVec, const Vec *yVec, size_t n, size_t maxBitSize)
@@ -1292,47 +1262,7 @@ inline void mulVecAVX512_inner(mcl::msm::G1A& P, const EcM *xVec, const Vec *yVe
Xbyak::AlignedFree(tbl);
}
-#if 0
-void mulVec_naive(mcl::msm::G1A& P, const mcl::msm::G1A *x, const mcl::msm::FrA *y, size_t n)
-{
- size_t c = mcl::ec::argminForMulVec(n);
- size_t tblN = (1 << c) - 0;
- mcl::msm::G1A *tbl = (mcl::msm::G1A*)CYBOZU_ALLOCA(sizeof(mcl::msm::G1A) * tblN);
- const size_t maxBitSize = 256;
- const size_t winN = (maxBitSize + c-1) / c;
- mcl::msm::G1A *win = (mcl::msm::G1A*)CYBOZU_ALLOCA(sizeof(mcl::msm::G1A) * winN);
-
- Unit *yVec = (Unit*)CYBOZU_ALLOCA(sizeof(mcl::msm::FrA) * n);
- const mcl::msm::addG1Func addG1 = g_param.addG1;
- const mcl::msm::dblG1Func dblG1 = g_param.dblG1;
- const mcl::msm::clearG1Func clearG1 = g_param.clearG1;
- for (size_t i = 0; i < n; i++) {
- g_param.fr->fromMont(yVec+i*4, y[i].v);
- }
- for (size_t w = 0; w < winN; w++) {
- for (size_t i = 0; i < tblN; i++) {
- clearG1(tbl[i]);
- }
- for (size_t i = 0; i < n; i++) {
- Unit v = mcl::fp::getUnitAt(yVec+i*4, 4, c * w) & (tblN-1);
- addG1(tbl[v], tbl[v], x[i]);
- }
- mcl::msm::G1A sum = tbl[tblN-1];
- win[w] = sum;
- for (size_t i = 1; i < tblN-1; i++) {
- addG1(sum, sum, tbl[tblN - 1 - i]);
- addG1(win[w], win[w], sum);
- }
- }
- P = win[winN - 1];
- for (size_t w = 1; w < winN; w++) {
- for (size_t i = 0; i < c; i++) {
- dblG1(P, P);
- }
- addG1(P, P, win[winN - 1 - w]);
- }
-}
-#endif
+} // namespace
namespace mcl { namespace msm {
@@ -1357,7 +1287,7 @@ void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n)
Unit ya[4];
fr->fromMont(ya, y[i*8+j].v);
Unit a[2], b[2];
- split(a, b, ya);
+ mcl::ec::local::optimizedSplitRawForBLS12_381(a, b, ya);
py[j+0] = a[0];
py[j+8] = a[1];
py[j+16] = b[0];
@@ -1387,37 +1317,61 @@ void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n)
}
}
+void mulEachAVX512(Unit *_x, const Unit *_y, size_t n)
+{
+ assert(n % 8 == 0);
+ const bool isProj = false;
+ const bool mixed = true;
+ mcl::msm::G1A *x = (mcl::msm::G1A*)_x;
+ const mcl::msm::FrA *y = (const mcl::msm::FrA*)_y;
+ if (!isProj && mixed) g_param.normalizeVecG1(x, x, n);
+ for (size_t i = 0; i < n; i += 8) {
+ EcM P;
+ Vec yv[4];
+ cvtFr8toVec4(yv, y+i);
+ P.setG1(x+i, isProj);
+ EcM::mulGLV(P, P, yv);
+ P.getG1(x+i, isProj);
+ }
+}
+
bool initMsm(const mcl::CurveParam& cp, const mcl::msm::Param *param)
{
+ assert(EcM::a_ == 0);
+ assert(EcM::b_ == 4);
+ (void)EcM::a_; // disable unused warning
+ (void)EcM::b_;
+
if (cp != mcl::BLS12_381) return false;
Xbyak::util::Cpu cpu;
if (!cpu.has(Xbyak::util::Cpu::tAVX512_IFMA)) return false;
g_param = *param;
- Montgomery& mont = g_mont;
const mpz_class& mp = g_param.fp->mp;
-
- mont.set(mp);
- toArray<6, 64>(g_mpM2, mp-2);
- expand(vmask, g_mask);
- expandN(vpN, mp);
- expand(vrp, mont.rp);
+ FpM::init(mp);
+ Montgomery& mont = FpM::g_mont;
+ Unit pM2[6]; // x^(-1) = x^(p-2) mod p
+ toArray<6, 64>(pM2, mp-2);
+ expand(g_vmask, g_mask);
+ expandN(g_vpN, mp);
+ expand(g_vrp, mont.rp);
+ Vec vpM2[6]; // NOT 52-bit but 64-bit
for (int i = 0; i < 6; i++) {
- expand(g_vmpM2[i], g_mpM2[i]);
+ expand(vpM2[i], pM2[i]);
}
expand(g_vmask4, getMask(4));
for (int i = 0; i < 8; i++) {
((Unit*)&g_offset)[i] = i;
}
expand(g_vi192, 192);
- expandN(FpM::one_.v, g_mont.toMont(1));
+ expandN(FpM::one_.v, mont.toMont(1));
expandN(FpM::rawOne_.v, mpz_class(1));
- expandN(FpM::mR2_.v, g_mont.mR2);
+ expandN(FpM::mR2_.v, mont.mR2);
{
mpz_class t(1);
t <<= 32;
- FpM::m64to52_.set(t);
- FpM::pow(FpM::m52to64_, FpM::m64to52_, g_vmpM2, 6);
+ FpM::m64to52_.set(t); // 2^32
+ FpM::pow(FpM::m52to64_, FpM::m64to52_, vpM2, 6);
}
FpM::rw_.setFp(g_param.rw);
EcM::init(mont);
@@ -1426,3 +1380,259 @@ bool initMsm(const mcl::CurveParam& cp, const mcl::msm::Param *param)
} } // mcl::msm
+#ifdef MCL_MSM_TEST
+#include
+#include
+#include
+#include
+
+using namespace mcl::bn;
+
+void FpM::dump(size_t pos, const char *msg) const
+{
+ Fp T[8];
+ getFp((mcl::msm::FpA*)T);
+ if (msg) printf("%s\n", msg);
+ printf(" [%zd]=%s\n", pos, T[pos].getStr(16).c_str());
+}
+
+void EcM::dump(bool isProj, size_t pos, const char *msg) const
+{
+ G1 T[8];
+ getG1((mcl::msm::G1A*)T, isProj);
+ if (msg) printf("%s\n", msg);
+ printf(" [%zd]=%s\n", pos, T[pos].getStr(16|mcl::IoEcProj).c_str());
+// printf(" [%zd]=%s\n", pos, T[pos].getStr(16|mcl::IoEcAffine).c_str());
+}
+
+CYBOZU_TEST_AUTO(init)
+{
+ initPairing(mcl::BLS12_381);
+}
+
+void setParam(G1 *P, Fr *x, size_t n, cybozu::XorShift& rg)
+{
+ for (size_t i = 0; i < n; i++) {
+ uint32_t v = rg.get32();
+ hashAndMapToG1(P[i], &v, sizeof(v));
+ if (x) x[i].setByCSPRNG(rg);
+ }
+}
+
+CYBOZU_TEST_AUTO(cmp)
+{
+ const size_t n = 8;
+ Vmask v;
+ FpM x, y;
+ x.clear();
+ v = x.isEqualAll(x);
+ CYBOZU_TEST_EQUAL(cvtToInt(v), 0xff);
+ for (size_t i = 0; i < n; i++) {
+ y.clear();
+ y.set(1, i);
+ v = x.isEqualAll(y);
+ CYBOZU_TEST_EQUAL(cvtToInt(v), 0xff ^ (1<(TM, PM);
+ TM.getG1(TA);
+ for (size_t i = 0; i < n; i++) {
+ CYBOZU_TEST_EQUAL(R[i], T[i]);
+ }
+
+ // as Jacobi
+ PM.setG1(PA, false);
+ EcM::dbl(TM, PM);
+ TM.getG1(TA, false);
+ for (size_t i = 0; i < n; i++) {
+ CYBOZU_TEST_EQUAL(R[i], T[i]);
+ }
+
+ // test add
+ // R = P + Q
+ for (size_t i = 0; i < n; i++) {
+ G1::add(R[i], P[i], Q[i]);
+ }
+
+ // as Proj
+ PM.setG1(PA);
+ QM.setG1(QA);
+ EcM::add(TM, PM, QM);
+ TM.getG1(TA);
+ for (size_t i = 0; i < n; i++) {
+ CYBOZU_TEST_EQUAL(R[i], T[i]);
+ }
+
+ // as Jacobi
+ PM.setG1(PA, false);
+ QM.setG1(QA, false);
+ EcM::add(TM, PM, QM);
+ TM.getG1(TA, false);
+ for (size_t i = 0; i < n; i++) {
+ CYBOZU_TEST_EQUAL(R[i], T[i]);
+ }
+
+ // as Jacobi (mixed)
+ for (size_t i = 0; i < n; i++) {
+ Q[i].normalize();
+ }
+ QM.setG1(QA, false);
+ EcM::add(TM, PM, QM);
+ TM.getG1(TA, false);
+ for (size_t i = 0; i < n; i++) {
+ CYBOZU_TEST_EQUAL(R[i], T[i]);
+ }
+#if 1
+ // mulEachAVX512
+ for (size_t i = 0; i < n; i++) {
+ Q[i] = P[i];
+ G1::mul(R[i], P[i], x[i]);
+ }
+ mcl::msm::mulEachAVX512((Unit*)Q, (const Unit*)x, n);
+ for (size_t i = 0; i < n; i++) {
+ CYBOZU_TEST_EQUAL(R[i], Q[i]);
+ }
+#endif
+}
+
+CYBOZU_TEST_AUTO(normalizeJacobiVec)
+{
+ const bool isProj = false;
+ const size_t n = 64;
+ G1 P[n], Q[n], R[n];
+ EcM PP[n/8];
+ cybozu::XorShift rg;
+ setParam(P, 0, n, rg);
+ P[n/2].clear();
+ P[n/3].clear();
+ mcl::ec::normalizeVec(Q, P, n);
+ for (size_t i = 0; i < n/8; i++) {
+ PP[i].setG1((mcl::msm::G1A*)&P[i*8], isProj);
+ }
+ normalizeJacobiVec(PP);
+ for (size_t i = 0; i < n/8; i++) {
+ PP[i].getG1((mcl::msm::G1A*)&R[i*8], isProj);
+ }
+ CYBOZU_TEST_EQUAL_ARRAY(P, R, n);
+}
+
+CYBOZU_TEST_AUTO(mulEach_special)
+{
+ const size_t n = 8;
+ G1 P[n], Q[n], R[n];
+ Fr x[n];
+ for (size_t i = 0; i < n; i++) P[i].clear();
+ P[0].setStr("1 13de196893df2bb5b57882ff1eec37d98966aa71b828fd25125d04ed2c75ddc55d5bc68bd797bd555f9a827387ee6b28 5d59257a0fccd5215cdeb0928296a7a4d684823db76aef279120d2d71c4b54604ec885eb554f99780231ade171979a3", 16);
+ x[0].setStr("5b4b92c347ffcd8543904dd1b22a60d94b4a9c243046456b8befd41507bec5d", 16);
+// x[0].setStr("457977620305299156129707153920788267006"); // L+L
+ for (size_t i = 0; i < n; i++) Q[i] = P[i];
+ G1::mul(R[0], P[0], x[0]);
+ G1::mulEach(Q, x, 8);
+ CYBOZU_TEST_EQUAL(R[0], Q[0]);
+ mpz_class L;
+ L.setStr("0xac45a4010001a40200000000ffffffff");
+ mpz_class tbl[] = {
+ 0,
+ 1,
+ L,
+ };
+ cybozu::XorShift rg;
+ for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
+ const mpz_class& a = tbl[i];
+ for (size_t j = 0; j < CYBOZU_NUM_OF_ARRAY(tbl); j++) {
+ const mpz_class& b = tbl[j];
+ setParam(P, x, n, rg);
+ x[0].setMpz(a * L + b);
+ for (size_t k = 0; k < 8; k++) {
+ Q[k] = P[k];
+ G1::mul(R[k], P[k], x[k]);
+ }
+ G1::mulEach(Q, x, n);
+ CYBOZU_TEST_EQUAL_ARRAY(R, Q, n);
+ }
+ }
+}
+
+CYBOZU_TEST_AUTO(mulEach)
+{
+ const size_t n = 1024;
+ G1 P[n], Q[n], R[n];
+ Fr x[n];
+ cybozu::XorShift rg;
+ setParam(P, x, n, rg);
+ if (n > 32) P[32].clear();
+ P[n/2].clear();
+ for (size_t i = 0; i < n; i++) {
+ Q[i] = P[i];
+ G1::mul(R[i], P[i], x[i]);
+ }
+ G1::mulEach(Q, x, n);
+ for (size_t i = 0; i < n; i++) {
+ CYBOZU_TEST_EQUAL(R[i], Q[i]);
+ if (R[i] != Q[i]) {
+ printf("P[%zd]=%s\n", i, P[i].getStr(16).c_str());
+ printf("x[%zd]=%s\n", i, x[i].getStr(16).c_str());
+ printf("R[%zd]=%s\n", i, R[i].getStr(16|mcl::IoEcProj).c_str());
+ printf("Q[%zd]=%s\n", i, Q[i].getStr(16|mcl::IoEcProj).c_str());
+ }
+ }
+#ifdef NDEBUG
+ CYBOZU_BENCH_C("mulEach", 100, G1::mulEach, Q, x, n);
+#endif
+}
+#endif
diff --git a/test/bls12_test.cpp b/test/bls12_test.cpp
index ee707a82..ded566c4 100644
--- a/test/bls12_test.cpp
+++ b/test/bls12_test.cpp
@@ -359,6 +359,32 @@ void testSerialize(const G1& P, const G2& Q)
#include "bench.hpp"
+void testMulVec()
+{
+ puts("testMulVec");
+ const size_t n = 8192;
+ cybozu::XorShift rg;
+ std::vector Pvec(n);
+ std::vector xVec(n);
+ hashAndMapToG1(Pvec[0], "abc", 3);
+ for (size_t i = 1; i < n; i++) {
+ G1::add(Pvec[i], Pvec[i-1], Pvec[0]);
+ }
+ for (size_t i = 0; i < n; i++) {
+ xVec[i].setByCSPRNG(rg);
+ }
+ G1 P;
+ G1 P8191;
+ P8191.setStr("1 c252fef934098904eca8e3fbd9cc8c78877e434d9ce01e424ef07302cec5652dc17d341b8abd4278255a75718cebd67 17455f24f76e7e7d1dd3231d8f144a40decc40d5b129734879b8aad4a209a2e6d83d8256221e46aaf8205e254355d9ad", 16);
+ G1 P8192;
+ P8192.setStr("1 f0d44ba84af56d1db97f46660bfd12401aae239a6650cdfc168158d1076d68c5149ac3a311b9c058ad4e61ad1b8063 b2240da1e42c5f469ccf818e58901aca2283d1bd29565f5efbfa14e48cdae199c7a7981b958bfec332f6e613cf36990", 16);
+ G1::mulVec(P, Pvec.data(), xVec.data(), n-1);
+ CYBOZU_TEST_EQUAL(P, P8191);
+ G1::mulVec(P, Pvec.data(), xVec.data(), n);
+ CYBOZU_TEST_EQUAL(P, P8192);
+}
+
+
CYBOZU_TEST_AUTO(naive)
{
for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(g_testSetTbl); i++) {
@@ -375,6 +401,7 @@ CYBOZU_TEST_AUTO(naive)
clk.put();
return;
#endif
+ testMulVec();
testSerialize(P, Q);
testParam(ts);
testIo(P, Q);
@@ -857,7 +884,6 @@ CYBOZU_TEST_AUTO(verifyG2)
CYBOZU_TEST_ASSERT(n == 0);
}
-
typedef std::vector FpVec;
void f(FpVec& zv, const FpVec& xv, const FpVec& yv)
diff --git a/test/bn_c_test.hpp b/test/bn_c_test.hpp
index febe27fc..40bacb61 100644
--- a/test/bn_c_test.hpp
+++ b/test/bn_c_test.hpp
@@ -1078,13 +1078,21 @@ CYBOZU_TEST_AUTO(mulVec)
for (size_t i = 0; i < N; i++) {
char c = char('a' + i);
mclBnG1_hashAndMapTo(&x1Vec[i], &c, 1);
+ if (i == 10) {
+ mclBnG1_clear(&x1Vec[i]); // x1Vec[i] contains zero
+ }
mclBnG2_hashAndMapTo(&x2Vec[i], &c, 1);
mclBn_pairing(&xtVec[i], &x1Vec[i], &x2Vec[i]);
- mclBnFr_setByCSPRNG(&yVec[i]);
+// mclBnFr_setByCSPRNG(&yVec[i]);
+ mclBnFr_setHashOf(&yVec[i], &c, 1);
}
+ mclBnG1 x1Vec2[N];
+ memcpy(x1Vec2, x1Vec, sizeof(x1Vec));
+
mclBnG1_mulVec(&z1, x1Vec, yVec, N);
mclBnG2_mulVec(&z2, x2Vec, yVec, N);
mclBnGT_powVec(&zt, xtVec, yVec, N);
+ mclBnG1_mulEach(x1Vec2, yVec, N);
mclBnG1_clear(&w1);
mclBnG2_clear(&w2);
@@ -1094,6 +1102,22 @@ CYBOZU_TEST_AUTO(mulVec)
mclBnG2 t2;
mclBnGT tt;
mclBnG1_mul(&t1, &x1Vec[i], &yVec[i]);
+ CYBOZU_TEST_ASSERT(mclBnG1_isEqual(&t1, &x1Vec2[i]));
+#if 0
+ if (mclBnG1_isEqual(&t1, &x1Vec2[i]) == 0) {
+ char buf[1024];
+ printf("i=%zd\n", i);
+ mclBnG1_getStr(buf, sizeof(buf), &x1Vec[i], 10);
+ printf("x1=%s\n", buf);
+ mclBnFr_getStr(buf, sizeof(buf), &yVec[i], 10);
+ printf("y=%s\n", buf);
+ mclBnG1_getStr(buf, sizeof(buf), &t1, 10);
+ printf("xy=%s\n", buf);
+ mclBnG1_getStr(buf, sizeof(buf), &x1Vec2[i], 10);
+ printf("ng=%s\n", buf);
+ exit(1);
+ }
+#endif
mclBnG2_mul(&t2, &x2Vec[i], &yVec[i]);
mclBnGT_pow(&tt, &xtVec[i], &yVec[i]);
mclBnG1_add(&w1, &w1, &t1);
diff --git a/test/common_test.hpp b/test/common_test.hpp
index 99d04fb2..e00cd726 100644
--- a/test/common_test.hpp
+++ b/test/common_test.hpp
@@ -33,6 +33,9 @@ void testMulVec(const G& P)
cybozu::XorShift rg;
for (size_t i = 0; i < N; i++) {
G::mul(x0Vec[i], P, i + 3);
+ if (i == 30) {
+ x0Vec[i].clear(); // x0Vec[i] contains zero value
+ }
xVec[i] = x0Vec[i];
yVec[i].setByCSPRNG(rg);
}
@@ -57,6 +60,17 @@ void testMulVec(const G& P)
CYBOZU_BENCH_C("mulVecCopy", C, mulVecCopy, Q1, xVec.data(), yVec.data(), n, x0Vec.data());
#endif
}
+ puts("mulEach");
+ for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(nTbl); i++) {
+ const size_t n = nTbl[i];
+ xVec = x0Vec;
+ G::mulEach(xVec.data(), yVec.data(), n);
+ for (size_t j = 0; j < n; j++) {
+ G T;
+ G::mul(T, x0Vec[j], yVec[j]);
+ CYBOZU_TEST_EQUAL(xVec[j], T);
+ }
+ }
}
template