Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
herumi committed Dec 13, 2024
2 parents 7db1696 + e728ca5 commit ed3cb70
Show file tree
Hide file tree
Showing 16 changed files with 2,633 additions and 702 deletions.
2 changes: 1 addition & 1 deletion include/mcl/bn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace mcl {
namespace msm {

bool initMsm(const mcl::CurveParam& cp, const msm::Func *func);
void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n);
void mulVecAVX512(Unit *_P, Unit *_x, const Unit *_y, size_t n, size_t b);
void mulEachAVX512(Unit *_x, const Unit *_y, size_t n);

} // mcl::msm
Expand Down
136 changes: 86 additions & 50 deletions include/mcl/ec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,30 +243,45 @@ void normalizeVecT(Eout& Q, Ein& P, size_t n, size_t N = 256)
}

/*
split x to (a, b) such that x = a + b L where 0 <= a, b <= L, 0 <= x <= r-1 = L^2+L
if adj is true, then 0 <= a < L, 0 <= b <= L+1
split x in [0, r-1] to (a, b) such that x = a + b L, 0 <= a < L, 0 <= b <= L+1
a[] : 128 bit
b[] : 128 bit
x[] : 256 bit
*/
inline void optimizedSplitRawForBLS12_381(Unit *a, Unit *b, const Unit *x)
{
const bool adj = false;
/*
z = -0xd201000000010000
L = z^2-1 = 0xac45a4010001a40200000000ffffffff
r = L^2+L+1 = 0x73eda753299d7d483339d80809a1d80553bda402fffe5bfeffffffff00000001
s=255
v = (1<<s)//L = 0xbe35f678f00fd56eb1fb72917b67f718
q = (1<<s)//L = 0xbe35f678f00fd56eb1fb72917b67f718
H = 1<<128
*/
static const Unit L[] = { MCL_U64_TO_UNIT(0x00000000ffffffff), MCL_U64_TO_UNIT(0xac45a4010001a402) };
static const Unit v[] = { MCL_U64_TO_UNIT(0xb1fb72917b67f718), MCL_U64_TO_UNIT(0xbe35f678f00fd56e) };
static const Unit q[] = { MCL_U64_TO_UNIT(0xb1fb72917b67f718), MCL_U64_TO_UNIT(0xbe35f678f00fd56e) };
static const Unit one[] = { MCL_U64_TO_UNIT(1), MCL_U64_TO_UNIT(0) };
static const size_t n = 128 / mcl::UnitBitSize;
#if 1
Unit xH[n+1]; // x = xH * (H/2) + xL
mcl::bint::shrT<n+1>(xH, x+n-1, mcl::UnitBitSize-1); // >>127
Unit t[n*2];
mcl::bint::mulT<n>(t, xH, q);
mcl::bint::copyT<n>(b, t+n); // (xH * q)/H
mcl::bint::mulT<n>(t, b, L); // bL
mcl::bint::subT<n*2>(t, x, t); // x - bL
Unit d = mcl::bint::subT<n>(a, t, L);
if (t[n] - d == 0) {
mcl::bint::addT<n>(b, b, one);
} else {
mcl::bint::copyT<n>(a, t);
}
#else
const bool adj = false;
Unit t[n*3];
// n = 128 bit
// t[n*3] = x[n*2] * v[n]
mcl::bint::mulNM(t, x, n*2, v, n);
// t[n*3] = x[n*2] * q[n]
mcl::bint::mulNM(t, x, n*2, q, n);
// b[n] = t[n*3]>>255
mcl::bint::shrT<n+1>(t, t+n*2-1, mcl::UnitBitSize-1); // >>255
mcl::bint::copyT<n>(b, t);
Expand All @@ -283,6 +298,7 @@ inline void optimizedSplitRawForBLS12_381(Unit *a, Unit *b, const Unit *x)
mcl::bint::clearT<n>(a);
}
}
#endif
}

} // mcl::ec::local
Expand Down Expand Up @@ -571,21 +587,26 @@ void addJacobi(E& R, const E& P, const E& Q)

/*
accept P == Q
https://github.com/apache/incubator-milagro-crypto-c/blob/fa0a45a3/src/ecp.c.in#L767-L976
https://eprint.iacr.org/2015/1060
(x, y, z) is zero <=> x = 0, y = 1, z = 0
*/

// (b=4) 12M+27A
// (generic) 14M+19A
// Q.z = 1 if mixed
template<class E>
void addCTProj(E& R, const E& P, const E& Q)
void addCTProj(E& R, const E& P, const E& Q, bool mixed = false)
{
typedef typename E::Fp F;
assert(E::a_ == 0);
F t0, t1, t2, t3, t4, x3, y3;
F::mul(t0, P.x, Q.x);
F::mul(t1, P.y, Q.y);
F::mul(t2, P.z, Q.z);
if (mixed) {
t2 = P.z;
} else {
F::mul(t2, P.z, Q.z);
}
F::add(t3, P.x, P.y);
F::add(t4, Q.x, Q.y);
F::mul(t3, t3, t4);
Expand Down Expand Up @@ -967,37 +988,52 @@ inline size_t ilog2(size_t n)
return cybozu::bsr(n) + 1;
}

inline size_t costMulVec(size_t n, size_t x)
// The number of ADD for n-elements with bucket size b
inline size_t glvCost(size_t n, size_t b)
{
return (n + (size_t(1)<<(x+1))-1)/x;
return (n + (size_t(1)<<(b+1))-1)/b;
}
// calculate approximate value such that argmin { x : (n + 2^(x+1)-1)/x }
inline size_t argminForMulVec0(size_t n)
// approximate value such that argmin { b : glvCost(n, b) }
inline size_t estimateBucketSize(size_t n)
{
if (n <= 16) return 2;
size_t log2n = ilog2(n);
return log2n - ilog2(log2n);
}

/*
First, get approximate value x and compute costMulVec of x-1 and x+1,
First, get approximate value x and compute glvCost of x-1 and x+1,
and return the minimum value.
*/
inline size_t argminForMulVec(size_t n)
inline size_t glvGetTheoreticBucketSize(size_t n)
{
size_t x = argminForMulVec0(n);
#if 1
size_t vm1 = x > 1 ? costMulVec(n, x-1) : n;
size_t v0 = costMulVec(n, x);
size_t vp1 = costMulVec(n, x+1);
size_t x = estimateBucketSize(n);
size_t vm1 = x > 1 ? glvCost(n, x-1) : n;
size_t v0 = glvCost(n, x);
size_t vp1 = glvCost(n, x+1);
if (vm1 <= v0) return x-1;
if (vp1 < v0) return x+1;
#endif
return x;
}

// return heuristic backet size which is faster than glvGetTheoreticBucketSize
inline size_t glvGetBucketSize(size_t n)
{
if (n <= 2) return 2;
size_t log2n = mcl::ec::ilog2(n);
const size_t tblMin = 8;
if (log2n < tblMin) return 3;
// n >= 2^tblMin
static const size_t tbl[] = {
3, 4, 5, 5, 8, 8, 9, 10, 11, 12, 13, 13, 13, 16, 16, 16, 18, 19, 19, 19, 19, 19
};
if (log2n >= CYBOZU_NUM_OF_ARRAY(tbl)) return 19;
size_t ret = tbl[log2n - tblMin];
return ret;
}

#ifndef MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC
// use (1 << argminForMulVec(n)) * sizeof(G) bytes stack + alpha
// use (1 << glvGetBucketSize(n)) * sizeof(G) bytes stack + alpha
// about 18KiB (G1) or 36KiB (G2) for n = 1024
// you can decrease this value but this algorithm is slow if n < 256
#define MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC 1024
Expand Down Expand Up @@ -1039,7 +1075,7 @@ void mulVecUpdateTable(G& win, G *tbl, size_t tblN, const G *xVec, const Unit *y
fast for n >= 256
*/
template<class G>
size_t mulVecCore(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next, size_t n, bool doNormalize = true)
size_t mulVecCore(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next, size_t n, size_t b, bool doNormalize)
{
if (n == 0) {
z.clear();
Expand All @@ -1050,15 +1086,15 @@ size_t mulVecCore(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next
return 1;
}

size_t c, tblN;
size_t tblN;
G *tbl = 0;

#ifndef MCL_DONT_USE_MALLOC
G *tbl_ = 0; // malloc is used if tbl_ != 0
// if n is large then try to use malloc
if (n > MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC) {
c = argminForMulVec(n);
tblN = (1 << c) - 1;
if (b == 0) b = glvGetBucketSize(n);
tblN = (1 << b) - 1;
tbl_ = (G*)malloc(sizeof(G) * tblN);
if (tbl_) {
tbl = tbl_;
Expand All @@ -1068,49 +1104,49 @@ size_t mulVecCore(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next
#endif
// n is small or malloc fails so use stack
if (n > MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC) n = MCL_MAX_N_TO_USE_STACK_FOR_MUL_VEC;
c = argminForMulVec(n);
tblN = (1 << c) - 1;
if (b == 0) b = glvGetBucketSize(n);
tblN = (1 << b) - 1;
tbl = (G*)CYBOZU_ALLOCA(sizeof(G) * tblN);
// keep tbl_ = 0
#ifndef MCL_DONT_USE_MALLOC
main:
#endif
const size_t maxBitSize = sizeof(Unit) * yUnitSize * 8;
const size_t winN = (maxBitSize + c-1) / c;
const size_t winN = (maxBitSize + b-1) / b;

// about 10% faster
if (doNormalize) G::normalizeVec(xVec, xVec, n);

mulVecUpdateTable(z, tbl, tblN, xVec, yVec, yUnitSize, next, c * (winN-1), n, true);
mulVecUpdateTable(z, tbl, tblN, xVec, yVec, yUnitSize, next, b * (winN-1), n, true);
for (size_t w = 1; w < winN; w++) {
for (size_t i = 0; i < c; i++) {
for (size_t i = 0; i < b; i++) {
G::dbl(z, z);
}
mulVecUpdateTable(z, tbl, tblN, xVec, yVec, yUnitSize, next, c * (winN-1-w), n, false);
mulVecUpdateTable(z, tbl, tblN, xVec, yVec, yUnitSize, next, b * (winN-1-w), n, false);
}
#ifndef MCL_DONT_USE_MALLOC
if (tbl_) free(tbl_);
#endif
return n;
}
template<class G>
void mulVecLong(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next, size_t n, bool doNormalize = true)
void mulVecLong(G& z, G *xVec, const Unit *yVec, size_t yUnitSize, size_t next, size_t n, size_t b, bool doNormalize)
{
size_t done = mulVecCore(z, xVec, yVec, yUnitSize, next, n, doNormalize);
size_t done = mulVecCore(z, xVec, yVec, yUnitSize, next, n, b, doNormalize);
if (done == n) return;
do {
xVec += done;
yVec += next * done;
n -= done;
G t;
done = mulVecCore(t, xVec, yVec, yUnitSize, next, n, doNormalize);
done = mulVecCore(t, xVec, yVec, yUnitSize, next, n, b, doNormalize);
z += t;
} while (done < n);
}

// for n >= 128
template<class GLV, class G>
bool mulVecGLVlarge(G& z, const G *xVec, const void *yVec, size_t n)
bool mulVecGLVlarge(G& z, const G *xVec, const void *yVec, size_t n, size_t b)
{
const int splitN = GLV::splitN;
assert(n > 0);
Expand Down Expand Up @@ -1147,7 +1183,7 @@ bool mulVecGLVlarge(G& z, const G *xVec, const void *yVec, size_t n)
assert(b); (void)b;
}
}
mulVecLong(z, tbl, yp, next, next, n * splitN, false);
mulVecLong(z, tbl, yp, next, next, n * splitN, false, b);
free(tbl);
return true;
}
Expand Down Expand Up @@ -1345,7 +1381,7 @@ static void mulVecGLVsmall(G& z, const G *xVec, const void* yVec, size_t n)

// return false if malloc fails or n is not in a target range
template<class GLV, class G, class F>
bool mulVecGLVT(G& z, const G *xVec, const void *yVec, size_t n, bool constTime = false)
bool mulVecGLVT(G& z, const G *xVec, const void *yVec, size_t n, bool constTime = false, size_t b = 0)
{
if (n == 1 && constTime) {
local::mulGLV_CT<GLV, G>(z, xVec[0], yVec);
Expand All @@ -1356,7 +1392,7 @@ bool mulVecGLVT(G& z, const G *xVec, const void *yVec, size_t n, bool constTime
return true;
}
if (n >= 128) {
return mulVecGLVlarge<GLV, G>(z, xVec, yVec, n);
return mulVecGLVlarge<GLV, G>(z, xVec, yVec, n, b);
}
return false;
}
Expand Down Expand Up @@ -1388,8 +1424,8 @@ class EcT : public fp::Serializable<EcT<_Fp, _Fr> > {
*/
static bool verifyOrder_;
static mpz_class order_;
static bool (*mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime);
static void (*mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n);
static bool (*mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime, size_t b);
static void (*mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n, size_t b);
static void (*mulEachOpti)(Unit *xVec, const Unit *yVec, size_t n);
static bool (*isValidOrderFast)(const EcT& x);
/* default constructor is undefined value */
Expand Down Expand Up @@ -1480,11 +1516,11 @@ class EcT : public fp::Serializable<EcT<_Fp, _Fr> > {
{
isValidOrderFast = f;
}
static void setMulVecGLV(bool f(EcT& z, const EcT *xVec, const void *yVec, size_t yn, bool constTime))
static void setMulVecGLV(bool f(EcT& z, const EcT *xVec, const void *yVec, size_t yn, bool constTime, size_t b))
{
mulVecGLV = f;
}
static void setMulVecOpti(void f(Unit* _z, Unit *_xVec, const Unit *_yVec, size_t yn))
static void setMulVecOpti(void f(Unit* _z, Unit *_xVec, const Unit *_yVec, size_t yn, size_t b))
{
mulVecOpti = f;
}
Expand Down Expand Up @@ -1603,7 +1639,7 @@ class EcT : public fp::Serializable<EcT<_Fp, _Fr> > {
static inline void mul(EcT& z, const EcT& x, const EcT::Fr& y, bool constTime = false)
{
if (mulVecGLV) {
mulVecGLV(z, &x, &y, 1, constTime);
mulVecGLV(z, &x, &y, 1, constTime, 0);
return;
}
fp::Block b;
Expand Down Expand Up @@ -2155,17 +2191,17 @@ class EcT : public fp::Serializable<EcT<_Fp, _Fr> > {
GLV : 7680, 15360, 30720
Long: 9779, 16322, 24533
*/
static inline void mulVec(EcT& z, EcT *xVec, const EcT::Fr *yVec, size_t n)
static inline void mulVec(EcT& z, EcT *xVec, const EcT::Fr *yVec, size_t n, size_t b = 0)
{
if (n == 0) {
z.clear();
return;
}
if (mulVecOpti && n >= 128) {
mulVecOpti((Unit*)&z, (Unit*)xVec, yVec[0].getUnit(), n);
mulVecOpti((Unit*)&z, (Unit*)xVec, yVec[0].getUnit(), n, b);
return;
}
if (mulVecGLV && mulVecGLV(z, xVec, yVec, n, false)) {
if (mulVecGLV && mulVecGLV(z, xVec, yVec, n, false, b)) {
return;
}
EcT r;
Expand Down Expand Up @@ -2284,8 +2320,8 @@ template<class Fp, class Fr> int EcT<Fp, Fr>::specialB_;
template<class Fp, class Fr> int EcT<Fp, Fr>::ioMode_;
template<class Fp, class Fr> bool EcT<Fp, Fr>::verifyOrder_;
template<class Fp, class Fr> mpz_class EcT<Fp, Fr>::order_;
template<class Fp, class Fr> bool (*EcT<Fp, Fr>::mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime);
template<class Fp, class Fr> void (*EcT<Fp, Fr>::mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n);
template<class Fp, class Fr> bool (*EcT<Fp, Fr>::mulVecGLV)(EcT& z, const EcT *xVec, const void *yVec, size_t n, bool constTime, size_t b);
template<class Fp, class Fr> void (*EcT<Fp, Fr>::mulVecOpti)(Unit *z, Unit *xVec, const Unit *yVec, size_t n, size_t b);
template<class Fp, class Fr> bool (*EcT<Fp, Fr>::isValidOrderFast)(const EcT& x);
template<class Fp, class Fr> int EcT<Fp, Fr>::mode_;
template<class Fp, class Fr> void (*EcT<Fp, Fr>::mulEachOpti)(Unit *xVec, const Unit *yVec, size_t n);
Expand Down
3 changes: 0 additions & 3 deletions include/mcl/vint.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -696,9 +696,6 @@ class Vint {
// logical left shift (copy sign)
static void shl(Vint& y, const Vint& x, size_t shiftBit)
{
if (shiftBit > MCL_MAX_BIT_SIZE*2) {
printf("shiftBit=%zd\n", shiftBit);
}
assert(shiftBit <= MCL_MAX_BIT_SIZE * 2); // many be too big
size_t xn = x.size();
size_t yn = xn + (shiftBit + UnitBitSize - 1) / UnitBitSize;
Expand Down
Loading

0 comments on commit ed3cb70

Please sign in to comment.