-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathGF(p).cpp
314 lines (260 loc) · 9.86 KB
/
GF(p).cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/// Implementation of the GF(P) Galois field
#include "SIMD.h"
#if defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__)
#define MY_CPU_AMD64
#endif
#if defined(MY_CPU_AMD64) || defined(_M_IA64)
#define MY_CPU_64BIT
#endif
#if _MSC_VER
#define MSVC_ONLY 1
#endif
#if __GNUC__ && defined(MY_CPU_64BIT)
#include <inttypes.h>
typedef unsigned __int128 uint128_t;
#elif _MSC_VER && defined(MY_CPU_64BIT)
#include <intrin.h>
#define constexpr /* incompatible with __umulh :( */
#endif
#define unlikely /* to do... */
/***********************************************************************************************************************
*** Main operations in GF(P) *******************************************************************************************
************************************************************************************************************************/
// Twice wider type to hold intermediate MUL results
template<typename Type> struct Double {};
template<> struct Double<uint32_t> {typedef uint64_t T;};
template <typename T, T P>
constexpr T GF_Sub (T X, T Y)
{
T res = X - Y;
return res + (res>X)*P; // res>X? res+P : res
}
template <typename T, T P>
constexpr T GF_Add (T X, T Y)
{
return GF_Sub<T,P> (X, P-Y);
}
#if __GNUC__ && defined(MY_CPU_64BIT)
// Alternative GF_Mul64 implementation for 64-bit GCC
template<> struct Double<uint64_t> {typedef uint128_t T;};
// 4x wider type to hold intermediate MUL results
template<typename Type> struct Quadruple {};
template<> struct Quadruple<uint32_t> {typedef uint128_t T;};
// The binary logarithm, rounded down
template <typename T>
static constexpr int trunc_log2 (T x)
{return x<=1? 0 : 1+trunc_log2(x/2);}
template <typename T, T P>
constexpr T GF_Mul64 (T X, T Y)
{
using DoubleT = typename Double<T>::T;
using QuadT = typename Quadruple<T>::T;
// See chapter "16.9 Division" in the http://www.agner.org/optimize/optimizing_assembly.pdf
constexpr int BITS = trunc_log2(P) + 8*sizeof(DoubleT);
constexpr QuadT invP2 = (QuadT(2) << BITS) / P; // double the invP value
constexpr DoubleT invP = (invP2+1) / 2; // rounded invP value
constexpr DoubleT extra = 1 - (invP2 & 1); // 1 if invP was rounded down, 0 otherwise
DoubleT res = DoubleT(X)*Y;
res -= DoubleT(((res+extra)*QuadT(invP)) >> BITS) * P;
return T(res);
}
#elif defined(MSVC_ONLY) && defined(MY_CPU_64BIT)
// Alternative GF_Mul64 implementation made with MSVC intrinsics
template <typename T, T P>
constexpr T GF_Mul64 (T X, T Y)
{
using DoubleT = typename Double<T>::T;
constexpr DoubleT estInvP = ((DoubleT(1)<<63) / P) << 1;
constexpr DoubleT invP = (estInvP*P > (estInvP+1)*P? estInvP : estInvP+1);
DoubleT res = DoubleT(X)*Y;
res -= __umulh(res,invP) * P;
return T(res>=P? res-P : res);
}
#else
// GF_Mul64 is optimized for 64-bit CPUs
template <typename T, T P>
constexpr T GF_Mul64 (T X, T Y)
{
using DoubleT = typename Double<T>::T;
return T((DoubleT(X)*Y) % P);
}
#endif
// GF_Mul32 is optimized for 32-bit CPUs, SIMD and GPUs
// It can vectorized by GCC 6 and Clang 4.0, but only for SSE>=4.2 (since 64-bit comparison "res>=P" is translated to PCMPGTQ)
template <typename T, T P>
constexpr T GF_Mul32 (T X, T Y)
{
using DoubleT = typename Double<T>::T;
// invP32 := (2**64)/P - 2**32 : if 2**31<P<2**32, then 2**32 < (2**64)/P < 2**33, and invP32 is a 32-bit value
const DoubleT estInvP = ((DoubleT(1)<<63) / P) << 1; // == invP & (~1)
const T invP32 = T(estInvP*P > (estInvP+1)*P? estInvP : estInvP+1); // we can't use 1<<64 for exact invP computation so we add the posible 1 in other way
DoubleT res = DoubleT(X)*Y;
res -= ((res + (res>>32)*invP32) >> 32) * P; // The same as res -= ((res*invP) >> 64) * P, where invP = (2**64)/P, but optimized for 32-bit computations
#if (SIMD < SSE2)
return T(res>=P? res-P : res); // optimized for scalar/GPU code
#else
return T(res-P) + (T((res-P)>>32) & P); // optimized for SIMD code
// return T(res) - (T((P-res)>>32)&P); // alternative SIMD-optimized code
#endif
}
#if defined(MY_CPU_64BIT) && SIMD<SSE2
#define GF_Mul GF_Mul64
#else
#define GF_Mul GF_Mul32
#endif
/***********************************************************************************************************************
*** Optimized operations for various P *********************************************************************************
************************************************************************************************************************/
#if 0 && defined(MSVC_ONLY) && defined(MY_CPU_64BIT)
// Optimized operations for P=0xFFF00001 (or any other 32-bit P)
template <> constexpr uint32_t GF_Add<uint32_t,0xFFF00001> (uint32_t X, uint32_t Y)
{
uint32_t res, temp;
auto carry = _addcarry_u32(0,X,Y,&temp);
return temp - (carry? 0xFFF00001 : 0);
}
template <> constexpr uint32_t GF_Sub<uint32_t,0xFFF00001> (uint32_t X, uint32_t Y)
{
uint32_t res, temp;
auto carry = _subborrow_u32(0,X,Y,&temp);
return temp + (carry? 0xFFF00001 : 0);
}
#endif
// Optimized operations for P=0x10001
template <> constexpr uint32_t GF_Add<uint32_t,0x10001> (uint32_t X, uint32_t Y)
{
uint32_t res = X+Y;
return res - (res>=0x10001)*0x10001; // res>P? res-P : res
}
template <> constexpr uint32_t GF_Sub<uint32_t,0x10001> (uint32_t X, uint32_t Y)
{
uint32_t res = X-Y;
return res + (int32_t(res)<0)*0x10001; // res<0? res+P : res
}
template <> constexpr uint32_t GF_Mul<uint32_t,0x10001> (uint32_t X, uint32_t Y)
{
uint32_t res = X*Y;
if (unlikely(res==0) && X && Y)
return 1; // 65536*65536
res = (res&0xFFFF) - (res>>16);
#ifdef MY_CPU_64BIT
return res + (int32_t(res)<0)*0x10001; // faster for gcc64
#else
return int32_t(res)<0? res+0x10001 : res; // faster for gcc32
#endif
}
// Optimized operations for P=0xFFFFFFFF
// Note: finally data should be normalized, i.e. 0xFFFFFFFF mapped to 0
template <> constexpr uint32_t GF_Add<uint32_t,0xFFFFFFFF> (uint32_t X, uint32_t Y)
{
uint64_t res = uint64_t(X)+Y;
return uint32_t(res) + uint32_t(res>>32);
}
template <> constexpr uint32_t GF_Sub<uint32_t,0xFFFFFFFF> (uint32_t X, uint32_t Y)
{
return GF_Add<uint32_t,0xFFFFFFFF> (X,~Y);
}
template <> constexpr uint32_t GF_Mul<uint32_t,0xFFFFFFFF> (uint32_t X, uint32_t Y)
{
uint64_t res = uint64_t(X) * Y;
res = (res&0xFFFFFFFF) + (res>>32);
return uint32_t(res) + uint32_t(res>>32);
}
// Optimized operations for P=0xFFFFFFFFFFFFFFFF
// Note: finally data should be normalized, i.e. 0xFFFFFFFFFFFFFFFF mapped to 0
#if __GNUC__ && defined(MY_CPU_64BIT)
template <> constexpr uint64_t GF_Add<uint64_t,0xFFFFFFFFFFFFFFFF> (uint64_t X, uint64_t Y)
{
uint64_t res = X+Y;
return res + (res<X);
}
template <> constexpr uint64_t GF_Sub<uint64_t,0xFFFFFFFFFFFFFFFF> (uint64_t X, uint64_t Y)
{
uint64_t res = X-Y;
return res - (res>X);
}
template <> constexpr uint64_t GF_Mul<uint64_t,0xFFFFFFFFFFFFFFFF> (uint64_t X, uint64_t Y)
{
uint128_t res = uint128_t(X) * Y;
uint64_t a = res, b = res>>64;
uint64_t c = a+b;
return c + (c<a);
}
#elif defined(MSVC_ONLY) && defined(MY_CPU_64BIT)
template <> constexpr uint64_t GF_Add<uint64_t,0xFFFFFFFFFFFFFFFF> (uint64_t X, uint64_t Y)
{
uint64_t res, temp;
auto carry = _addcarry_u64(0,X,Y,&temp);
_addcarry_u64(carry,temp,0,&res);
return res;
}
template <> constexpr uint64_t GF_Sub<uint64_t,0xFFFFFFFFFFFFFFFF> (uint64_t X, uint64_t Y)
{
uint64_t res, temp;
auto carry = _subborrow_u64(0,X,Y,&temp);
_subborrow_u64(carry,temp,0,&res);
return res;
}
template <> constexpr uint64_t GF_Mul<uint64_t,0xFFFFFFFFFFFFFFFF> (uint64_t X, uint64_t Y)
{
uint64_t a, b = _umul128(X,Y,&a), res, temp;
auto carry = _addcarry_u64(0,a,b,&temp);
_addcarry_u64(carry,temp,0,&res);
return res;
}
#endif
/***********************************************************************************************************************
*** Extra operations in GF(P) ******************************************************************************************
************************************************************************************************************************/
template <typename T, T P>
constexpr T GF_Pow (T X, T N)
{
T res = 1;
for ( ; N; N/=2)
{
if (N&1) res = GF_Mul<T,P> (res,X);
X = GF_Mul<T,P> (X,X);
}
return res;
}
// Some primary root of 1 of power N
template <typename T, T P>
constexpr T GF_Root (T N)
{
static_assert (P==0xFFF00001 || P==0x10001, "Only GF(0xFFF00001) and GF(0x10001) are supported by generic GF_Root");
T main_root = (P==0x10001? 3 : 19); // root of power P-1 in the GF(P)
//assert ((P-1) % N == 0);
return GF_Pow<T,P> (main_root, (P-1) / N);
}
template <> constexpr uint32_t GF_Root<uint32_t,0xFFFFFFFF> (uint32_t N)
{
uint32_t main_root = 7; // root of power 65536 in Z/mZ(0xFFFFFFFF)
//assert (65536 % N == 0);
return GF_Pow<uint32_t,0xFFFFFFFF> (main_root, 65536 / N);
}
#ifdef MY_CPU_64BIT
template <> constexpr uint64_t GF_Root<uint64_t,0xFFFFFFFFFFFFFFFF> (uint64_t N)
{
uint64_t main_root = 7; // root of power 2^16*3*5*17449 in Z/mZ(0xFFFFFFFFFFFFFFFF)
//assert (uint64_t(65536)*3*5*17449) % N == 0);
return GF_Pow<uint64_t,0xFFFFFFFFFFFFFFFF> (main_root, (uint64_t(65536)*3*5*17449) / N);
}
#endif
template <typename T, T P>
constexpr T GF_Inv (T X)
{
return GF_Pow<T,P> (X, P==0xFFFFFFFFFFFFFFFF? 0xFFFFFFFF : P==0xFFFFFFFF? 0xFFFF : P-2);
}
template <typename T, T P>
constexpr T GF_Div (T X, T Y)
{
T InvY = GF_Inv<T,P> (Y);
return GF_Mul<T,P> (X, InvY);
}
// Normalize value, i.e. return X%P
// Required after optimized operations with P=2^32-1 or 2^64-1
template <typename T, T P>
constexpr T GF_Normalize (T X)
{
return X % P;
}