Electroneum
curve25519-donna-sse2.h
Go to the documentation of this file.
1 /*
2  Public domain by Andrew M. <liquidsun@gmail.com>
3  See: https://github.com/floodyberry/curve25519-donna
4 
5  SSE2 curve25519 implementation
6 */
7 
8 #include <emmintrin.h>
9 typedef __m128i xmmi;
10 
11 typedef union packedelem8_t {
12  unsigned char u[16];
14 } packedelem8;
15 
16 typedef union packedelem32_t {
17  uint32_t u[4];
19 } packedelem32;
20 
21 typedef union packedelem64_t {
22  uint64_t u[2];
24 } packedelem64;
25 
26 /* 10 elements + an extra 2 to fit in 3 xmm registers */
27 typedef uint32_t bignum25519[12];
30 
31 static const packedelem32 bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}};
32 static const packedelem32 top32bitmask = {{0x00000000, 0xffffffff, 0x00000000, 0xffffffff}};
33 static const packedelem32 top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}};
34 static const packedelem32 bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}};
35 
36 /* reduction masks */
37 static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}};
38 static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}};
39 static const packedelem32 packedmask2625 = {{0x3ffffff,0,0x1ffffff,0}};
40 static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}};
41 static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}};
42 
43 /* multipliers */
44 static const packedelem64 packednineteen = {{19, 19}};
45 static const packedelem64 packednineteenone = {{19, 1}};
46 static const packedelem64 packedthirtyeight = {{38, 38}};
47 static const packedelem64 packed3819 = {{19*2,19}};
48 static const packedelem64 packed9638 = {{19*4,19*2}};
49 
50 /* 121666,121665 */
51 static const packedelem64 packed121666121665 = {{121666, 121665}};
52 
53 /* 2*(2^255 - 19) = 0 mod p */
54 static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}};
55 static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}};
56 static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}};
57 
58 static const packedelem32 packed32packed2p0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}};
59 static const packedelem32 packed32packed2p1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}};
60 
61 /* 4*(2^255 - 19) = 0 mod p */
62 static const packedelem32 packed4p0 = {{0xfffffb4,0x7fffffc,0xffffffc,0x7fffffc}};
63 static const packedelem32 packed4p1 = {{0xffffffc,0x7fffffc,0xffffffc,0x7fffffc}};
64 static const packedelem32 packed4p2 = {{0xffffffc,0x7fffffc,0x0000000,0x0000000}};
65 
66 static const packedelem32 packed32packed4p0 = {{0xfffffb4,0xfffffb4,0x7fffffc,0x7fffffc}};
67 static const packedelem32 packed32packed4p1 = {{0xffffffc,0xffffffc,0x7fffffc,0x7fffffc}};
68 
69 /* out = in */
70 DONNA_INLINE static void
71 curve25519_copy(bignum25519 out, const bignum25519 in) {
72  xmmi x0,x1,x2;
73  x0 = _mm_load_si128((xmmi*)in + 0);
74  x1 = _mm_load_si128((xmmi*)in + 1);
75  x2 = _mm_load_si128((xmmi*)in + 2);
76  _mm_store_si128((xmmi*)out + 0, x0);
77  _mm_store_si128((xmmi*)out + 1, x1);
78  _mm_store_si128((xmmi*)out + 2, x2);
79 }
80 
81 /* out = a + b */
82 DONNA_INLINE static void
83 curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
84  xmmi a0,a1,a2,b0,b1,b2;
85  a0 = _mm_load_si128((xmmi*)a + 0);
86  a1 = _mm_load_si128((xmmi*)a + 1);
87  a2 = _mm_load_si128((xmmi*)a + 2);
88  b0 = _mm_load_si128((xmmi*)b + 0);
89  b1 = _mm_load_si128((xmmi*)b + 1);
90  b2 = _mm_load_si128((xmmi*)b + 2);
91  a0 = _mm_add_epi32(a0, b0);
92  a1 = _mm_add_epi32(a1, b1);
93  a2 = _mm_add_epi32(a2, b2);
94  _mm_store_si128((xmmi*)out + 0, a0);
95  _mm_store_si128((xmmi*)out + 1, a1);
96  _mm_store_si128((xmmi*)out + 2, a2);
97 }
98 
99 #define curve25519_add_after_basic curve25519_add_reduce
100 DONNA_INLINE static void
101 curve25519_add_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
102  xmmi a0,a1,a2,b0,b1,b2;
103  xmmi c1,c2,c3;
104  xmmi r0,r1,r2,r3,r4,r5;
105 
106  a0 = _mm_load_si128((xmmi*)a + 0);
107  a1 = _mm_load_si128((xmmi*)a + 1);
108  a2 = _mm_load_si128((xmmi*)a + 2);
109  b0 = _mm_load_si128((xmmi*)b + 0);
110  b1 = _mm_load_si128((xmmi*)b + 1);
111  b2 = _mm_load_si128((xmmi*)b + 2);
112  a0 = _mm_add_epi32(a0, b0);
113  a1 = _mm_add_epi32(a1, b1);
114  a2 = _mm_add_epi32(a2, b2);
115 
116  r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
117  r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
118  r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
119  r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
120  r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
121  r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
122 
123  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
124  c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
125  c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
126  c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
127  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
128 
129  _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
130  _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
131  _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
132 }
133 
134 DONNA_INLINE static void
135 curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
136  xmmi a0,a1,a2,b0,b1,b2;
137  xmmi c1,c2;
138  xmmi r0,r1;
139 
140  a0 = _mm_load_si128((xmmi*)a + 0);
141  a1 = _mm_load_si128((xmmi*)a + 1);
142  a2 = _mm_load_si128((xmmi*)a + 2);
143  a0 = _mm_add_epi32(a0, packed2p0.v);
144  a1 = _mm_add_epi32(a1, packed2p1.v);
145  a2 = _mm_add_epi32(a2, packed2p2.v);
146  b0 = _mm_load_si128((xmmi*)b + 0);
147  b1 = _mm_load_si128((xmmi*)b + 1);
148  b2 = _mm_load_si128((xmmi*)b + 2);
149  a0 = _mm_sub_epi32(a0, b0);
150  a1 = _mm_sub_epi32(a1, b1);
151  a2 = _mm_sub_epi32(a2, b2);
152 
153  r0 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(2,2,0,0)), bot32bitmask.v);
154  r1 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3,3,1,1)), bot32bitmask.v);
155 
156  c1 = _mm_srli_epi32(r0, 26);
157  c2 = _mm_srli_epi32(r1, 25);
158  r0 = _mm_and_si128(r0, packedmask26.v);
159  r1 = _mm_and_si128(r1, packedmask25.v);
160  r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
161  r1 = _mm_add_epi32(r1, c1);
162 
163  a0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
164  a1 = _mm_add_epi32(a1, _mm_srli_si128(c2, 8));
165 
166  _mm_store_si128((xmmi*)out + 0, a0);
167  _mm_store_si128((xmmi*)out + 1, a1);
168  _mm_store_si128((xmmi*)out + 2, a2);
169 }
170 
171 DONNA_INLINE static void
172 curve25519_sub_after_basic(bignum25519 out, const bignum25519 a, const bignum25519 b) {
173  xmmi a0,a1,a2,b0,b1,b2;
174  xmmi c1,c2,c3;
175  xmmi r0,r1,r2,r3,r4,r5;
176 
177  a0 = _mm_load_si128((xmmi*)a + 0);
178  a1 = _mm_load_si128((xmmi*)a + 1);
179  a2 = _mm_load_si128((xmmi*)a + 2);
180  a0 = _mm_add_epi32(a0, packed4p0.v);
181  a1 = _mm_add_epi32(a1, packed4p1.v);
182  a2 = _mm_add_epi32(a2, packed4p2.v);
183  b0 = _mm_load_si128((xmmi*)b + 0);
184  b1 = _mm_load_si128((xmmi*)b + 1);
185  b2 = _mm_load_si128((xmmi*)b + 2);
186  a0 = _mm_sub_epi32(a0, b0);
187  a1 = _mm_sub_epi32(a1, b1);
188  a2 = _mm_sub_epi32(a2, b2);
189 
190  r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
191  r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
192  r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
193  r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
194  r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
195  r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
196 
197  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
198  c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
199  c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
200  c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
201  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
202 
203  _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
204  _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
205  _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
206 }
207 
208 DONNA_INLINE static void
209 curve25519_sub_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
210  xmmi a0,a1,a2,b0,b1,b2;
211  xmmi c1,c2,c3;
212  xmmi r0,r1,r2,r3,r4,r5;
213 
214  a0 = _mm_load_si128((xmmi*)a + 0);
215  a1 = _mm_load_si128((xmmi*)a + 1);
216  a2 = _mm_load_si128((xmmi*)a + 2);
217  a0 = _mm_add_epi32(a0, packed2p0.v);
218  a1 = _mm_add_epi32(a1, packed2p1.v);
219  a2 = _mm_add_epi32(a2, packed2p2.v);
220  b0 = _mm_load_si128((xmmi*)b + 0);
221  b1 = _mm_load_si128((xmmi*)b + 1);
222  b2 = _mm_load_si128((xmmi*)b + 2);
223  a0 = _mm_sub_epi32(a0, b0);
224  a1 = _mm_sub_epi32(a1, b1);
225  a2 = _mm_sub_epi32(a2, b2);
226 
227  r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
228  r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
229  r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
230  r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
231  r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
232  r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
233 
234  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
235  c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
236  c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
237  c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
238  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
239 
240  _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
241  _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
242  _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
243 }
244 
245 
246 DONNA_INLINE static void
247 curve25519_neg(bignum25519 out, const bignum25519 b) {
248  xmmi a0,a1,a2,b0,b1,b2;
249  xmmi c1,c2,c3;
250  xmmi r0,r1,r2,r3,r4,r5;
251 
252  a0 = packed2p0.v;
253  a1 = packed2p1.v;
254  a2 = packed2p2.v;
255  b0 = _mm_load_si128((xmmi*)b + 0);
256  b1 = _mm_load_si128((xmmi*)b + 1);
257  b2 = _mm_load_si128((xmmi*)b + 2);
258  a0 = _mm_sub_epi32(a0, b0);
259  a1 = _mm_sub_epi32(a1, b1);
260  a2 = _mm_sub_epi32(a2, b2);
261 
262  r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
263  r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
264  r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
265  r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
266  r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
267  r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
268 
269  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
270  c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
271  c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
272  c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
273  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
274 
275  _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
276  _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
277  _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
278 }
279 
280 
281 /* Multiply two numbers: out = in2 * in */
282 static void
283 curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
284  xmmi m01,m23,m45,m67,m89;
285  xmmi m0123,m4567;
286  xmmi s0123,s4567;
287  xmmi s01,s23,s45,s67,s89;
288  xmmi s12,s34,s56,s78,s9;
289  xmmi r0,r2,r4,r6,r8;
290  xmmi r1,r3,r5,r7,r9;
291  xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
292  xmmi c1,c2,c3;
293 
294  s0123 = _mm_load_si128((xmmi*)s + 0);
295  s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
296  s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
297  s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
298  s4567 = _mm_load_si128((xmmi*)s + 1);
299  s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
300  s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
301  s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
302  s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
303  s89 = _mm_load_si128((xmmi*)s + 2);
304  s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
305  s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
306  s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
307 
308  r0 = _mm_load_si128((xmmi*)r + 0);
309  r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
310  r1 = _mm_add_epi64(r1, _mm_and_si128(r1, top64bitmask.v));
311  r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
312  r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
313  r3 = _mm_add_epi64(r3, _mm_and_si128(r3, top64bitmask.v));
314  r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
315  r4 = _mm_load_si128((xmmi*)r + 1);
316  r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
317  r5 = _mm_add_epi64(r5, _mm_and_si128(r5, top64bitmask.v));
318  r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
319  r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
320  r7 = _mm_add_epi64(r7, _mm_and_si128(r7, top64bitmask.v));
321  r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
322  r8 = _mm_load_si128((xmmi*)r + 2);
323  r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
324  r9 = _mm_add_epi64(r9, _mm_and_si128(r9, top64bitmask.v));
325  r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
326 
327  m01 = _mm_mul_epu32(r1,s01);
328  m23 = _mm_mul_epu32(r1,s23);
329  m45 = _mm_mul_epu32(r1,s45);
330  m67 = _mm_mul_epu32(r1,s67);
331  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
332  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
333  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
334  m89 = _mm_mul_epu32(r1,s89);
335  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
336  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
337  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
338  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
339  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
340  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
341  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
342 
343  /* shift up */
344  m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
345  m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
346  m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
347  m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
348  m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
349 
350  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
351  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
352  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
353  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
354  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
355  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
356  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
357  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
358  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
359  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
360  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
361  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
362  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
363  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
364  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
365 
366  r219 = _mm_mul_epu32(r2, packednineteen.v);
367  r419 = _mm_mul_epu32(r4, packednineteen.v);
368  r619 = _mm_mul_epu32(r6, packednineteen.v);
369  r819 = _mm_mul_epu32(r8, packednineteen.v);
370  r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
371  r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
372  r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
373  r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
374  r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
375 
376  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
377  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
378  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
379  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
380  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
381  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
382  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
383  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
384  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
385  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
386  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
387  m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
388  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
389  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
390  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
391  m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
392  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
393  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
394  m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
395  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
396  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
397  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
398  m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
399  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
400  m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
401 
402  r0 = _mm_unpacklo_epi64(m01, m45);
403  r1 = _mm_unpackhi_epi64(m01, m45);
404  r2 = _mm_unpacklo_epi64(m23, m67);
405  r3 = _mm_unpackhi_epi64(m23, m67);
406  r4 = _mm_unpacklo_epi64(m89, m89);
407  r5 = _mm_unpackhi_epi64(m89, m89);
408 
409  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
410  c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
411  c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
412  c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
413  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
414 
415  m0123 = _mm_unpacklo_epi32(r0, r1);
416  m4567 = _mm_unpackhi_epi32(r0, r1);
417  m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
418  m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
419  m89 = _mm_unpackhi_epi32(r4, r5);
420 
421  _mm_store_si128((xmmi*)out + 0, m0123);
422  _mm_store_si128((xmmi*)out + 1, m4567);
423  _mm_store_si128((xmmi*)out + 2, m89);
424 }
425 
426 DONNA_NOINLINE static void
428  curve25519_mul(out, r, s);
429 }
430 
431 #define curve25519_square(r, n) curve25519_square_times(r, n, 1)
432 static void
433 curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
434  xmmi m01,m23,m45,m67,m89;
435  xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
436  xmmi r0a,r1a,r2a,r3a,r7a,r9a;
437  xmmi r0123,r4567;
438  xmmi r01,r23,r45,r67,r6x,r89,r8x;
439  xmmi r12,r34,r56,r78,r9x;
440  xmmi r5619;
441  xmmi c1,c2,c3;
442 
443  r0123 = _mm_load_si128((xmmi*)in + 0);
444  r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
445  r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
446  r4567 = _mm_load_si128((xmmi*)in + 1);
447  r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
448  r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
449  r89 = _mm_load_si128((xmmi*)in + 2);
450  r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
451 
452  do {
453  r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
454  r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
455  r0 = _mm_add_epi64(r0, _mm_and_si128(r0, top64bitmask.v));
456  r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
457  r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
458  r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
459  r2 = _mm_add_epi64(r2, _mm_and_si128(r2, top64bitmask.v));
460  r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
461  r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
462  r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
463  r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
464  r4 = _mm_add_epi64(r4, _mm_and_si128(r4, top64bitmask.v));
465  r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
466  r5619 = _mm_mul_epu32(r56, packednineteen.v);
467  r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
468  r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
469  r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
470  r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
471  r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
472  r7 = _mm_mul_epu32(r7, packed3819.v);
473  r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
474  r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
475  r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
476  r8 = _mm_mul_epu32(r8, packednineteen.v);
477  r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
478  r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
479  r9 = _mm_mul_epu32(r9, packed3819.v);
480  r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
481 
482  m01 = _mm_mul_epu32(r01, r0);
483  m23 = _mm_mul_epu32(r23, r0a);
484  m45 = _mm_mul_epu32(r45, r0a);
485  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
486  r23 = _mm_slli_epi32(r23, 1);
487  m67 = _mm_mul_epu32(r67, r0a);
488  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
489  m89 = _mm_mul_epu32(r89, r0a);
490  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
491  r67 = _mm_slli_epi32(r67, 1);
492  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
493  r45 = _mm_slli_epi32(r45, 1);
494 
495  r1 = _mm_slli_epi32(r1, 1);
496  r3 = _mm_slli_epi32(r3, 1);
497  r1a = _mm_add_epi64(r1, _mm_and_si128(r1, bot64bitmask.v));
498  r3a = _mm_add_epi64(r3, _mm_and_si128(r3, bot64bitmask.v));
499 
500  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
501  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
502  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
503  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
504  r34 = _mm_slli_epi32(r34, 1);
505  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
506  r78 = _mm_slli_epi32(r78, 1);
507  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
508  r56 = _mm_slli_epi32(r56, 1);
509 
510  m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
511  m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
512  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
513  m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
514  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
515  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
516  m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
517  m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
518  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
519  m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
520  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
521  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
522  m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
523  m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
524  m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
525 
526  r0 = _mm_unpacklo_epi64(m01, m45);
527  r1 = _mm_unpackhi_epi64(m01, m45);
528  r2 = _mm_unpacklo_epi64(m23, m67);
529  r3 = _mm_unpackhi_epi64(m23, m67);
530  r4 = _mm_unpacklo_epi64(m89, m89);
531  r5 = _mm_unpackhi_epi64(m89, m89);
532 
533  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
534  c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
535  c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
536  c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
537  c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
538 
539  r01 = _mm_unpacklo_epi64(r0, r1);
540  r45 = _mm_unpackhi_epi64(r0, r1);
541  r23 = _mm_unpacklo_epi64(r2, r3);
542  r67 = _mm_unpackhi_epi64(r2, r3);
543  r89 = _mm_unpackhi_epi64(r4, r5);
544  } while (--count);
545 
546  r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
547  r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
548  r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
549  r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
550  r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
551 
552  _mm_store_si128((xmmi*)r + 0, r0123);
553  _mm_store_si128((xmmi*)r + 1, r4567);
554  _mm_store_si128((xmmi*)r + 2, r89);
555 }
556 
557 DONNA_INLINE static void
558 curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
559  xmmi x0,x1,x2,z0,z1,z2;
560 
561  x0 = _mm_load_si128((xmmi *)(x + 0));
562  x1 = _mm_load_si128((xmmi *)(x + 4));
563  x2 = _mm_load_si128((xmmi *)(x + 8));
564  z0 = _mm_load_si128((xmmi *)(z + 0));
565  z1 = _mm_load_si128((xmmi *)(z + 4));
566  z2 = _mm_load_si128((xmmi *)(z + 8));
567 
568  out[0].v = _mm_unpacklo_epi32(x0, z0);
569  out[1].v = _mm_unpackhi_epi32(x0, z0);
570  out[2].v = _mm_unpacklo_epi32(x1, z1);
571  out[3].v = _mm_unpackhi_epi32(x1, z1);
572  out[4].v = _mm_unpacklo_epi32(x2, z2);
573 }
574 
575 DONNA_INLINE static void
576 curve25519_untangle32(bignum25519 x, bignum25519 z, const packedelem32 *in) {
577  xmmi t0,t1,t2,t3,t4,zero;
578 
579  t0 = _mm_shuffle_epi32(in[0].v, _MM_SHUFFLE(3,1,2,0));
580  t1 = _mm_shuffle_epi32(in[1].v, _MM_SHUFFLE(3,1,2,0));
581  t2 = _mm_shuffle_epi32(in[2].v, _MM_SHUFFLE(3,1,2,0));
582  t3 = _mm_shuffle_epi32(in[3].v, _MM_SHUFFLE(3,1,2,0));
583  t4 = _mm_shuffle_epi32(in[4].v, _MM_SHUFFLE(3,1,2,0));
584  zero = _mm_setzero_si128();
585  _mm_store_si128((xmmi *)x + 0, _mm_unpacklo_epi64(t0, t1));
586  _mm_store_si128((xmmi *)x + 1, _mm_unpacklo_epi64(t2, t3));
587  _mm_store_si128((xmmi *)x + 2, _mm_unpacklo_epi64(t4, zero));
588  _mm_store_si128((xmmi *)z + 0, _mm_unpackhi_epi64(t0, t1));
589  _mm_store_si128((xmmi *)z + 1, _mm_unpackhi_epi64(t2, t3));
590  _mm_store_si128((xmmi *)z + 2, _mm_unpackhi_epi64(t4, zero));
591 }
592 
593 DONNA_INLINE static void
594 curve25519_add_reduce_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
595  xmmi r0,r1,r2,r3,r4;
596  xmmi s0,s1,s2,s3,s4,s5;
597  xmmi c1,c2;
598 
599  r0 = _mm_add_epi32(r[0].v, s[0].v);
600  r1 = _mm_add_epi32(r[1].v, s[1].v);
601  r2 = _mm_add_epi32(r[2].v, s[2].v);
602  r3 = _mm_add_epi32(r[3].v, s[3].v);
603  r4 = _mm_add_epi32(r[4].v, s[4].v);
604 
605  s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
606  s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
607  s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
608  s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
609  s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
610  s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
611 
612  c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
613  c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
614  c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
615  c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
616  c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
617 
618  out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
619  out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
620  out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
621  out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
622  out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
623 }
624 
625 DONNA_INLINE static void
626 curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
627  out[0].v = _mm_add_epi32(r[0].v, s[0].v);
628  out[1].v = _mm_add_epi32(r[1].v, s[1].v);
629  out[2].v = _mm_add_epi32(r[2].v, s[2].v);
630  out[3].v = _mm_add_epi32(r[3].v, s[3].v);
631  out[4].v = _mm_add_epi32(r[4].v, s[4].v);
632 }
633 
634 DONNA_INLINE static void
635 curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
636  xmmi r0,r1,r2,r3,r4;
637  xmmi s0,s1,s2,s3;
638  xmmi c1,c2;
639 
640  r0 = _mm_add_epi32(r[0].v, packed32packed2p0.v);
641  r1 = _mm_add_epi32(r[1].v, packed32packed2p1.v);
642  r2 = _mm_add_epi32(r[2].v, packed32packed2p1.v);
643  r3 = _mm_add_epi32(r[3].v, packed32packed2p1.v);
644  r4 = _mm_add_epi32(r[4].v, packed32packed2p1.v);
645  r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
646  r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
647  r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
648  r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
649  r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
650 
651  s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
652  s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
653  s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
654  s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
655 
656  c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
657  c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));
658 
659  out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
660  out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
661  out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
662  out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
663  out[4].v = r4;
664 }
665 
666 DONNA_INLINE static void
667 curve25519_sub_after_basic_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
668  xmmi r0,r1,r2,r3,r4;
669  xmmi s0,s1,s2,s3,s4,s5;
670  xmmi c1,c2;
671 
672  r0 = _mm_add_epi32(r[0].v, packed32packed4p0.v);
673  r1 = _mm_add_epi32(r[1].v, packed32packed4p1.v);
674  r2 = _mm_add_epi32(r[2].v, packed32packed4p1.v);
675  r3 = _mm_add_epi32(r[3].v, packed32packed4p1.v);
676  r4 = _mm_add_epi32(r[4].v, packed32packed4p1.v);
677  r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
678  r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
679  r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
680  r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
681  r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
682 
683  s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
684  s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
685  s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
686  s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
687  s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
688  s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
689 
690  c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
691  c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
692  c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
693  c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
694  c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
695 
696  out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
697  out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
698  out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
699  out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
700  out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
701 }
702 
703 DONNA_INLINE static void
704 curve25519_tangle64_from32(packedelem64 *a, packedelem64 *b, const packedelem32 *c, const packedelem32 *d) {
705  xmmi c0,c1,c2,c3,c4,c5,t;
706  xmmi d0,d1,d2,d3,d4,d5;
707  xmmi t0,t1,t2,t3,t4,zero;
708 
709  t0 = _mm_shuffle_epi32(c[0].v, _MM_SHUFFLE(3,1,2,0));
710  t1 = _mm_shuffle_epi32(c[1].v, _MM_SHUFFLE(3,1,2,0));
711  t2 = _mm_shuffle_epi32(d[0].v, _MM_SHUFFLE(3,1,2,0));
712  t3 = _mm_shuffle_epi32(d[1].v, _MM_SHUFFLE(3,1,2,0));
713  c0 = _mm_unpacklo_epi64(t0, t1);
714  c3 = _mm_unpackhi_epi64(t0, t1);
715  d0 = _mm_unpacklo_epi64(t2, t3);
716  d3 = _mm_unpackhi_epi64(t2, t3);
717  t = _mm_unpacklo_epi64(c0, d0); a[0].v = t; a[1].v = _mm_srli_epi64(t, 32);
718  t = _mm_unpackhi_epi64(c0, d0); a[2].v = t; a[3].v = _mm_srli_epi64(t, 32);
719  t = _mm_unpacklo_epi64(c3, d3); b[0].v = t; b[1].v = _mm_srli_epi64(t, 32);
720  t = _mm_unpackhi_epi64(c3, d3); b[2].v = t; b[3].v = _mm_srli_epi64(t, 32);
721 
722  t0 = _mm_shuffle_epi32(c[2].v, _MM_SHUFFLE(3,1,2,0));
723  t1 = _mm_shuffle_epi32(c[3].v, _MM_SHUFFLE(3,1,2,0));
724  t2 = _mm_shuffle_epi32(d[2].v, _MM_SHUFFLE(3,1,2,0));
725  t3 = _mm_shuffle_epi32(d[3].v, _MM_SHUFFLE(3,1,2,0));
726  c1 = _mm_unpacklo_epi64(t0, t1);
727  c4 = _mm_unpackhi_epi64(t0, t1);
728  d1 = _mm_unpacklo_epi64(t2, t3);
729  d4 = _mm_unpackhi_epi64(t2, t3);
730  t = _mm_unpacklo_epi64(c1, d1); a[4].v = t; a[5].v = _mm_srli_epi64(t, 32);
731  t = _mm_unpackhi_epi64(c1, d1); a[6].v = t; a[7].v = _mm_srli_epi64(t, 32);
732  t = _mm_unpacklo_epi64(c4, d4); b[4].v = t; b[5].v = _mm_srli_epi64(t, 32);
733  t = _mm_unpackhi_epi64(c4, d4); b[6].v = t; b[7].v = _mm_srli_epi64(t, 32);
734 
735  t4 = _mm_shuffle_epi32(c[4].v, _MM_SHUFFLE(3,1,2,0));
736  zero = _mm_setzero_si128();
737  c2 = _mm_unpacklo_epi64(t4, zero);
738  c5 = _mm_unpackhi_epi64(t4, zero);
739  t4 = _mm_shuffle_epi32(d[4].v, _MM_SHUFFLE(3,1,2,0));
740  d2 = _mm_unpacklo_epi64(t4, zero);
741  d5 = _mm_unpackhi_epi64(t4, zero);
742  t = _mm_unpacklo_epi64(c2, d2); a[8].v = t; a[9].v = _mm_srli_epi64(t, 32);
743  t = _mm_unpacklo_epi64(c5, d5); b[8].v = t; b[9].v = _mm_srli_epi64(t, 32);
744 }
745 
746 DONNA_INLINE static void
747 curve25519_tangle64(packedelem64 *out, const bignum25519 x, const bignum25519 z) {
748  xmmi x0,x1,x2,z0,z1,z2,t;
749 
750  x0 = _mm_load_si128((xmmi *)x + 0);
751  x1 = _mm_load_si128((xmmi *)x + 1);
752  x2 = _mm_load_si128((xmmi *)x + 2);
753  z0 = _mm_load_si128((xmmi *)z + 0);
754  z1 = _mm_load_si128((xmmi *)z + 1);
755  z2 = _mm_load_si128((xmmi *)z + 2);
756 
757  t = _mm_unpacklo_epi64(x0, z0); out[0].v = t; out[1].v = _mm_srli_epi64(t, 32);
758  t = _mm_unpackhi_epi64(x0, z0); out[2].v = t; out[3].v = _mm_srli_epi64(t, 32);
759  t = _mm_unpacklo_epi64(x1, z1); out[4].v = t; out[5].v = _mm_srli_epi64(t, 32);
760  t = _mm_unpackhi_epi64(x1, z1); out[6].v = t; out[7].v = _mm_srli_epi64(t, 32);
761  t = _mm_unpacklo_epi64(x2, z2); out[8].v = t; out[9].v = _mm_srli_epi64(t, 32);
762 }
763 
764 DONNA_INLINE static void
765 curve25519_tangleone64(packedelem64 *out, const bignum25519 x) {
766  xmmi x0,x1,x2;
767 
768  x0 = _mm_load_si128((xmmi *)(x + 0));
769  x1 = _mm_load_si128((xmmi *)(x + 4));
770  x2 = _mm_load_si128((xmmi *)(x + 8));
771 
772  out[0].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(0,0,0,0));
773  out[1].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(1,1,1,1));
774  out[2].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(2,2,2,2));
775  out[3].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(3,3,3,3));
776  out[4].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0,0,0,0));
777  out[5].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,1,1,1));
778  out[6].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2,2,2,2));
779  out[7].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(3,3,3,3));
780  out[8].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0,0,0,0));
781  out[9].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1,1,1,1));
782 }
783 
784 DONNA_INLINE static void
785 curve25519_swap64(packedelem64 *out) {
786  out[0].v = _mm_shuffle_epi32(out[0].v, _MM_SHUFFLE(1,0,3,2));
787  out[1].v = _mm_shuffle_epi32(out[1].v, _MM_SHUFFLE(1,0,3,2));
788  out[2].v = _mm_shuffle_epi32(out[2].v, _MM_SHUFFLE(1,0,3,2));
789  out[3].v = _mm_shuffle_epi32(out[3].v, _MM_SHUFFLE(1,0,3,2));
790  out[4].v = _mm_shuffle_epi32(out[4].v, _MM_SHUFFLE(1,0,3,2));
791  out[5].v = _mm_shuffle_epi32(out[5].v, _MM_SHUFFLE(1,0,3,2));
792  out[6].v = _mm_shuffle_epi32(out[6].v, _MM_SHUFFLE(1,0,3,2));
793  out[7].v = _mm_shuffle_epi32(out[7].v, _MM_SHUFFLE(1,0,3,2));
794  out[8].v = _mm_shuffle_epi32(out[8].v, _MM_SHUFFLE(1,0,3,2));
795  out[9].v = _mm_shuffle_epi32(out[9].v, _MM_SHUFFLE(1,0,3,2));
796 }
797 
798 DONNA_INLINE static void
799 curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
800  _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
801  _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
802  _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );
803  _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
804  _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
805  _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );
806 }
807 
808 DONNA_INLINE static void
809 curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
810  xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
811  xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
812  xmmi c1,c2;
813 
814  out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
815  out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
816  r1_2 = _mm_slli_epi32(r[1].v, 1);
817  out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
818  out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
819  r3_2 = _mm_slli_epi32(r[3].v, 1);
820  out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
821  out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
822  r5_2 = _mm_slli_epi32(r[5].v, 1);
823  out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
824  out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));
825  r7_2 = _mm_slli_epi32(r[7].v, 1);
826  out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
827  out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
828 
829  r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
830  r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
831  r1_2 = _mm_slli_epi32(r1, 1);
832  r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
833  r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
834  r3_2 = _mm_slli_epi32(r3, 1);
835  r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
836  r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
837  r5_2 = _mm_slli_epi32(r5, 1);
838  r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
839  r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
840  r7_2 = _mm_slli_epi32(r7, 1);
841  r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
842  r9_2 = _mm_slli_epi32(r9, 1);
843 
844  out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
845  out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
846  out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
847  out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
848  out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
849  out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
850  out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
851  out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));
852  out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
853 
854  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
855  c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
856  c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
857  c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
858  c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
859  c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
860  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
861 }
862 
863 DONNA_INLINE static void
864 curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
865  xmmi r0,r1,r2,r3;
866  xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
867  xmmi d5,d6,d7,d8,d9;
868  xmmi c1,c2;
869 
870  r0 = r[0].v;
871  r1 = r[1].v;
872  r2 = r[2].v;
873  r3 = r[3].v;
874 
875  out[0].v = _mm_mul_epu32(r0, r0);
876  r0 = _mm_slli_epi32(r0, 1);
877  out[1].v = _mm_mul_epu32(r0, r1);
878  r1_2 = _mm_slli_epi32(r1, 1);
879  out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));
880  r1 = r1_2;
881  out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));
882  r3_2 = _mm_slli_epi32(r3, 1);
883  out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));
884  r2 = _mm_slli_epi32(r2, 1);
885  out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
886  r5_2 = _mm_slli_epi32(r[5].v, 1);
887  out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));
888  r3 = r3_2;
889  out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
890  r7_2 = _mm_slli_epi32(r[7].v, 1);
891  out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));
892  out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));
893 
894  d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
895  d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
896  d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
897  d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
898  d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
899 
900  r4_2 = _mm_slli_epi32(r[4].v, 1);
901  r6_2 = _mm_slli_epi32(r[6].v, 1);
902  out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
903  out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));
904  out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));
905  out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
906  out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
907  out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));
908  out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));
909  out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
910  out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
911 
912  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
913  c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
914  c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
915  c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
916  c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
917  c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
918  c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
919 }
920 
921 
922 /* Take a little-endian, 32-byte number and expand it into polynomial form */
923 static void
924 curve25519_expand(bignum25519 out, const unsigned char in[32]) {
925  uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
926 
927  x0 = *(uint32_t *)(in + 0);
928  x1 = *(uint32_t *)(in + 4);
929  x2 = *(uint32_t *)(in + 8);
930  x3 = *(uint32_t *)(in + 12);
931  x4 = *(uint32_t *)(in + 16);
932  x5 = *(uint32_t *)(in + 20);
933  x6 = *(uint32_t *)(in + 24);
934  x7 = *(uint32_t *)(in + 28);
935 
936  out[0] = ( x0 ) & 0x3ffffff;
937  out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff;
938  out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff;
939  out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff;
940  out[4] = (( x3) >> 6) & 0x3ffffff;
941  out[5] = ( x4 ) & 0x1ffffff;
942  out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff;
943  out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff;
944  out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff;
945  out[9] = (( x7) >> 6) & 0x1ffffff;
946  out[10] = 0;
947  out[11] = 0;
948 }
949 
950 /* Take a fully reduced polynomial form number and contract it into a
951  * little-endian, 32-byte array
952  */
953 static void
954 curve25519_contract(unsigned char out[32], const bignum25519 in) {
955  bignum25519 ALIGN(16) f;
956  curve25519_copy(f, in);
957 
958  #define carry_pass() \
959  f[1] += f[0] >> 26; f[0] &= 0x3ffffff; \
960  f[2] += f[1] >> 25; f[1] &= 0x1ffffff; \
961  f[3] += f[2] >> 26; f[2] &= 0x3ffffff; \
962  f[4] += f[3] >> 25; f[3] &= 0x1ffffff; \
963  f[5] += f[4] >> 26; f[4] &= 0x3ffffff; \
964  f[6] += f[5] >> 25; f[5] &= 0x1ffffff; \
965  f[7] += f[6] >> 26; f[6] &= 0x3ffffff; \
966  f[8] += f[7] >> 25; f[7] &= 0x1ffffff; \
967  f[9] += f[8] >> 26; f[8] &= 0x3ffffff;
968 
969  #define carry_pass_full() \
970  carry_pass() \
971  f[0] += 19 * (f[9] >> 25); f[9] &= 0x1ffffff;
972 
973  #define carry_pass_final() \
974  carry_pass() \
975  f[9] &= 0x1ffffff;
976 
979 
980  /* now t is between 0 and 2^255-1, properly carried. */
981  /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
982  f[0] += 19;
984 
985  /* now between 19 and 2^255-1 in both cases, and offset by 19. */
986  f[0] += (1 << 26) - 19;
987  f[1] += (1 << 25) - 1;
988  f[2] += (1 << 26) - 1;
989  f[3] += (1 << 25) - 1;
990  f[4] += (1 << 26) - 1;
991  f[5] += (1 << 25) - 1;
992  f[6] += (1 << 26) - 1;
993  f[7] += (1 << 25) - 1;
994  f[8] += (1 << 26) - 1;
995  f[9] += (1 << 25) - 1;
996 
997  /* now between 2^255 and 2^256-20, and offset by 2^255. */
999 
1000  #undef carry_pass
1001  #undef carry_full
1002  #undef carry_final
1003 
1004  f[1] <<= 2;
1005  f[2] <<= 3;
1006  f[3] <<= 5;
1007  f[4] <<= 6;
1008  f[6] <<= 1;
1009  f[7] <<= 3;
1010  f[8] <<= 4;
1011  f[9] <<= 6;
1012 
1013  #define F(i, s) \
1014  out[s+0] |= (unsigned char )(f[i] & 0xff); \
1015  out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \
1016  out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \
1017  out[s+3] = (unsigned char )((f[i] >> 24) & 0xff);
1018 
1019  out[0] = 0;
1020  out[16] = 0;
1021  F(0,0);
1022  F(1,3);
1023  F(2,6);
1024  F(3,9);
1025  F(4,12);
1026  F(5,16);
1027  F(6,19);
1028  F(7,22);
1029  F(8,25);
1030  F(9,28);
1031  #undef F
1032 }
1033 
1034 /* if (iswap) swap(a, b) */
1035 DONNA_INLINE static void
1036 curve25519_swap_conditional(bignum25519 a, bignum25519 b, uint32_t iswap) {
1037  const uint32_t swap = (uint32_t)(-(int32_t)iswap);
1038  xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
1039  xmmi mask = _mm_cvtsi32_si128(swap);
1040  mask = _mm_shuffle_epi32(mask, 0);
1041  a0 = _mm_load_si128((xmmi *)a + 0);
1042  a1 = _mm_load_si128((xmmi *)a + 1);
1043  b0 = _mm_load_si128((xmmi *)b + 0);
1044  b1 = _mm_load_si128((xmmi *)b + 1);
1045  b0 = _mm_xor_si128(a0, b0);
1046  b1 = _mm_xor_si128(a1, b1);
1047  x0 = _mm_and_si128(b0, mask);
1048  x1 = _mm_and_si128(b1, mask);
1049  x0 = _mm_xor_si128(x0, a0);
1050  x1 = _mm_xor_si128(x1, a1);
1051  a0 = _mm_xor_si128(x0, b0);
1052  a1 = _mm_xor_si128(x1, b1);
1053  _mm_store_si128((xmmi *)a + 0, x0);
1054  _mm_store_si128((xmmi *)a + 1, x1);
1055  _mm_store_si128((xmmi *)b + 0, a0);
1056  _mm_store_si128((xmmi *)b + 1, a1);
1057 
1058  a2 = _mm_load_si128((xmmi *)a + 2);
1059  b2 = _mm_load_si128((xmmi *)b + 2);
1060  b2 = _mm_xor_si128(a2, b2);
1061  x2 = _mm_and_si128(b2, mask);
1062  x2 = _mm_xor_si128(x2, a2);
1063  a2 = _mm_xor_si128(x2, b2);
1064  _mm_store_si128((xmmi *)b + 2, a2);
1065  _mm_store_si128((xmmi *)a + 2, x2);
1066 }
1067 
1068 /* out = (flag) ? out : in */
1069 DONNA_INLINE static void
1070 curve25519_move_conditional_bytes(uint8_t out[96], const uint8_t in[96], uint32_t flag) {
1071  xmmi a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5;
1072  const uint32_t nb = flag - 1;
1073  xmmi masknb = _mm_shuffle_epi32(_mm_cvtsi32_si128(nb),0);
1074  a0 = _mm_load_si128((xmmi *)in + 0);
1075  a1 = _mm_load_si128((xmmi *)in + 1);
1076  a2 = _mm_load_si128((xmmi *)in + 2);
1077  b0 = _mm_load_si128((xmmi *)out + 0);
1078  b1 = _mm_load_si128((xmmi *)out + 1);
1079  b2 = _mm_load_si128((xmmi *)out + 2);
1080  a0 = _mm_andnot_si128(masknb, a0);
1081  a1 = _mm_andnot_si128(masknb, a1);
1082  a2 = _mm_andnot_si128(masknb, a2);
1083  b0 = _mm_and_si128(masknb, b0);
1084  b1 = _mm_and_si128(masknb, b1);
1085  b2 = _mm_and_si128(masknb, b2);
1086  a0 = _mm_or_si128(a0, b0);
1087  a1 = _mm_or_si128(a1, b1);
1088  a2 = _mm_or_si128(a2, b2);
1089  _mm_store_si128((xmmi*)out + 0, a0);
1090  _mm_store_si128((xmmi*)out + 1, a1);
1091  _mm_store_si128((xmmi*)out + 2, a2);
1092 
1093  a3 = _mm_load_si128((xmmi *)in + 3);
1094  a4 = _mm_load_si128((xmmi *)in + 4);
1095  a5 = _mm_load_si128((xmmi *)in + 5);
1096  b3 = _mm_load_si128((xmmi *)out + 3);
1097  b4 = _mm_load_si128((xmmi *)out + 4);
1098  b5 = _mm_load_si128((xmmi *)out + 5);
1099  a3 = _mm_andnot_si128(masknb, a3);
1100  a4 = _mm_andnot_si128(masknb, a4);
1101  a5 = _mm_andnot_si128(masknb, a5);
1102  b3 = _mm_and_si128(masknb, b3);
1103  b4 = _mm_and_si128(masknb, b4);
1104  b5 = _mm_and_si128(masknb, b5);
1105  a3 = _mm_or_si128(a3, b3);
1106  a4 = _mm_or_si128(a4, b4);
1107  a5 = _mm_or_si128(a5, b5);
1108  _mm_store_si128((xmmi*)out + 3, a3);
1109  _mm_store_si128((xmmi*)out + 4, a4);
1110  _mm_store_si128((xmmi*)out + 5, a5);
1111 }
1112 
#define ALIGN(x)
packedelem64 packed64bignum25519[10]
packedelem32 packed32bignum25519[5]
union packedelem64_t packedelem64
#define DONNA_INLINE
unsigned char uint8_t
Definition: stdint.h:124
#define F(i, s)
#define carry_pass_final()
#define carry_pass_full()
t0
Definition: pow22523.h:53
mdb_size_t count(MDB_cursor *cur)
uint32_t bignum25519[12]
unsigned int uint32_t
Definition: stdint.h:126
unsigned char u[16]
__m128i xmmi
t3
Definition: pow225521.h:103
unsigned __int64 uint64_t
Definition: stdint.h:136
#define curve25519_mul_noinline
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
Definition: pointer.h:1124
t2
Definition: pow22523.h:103
union packedelem32_t packedelem32
#define DONNA_NOINLINE
union packedelem8_t packedelem8
t1
Definition: pow22523.h:58
signed int int32_t
Definition: stdint.h:123
key zero()
Definition: rctOps.h:70