31 static const packedelem32 bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}};
32 static const packedelem32 top32bitmask = {{0x00000000, 0xffffffff, 0x00000000, 0xffffffff}};
33 static const packedelem32 top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}};
34 static const packedelem32 bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}};
37 static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}};
38 static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}};
39 static const packedelem32 packedmask2625 = {{0x3ffffff,0,0x1ffffff,0}};
40 static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}};
41 static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}};
46 static const packedelem64 packedthirtyeight = {{38, 38}};
51 static const packedelem64 packed121666121665 = {{121666, 121665}};
54 static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}};
55 static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}};
56 static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}};
58 static const packedelem32 packed32packed2p0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}};
59 static const packedelem32 packed32packed2p1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}};
62 static const packedelem32 packed4p0 = {{0xfffffb4,0x7fffffc,0xffffffc,0x7fffffc}};
63 static const packedelem32 packed4p1 = {{0xffffffc,0x7fffffc,0xffffffc,0x7fffffc}};
64 static const packedelem32 packed4p2 = {{0xffffffc,0x7fffffc,0x0000000,0x0000000}};
66 static const packedelem32 packed32packed4p0 = {{0xfffffb4,0xfffffb4,0x7fffffc,0x7fffffc}};
67 static const packedelem32 packed32packed4p1 = {{0xffffffc,0xffffffc,0x7fffffc,0x7fffffc}};
73 x0 = _mm_load_si128((
xmmi*)in + 0);
74 x1 = _mm_load_si128((
xmmi*)in + 1);
75 x2 = _mm_load_si128((
xmmi*)in + 2);
76 _mm_store_si128((
xmmi*)out + 0, x0);
77 _mm_store_si128((
xmmi*)out + 1, x1);
78 _mm_store_si128((
xmmi*)out + 2, x2);
84 xmmi a0,a1,a2,b0,b1,b2;
85 a0 = _mm_load_si128((
xmmi*)
a + 0);
86 a1 = _mm_load_si128((
xmmi*)
a + 1);
87 a2 = _mm_load_si128((
xmmi*)
a + 2);
88 b0 = _mm_load_si128((
xmmi*)b + 0);
89 b1 = _mm_load_si128((
xmmi*)b + 1);
90 b2 = _mm_load_si128((
xmmi*)b + 2);
91 a0 = _mm_add_epi32(a0, b0);
92 a1 = _mm_add_epi32(a1, b1);
93 a2 = _mm_add_epi32(a2, b2);
94 _mm_store_si128((
xmmi*)out + 0, a0);
95 _mm_store_si128((
xmmi*)out + 1, a1);
96 _mm_store_si128((
xmmi*)out + 2, a2);
99 #define curve25519_add_after_basic curve25519_add_reduce 102 xmmi a0,a1,a2,b0,b1,b2;
104 xmmi r0,r1,r2,r3,r4,r5;
106 a0 = _mm_load_si128((
xmmi*)
a + 0);
107 a1 = _mm_load_si128((
xmmi*)
a + 1);
108 a2 = _mm_load_si128((
xmmi*)
a + 2);
109 b0 = _mm_load_si128((
xmmi*)b + 0);
110 b1 = _mm_load_si128((
xmmi*)b + 1);
111 b2 = _mm_load_si128((
xmmi*)b + 2);
112 a0 = _mm_add_epi32(a0, b0);
113 a1 = _mm_add_epi32(a1, b1);
114 a2 = _mm_add_epi32(a2, b2);
116 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.
v);
117 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
118 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.
v);
119 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
120 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.
v);
121 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
123 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
124 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.
v); r3 = _mm_and_si128(r3, packedmask25.
v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
125 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.
v); r5 = _mm_add_epi64(r5, c1);
126 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.
v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.
v), c3));
127 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
129 _mm_store_si128((
xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
130 _mm_store_si128((
xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
131 _mm_store_si128((
xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
136 xmmi a0,a1,a2,b0,b1,b2;
140 a0 = _mm_load_si128((
xmmi*)
a + 0);
141 a1 = _mm_load_si128((
xmmi*)
a + 1);
142 a2 = _mm_load_si128((
xmmi*)
a + 2);
143 a0 = _mm_add_epi32(a0, packed2p0.
v);
144 a1 = _mm_add_epi32(a1, packed2p1.
v);
145 a2 = _mm_add_epi32(a2, packed2p2.
v);
146 b0 = _mm_load_si128((
xmmi*)b + 0);
147 b1 = _mm_load_si128((
xmmi*)b + 1);
148 b2 = _mm_load_si128((
xmmi*)b + 2);
149 a0 = _mm_sub_epi32(a0, b0);
150 a1 = _mm_sub_epi32(a1, b1);
151 a2 = _mm_sub_epi32(a2, b2);
153 r0 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(2,2,0,0)), bot32bitmask.
v);
154 r1 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3,3,1,1)), bot32bitmask.
v);
156 c1 = _mm_srli_epi32(r0, 26);
157 c2 = _mm_srli_epi32(r1, 25);
158 r0 = _mm_and_si128(r0, packedmask26.
v);
159 r1 = _mm_and_si128(r1, packedmask25.
v);
160 r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
161 r1 = _mm_add_epi32(r1, c1);
163 a0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
164 a1 = _mm_add_epi32(a1, _mm_srli_si128(c2, 8));
166 _mm_store_si128((
xmmi*)out + 0, a0);
167 _mm_store_si128((
xmmi*)out + 1, a1);
168 _mm_store_si128((
xmmi*)out + 2, a2);
173 xmmi a0,a1,a2,b0,b1,b2;
175 xmmi r0,r1,r2,r3,r4,r5;
177 a0 = _mm_load_si128((
xmmi*)
a + 0);
178 a1 = _mm_load_si128((
xmmi*)
a + 1);
179 a2 = _mm_load_si128((
xmmi*)
a + 2);
180 a0 = _mm_add_epi32(a0, packed4p0.
v);
181 a1 = _mm_add_epi32(a1, packed4p1.
v);
182 a2 = _mm_add_epi32(a2, packed4p2.
v);
183 b0 = _mm_load_si128((
xmmi*)b + 0);
184 b1 = _mm_load_si128((
xmmi*)b + 1);
185 b2 = _mm_load_si128((
xmmi*)b + 2);
186 a0 = _mm_sub_epi32(a0, b0);
187 a1 = _mm_sub_epi32(a1, b1);
188 a2 = _mm_sub_epi32(a2, b2);
190 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.
v);
191 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
192 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.
v);
193 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
194 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.
v);
195 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
197 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
198 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.
v); r3 = _mm_and_si128(r3, packedmask25.
v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
199 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.
v); r5 = _mm_add_epi64(r5, c1);
200 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.
v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.
v), c3));
201 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
203 _mm_store_si128((
xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
204 _mm_store_si128((
xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
205 _mm_store_si128((
xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
210 xmmi a0,a1,a2,b0,b1,b2;
212 xmmi r0,r1,r2,r3,r4,r5;
214 a0 = _mm_load_si128((
xmmi*)
a + 0);
215 a1 = _mm_load_si128((
xmmi*)
a + 1);
216 a2 = _mm_load_si128((
xmmi*)
a + 2);
217 a0 = _mm_add_epi32(a0, packed2p0.
v);
218 a1 = _mm_add_epi32(a1, packed2p1.
v);
219 a2 = _mm_add_epi32(a2, packed2p2.
v);
220 b0 = _mm_load_si128((
xmmi*)b + 0);
221 b1 = _mm_load_si128((
xmmi*)b + 1);
222 b2 = _mm_load_si128((
xmmi*)b + 2);
223 a0 = _mm_sub_epi32(a0, b0);
224 a1 = _mm_sub_epi32(a1, b1);
225 a2 = _mm_sub_epi32(a2, b2);
227 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.
v);
228 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
229 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.
v);
230 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
231 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.
v);
232 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
234 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
235 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.
v); r3 = _mm_and_si128(r3, packedmask25.
v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
236 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.
v); r5 = _mm_add_epi64(r5, c1);
237 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.
v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.
v), c3));
238 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
240 _mm_store_si128((
xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
241 _mm_store_si128((
xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
242 _mm_store_si128((
xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
248 xmmi a0,a1,a2,b0,b1,b2;
250 xmmi r0,r1,r2,r3,r4,r5;
255 b0 = _mm_load_si128((
xmmi*)b + 0);
256 b1 = _mm_load_si128((
xmmi*)b + 1);
257 b2 = _mm_load_si128((
xmmi*)b + 2);
258 a0 = _mm_sub_epi32(a0, b0);
259 a1 = _mm_sub_epi32(a1, b1);
260 a2 = _mm_sub_epi32(a2, b2);
262 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.
v);
263 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
264 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.
v);
265 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
266 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.
v);
267 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
269 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
270 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.
v); r3 = _mm_and_si128(r3, packedmask25.
v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
271 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.
v); r5 = _mm_add_epi64(r5, c1);
272 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.
v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.
v), c3));
273 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
275 _mm_store_si128((
xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
276 _mm_store_si128((
xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
277 _mm_store_si128((
xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
284 xmmi m01,m23,m45,m67,m89;
287 xmmi s01,s23,s45,s67,s89;
288 xmmi s12,s34,s56,s78,s9;
291 xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
294 s0123 = _mm_load_si128((
xmmi*)s + 0);
295 s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
296 s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
297 s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
298 s4567 = _mm_load_si128((
xmmi*)s + 1);
299 s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
300 s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
301 s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
302 s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
303 s89 = _mm_load_si128((
xmmi*)s + 2);
304 s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
305 s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
306 s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
308 r0 = _mm_load_si128((
xmmi*)r + 0);
309 r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
310 r1 = _mm_add_epi64(r1, _mm_and_si128(r1, top64bitmask.
v));
311 r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
312 r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
313 r3 = _mm_add_epi64(r3, _mm_and_si128(r3, top64bitmask.
v));
314 r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
315 r4 = _mm_load_si128((
xmmi*)r + 1);
316 r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
317 r5 = _mm_add_epi64(r5, _mm_and_si128(r5, top64bitmask.
v));
318 r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
319 r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
320 r7 = _mm_add_epi64(r7, _mm_and_si128(r7, top64bitmask.
v));
321 r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
322 r8 = _mm_load_si128((
xmmi*)r + 2);
323 r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
324 r9 = _mm_add_epi64(r9, _mm_and_si128(r9, top64bitmask.
v));
325 r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
327 m01 = _mm_mul_epu32(r1,s01);
328 m23 = _mm_mul_epu32(r1,s23);
329 m45 = _mm_mul_epu32(r1,s45);
330 m67 = _mm_mul_epu32(r1,s67);
331 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
332 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
333 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
334 m89 = _mm_mul_epu32(r1,s89);
335 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
336 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
337 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
338 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
339 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
340 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
341 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
344 m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
345 m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
346 m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
347 m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
348 m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
350 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
351 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
352 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
353 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
354 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
355 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
356 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
357 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
358 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
359 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
360 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
361 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
362 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
363 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
364 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
366 r219 = _mm_mul_epu32(r2, packednineteen.
v);
367 r419 = _mm_mul_epu32(r4, packednineteen.
v);
368 r619 = _mm_mul_epu32(r6, packednineteen.
v);
369 r819 = _mm_mul_epu32(r8, packednineteen.
v);
370 r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.
v);
371 r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.
v);
372 r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.
v);
373 r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.
v);
374 r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.
v);
376 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
377 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
378 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
379 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
380 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
381 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
382 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
383 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
384 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
385 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
386 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
387 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
388 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
389 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
390 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
391 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
392 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
393 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
394 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
395 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
396 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
397 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
398 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
399 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
400 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
402 r0 = _mm_unpacklo_epi64(m01, m45);
403 r1 = _mm_unpackhi_epi64(m01, m45);
404 r2 = _mm_unpacklo_epi64(m23, m67);
405 r3 = _mm_unpackhi_epi64(m23, m67);
406 r4 = _mm_unpacklo_epi64(m89, m89);
407 r5 = _mm_unpackhi_epi64(m89, m89);
409 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
410 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.
v); r3 = _mm_and_si128(r3, packedmask25.
v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
411 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.
v); r5 = _mm_add_epi64(r5, c1);
412 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.
v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.
v), c3));
413 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
415 m0123 = _mm_unpacklo_epi32(r0, r1);
416 m4567 = _mm_unpackhi_epi32(r0, r1);
417 m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
418 m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
419 m89 = _mm_unpackhi_epi32(r4, r5);
421 _mm_store_si128((
xmmi*)out + 0, m0123);
422 _mm_store_si128((
xmmi*)out + 1, m4567);
423 _mm_store_si128((
xmmi*)out + 2, m89);
428 curve25519_mul(out, r, s);
431 #define curve25519_square(r, n) curve25519_square_times(r, n, 1) 434 xmmi m01,m23,m45,m67,m89;
435 xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
436 xmmi r0a,r1a,r2a,r3a,r7a,r9a;
438 xmmi r01,r23,r45,r67,r6x,r89,r8x;
439 xmmi r12,r34,r56,r78,r9x;
443 r0123 = _mm_load_si128((
xmmi*)in + 0);
444 r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
445 r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
446 r4567 = _mm_load_si128((
xmmi*)in + 1);
447 r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
448 r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
449 r89 = _mm_load_si128((
xmmi*)in + 2);
450 r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
453 r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
454 r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
455 r0 = _mm_add_epi64(r0, _mm_and_si128(r0, top64bitmask.
v));
456 r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
457 r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
458 r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
459 r2 = _mm_add_epi64(r2, _mm_and_si128(r2, top64bitmask.
v));
460 r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
461 r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
462 r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
463 r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
464 r4 = _mm_add_epi64(r4, _mm_and_si128(r4, top64bitmask.
v));
465 r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
466 r5619 = _mm_mul_epu32(r56, packednineteen.
v);
467 r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
468 r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
469 r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
470 r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
471 r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
472 r7 = _mm_mul_epu32(r7, packed3819.
v);
473 r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
474 r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
475 r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
476 r8 = _mm_mul_epu32(r8, packednineteen.
v);
477 r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
478 r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
479 r9 = _mm_mul_epu32(r9, packed3819.
v);
480 r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
482 m01 = _mm_mul_epu32(r01, r0);
483 m23 = _mm_mul_epu32(r23, r0a);
484 m45 = _mm_mul_epu32(r45, r0a);
485 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
486 r23 = _mm_slli_epi32(r23, 1);
487 m67 = _mm_mul_epu32(r67, r0a);
488 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
489 m89 = _mm_mul_epu32(r89, r0a);
490 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
491 r67 = _mm_slli_epi32(r67, 1);
492 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
493 r45 = _mm_slli_epi32(r45, 1);
495 r1 = _mm_slli_epi32(r1, 1);
496 r3 = _mm_slli_epi32(r3, 1);
497 r1a = _mm_add_epi64(r1, _mm_and_si128(r1, bot64bitmask.
v));
498 r3a = _mm_add_epi64(r3, _mm_and_si128(r3, bot64bitmask.
v));
500 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
501 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
502 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
503 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
504 r34 = _mm_slli_epi32(r34, 1);
505 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
506 r78 = _mm_slli_epi32(r78, 1);
507 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
508 r56 = _mm_slli_epi32(r56, 1);
510 m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
511 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
512 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
513 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
514 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
515 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
516 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
517 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
518 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
519 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
520 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
521 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
522 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
523 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
524 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
526 r0 = _mm_unpacklo_epi64(m01, m45);
527 r1 = _mm_unpackhi_epi64(m01, m45);
528 r2 = _mm_unpacklo_epi64(m23, m67);
529 r3 = _mm_unpackhi_epi64(m23, m67);
530 r4 = _mm_unpacklo_epi64(m89, m89);
531 r5 = _mm_unpackhi_epi64(m89, m89);
533 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
534 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.
v); r3 = _mm_and_si128(r3, packedmask25.
v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
535 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.
v); r5 = _mm_add_epi64(r5, c1);
536 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.
v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.
v), c3));
537 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.
v); r2 = _mm_and_si128(r2, packedmask26.
v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
539 r01 = _mm_unpacklo_epi64(r0, r1);
540 r45 = _mm_unpackhi_epi64(r0, r1);
541 r23 = _mm_unpacklo_epi64(r2, r3);
542 r67 = _mm_unpackhi_epi64(r2, r3);
543 r89 = _mm_unpackhi_epi64(r4, r5);
546 r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
547 r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
548 r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
549 r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
550 r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
552 _mm_store_si128((
xmmi*)r + 0, r0123);
553 _mm_store_si128((
xmmi*)r + 1, r4567);
554 _mm_store_si128((
xmmi*)r + 2, r89);
559 xmmi x0,x1,x2,z0,z1,z2;
561 x0 = _mm_load_si128((
xmmi *)(x + 0));
562 x1 = _mm_load_si128((
xmmi *)(x + 4));
563 x2 = _mm_load_si128((
xmmi *)(x + 8));
564 z0 = _mm_load_si128((
xmmi *)(z + 0));
565 z1 = _mm_load_si128((
xmmi *)(z + 4));
566 z2 = _mm_load_si128((
xmmi *)(z + 8));
568 out[0].v = _mm_unpacklo_epi32(x0, z0);
569 out[1].v = _mm_unpackhi_epi32(x0, z0);
570 out[2].v = _mm_unpacklo_epi32(x1, z1);
571 out[3].v = _mm_unpackhi_epi32(x1, z1);
572 out[4].v = _mm_unpacklo_epi32(x2, z2);
579 t0 = _mm_shuffle_epi32(in[0].v, _MM_SHUFFLE(3,1,2,0));
580 t1 = _mm_shuffle_epi32(in[1].v, _MM_SHUFFLE(3,1,2,0));
581 t2 = _mm_shuffle_epi32(in[2].v, _MM_SHUFFLE(3,1,2,0));
582 t3 = _mm_shuffle_epi32(in[3].v, _MM_SHUFFLE(3,1,2,0));
583 t4 = _mm_shuffle_epi32(in[4].v, _MM_SHUFFLE(3,1,2,0));
584 zero = _mm_setzero_si128();
585 _mm_store_si128((
xmmi *)x + 0, _mm_unpacklo_epi64(
t0,
t1));
586 _mm_store_si128((
xmmi *)x + 1, _mm_unpacklo_epi64(
t2,
t3));
587 _mm_store_si128((
xmmi *)x + 2, _mm_unpacklo_epi64(t4,
zero));
588 _mm_store_si128((
xmmi *)z + 0, _mm_unpackhi_epi64(
t0,
t1));
589 _mm_store_si128((
xmmi *)z + 1, _mm_unpackhi_epi64(
t2,
t3));
590 _mm_store_si128((
xmmi *)z + 2, _mm_unpackhi_epi64(t4,
zero));
596 xmmi s0,s1,s2,s3,s4,s5;
599 r0 = _mm_add_epi32(r[0].v, s[0].v);
600 r1 = _mm_add_epi32(r[1].v, s[1].v);
601 r2 = _mm_add_epi32(r[2].v, s[2].v);
602 r3 = _mm_add_epi32(r[3].v, s[3].v);
603 r4 = _mm_add_epi32(r[4].v, s[4].v);
605 s0 = _mm_unpacklo_epi64(r0, r2);
606 s1 = _mm_unpackhi_epi64(r0, r2);
607 s2 = _mm_unpacklo_epi64(r1, r3);
608 s3 = _mm_unpackhi_epi64(r1, r3);
609 s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4);
610 s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4);
612 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.
v); s2 = _mm_and_si128(s2, packedmask26262626.
v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
613 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.
v); s3 = _mm_and_si128(s3, packedmask25252525.
v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
614 c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.
v); s4 = _mm_and_si128(s4, packedmask26262626.
v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
615 c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.
v); s5 = _mm_and_si128(s5, packedmask25252525.
v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
616 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.
v); s2 = _mm_and_si128(s2, packedmask26262626.
v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
618 out[0].v = _mm_unpacklo_epi64(s0, s1);
619 out[1].v = _mm_unpacklo_epi64(s2, s3);
620 out[2].v = _mm_unpackhi_epi64(s0, s1);
621 out[3].v = _mm_unpackhi_epi64(s2, s3);
622 out[4].v = _mm_unpackhi_epi64(s4, s5);
627 out[0].v = _mm_add_epi32(r[0].v, s[0].v);
628 out[1].v = _mm_add_epi32(r[1].v, s[1].v);
629 out[2].v = _mm_add_epi32(r[2].v, s[2].v);
630 out[3].v = _mm_add_epi32(r[3].v, s[3].v);
631 out[4].v = _mm_add_epi32(r[4].v, s[4].v);
640 r0 = _mm_add_epi32(r[0].v, packed32packed2p0.
v);
641 r1 = _mm_add_epi32(r[1].v, packed32packed2p1.
v);
642 r2 = _mm_add_epi32(r[2].v, packed32packed2p1.
v);
643 r3 = _mm_add_epi32(r[3].v, packed32packed2p1.
v);
644 r4 = _mm_add_epi32(r[4].v, packed32packed2p1.
v);
645 r0 = _mm_sub_epi32(r0, s[0].v);
646 r1 = _mm_sub_epi32(r1, s[1].v);
647 r2 = _mm_sub_epi32(r2, s[2].v);
648 r3 = _mm_sub_epi32(r3, s[3].v);
649 r4 = _mm_sub_epi32(r4, s[4].v);
651 s0 = _mm_unpacklo_epi64(r0, r2);
652 s1 = _mm_unpackhi_epi64(r0, r2);
653 s2 = _mm_unpacklo_epi64(r1, r3);
654 s3 = _mm_unpackhi_epi64(r1, r3);
656 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.
v); s2 = _mm_and_si128(s2, packedmask26262626.
v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
657 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.
v); s3 = _mm_and_si128(s3, packedmask25252525.
v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));
659 out[0].v = _mm_unpacklo_epi64(s0, s1);
660 out[1].v = _mm_unpacklo_epi64(s2, s3);
661 out[2].v = _mm_unpackhi_epi64(s0, s1);
662 out[3].v = _mm_unpackhi_epi64(s2, s3);
669 xmmi s0,s1,s2,s3,s4,s5;
672 r0 = _mm_add_epi32(r[0].v, packed32packed4p0.
v);
673 r1 = _mm_add_epi32(r[1].v, packed32packed4p1.
v);
674 r2 = _mm_add_epi32(r[2].v, packed32packed4p1.
v);
675 r3 = _mm_add_epi32(r[3].v, packed32packed4p1.
v);
676 r4 = _mm_add_epi32(r[4].v, packed32packed4p1.
v);
677 r0 = _mm_sub_epi32(r0, s[0].v);
678 r1 = _mm_sub_epi32(r1, s[1].v);
679 r2 = _mm_sub_epi32(r2, s[2].v);
680 r3 = _mm_sub_epi32(r3, s[3].v);
681 r4 = _mm_sub_epi32(r4, s[4].v);
683 s0 = _mm_unpacklo_epi64(r0, r2);
684 s1 = _mm_unpackhi_epi64(r0, r2);
685 s2 = _mm_unpacklo_epi64(r1, r3);
686 s3 = _mm_unpackhi_epi64(r1, r3);
687 s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4);
688 s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4);
690 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.
v); s2 = _mm_and_si128(s2, packedmask26262626.
v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
691 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.
v); s3 = _mm_and_si128(s3, packedmask25252525.
v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
692 c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.
v); s4 = _mm_and_si128(s4, packedmask26262626.
v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
693 c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.
v); s5 = _mm_and_si128(s5, packedmask25252525.
v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
694 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.
v); s2 = _mm_and_si128(s2, packedmask26262626.
v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
696 out[0].v = _mm_unpacklo_epi64(s0, s1);
697 out[1].v = _mm_unpacklo_epi64(s2, s3);
698 out[2].v = _mm_unpackhi_epi64(s0, s1);
699 out[3].v = _mm_unpackhi_epi64(s2, s3);
700 out[4].v = _mm_unpackhi_epi64(s4, s5);
705 xmmi c0,c1,c2,c3,c4,c5,t;
706 xmmi d0,d1,d2,d3,d4,d5;
709 t0 = _mm_shuffle_epi32(c[0].v, _MM_SHUFFLE(3,1,2,0));
710 t1 = _mm_shuffle_epi32(c[1].v, _MM_SHUFFLE(3,1,2,0));
711 t2 = _mm_shuffle_epi32(d[0].v, _MM_SHUFFLE(3,1,2,0));
712 t3 = _mm_shuffle_epi32(d[1].v, _MM_SHUFFLE(3,1,2,0));
713 c0 = _mm_unpacklo_epi64(
t0,
t1);
714 c3 = _mm_unpackhi_epi64(
t0,
t1);
715 d0 = _mm_unpacklo_epi64(
t2,
t3);
716 d3 = _mm_unpackhi_epi64(
t2,
t3);
717 t = _mm_unpacklo_epi64(c0, d0);
a[0].v = t;
a[1].v = _mm_srli_epi64(t, 32);
718 t = _mm_unpackhi_epi64(c0, d0);
a[2].v = t;
a[3].v = _mm_srli_epi64(t, 32);
719 t = _mm_unpacklo_epi64(c3, d3); b[0].
v = t; b[1].
v = _mm_srli_epi64(t, 32);
720 t = _mm_unpackhi_epi64(c3, d3); b[2].
v = t; b[3].
v = _mm_srli_epi64(t, 32);
722 t0 = _mm_shuffle_epi32(c[2].v, _MM_SHUFFLE(3,1,2,0));
723 t1 = _mm_shuffle_epi32(c[3].v, _MM_SHUFFLE(3,1,2,0));
724 t2 = _mm_shuffle_epi32(d[2].v, _MM_SHUFFLE(3,1,2,0));
725 t3 = _mm_shuffle_epi32(d[3].v, _MM_SHUFFLE(3,1,2,0));
726 c1 = _mm_unpacklo_epi64(
t0,
t1);
727 c4 = _mm_unpackhi_epi64(
t0,
t1);
728 d1 = _mm_unpacklo_epi64(
t2,
t3);
729 d4 = _mm_unpackhi_epi64(
t2,
t3);
730 t = _mm_unpacklo_epi64(c1, d1);
a[4].v = t;
a[5].v = _mm_srli_epi64(t, 32);
731 t = _mm_unpackhi_epi64(c1, d1);
a[6].v = t;
a[7].v = _mm_srli_epi64(t, 32);
732 t = _mm_unpacklo_epi64(c4, d4); b[4].
v = t; b[5].
v = _mm_srli_epi64(t, 32);
733 t = _mm_unpackhi_epi64(c4, d4); b[6].
v = t; b[7].
v = _mm_srli_epi64(t, 32);
735 t4 = _mm_shuffle_epi32(c[4].v, _MM_SHUFFLE(3,1,2,0));
736 zero = _mm_setzero_si128();
737 c2 = _mm_unpacklo_epi64(t4,
zero);
738 c5 = _mm_unpackhi_epi64(t4,
zero);
739 t4 = _mm_shuffle_epi32(d[4].v, _MM_SHUFFLE(3,1,2,0));
740 d2 = _mm_unpacklo_epi64(t4,
zero);
741 d5 = _mm_unpackhi_epi64(t4,
zero);
742 t = _mm_unpacklo_epi64(c2, d2);
a[8].v = t;
a[9].v = _mm_srli_epi64(t, 32);
743 t = _mm_unpacklo_epi64(c5, d5); b[8].
v = t; b[9].
v = _mm_srli_epi64(t, 32);
748 xmmi x0,x1,x2,z0,z1,z2,t;
750 x0 = _mm_load_si128((
xmmi *)x + 0);
751 x1 = _mm_load_si128((
xmmi *)x + 1);
752 x2 = _mm_load_si128((
xmmi *)x + 2);
753 z0 = _mm_load_si128((
xmmi *)z + 0);
754 z1 = _mm_load_si128((
xmmi *)z + 1);
755 z2 = _mm_load_si128((
xmmi *)z + 2);
757 t = _mm_unpacklo_epi64(x0, z0);
out[0].v = t;
out[1].v = _mm_srli_epi64(t, 32);
758 t = _mm_unpackhi_epi64(x0, z0);
out[2].v = t;
out[3].v = _mm_srli_epi64(t, 32);
759 t = _mm_unpacklo_epi64(x1, z1);
out[4].v = t;
out[5].v = _mm_srli_epi64(t, 32);
760 t = _mm_unpackhi_epi64(x1, z1);
out[6].v = t;
out[7].v = _mm_srli_epi64(t, 32);
761 t = _mm_unpacklo_epi64(x2, z2);
out[8].v = t;
out[9].v = _mm_srli_epi64(t, 32);
768 x0 = _mm_load_si128((
xmmi *)(x + 0));
769 x1 = _mm_load_si128((
xmmi *)(x + 4));
770 x2 = _mm_load_si128((
xmmi *)(x + 8));
772 out[0].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(0,0,0,0));
773 out[1].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(1,1,1,1));
774 out[2].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(2,2,2,2));
775 out[3].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(3,3,3,3));
776 out[4].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0,0,0,0));
777 out[5].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,1,1,1));
778 out[6].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2,2,2,2));
779 out[7].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(3,3,3,3));
780 out[8].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0,0,0,0));
781 out[9].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1,1,1,1));
786 out[0].v = _mm_shuffle_epi32(out[0].v, _MM_SHUFFLE(1,0,3,2));
787 out[1].v = _mm_shuffle_epi32(out[1].v, _MM_SHUFFLE(1,0,3,2));
788 out[2].v = _mm_shuffle_epi32(out[2].v, _MM_SHUFFLE(1,0,3,2));
789 out[3].v = _mm_shuffle_epi32(out[3].v, _MM_SHUFFLE(1,0,3,2));
790 out[4].v = _mm_shuffle_epi32(out[4].v, _MM_SHUFFLE(1,0,3,2));
791 out[5].v = _mm_shuffle_epi32(out[5].v, _MM_SHUFFLE(1,0,3,2));
792 out[6].v = _mm_shuffle_epi32(out[6].v, _MM_SHUFFLE(1,0,3,2));
793 out[7].v = _mm_shuffle_epi32(out[7].v, _MM_SHUFFLE(1,0,3,2));
794 out[8].v = _mm_shuffle_epi32(out[8].v, _MM_SHUFFLE(1,0,3,2));
795 out[9].v = _mm_shuffle_epi32(out[9].v, _MM_SHUFFLE(1,0,3,2));
800 _mm_store_si128((
xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
801 _mm_store_si128((
xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
802 _mm_store_si128((
xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );
803 _mm_store_si128((
xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
804 _mm_store_si128((
xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
805 _mm_store_si128((
xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );
810 xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
811 xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
814 out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
815 out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
816 r1_2 = _mm_slli_epi32(r[1].v, 1);
817 out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
818 out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
819 r3_2 = _mm_slli_epi32(r[3].v, 1);
820 out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
821 out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
822 r5_2 = _mm_slli_epi32(r[5].v, 1);
823 out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
824 out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));
825 r7_2 = _mm_slli_epi32(r[7].v, 1);
826 out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
827 out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
829 r1 = _mm_mul_epu32(r[1].v, packednineteen.
v);
830 r2 = _mm_mul_epu32(r[2].v, packednineteen.
v);
831 r1_2 = _mm_slli_epi32(r1, 1);
832 r3 = _mm_mul_epu32(r[3].v, packednineteen.
v);
833 r4 = _mm_mul_epu32(r[4].v, packednineteen.
v);
834 r3_2 = _mm_slli_epi32(r3, 1);
835 r5 = _mm_mul_epu32(r[5].v, packednineteen.
v);
836 r6 = _mm_mul_epu32(r[6].v, packednineteen.
v);
837 r5_2 = _mm_slli_epi32(r5, 1);
838 r7 = _mm_mul_epu32(r[7].v, packednineteen.
v);
839 r8 = _mm_mul_epu32(r[8].v, packednineteen.
v);
840 r7_2 = _mm_slli_epi32(r7, 1);
841 r9 = _mm_mul_epu32(r[9].v, packednineteen.
v);
842 r9_2 = _mm_slli_epi32(r9, 1);
844 out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
845 out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
846 out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
847 out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
848 out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
849 out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
850 out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
851 out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));
852 out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
854 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26);
out[0].v = _mm_and_si128(out[0].v, packedmask26.
v);
out[4].v = _mm_and_si128(out[4].v, packedmask26.
v);
out[1].v = _mm_add_epi64(out[1].v, c1);
out[5].v = _mm_add_epi64(out[5].v, c2);
855 c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25);
out[1].v = _mm_and_si128(out[1].v, packedmask25.
v);
out[5].v = _mm_and_si128(out[5].v, packedmask25.
v);
out[2].v = _mm_add_epi64(out[2].v, c1);
out[6].v = _mm_add_epi64(out[6].v, c2);
856 c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26);
out[2].v = _mm_and_si128(out[2].v, packedmask26.
v);
out[6].v = _mm_and_si128(out[6].v, packedmask26.
v);
out[3].v = _mm_add_epi64(out[3].v, c1);
out[7].v = _mm_add_epi64(out[7].v, c2);
857 c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25);
out[3].v = _mm_and_si128(out[3].v, packedmask25.
v);
out[7].v = _mm_and_si128(out[7].v, packedmask25.
v);
out[4].v = _mm_add_epi64(out[4].v, c1);
out[8].v = _mm_add_epi64(out[8].v, c2);
858 c2 = _mm_srli_epi64(out[8].v, 26);
out[8].v = _mm_and_si128(out[8].v, packedmask26.
v);
out[9].v = _mm_add_epi64(out[9].v, c2);
859 c2 = _mm_srli_epi64(out[9].v, 25);
out[9].v = _mm_and_si128(out[9].v, packedmask25.
v);
out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.
v));
860 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26);
out[0].v = _mm_and_si128(out[0].v, packedmask26.
v);
out[4].v = _mm_and_si128(out[4].v, packedmask26.
v);
out[1].v = _mm_add_epi64(out[1].v, c1);
out[5].v = _mm_add_epi64(out[5].v, c2);
866 xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
875 out[0].v = _mm_mul_epu32(r0, r0);
876 r0 = _mm_slli_epi32(r0, 1);
877 out[1].v = _mm_mul_epu32(r0, r1);
878 r1_2 = _mm_slli_epi32(r1, 1);
879 out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));
881 out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));
882 r3_2 = _mm_slli_epi32(r3, 1);
883 out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));
884 r2 = _mm_slli_epi32(r2, 1);
885 out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
886 r5_2 = _mm_slli_epi32(r[5].v, 1);
887 out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));
889 out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
890 r7_2 = _mm_slli_epi32(r[7].v, 1);
891 out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));
892 out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));
894 d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.
v);
895 d6 = _mm_mul_epu32(r[6].v, packednineteen.
v);
896 d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.
v);
897 d8 = _mm_mul_epu32(r[8].v, packednineteen.
v);
898 d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.
v);
900 r4_2 = _mm_slli_epi32(r[4].v, 1);
901 r6_2 = _mm_slli_epi32(r[6].v, 1);
902 out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
903 out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));
904 out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));
905 out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
906 out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
907 out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));
908 out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));
909 out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
910 out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
912 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26);
out[0].v = _mm_and_si128(out[0].v, packedmask26.
v);
out[4].v = _mm_and_si128(out[4].v, packedmask26.
v);
out[1].v = _mm_add_epi64(out[1].v, c1);
out[5].v = _mm_add_epi64(out[5].v, c2);
913 c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25);
out[1].v = _mm_and_si128(out[1].v, packedmask25.
v);
out[5].v = _mm_and_si128(out[5].v, packedmask25.
v);
out[2].v = _mm_add_epi64(out[2].v, c1);
out[6].v = _mm_add_epi64(out[6].v, c2);
914 c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26);
out[2].v = _mm_and_si128(out[2].v, packedmask26.
v);
out[6].v = _mm_and_si128(out[6].v, packedmask26.
v);
out[3].v = _mm_add_epi64(out[3].v, c1);
out[7].v = _mm_add_epi64(out[7].v, c2);
915 c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25);
out[3].v = _mm_and_si128(out[3].v, packedmask25.
v);
out[7].v = _mm_and_si128(out[7].v, packedmask25.
v);
out[4].v = _mm_add_epi64(out[4].v, c1);
out[8].v = _mm_add_epi64(out[8].v, c2);
916 c2 = _mm_srli_epi64(out[8].v, 26);
out[8].v = _mm_and_si128(out[8].v, packedmask26.
v);
out[9].v = _mm_add_epi64(out[9].v, c2);
917 c2 = _mm_srli_epi64(out[9].v, 25);
out[9].v = _mm_and_si128(out[9].v, packedmask25.
v);
out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.
v));
918 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26);
out[0].v = _mm_and_si128(out[0].v, packedmask26.
v);
out[4].v = _mm_and_si128(out[4].v, packedmask26.
v);
out[1].v = _mm_add_epi64(out[1].v, c1);
out[5].v = _mm_add_epi64(out[5].v, c2);
924 curve25519_expand(
bignum25519 out,
const unsigned char in[32]) {
936 out[0] = ( x0 ) & 0x3ffffff;
937 out[1] = ((((
uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff;
938 out[2] = ((((
uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff;
939 out[3] = ((((
uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff;
940 out[4] = (( x3) >> 6) & 0x3ffffff;
941 out[5] = ( x4 ) & 0x1ffffff;
942 out[6] = ((((
uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff;
943 out[7] = ((((
uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff;
944 out[8] = ((((
uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff;
945 out[9] = (( x7) >> 6) & 0x1ffffff;
954 curve25519_contract(
unsigned char out[32],
const bignum25519 in) {
956 curve25519_copy(f, in);
958 #define carry_pass() \ 959 f[1] += f[0] >> 26; f[0] &= 0x3ffffff; \ 960 f[2] += f[1] >> 25; f[1] &= 0x1ffffff; \ 961 f[3] += f[2] >> 26; f[2] &= 0x3ffffff; \ 962 f[4] += f[3] >> 25; f[3] &= 0x1ffffff; \ 963 f[5] += f[4] >> 26; f[4] &= 0x3ffffff; \ 964 f[6] += f[5] >> 25; f[5] &= 0x1ffffff; \ 965 f[7] += f[6] >> 26; f[6] &= 0x3ffffff; \ 966 f[8] += f[7] >> 25; f[7] &= 0x1ffffff; \ 967 f[9] += f[8] >> 26; f[8] &= 0x3ffffff; 969 #define carry_pass_full() \ 971 f[0] += 19 * (f[9] >> 25); f[9] &= 0x1ffffff; 973 #define carry_pass_final() \ 986 f[0] += (1 << 26) - 19;
987 f[1] += (1 << 25) - 1;
988 f[2] += (1 << 26) - 1;
989 f[3] += (1 << 25) - 1;
990 f[4] += (1 << 26) - 1;
991 f[5] += (1 << 25) - 1;
992 f[6] += (1 << 26) - 1;
993 f[7] += (1 << 25) - 1;
994 f[8] += (1 << 26) - 1;
995 f[9] += (1 << 25) - 1;
1014 out[s+0] |= (unsigned char )(f[i] & 0xff); \ 1015 out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \ 1016 out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \ 1017 out[s+3] = (unsigned char )((f[i] >> 24) & 0xff); 1038 xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
1039 xmmi mask = _mm_cvtsi32_si128(swap);
1040 mask = _mm_shuffle_epi32(mask, 0);
1041 a0 = _mm_load_si128((
xmmi *)
a + 0);
1042 a1 = _mm_load_si128((
xmmi *)
a + 1);
1043 b0 = _mm_load_si128((
xmmi *)b + 0);
1044 b1 = _mm_load_si128((
xmmi *)b + 1);
1045 b0 = _mm_xor_si128(a0, b0);
1046 b1 = _mm_xor_si128(a1, b1);
1047 x0 = _mm_and_si128(b0, mask);
1048 x1 = _mm_and_si128(b1, mask);
1049 x0 = _mm_xor_si128(x0, a0);
1050 x1 = _mm_xor_si128(x1, a1);
1051 a0 = _mm_xor_si128(x0, b0);
1052 a1 = _mm_xor_si128(x1, b1);
1053 _mm_store_si128((
xmmi *)
a + 0, x0);
1054 _mm_store_si128((
xmmi *)
a + 1, x1);
1055 _mm_store_si128((
xmmi *)b + 0, a0);
1056 _mm_store_si128((
xmmi *)b + 1, a1);
1058 a2 = _mm_load_si128((
xmmi *)
a + 2);
1059 b2 = _mm_load_si128((
xmmi *)b + 2);
1060 b2 = _mm_xor_si128(a2, b2);
1061 x2 = _mm_and_si128(b2, mask);
1062 x2 = _mm_xor_si128(x2, a2);
1063 a2 = _mm_xor_si128(x2, b2);
1064 _mm_store_si128((
xmmi *)b + 2, a2);
1065 _mm_store_si128((
xmmi *)
a + 2, x2);
1071 xmmi a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5;
1073 xmmi masknb = _mm_shuffle_epi32(_mm_cvtsi32_si128(nb),0);
1074 a0 = _mm_load_si128((
xmmi *)in + 0);
1075 a1 = _mm_load_si128((
xmmi *)in + 1);
1076 a2 = _mm_load_si128((
xmmi *)in + 2);
1077 b0 = _mm_load_si128((
xmmi *)out + 0);
1078 b1 = _mm_load_si128((
xmmi *)out + 1);
1079 b2 = _mm_load_si128((
xmmi *)out + 2);
1080 a0 = _mm_andnot_si128(masknb, a0);
1081 a1 = _mm_andnot_si128(masknb, a1);
1082 a2 = _mm_andnot_si128(masknb, a2);
1083 b0 = _mm_and_si128(masknb, b0);
1084 b1 = _mm_and_si128(masknb, b1);
1085 b2 = _mm_and_si128(masknb, b2);
1086 a0 = _mm_or_si128(a0, b0);
1087 a1 = _mm_or_si128(a1, b1);
1088 a2 = _mm_or_si128(a2, b2);
1089 _mm_store_si128((
xmmi*)out + 0, a0);
1090 _mm_store_si128((
xmmi*)out + 1, a1);
1091 _mm_store_si128((
xmmi*)out + 2, a2);
1093 a3 = _mm_load_si128((
xmmi *)in + 3);
1094 a4 = _mm_load_si128((
xmmi *)in + 4);
1095 a5 = _mm_load_si128((
xmmi *)in + 5);
1096 b3 = _mm_load_si128((
xmmi *)out + 3);
1097 b4 = _mm_load_si128((
xmmi *)out + 4);
1098 b5 = _mm_load_si128((
xmmi *)out + 5);
1099 a3 = _mm_andnot_si128(masknb, a3);
1100 a4 = _mm_andnot_si128(masknb, a4);
1101 a5 = _mm_andnot_si128(masknb, a5);
1102 b3 = _mm_and_si128(masknb, b3);
1103 b4 = _mm_and_si128(masknb, b4);
1104 b5 = _mm_and_si128(masknb, b5);
1105 a3 = _mm_or_si128(a3, b3);
1106 a4 = _mm_or_si128(a4, b4);
1107 a5 = _mm_or_si128(a5, b5);
1108 _mm_store_si128((
xmmi*)out + 3, a3);
1109 _mm_store_si128((
xmmi*)out + 4, a4);
1110 _mm_store_si128((
xmmi*)out + 5, a5);
packedelem64 packed64bignum25519[10]
packedelem32 packed32bignum25519[5]
union packedelem64_t packedelem64
#define carry_pass_final()
#define carry_pass_full()
mdb_size_t count(MDB_cursor *cur)
unsigned __int64 uint64_t
#define curve25519_mul_noinline
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
union packedelem32_t packedelem32
union packedelem8_t packedelem8