curve25519-donna.c source code [olm/lib/curve25519-donna/curve25519-donna.c]

1	/ Copyright 2008, Google Inc.*
2	* All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are
6	* met:
7	*
8	* * Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* * Redistributions in binary form must reproduce the above
11	* copyright notice, this list of conditions and the following disclaimer
12	* in the documentation and/or other materials provided with the
13	* distribution.
14	* * Neither the name of Google Inc. nor the names of its
15	* contributors may be used to endorse or promote products derived from
16	* this software without specific prior written permission.
17	*
18	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29	*
30	* curve25519-donna: Curve25519 elliptic curve, public key function
31	*
32	* http://code.google.com/p/curve25519-donna/
33	*
34	* Adam Langley <agl@imperialviolet.org>
35	*
36	* Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
37	*
38	* More information about curve25519 can be found here
39	* http://cr.yp.to/ecdh.html
40	*
41	* djb's sample implementation of curve25519 is written in a special assembly
42	* language called qhasm and uses the floating point registers.
43	*
44	* This is, almost, a clean room reimplementation from the curve25519 paper. It
45	* uses many of the tricks described therein. Only the crecip function is taken
46	* from the sample implementation. */
47
48	#include <string.h>
49	#include <stdint.h>
50
51	#ifdef _MSC_VER
52	#define inline __inline
53	#endif
54
55	typedef uint8_t u8;
56	typedef int32_t s32;
57	typedef int64_t limb;
58
59	/ Field element representation:*
60	*
61	* Field elements are written as an array of signed, 64-bit limbs, least
62	* significant first. The value of the field element is:
63	* x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
64	*
65	* i.e. the limbs are 26, 25, 26, 25, ... bits wide. */
66
67	/ Sum two numbers: output += in /
68	static void fsum(limb output, const* limb *in) {
69	unsigned i;
70	for (i = `0`; i < `10`; i += `2`) {
71	output[`0`+i] = output[`0`+i] + in[`0`+i];
72	output[`1`+i] = output[`1`+i] + in[`1`+i];
73	}
74	}
75
76	/ Find the difference of two numbers: output = in - output*
77	* (note the order of the arguments!). */
78	static void fdifference(limb output, const* limb *in) {
79	unsigned i;
80	for (i = `0`; i < `10`; ++i) {
81	output[i] = in[i] - output[i];
82	}
83	}
84
85	/ Multiply a number by a scalar: output = in * scalar /
86	static void fscalar_product(limb output, const* limb in, const* limb scalar) {
87	unsigned i;
88	for (i = `0`; i < `10`; ++i) {
89	output[i] = in[i] * scalar;
90	}
91	}
92
93	/ Multiply two numbers: output = in2 * in*
94	*
95	* output must be distinct to both inputs. The inputs are reduced coefficient
96	* form, the output is not.
97	*
98	* output[x] <= 14 * the largest product of the input limbs. */
99	static void fproduct(limb output, const* limb in2, const* limb *in) {
100	output[`0`] = ((limb) ((s32) in2[`0`])) * ((s32) in[`0`]);
101	output[`1`] = ((limb) ((s32) in2[`0`])) * ((s32) in[`1`]) +
102	((limb) ((s32) in2[`1`])) * ((s32) in[`0`]);
103	output[`2`] = `2` * ((limb) ((s32) in2[`1`])) * ((s32) in[`1`]) +
104	((limb) ((s32) in2[`0`])) * ((s32) in[`2`]) +
105	((limb) ((s32) in2[`2`])) * ((s32) in[`0`]);
106	output[`3`] = ((limb) ((s32) in2[`1`])) * ((s32) in[`2`]) +
107	((limb) ((s32) in2[`2`])) * ((s32) in[`1`]) +
108	((limb) ((s32) in2[`0`])) * ((s32) in[`3`]) +
109	((limb) ((s32) in2[`3`])) * ((s32) in[`0`]);
110	output[`4`] = ((limb) ((s32) in2[`2`])) * ((s32) in[`2`]) +
111	`2` * (((limb) ((s32) in2[`1`])) * ((s32) in[`3`]) +
112	((limb) ((s32) in2[`3`])) * ((s32) in[`1`])) +
113	((limb) ((s32) in2[`0`])) * ((s32) in[`4`]) +
114	((limb) ((s32) in2[`4`])) * ((s32) in[`0`]);
115	output[`5`] = ((limb) ((s32) in2[`2`])) * ((s32) in[`3`]) +
116	((limb) ((s32) in2[`3`])) * ((s32) in[`2`]) +
117	((limb) ((s32) in2[`1`])) * ((s32) in[`4`]) +
118	((limb) ((s32) in2[`4`])) * ((s32) in[`1`]) +
119	((limb) ((s32) in2[`0`])) * ((s32) in[`5`]) +
120	((limb) ((s32) in2[`5`])) * ((s32) in[`0`]);
121	output[`6`] = `2` * (((limb) ((s32) in2[`3`])) * ((s32) in[`3`]) +
122	((limb) ((s32) in2[`1`])) * ((s32) in[`5`]) +
123	((limb) ((s32) in2[`5`])) * ((s32) in[`1`])) +
124	((limb) ((s32) in2[`2`])) * ((s32) in[`4`]) +
125	((limb) ((s32) in2[`4`])) * ((s32) in[`2`]) +
126	((limb) ((s32) in2[`0`])) * ((s32) in[`6`]) +
127	((limb) ((s32) in2[`6`])) * ((s32) in[`0`]);
128	output[`7`] = ((limb) ((s32) in2[`3`])) * ((s32) in[`4`]) +
129	((limb) ((s32) in2[`4`])) * ((s32) in[`3`]) +
130	((limb) ((s32) in2[`2`])) * ((s32) in[`5`]) +
131	((limb) ((s32) in2[`5`])) * ((s32) in[`2`]) +
132	((limb) ((s32) in2[`1`])) * ((s32) in[`6`]) +
133	((limb) ((s32) in2[`6`])) * ((s32) in[`1`]) +
134	((limb) ((s32) in2[`0`])) * ((s32) in[`7`]) +
135	((limb) ((s32) in2[`7`])) * ((s32) in[`0`]);
136	output[`8`] = ((limb) ((s32) in2[`4`])) * ((s32) in[`4`]) +
137	`2` * (((limb) ((s32) in2[`3`])) * ((s32) in[`5`]) +
138	((limb) ((s32) in2[`5`])) * ((s32) in[`3`]) +
139	((limb) ((s32) in2[`1`])) * ((s32) in[`7`]) +
140	((limb) ((s32) in2[`7`])) * ((s32) in[`1`])) +
141	((limb) ((s32) in2[`2`])) * ((s32) in[`6`]) +
142	((limb) ((s32) in2[`6`])) * ((s32) in[`2`]) +
143	((limb) ((s32) in2[`0`])) * ((s32) in[`8`]) +
144	((limb) ((s32) in2[`8`])) * ((s32) in[`0`]);
145	output[`9`] = ((limb) ((s32) in2[`4`])) * ((s32) in[`5`]) +
146	((limb) ((s32) in2[`5`])) * ((s32) in[`4`]) +
147	((limb) ((s32) in2[`3`])) * ((s32) in[`6`]) +
148	((limb) ((s32) in2[`6`])) * ((s32) in[`3`]) +
149	((limb) ((s32) in2[`2`])) * ((s32) in[`7`]) +
150	((limb) ((s32) in2[`7`])) * ((s32) in[`2`]) +
151	((limb) ((s32) in2[`1`])) * ((s32) in[`8`]) +
152	((limb) ((s32) in2[`8`])) * ((s32) in[`1`]) +
153	((limb) ((s32) in2[`0`])) * ((s32) in[`9`]) +
154	((limb) ((s32) in2[`9`])) * ((s32) in[`0`]);
155	output[`10`] = `2` * (((limb) ((s32) in2[`5`])) * ((s32) in[`5`]) +
156	((limb) ((s32) in2[`3`])) * ((s32) in[`7`]) +
157	((limb) ((s32) in2[`7`])) * ((s32) in[`3`]) +
158	((limb) ((s32) in2[`1`])) * ((s32) in[`9`]) +
159	((limb) ((s32) in2[`9`])) * ((s32) in[`1`])) +
160	((limb) ((s32) in2[`4`])) * ((s32) in[`6`]) +
161	((limb) ((s32) in2[`6`])) * ((s32) in[`4`]) +
162	((limb) ((s32) in2[`2`])) * ((s32) in[`8`]) +
163	((limb) ((s32) in2[`8`])) * ((s32) in[`2`]);
164	output[`11`] = ((limb) ((s32) in2[`5`])) * ((s32) in[`6`]) +
165	((limb) ((s32) in2[`6`])) * ((s32) in[`5`]) +
166	((limb) ((s32) in2[`4`])) * ((s32) in[`7`]) +
167	((limb) ((s32) in2[`7`])) * ((s32) in[`4`]) +
168	((limb) ((s32) in2[`3`])) * ((s32) in[`8`]) +
169	((limb) ((s32) in2[`8`])) * ((s32) in[`3`]) +
170	((limb) ((s32) in2[`2`])) * ((s32) in[`9`]) +
171	((limb) ((s32) in2[`9`])) * ((s32) in[`2`]);
172	output[`12`] = ((limb) ((s32) in2[`6`])) * ((s32) in[`6`]) +
173	`2` * (((limb) ((s32) in2[`5`])) * ((s32) in[`7`]) +
174	((limb) ((s32) in2[`7`])) * ((s32) in[`5`]) +
175	((limb) ((s32) in2[`3`])) * ((s32) in[`9`]) +
176	((limb) ((s32) in2[`9`])) * ((s32) in[`3`])) +
177	((limb) ((s32) in2[`4`])) * ((s32) in[`8`]) +
178	((limb) ((s32) in2[`8`])) * ((s32) in[`4`]);
179	output[`13`] = ((limb) ((s32) in2[`6`])) * ((s32) in[`7`]) +
180	((limb) ((s32) in2[`7`])) * ((s32) in[`6`]) +
181	((limb) ((s32) in2[`5`])) * ((s32) in[`8`]) +
182	((limb) ((s32) in2[`8`])) * ((s32) in[`5`]) +
183	((limb) ((s32) in2[`4`])) * ((s32) in[`9`]) +
184	((limb) ((s32) in2[`9`])) * ((s32) in[`4`]);
185	output[`14`] = `2` * (((limb) ((s32) in2[`7`])) * ((s32) in[`7`]) +
186	((limb) ((s32) in2[`5`])) * ((s32) in[`9`]) +
187	((limb) ((s32) in2[`9`])) * ((s32) in[`5`])) +
188	((limb) ((s32) in2[`6`])) * ((s32) in[`8`]) +
189	((limb) ((s32) in2[`8`])) * ((s32) in[`6`]);
190	output[`15`] = ((limb) ((s32) in2[`7`])) * ((s32) in[`8`]) +
191	((limb) ((s32) in2[`8`])) * ((s32) in[`7`]) +
192	((limb) ((s32) in2[`6`])) * ((s32) in[`9`]) +
193	((limb) ((s32) in2[`9`])) * ((s32) in[`6`]);
194	output[`16`] = ((limb) ((s32) in2[`8`])) * ((s32) in[`8`]) +
195	`2` * (((limb) ((s32) in2[`7`])) * ((s32) in[`9`]) +
196	((limb) ((s32) in2[`9`])) * ((s32) in[`7`]));
197	output[`17`] = ((limb) ((s32) in2[`8`])) * ((s32) in[`9`]) +
198	((limb) ((s32) in2[`9`])) * ((s32) in[`8`]);
199	output[`18`] = `2` * ((limb) ((s32) in2[`9`])) * ((s32) in[`9`]);
200	}
201
202	/ Reduce a long form to a short form by taking the input mod 2^255 - 19.*
203	*
204	* On entry: \|output[i]\| < 14*2^54
205	* On exit: \|output[0..8]\| < 2802^54 /
206	static void freduce_degree(limb *output) {
207	/ Each of these shifts and adds ends up multiplying the value by 19.*
208	*
209	* For output[0..8], the absolute entry value is < 14*2^54 and we add, at
210	* most, 19142^54 thus, on exit, \|output[0..8]\| < 2802^54. /
211	output[`8`] += output[`18`] << `4`;
212	output[`8`] += output[`18`] << `1`;
213	output[`8`] += output[`18`];
214	output[`7`] += output[`17`] << `4`;
215	output[`7`] += output[`17`] << `1`;
216	output[`7`] += output[`17`];
217	output[`6`] += output[`16`] << `4`;
218	output[`6`] += output[`16`] << `1`;
219	output[`6`] += output[`16`];
220	output[`5`] += output[`15`] << `4`;
221	output[`5`] += output[`15`] << `1`;
222	output[`5`] += output[`15`];
223	output[`4`] += output[`14`] << `4`;
224	output[`4`] += output[`14`] << `1`;
225	output[`4`] += output[`14`];
226	output[`3`] += output[`13`] << `4`;
227	output[`3`] += output[`13`] << `1`;
228	output[`3`] += output[`13`];
229	output[`2`] += output[`12`] << `4`;
230	output[`2`] += output[`12`] << `1`;
231	output[`2`] += output[`12`];
232	output[`1`] += output[`11`] << `4`;
233	output[`1`] += output[`11`] << `1`;
234	output[`1`] += output[`11`];
235	output[`0`] += output[`10`] << `4`;
236	output[`0`] += output[`10`] << `1`;
237	output[`0`] += output[`10`];
238	}
239
240	#if (-1 & 3) != 3
241	#error "This code only works on a two's complement system"
242	#endif
243
244	/ return v / 2^26, using only shifts and adds.*
245	*
246	* On entry: v can take any value. */
247	static inline limb
248	div_by_2_26(const limb v)
249	{
250	/ High word of v; no shift needed. /
251	const uint32_t highword = (uint32_t) (((uint64_t) v) >> `32`);
252	/ Set to all 1s if v was negative; else set to 0s. /
253	const int32_t sign = ((int32_t) highword) >> `31`;
254	/ Set to 0x3ffffff if v was negative; else set to 0. /
255	const int32_t roundoff = ((uint32_t) sign) >> `6`;
256	/ Should return v / (1<<26) /
257	return (v + roundoff) >> `26`;
258	}
259
260	/ return v / (2^25), using only shifts and adds.*
261	*
262	* On entry: v can take any value. */
263	static inline limb
264	div_by_2_25(const limb v)
265	{
266	/ High word of v; no shift needed/
267	const uint32_t highword = (uint32_t) (((uint64_t) v) >> `32`);
268	/ Set to all 1s if v was negative; else set to 0s. /
269	const int32_t sign = ((int32_t) highword) >> `31`;
270	/ Set to 0x1ffffff if v was negative; else set to 0. /
271	const int32_t roundoff = ((uint32_t) sign) >> `7`;
272	/ Should return v / (1<<25) /
273	return (v + roundoff) >> `25`;
274	}
275
276	/ Reduce all coefficients of the short form input so that \|x\| < 2^26.*
277	*
278	* On entry: \|output[i]\| < 2802^54 /
279	static void freduce_coefficients(limb *output) {
280	unsigned i;
281
282	output[`10`] = `0`;
283
284	for (i = `0`; i < `10`; i += `2`) {
285	limb over = div_by_2_26(v: output[i]);
286	/ The entry condition (that \|output[i]\| < 2802^54) means that over is, at
287	* most, 280*2^28 in the first iteration of this loop. This is added to the
288	* next limb and we can approximate the resulting bound of that limb by
289	* 2812^54. /
290	output[i] -= over << `26`;
291	output[i+`1`] += over;
292
293	/ For the first iteration, \|output[i+1]\| < 2812^54, thus \|over\| <
294	* 281*2^29. When this is added to the next limb, the resulting bound can
295	* be approximated as 281*2^54.
296	*
297	* For subsequent iterations of the loop, 281*2^54 remains a conservative
298	* bound and no overflow occurs. */
299	over = div_by_2_25(v: output[i+`1`]);
300	output[i+`1`] -= over << `25`;
301	output[i+`2`] += over;
302	}
303	/ Now \|output[10]\| < 2812^29 and all other coefficients are reduced. /*
304	output[`0`] += output[`10`] << `4`;
305	output[`0`] += output[`10`] << `1`;
306	output[`0`] += output[`10`];
307
308	output[`10`] = `0`;
309
310	/ Now output[1..9] are reduced, and \|output[0]\| < 2^26 + 192812^29*
311	* So \|over\| will be no more than 2^16. */
312	{
313	limb over = div_by_2_26(v: output[`0`]);
314	output[`0`] -= over << `26`;
315	output[`1`] += over;
316	}
317
318	/ Now output[0,2..9] are reduced, and \|output[1]\| < 2^25 + 2^16 < 2^26. The*
319	* bound on \|output[1]\| is sufficient to meet our needs. */
320	}
321
322	/ A helpful wrapper around fproduct: output = in * in2.*
323	*
324	* On entry: \|in[i]\| < 2^27 and \|in2[i]\| < 2^27.
325	*
326	* output must be distinct to both inputs. The output is reduced degree
327	* (indeed, one need only provide storage for 10 limbs) and \|output[i]\| < 2^26. */
328	static void
329	fmul(limb output, const* limb in, const* limb *in2) {
330	limb t[`19`];
331	fproduct(output: t, in2: in, in: in2);
332	/ \|t[i]\| < 142^54 /*
333	freduce_degree(output: t);
334	freduce_coefficients(output: t);
335	/ \|t[i]\| < 2^26 /
336	memcpy(dest: output, src: t, n: sizeof(limb) * `10`);
337	}
338
339	/ Square a number: output = in*2
340	*
341	* output must be distinct from the input. The inputs are reduced coefficient
342	* form, the output is not.
343	*
344	* output[x] <= 14 * the largest product of the input limbs. */
345	static void fsquare_inner(limb output, const* limb *in) {
346	output[`0`] = ((limb) ((s32) in[`0`])) * ((s32) in[`0`]);
347	output[`1`] = `2` * ((limb) ((s32) in[`0`])) * ((s32) in[`1`]);
348	output[`2`] = `2` * (((limb) ((s32) in[`1`])) * ((s32) in[`1`]) +
349	((limb) ((s32) in[`0`])) * ((s32) in[`2`]));
350	output[`3`] = `2` * (((limb) ((s32) in[`1`])) * ((s32) in[`2`]) +
351	((limb) ((s32) in[`0`])) * ((s32) in[`3`]));
352	output[`4`] = ((limb) ((s32) in[`2`])) * ((s32) in[`2`]) +
353	`4` * ((limb) ((s32) in[`1`])) * ((s32) in[`3`]) +
354	`2` * ((limb) ((s32) in[`0`])) * ((s32) in[`4`]);
355	output[`5`] = `2` * (((limb) ((s32) in[`2`])) * ((s32) in[`3`]) +
356	((limb) ((s32) in[`1`])) * ((s32) in[`4`]) +
357	((limb) ((s32) in[`0`])) * ((s32) in[`5`]));
358	output[`6`] = `2` * (((limb) ((s32) in[`3`])) * ((s32) in[`3`]) +
359	((limb) ((s32) in[`2`])) * ((s32) in[`4`]) +
360	((limb) ((s32) in[`0`])) * ((s32) in[`6`]) +
361	`2` * ((limb) ((s32) in[`1`])) * ((s32) in[`5`]));
362	output[`7`] = `2` * (((limb) ((s32) in[`3`])) * ((s32) in[`4`]) +
363	((limb) ((s32) in[`2`])) * ((s32) in[`5`]) +
364	((limb) ((s32) in[`1`])) * ((s32) in[`6`]) +
365	((limb) ((s32) in[`0`])) * ((s32) in[`7`]));
366	output[`8`] = ((limb) ((s32) in[`4`])) * ((s32) in[`4`]) +
367	`2` * (((limb) ((s32) in[`2`])) * ((s32) in[`6`]) +
368	((limb) ((s32) in[`0`])) * ((s32) in[`8`]) +
369	`2` * (((limb) ((s32) in[`1`])) * ((s32) in[`7`]) +
370	((limb) ((s32) in[`3`])) * ((s32) in[`5`])));
371	output[`9`] = `2` * (((limb) ((s32) in[`4`])) * ((s32) in[`5`]) +
372	((limb) ((s32) in[`3`])) * ((s32) in[`6`]) +
373	((limb) ((s32) in[`2`])) * ((s32) in[`7`]) +
374	((limb) ((s32) in[`1`])) * ((s32) in[`8`]) +
375	((limb) ((s32) in[`0`])) * ((s32) in[`9`]));
376	output[`10`] = `2` * (((limb) ((s32) in[`5`])) * ((s32) in[`5`]) +
377	((limb) ((s32) in[`4`])) * ((s32) in[`6`]) +
378	((limb) ((s32) in[`2`])) * ((s32) in[`8`]) +
379	`2` * (((limb) ((s32) in[`3`])) * ((s32) in[`7`]) +
380	((limb) ((s32) in[`1`])) * ((s32) in[`9`])));
381	output[`11`] = `2` * (((limb) ((s32) in[`5`])) * ((s32) in[`6`]) +
382	((limb) ((s32) in[`4`])) * ((s32) in[`7`]) +
383	((limb) ((s32) in[`3`])) * ((s32) in[`8`]) +
384	((limb) ((s32) in[`2`])) * ((s32) in[`9`]));
385	output[`12`] = ((limb) ((s32) in[`6`])) * ((s32) in[`6`]) +
386	`2` * (((limb) ((s32) in[`4`])) * ((s32) in[`8`]) +
387	`2` * (((limb) ((s32) in[`5`])) * ((s32) in[`7`]) +
388	((limb) ((s32) in[`3`])) * ((s32) in[`9`])));
389	output[`13`] = `2` * (((limb) ((s32) in[`6`])) * ((s32) in[`7`]) +
390	((limb) ((s32) in[`5`])) * ((s32) in[`8`]) +
391	((limb) ((s32) in[`4`])) * ((s32) in[`9`]));
392	output[`14`] = `2` * (((limb) ((s32) in[`7`])) * ((s32) in[`7`]) +
393	((limb) ((s32) in[`6`])) * ((s32) in[`8`]) +
394	`2` * ((limb) ((s32) in[`5`])) * ((s32) in[`9`]));
395	output[`15`] = `2` * (((limb) ((s32) in[`7`])) * ((s32) in[`8`]) +
396	((limb) ((s32) in[`6`])) * ((s32) in[`9`]));
397	output[`16`] = ((limb) ((s32) in[`8`])) * ((s32) in[`8`]) +
398	`4` * ((limb) ((s32) in[`7`])) * ((s32) in[`9`]);
399	output[`17`] = `2` * ((limb) ((s32) in[`8`])) * ((s32) in[`9`]);
400	output[`18`] = `2` * ((limb) ((s32) in[`9`])) * ((s32) in[`9`]);
401	}
402
403	/ fsquare sets output = in^2.*
404	*
405	* On entry: The \|in\| argument is in reduced coefficients form and \|in[i]\| <
406	* 2^27.
407	*
408	* On exit: The \|output\| argument is in reduced coefficients form (indeed, one
409	* need only provide storage for 10 limbs) and \|out[i]\| < 2^26. */
410	static void
411	fsquare(limb output, const* limb *in) {
412	limb t[`19`];
413	fsquare_inner(output: t, in);
414	/ \|t[i]\| < 142^54 because the largest product of two limbs will be <
415	* 2^(27+27) and fsquare_inner adds together, at most, 14 of those
416	* products. */
417	freduce_degree(output: t);
418	freduce_coefficients(output: t);
419	/ \|t[i]\| < 2^26 /
420	memcpy(dest: output, src: t, n: sizeof(limb) * `10`);
421	}
422
423	/ Take a little-endian, 32-byte number and expand it into polynomial form /
424	static void
425	fexpand(limb output, const* u8 *input) {
426	#define F(n,start,shift,mask) \
427	output[n] = ((((limb) input[start + 0]) \| \
428	((limb) input[start + 1]) << 8 \| \
429	((limb) input[start + 2]) << 16 \| \
430	((limb) input[start + 3]) << 24) >> shift) & mask;
431	F(`0`, `0`, `0`, `0x3ffffff`);
432	F(`1`, `3`, `2`, `0x1ffffff`);
433	F(`2`, `6`, `3`, `0x3ffffff`);
434	F(`3`, `9`, `5`, `0x1ffffff`);
435	F(`4`, `12`, `6`, `0x3ffffff`);
436	F(`5`, `16`, `0`, `0x1ffffff`);
437	F(`6`, `19`, `1`, `0x3ffffff`);
438	F(`7`, `22`, `3`, `0x1ffffff`);
439	F(`8`, `25`, `4`, `0x3ffffff`);
440	F(`9`, `28`, `6`, `0x1ffffff`);
441	#undef F
442	}
443
444	#if (-32 >> 1) != -16
445	#error "This code only works when >> does sign-extension on negative numbers"
446	#endif
447
448	/ s32_eq returns 0xffffffff iff a == b and zero otherwise. /
449	static s32 s32_eq(s32 a, s32 b) {
450	a = ~(a ^ b);
451	a &= a << `16`;
452	a &= a << `8`;
453	a &= a << `4`;
454	a &= a << `2`;
455	a &= a << `1`;
456	return a >> `31`;
457	}
458
459	/ s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are*
460	* both non-negative. */
461	static s32 s32_gte(s32 a, s32 b) {
462	a -= b;
463	/ a >= 0 iff a >= b. /
464	return ~(a >> `31`);
465	}
466
467	/ Take a fully reduced polynomial form number and contract it into a*
468	* little-endian, 32-byte array.
469	*
470	* On entry: \|input_limbs[i]\| < 2^26 */
471	static void
472	fcontract(u8 output, limb input_limbs) {
473	int i;
474	int j;
475	s32 input[`10`];
476	s32 mask;
477
478	/ \|input_limbs[i]\| < 2^26, so it's valid to convert to an s32. /
479	for (i = `0`; i < `10`; i++) {
480	input[i] = input_limbs[i];
481	}
482
483	for (j = `0`; j < `2`; ++j) {
484	for (i = `0`; i < `9`; ++i) {
485	if ((i & `1`) == `1`) {
486	/ This calculation is a time-invariant way to make input[i]*
487	* non-negative by borrowing from the next-larger limb. */
488	const s32 mask = input[i] >> `31`;
489	const s32 carry = -((input[i] & mask) >> `25`);
490	input[i] = input[i] + (carry << `25`);
491	input[i+`1`] = input[i+`1`] - carry;
492	} else {
493	const s32 mask = input[i] >> `31`;
494	const s32 carry = -((input[i] & mask) >> `26`);
495	input[i] = input[i] + (carry << `26`);
496	input[i+`1`] = input[i+`1`] - carry;
497	}
498	}
499
500	/ There's no greater limb for input[9] to borrow from, but we can multiply*
501	* by 19 and borrow from input[0], which is valid mod 2^255-19. */
502	{
503	const s32 mask = input[`9`] >> `31`;
504	const s32 carry = -((input[`9`] & mask) >> `25`);
505	input[`9`] = input[`9`] + (carry << `25`);
506	input[`0`] = input[`0`] - (carry * `19`);
507	}
508
509	/ After the first iteration, input[1..9] are non-negative and fit within*
510	* 25 or 26 bits, depending on position. However, input[0] may be
511	* negative. */
512	}
513
514	/ The first borrow-propagation pass above ended with every limb*
515	except (possibly) input[0] non-negative.
516
517	If input[0] was negative after the first pass, then it was because of a
518	carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most,
519	one, since (226-1) >> 25 = 1. Thus input[0] >= -19.
520
521	In the second pass, each limb is decreased by at most one. Thus the second
522	borrow-propagation pass could only have wrapped around to decrease
523	input[0] again if the first pass left input[0] negative and* input[1]*
524	through input[9] were all zero. In that case, input[1] is now 2^25 - 1,
525	and this last borrow-propagation step will leave input[1] non-negative. /*
526	{
527	const s32 mask = input[`0`] >> `31`;
528	const s32 carry = -((input[`0`] & mask) >> `26`);
529	input[`0`] = input[`0`] + (carry << `26`);
530	input[`1`] = input[`1`] - carry;
531	}
532
533	/ All input[i] are now non-negative. However, there might be values between*
534	* 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */
535	for (j = `0`; j < `2`; j++) {
536	for (i = `0`; i < `9`; i++) {
537	if ((i & `1`) == `1`) {
538	const s32 carry = input[i] >> `25`;
539	input[i] &= `0x1ffffff`;
540	input[i+`1`] += carry;
541	} else {
542	const s32 carry = input[i] >> `26`;
543	input[i] &= `0x3ffffff`;
544	input[i+`1`] += carry;
545	}
546	}
547
548	{
549	const s32 carry = input[`9`] >> `25`;
550	input[`9`] &= `0x1ffffff`;
551	input[`0`] += `19`*carry;
552	}
553	}
554
555	/ If the first carry-chain pass, just above, ended up with a carry from*
556	* input[9], and that caused input[0] to be out-of-bounds, then input[0] was
557	* < 2^26 + 2*19, because the carry was, at most, two.
558	*
559	* If the second pass carried from input[9] again then input[0] is < 2*19 and
560	* the input[9] -> input[0] carry didn't push input[0] out of bounds. */
561
562	/ It still remains the case that input might be between 2^255-19 and 2^255.*
563	* In this case, input[1..9] must take their maximum value and input[0] must
564	* be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */
565	mask = s32_gte(a: input[`0`], b: `0x3ffffed`);
566	for (i = `1`; i < `10`; i++) {
567	if ((i & `1`) == `1`) {
568	mask &= s32_eq(a: input[i], b: `0x1ffffff`);
569	} else {
570	mask &= s32_eq(a: input[i], b: `0x3ffffff`);
571	}
572	}
573
574	/ mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus*
575	* this conditionally subtracts 2^255-19. */
576	input[`0`] -= mask & `0x3ffffed`;
577
578	for (i = `1`; i < `10`; i++) {
579	if ((i & `1`) == `1`) {
580	input[i] -= mask & `0x1ffffff`;
581	} else {
582	input[i] -= mask & `0x3ffffff`;
583	}
584	}
585
586	input[`1`] <<= `2`;
587	input[`2`] <<= `3`;
588	input[`3`] <<= `5`;
589	input[`4`] <<= `6`;
590	input[`6`] <<= `1`;
591	input[`7`] <<= `3`;
592	input[`8`] <<= `4`;
593	input[`9`] <<= `6`;
594	#define F(i, s) \
595	output[s+0] \|= input[i] & 0xff; \
596	output[s+1] = (input[i] >> 8) & 0xff; \
597	output[s+2] = (input[i] >> 16) & 0xff; \
598	output[s+3] = (input[i] >> 24) & 0xff;
599	output[`0`] = `0`;
600	output[`16`] = `0`;
601	F(`0`,`0`);
602	F(`1`,`3`);
603	F(`2`,`6`);
604	F(`3`,`9`);
605	F(`4`,`12`);
606	F(`5`,`16`);
607	F(`6`,`19`);
608	F(`7`,`22`);
609	F(`8`,`25`);
610	F(`9`,`28`);
611	#undef F
612	}
613
614	/ Input: Q, Q', Q-Q'*
615	* Output: 2Q, Q+Q'
616	*
617	* x2 z3: long form
618	* x3 z3: long form
619	* x z: short form, destroyed
620	* xprime zprime: short form, destroyed
621	* qmqp: short form, preserved
622	*
623	* On entry and exit, the absolute value of the limbs of all inputs and outputs
624	* are < 2^26. */
625	static void fmonty(limb x2, limb z2, / output 2Q /
626	limb x3, limb z3, / output Q + Q' /
627	limb x, limb z, / input Q /
628	limb xprime, limb zprime, / input Q' /
629	const limb qmqp /* input Q - Q' /) {
630	limb origx[`10`], origxprime[`10`], zzz[`19`], xx[`19`], zz[`19`], xxprime[`19`],
631	zzprime[`19`], zzzprime[`19`], xxxprime[`19`];
632
633	memcpy(dest: origx, src: x, n: `10` * sizeof(limb));
634	fsum(output: x, in: z);
635	/ \|x[i]\| < 2^27 /
636	fdifference(output: z, in: origx); / does x - z /
637	/ \|z[i]\| < 2^27 /
638
639	memcpy(dest: origxprime, src: xprime, n: sizeof(limb) * `10`);
640	fsum(output: xprime, in: zprime);
641	/ \|xprime[i]\| < 2^27 /
642	fdifference(output: zprime, in: origxprime);
643	/ \|zprime[i]\| < 2^27 /
644	fproduct(output: xxprime, in2: xprime, in: z);
645	/ \|xxprime[i]\| < 142^54: the largest product of two limbs will be <
646	* 2^(27+27) and fproduct adds together, at most, 14 of those products.
647	* (Approximating that to 2^58 doesn't work out.) */
648	fproduct(output: zzprime, in2: x, in: zprime);
649	/ \|zzprime[i]\| < 142^54 /*
650	freduce_degree(output: xxprime);
651	freduce_coefficients(output: xxprime);
652	/ \|xxprime[i]\| < 2^26 /
653	freduce_degree(output: zzprime);
654	freduce_coefficients(output: zzprime);
655	/ \|zzprime[i]\| < 2^26 /
656	memcpy(dest: origxprime, src: xxprime, n: sizeof(limb) * `10`);
657	fsum(output: xxprime, in: zzprime);
658	/ \|xxprime[i]\| < 2^27 /
659	fdifference(output: zzprime, in: origxprime);
660	/ \|zzprime[i]\| < 2^27 /
661	fsquare(output: xxxprime, in: xxprime);
662	/ \|xxxprime[i]\| < 2^26 /
663	fsquare(output: zzzprime, in: zzprime);
664	/ \|zzzprime[i]\| < 2^26 /
665	fproduct(output: zzprime, in2: zzzprime, in: qmqp);
666	/ \|zzprime[i]\| < 142^52 /*
667	freduce_degree(output: zzprime);
668	freduce_coefficients(output: zzprime);
669	/ \|zzprime[i]\| < 2^26 /
670	memcpy(dest: x3, src: xxxprime, n: sizeof(limb) * `10`);
671	memcpy(dest: z3, src: zzprime, n: sizeof(limb) * `10`);
672
673	fsquare(output: xx, in: x);
674	/ \|xx[i]\| < 2^26 /
675	fsquare(output: zz, in: z);
676	/ \|zz[i]\| < 2^26 /
677	fproduct(output: x2, in2: xx, in: zz);
678	/ \|x2[i]\| < 142^52 /*
679	freduce_degree(output: x2);
680	freduce_coefficients(output: x2);
681	/ \|x2[i]\| < 2^26 /
682	fdifference(output: zz, in: xx); // does zz = xx - zz
683	/ \|zz[i]\| < 2^27 /
684	memset(s: zzz + `10`, c: `0`, n: sizeof(limb) * `9`);
685	fscalar_product(output: zzz, in: zz, scalar: `121665`);
686	/ \|zzz[i]\| < 2^(27+17) /
687	/ No need to call freduce_degree here:*
688	fscalar_product doesn't increase the degree of its input. /*
689	freduce_coefficients(output: zzz);
690	/ \|zzz[i]\| < 2^26 /
691	fsum(output: zzz, in: xx);
692	/ \|zzz[i]\| < 2^27 /
693	fproduct(output: z2, in2: zz, in: zzz);
694	/ \|z2[i]\| < 142^(26+27) /*
695	freduce_degree(output: z2);
696	freduce_coefficients(output: z2);
697	/ \|z2\|i\| < 2^26 /
698	}
699
700	/ Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave*
701	* them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
702	* side-channel attacks.
703	*
704	* NOTE that this function requires that 'iswap' be 1 or 0; other values give
705	* wrong results. Also, the two limb arrays must be in reduced-coefficient,
706	* reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
707	* and all all values in a[0..9],b[0..9] must have magnitude less than
708	* INT32_MAX. */
709	static void
710	swap_conditional(limb a[`19`], limb b[`19`], limb iswap) {
711	unsigned i;
712	const s32 swap = (s32) -iswap;
713
714	for (i = `0`; i < `10`; ++i) {
715	const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
716	a[i] = ((s32)a[i]) ^ x;
717	b[i] = ((s32)b[i]) ^ x;
718	}
719	}
720
721	/ Calculates nQ where Q is the x-coordinate of a point on the curve*
722	*
723	* resultx/resultz: the x coordinate of the resulting curve point (short form)
724	* n: a little endian, 32-byte number
725	* q: a point of the curve (short form) */
726	static void
727	cmult(limb resultx, limb resultz, const u8 n, const* limb *q) {
728	limb a[`19`] = {`0`}, b[`19`] = {`1`}, c[`19`] = {`1`}, d[`19`] = {`0`};
729	limb nqpqx = a, nqpqz = b, nqx = c, nqz = d, *t;
730	limb e[`19`] = {`0`}, f[`19`] = {`1`}, g[`19`] = {`0`}, h[`19`] = {`1`};
731	limb nqpqx2 = e, nqpqz2 = f, nqx2 = g, nqz2 = h;
732
733	unsigned i, j;
734
735	memcpy(dest: nqpqx, src: q, n: sizeof(limb) * `10`);
736
737	for (i = `0`; i < `32`; ++i) {
738	u8 byte = n[`31` - i];
739	for (j = `0`; j < `8`; ++j) {
740	const limb bit = byte >> `7`;
741
742	swap_conditional(a: nqx, b: nqpqx, iswap: bit);
743	swap_conditional(a: nqz, b: nqpqz, iswap: bit);
744	fmonty(x2: nqx2, z2: nqz2,
745	x3: nqpqx2, z3: nqpqz2,
746	x: nqx, z: nqz,
747	xprime: nqpqx, zprime: nqpqz,
748	qmqp: q);
749	swap_conditional(a: nqx2, b: nqpqx2, iswap: bit);
750	swap_conditional(a: nqz2, b: nqpqz2, iswap: bit);
751
752	t = nqx;
753	nqx = nqx2;
754	nqx2 = t;
755	t = nqz;
756	nqz = nqz2;
757	nqz2 = t;
758	t = nqpqx;
759	nqpqx = nqpqx2;
760	nqpqx2 = t;
761	t = nqpqz;
762	nqpqz = nqpqz2;
763	nqpqz2 = t;
764
765	byte <<= `1`;
766	}
767	}
768
769	memcpy(dest: resultx, src: nqx, n: sizeof(limb) * `10`);
770	memcpy(dest: resultz, src: nqz, n: sizeof(limb) * `10`);
771	}
772
773	// -----------------------------------------------------------------------------
774	// Shamelessly copied from djb's code
775	// -----------------------------------------------------------------------------
776	static void
777	crecip(limb out, const* limb *z) {
778	limb z2[`10`];
779	limb z9[`10`];
780	limb z11[`10`];
781	limb z2_5_0[`10`];
782	limb z2_10_0[`10`];
783	limb z2_20_0[`10`];
784	limb z2_50_0[`10`];
785	limb z2_100_0[`10`];
786	limb t0[`10`];
787	limb t1[`10`];
788	int i;
789
790	/ 2 / fsquare(output: z2,in: z);
791	/ 4 / fsquare(output: t1,in: z2);
792	/ 8 / fsquare(output: t0,in: t1);
793	/ 9 / fmul(output: z9,in: t0,in2: z);
794	/ 11 / fmul(output: z11,in: z9,in2: z2);
795	/ 22 / fsquare(output: t0,in: z11);
796	/ 2^5 - 2^0 = 31 / fmul(output: z2_5_0,in: t0,in2: z9);
797
798	/ 2^6 - 2^1 / fsquare(output: t0,in: z2_5_0);
799	/ 2^7 - 2^2 / fsquare(output: t1,in: t0);
800	/ 2^8 - 2^3 / fsquare(output: t0,in: t1);
801	/ 2^9 - 2^4 / fsquare(output: t1,in: t0);
802	/ 2^10 - 2^5 / fsquare(output: t0,in: t1);
803	/ 2^10 - 2^0 / fmul(output: z2_10_0,in: t0,in2: z2_5_0);
804
805	/ 2^11 - 2^1 / fsquare(output: t0,in: z2_10_0);
806	/ 2^12 - 2^2 / fsquare(output: t1,in: t0);
807	/ 2^20 - 2^10 / for (i = `2`;i < `10`;i += `2`) { fsquare(output: t0,in: t1); fsquare(output: t1,in: t0); }
808	/ 2^20 - 2^0 / fmul(output: z2_20_0,in: t1,in2: z2_10_0);
809
810	/ 2^21 - 2^1 / fsquare(output: t0,in: z2_20_0);
811	/ 2^22 - 2^2 / fsquare(output: t1,in: t0);
812	/ 2^40 - 2^20 / for (i = `2`;i < `20`;i += `2`) { fsquare(output: t0,in: t1); fsquare(output: t1,in: t0); }
813	/ 2^40 - 2^0 / fmul(output: t0,in: t1,in2: z2_20_0);
814
815	/ 2^41 - 2^1 / fsquare(output: t1,in: t0);
816	/ 2^42 - 2^2 / fsquare(output: t0,in: t1);
817	/ 2^50 - 2^10 / for (i = `2`;i < `10`;i += `2`) { fsquare(output: t1,in: t0); fsquare(output: t0,in: t1); }
818	/ 2^50 - 2^0 / fmul(output: z2_50_0,in: t0,in2: z2_10_0);
819
820	/ 2^51 - 2^1 / fsquare(output: t0,in: z2_50_0);
821	/ 2^52 - 2^2 / fsquare(output: t1,in: t0);
822	/ 2^100 - 2^50 / for (i = `2`;i < `50`;i += `2`) { fsquare(output: t0,in: t1); fsquare(output: t1,in: t0); }
823	/ 2^100 - 2^0 / fmul(output: z2_100_0,in: t1,in2: z2_50_0);
824
825	/ 2^101 - 2^1 / fsquare(output: t1,in: z2_100_0);
826	/ 2^102 - 2^2 / fsquare(output: t0,in: t1);
827	/ 2^200 - 2^100 / for (i = `2`;i < `100`;i += `2`) { fsquare(output: t1,in: t0); fsquare(output: t0,in: t1); }
828	/ 2^200 - 2^0 / fmul(output: t1,in: t0,in2: z2_100_0);
829
830	/ 2^201 - 2^1 / fsquare(output: t0,in: t1);
831	/ 2^202 - 2^2 / fsquare(output: t1,in: t0);
832	/ 2^250 - 2^50 / for (i = `2`;i < `50`;i += `2`) { fsquare(output: t0,in: t1); fsquare(output: t1,in: t0); }
833	/ 2^250 - 2^0 / fmul(output: t0,in: t1,in2: z2_50_0);
834
835	/ 2^251 - 2^1 / fsquare(output: t1,in: t0);
836	/ 2^252 - 2^2 / fsquare(output: t0,in: t1);
837	/ 2^253 - 2^3 / fsquare(output: t1,in: t0);
838	/ 2^254 - 2^4 / fsquare(output: t0,in: t1);
839	/ 2^255 - 2^5 / fsquare(output: t1,in: t0);
840	/ 2^255 - 21 / fmul(output: out,in: t1,in2: z11);
841	}
842
843	int
844	curve25519_donna(u8 mypublic, const* u8 secret, const* u8 *basepoint) {
845	limb bp[`10`], x[`10`], z[`11`], zmone[`10`];
846	uint8_t e[`32`];
847	int i;
848
849	for (i = `0`; i < `32`; ++i) e[i] = secret[i];
850	e[`0`] &= `248`;
851	e[`31`] &= `127`;
852	e[`31`] \|= `64`;
853
854	fexpand(output: bp, input: basepoint);
855	cmult(resultx: x, resultz: z, n: e, q: bp);
856	crecip(out: zmone, z);
857	fmul(output: z, in: x, in2: zmone);
858	fcontract(output: mypublic, input_limbs: z);
859	return `0`;
860	}
861

Definitions

fsum
fdifference
fscalar_product
fproduct
freduce_degree
div_by_2_26
div_by_2_25
freduce_coefficients
fmul
fsquare_inner
fsquare
fexpand
s32_eq
s32_gte
fcontract
fmonty
swap_conditional
cmult
crecip

Browse the source code of olm/lib/curve25519-donna/curve25519-donna.c

Definitions