Use fiat-crypto's freeze function for fe_tobytes.

It requires a handful of additional intrinsics for now.

Fiat's freeze function only works on the tight bounds, so fe_isnonzero
gains an extra fe_carry. But all other calls of fe_tobytes are of tight
bounds anyway.

Change-Id: I834858cee7863c7344e456d7a7dbf4f414f04ae5
Reviewed-on: https://boringssl-review.googlesource.com/24545
Reviewed-by: Adam Langley <agl@google.com>
This commit is contained in:
David Benjamin 2018-01-03 17:38:28 -05:00 committed by Adam Langley
parent 2f9b47fb19
commit 00208b443c
2 changed files with 140 additions and 110 deletions

View File

@ -7,7 +7,7 @@ licensed under the MIT license. (See LICENSE file.)
## Curve25519
To generate the field arithmetic procedures in `curve25519.c` from a fiat-crypto
checkout (as of `c47f48268f15e202a28b556845f231b2038cb426`), run
checkout (as of `693d62c6fd7370bf71b8eb3b9a5825dfd071fcac`), run
`make src/Specific/solinas32_2e255m19_10limbs/femul.c` (replacing `femul` with
the desired field operation). The "source" file specifying the finite field and
referencing the desired implementation strategy is

View File

@ -41,9 +41,6 @@
#include "../../crypto/internal.h"
static const int64_t kBottom25Bits = INT64_C(0x1ffffff);
static const int64_t kBottom26Bits = INT64_C(0x3ffffff);
static uint64_t load_3(const uint8_t *in) {
uint64_t result;
result = (uint64_t)in[0];
@ -73,6 +70,12 @@ static uint64_t load_4(const uint8_t *in) {
} \
} while (0)
#define assert_fe_frozen(f) do { \
for (unsigned _assert_fe_i = 0; _assert_fe_i< 10; _assert_fe_i++) { \
assert(f[_assert_fe_i] < (1u<<(26-(_assert_fe_i&1)))); \
} \
} while (0)
static void fe_frombytes_impl(uint32_t h[10], const uint8_t *s) {
// Ignores top bit of s.
uint32_t a0 = load_4(s);
@ -100,115 +103,140 @@ static void fe_frombytes(fe *h, const uint8_t *s) {
fe_frombytes_impl(h->v, s);
}
// Preconditions:
// |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
//
// Write p=2^255-19; q=floor(h/p).
// Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
//
// Proof:
// Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
// Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
//
// Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
// Then 0<y<1.
//
// Write r=h-pq.
// Have 0<=r<=p-1=2^255-20.
// Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
//
// Write x=r+19(2^-255)r+y.
// Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
//
// Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
// so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
static void fe_tobytes_impl(uint8_t s[32], const uint32_t h[10]) {
assert_fe_loose(h);
int32_t h0 = h[0];
int32_t h1 = h[1];
int32_t h2 = h[2];
int32_t h3 = h[3];
int32_t h4 = h[4];
int32_t h5 = h[5];
int32_t h6 = h[6];
int32_t h7 = h[7];
int32_t h8 = h[8];
int32_t h9 = h[9];
int32_t q;
q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;
q = (h2 + q) >> 26;
q = (h3 + q) >> 25;
q = (h4 + q) >> 26;
q = (h5 + q) >> 25;
q = (h6 + q) >> 26;
q = (h7 + q) >> 25;
q = (h8 + q) >> 26;
q = (h9 + q) >> 25;
// Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20.
h0 += 19 * q;
// Goal: Output h-2^255 q, which is between 0 and 2^255-20.
h1 += h0 >> 26; h0 &= kBottom26Bits;
h2 += h1 >> 25; h1 &= kBottom25Bits;
h3 += h2 >> 26; h2 &= kBottom26Bits;
h4 += h3 >> 25; h3 &= kBottom25Bits;
h5 += h4 >> 26; h4 &= kBottom26Bits;
h6 += h5 >> 25; h5 &= kBottom25Bits;
h7 += h6 >> 26; h6 &= kBottom26Bits;
h8 += h7 >> 25; h7 &= kBottom25Bits;
h9 += h8 >> 26; h8 &= kBottom26Bits;
h9 &= kBottom25Bits;
// h10 = carry9
// Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
// Have h0+...+2^230 h9 between 0 and 2^255-1;
// evidently 2^255 h10-2^255 q = 0.
// Goal: Output h0+...+2^230 h9.
s[0] = h0 >> 0;
s[1] = h0 >> 8;
s[2] = h0 >> 16;
s[3] = (h0 >> 24) | ((uint32_t)(h1) << 2);
s[4] = h1 >> 6;
s[5] = h1 >> 14;
s[6] = (h1 >> 22) | ((uint32_t)(h2) << 3);
s[7] = h2 >> 5;
s[8] = h2 >> 13;
s[9] = (h2 >> 21) | ((uint32_t)(h3) << 5);
s[10] = h3 >> 3;
s[11] = h3 >> 11;
s[12] = (h3 >> 19) | ((uint32_t)(h4) << 6);
s[13] = h4 >> 2;
s[14] = h4 >> 10;
s[15] = h4 >> 18;
s[16] = h5 >> 0;
s[17] = h5 >> 8;
s[18] = h5 >> 16;
s[19] = (h5 >> 24) | ((uint32_t)(h6) << 1);
s[20] = h6 >> 7;
s[21] = h6 >> 15;
s[22] = (h6 >> 23) | ((uint32_t)(h7) << 3);
s[23] = h7 >> 5;
s[24] = h7 >> 13;
s[25] = (h7 >> 21) | ((uint32_t)(h8) << 4);
s[26] = h8 >> 4;
s[27] = h8 >> 12;
s[28] = (h8 >> 20) | ((uint32_t)(h9) << 6);
s[29] = h9 >> 2;
s[30] = h9 >> 10;
s[31] = h9 >> 18;
static uint8_t /*bool*/ addcarryx_u25(uint8_t /*bool*/ c, uint32_t a,
uint32_t b, uint32_t *low) {
// This function extracts 25 bits of result and 1 bit of carry (26 total), so
// a 32-bit intermediate is sufficient.
uint32_t x = a + b + c;
*low = x & ((1 << 25) - 1);
return (x >> 25) & 1;
}
static void fe_tobytes(uint8_t s[32], const fe *h) {
fe_tobytes_impl(s, h->v);
static uint8_t /*bool*/ addcarryx_u26(uint8_t /*bool*/ c, uint32_t a,
uint32_t b, uint32_t *low) {
// This function extracts 26 bits of result and 1 bit of carry (27 total), so
// a 32-bit intermediate is sufficient.
uint32_t x = a + b + c;
*low = x & ((1 << 26) - 1);
return (x >> 26) & 1;
}
static void fe_loose_tobytes(uint8_t s[32], const fe_loose *h) {
fe_tobytes_impl(s, h->v);
static uint8_t /*bool*/ subborrow_u25(uint8_t /*bool*/ c, uint32_t a,
uint32_t b, uint32_t *low) {
// This function extracts 25 bits of result and 1 bit of borrow (26 total), so
// a 32-bit intermediate is sufficient.
uint32_t x = a - b - c;
*low = x & ((1 << 25) - 1);
return x >> 31;
}
static uint8_t /*bool*/ subborrow_u26(uint8_t /*bool*/ c, uint32_t a,
uint32_t b, uint32_t *low) {
// This function extracts 26 bits of result and 1 bit of borrow (27 total), so
// a 32-bit intermediate is sufficient.
uint32_t x = a - b - c;
*low = x & ((1 << 26) - 1);
return x >> 31;
}
static uint32_t cmovznz32(uint32_t t, uint32_t z, uint32_t nz) {
t = -!!t; // all set if nonzero, 0 if 0
return (t&nz) | ((~t)&z);
}
static void fe_freeze(uint32_t out[10], const uint32_t in1[10]) {
{ const uint32_t x17 = in1[9];
{ const uint32_t x18 = in1[8];
{ const uint32_t x16 = in1[7];
{ const uint32_t x14 = in1[6];
{ const uint32_t x12 = in1[5];
{ const uint32_t x10 = in1[4];
{ const uint32_t x8 = in1[3];
{ const uint32_t x6 = in1[2];
{ const uint32_t x4 = in1[1];
{ const uint32_t x2 = in1[0];
{ uint32_t x20; uint8_t/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20);
{ uint32_t x23; uint8_t/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23);
{ uint32_t x26; uint8_t/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26);
{ uint32_t x29; uint8_t/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29);
{ uint32_t x32; uint8_t/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32);
{ uint32_t x35; uint8_t/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35);
{ uint32_t x38; uint8_t/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38);
{ uint32_t x41; uint8_t/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41);
{ uint32_t x44; uint8_t/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44);
{ uint32_t x47; uint8_t/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47);
{ uint32_t x49 = cmovznz32(x48, 0x0, 0xffffffff);
{ uint32_t x50 = (x49 & 0x3ffffed);
{ uint32_t x52; uint8_t/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52);
{ uint32_t x54 = (x49 & 0x1ffffff);
{ uint32_t x56; uint8_t/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56);
{ uint32_t x58 = (x49 & 0x3ffffff);
{ uint32_t x60; uint8_t/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60);
{ uint32_t x62 = (x49 & 0x1ffffff);
{ uint32_t x64; uint8_t/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64);
{ uint32_t x66 = (x49 & 0x3ffffff);
{ uint32_t x68; uint8_t/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68);
{ uint32_t x70 = (x49 & 0x1ffffff);
{ uint32_t x72; uint8_t/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72);
{ uint32_t x74 = (x49 & 0x3ffffff);
{ uint32_t x76; uint8_t/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76);
{ uint32_t x78 = (x49 & 0x1ffffff);
{ uint32_t x80; uint8_t/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80);
{ uint32_t x82 = (x49 & 0x3ffffff);
{ uint32_t x84; uint8_t/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84);
{ uint32_t x86 = (x49 & 0x1ffffff);
{ uint32_t x88; addcarryx_u25(x85, x47, x86, &x88);
out[0] = x52;
out[1] = x56;
out[2] = x60;
out[3] = x64;
out[4] = x68;
out[5] = x72;
out[6] = x76;
out[7] = x80;
out[8] = x84;
out[9] = x88;
}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
}
static void fe_tobytes(uint8_t s[32], const fe *f) {
assert_fe(f->v);
uint32_t h[10];
fe_freeze(h, f->v);
assert_fe_frozen(h);
s[0] = h[0] >> 0;
s[1] = h[0] >> 8;
s[2] = h[0] >> 16;
s[3] = (h[0] >> 24) | (h[1] << 2);
s[4] = h[1] >> 6;
s[5] = h[1] >> 14;
s[6] = (h[1] >> 22) | (h[2] << 3);
s[7] = h[2] >> 5;
s[8] = h[2] >> 13;
s[9] = (h[2] >> 21) | (h[3] << 5);
s[10] = h[3] >> 3;
s[11] = h[3] >> 11;
s[12] = (h[3] >> 19) | (h[4] << 6);
s[13] = h[4] >> 2;
s[14] = h[4] >> 10;
s[15] = h[4] >> 18;
s[16] = h[5] >> 0;
s[17] = h[5] >> 8;
s[18] = h[5] >> 16;
s[19] = (h[5] >> 24) | (h[6] << 1);
s[20] = h[6] >> 7;
s[21] = h[6] >> 15;
s[22] = (h[6] >> 23) | (h[7] << 3);
s[23] = h[7] >> 5;
s[24] = h[7] >> 13;
s[25] = (h[7] >> 21) | (h[8] << 4);
s[26] = h[8] >> 4;
s[27] = h[8] >> 12;
s[28] = (h[8] >> 20) | (h[9] << 6);
s[29] = h[9] >> 2;
s[30] = h[9] >> 10;
s[31] = h[9] >> 18;
}
// h = f
@ -775,8 +803,10 @@ static void fe_cmov(fe_loose *f, const fe_loose *g, unsigned b) {
// return 0 if f == 0
// return 1 if f != 0
static int fe_isnonzero(const fe_loose *f) {
fe tight;
fe_carry(&tight, f);
uint8_t s[32];
fe_loose_tobytes(s, f);
fe_tobytes(s, &tight);
static const uint8_t zero[32] = {0};
return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;