Remove x86_64 x25519 assembly.
Now that we have 64-bit C code, courtesy of fiat-crypto, the tradeoff for carrying the assembly changes: Assembly: Did 16000 Curve25519 base-point multiplication operations in 1059932us (15095.3 ops/sec) Did 16000 Curve25519 arbitrary point multiplication operations in 1060023us (15094.0 ops/sec) fiat64: Did 39000 Curve25519 base-point multiplication operations in 1004712us (38817.1 ops/sec) Did 14000 Curve25519 arbitrary point multiplication operations in 1006827us (13905.1 ops/sec) The assembly is still about 9% faster than fiat64, but fiat64 gets to use the Ed25519 tables for the base point multiplication, so overall it is actually faster to disable the assembly: >>> 1/(1/15094.0 + 1/15095.3) 7547.324986004976 >>> 1/(1/38817.1 + 1/13905.1) 10237.73016319501 (At the cost of touching a 30kB table.) The assembly implementation is no longer pulling its weight. Remove it and use the fiat code in all build configurations. Change-Id: Id736873177d5568bb16ea06994b9fcb1af104e33 Reviewed-on: https://boringssl-review.googlesource.com/25524 Reviewed-by: Adam Langley <agl@google.com>
This commit is contained in:
parent
fa65113400
commit
884086e0e2
@ -8,21 +8,12 @@ if (${ARCH} STREQUAL "arm")
|
||||
)
|
||||
endif()
|
||||
|
||||
if (${ARCH} STREQUAL "x86_64")
|
||||
set(
|
||||
CURVE25519_ARCH_SOURCES
|
||||
|
||||
asm/x25519-asm-x86_64.S
|
||||
)
|
||||
endif()
|
||||
|
||||
add_library(
|
||||
curve25519
|
||||
|
||||
OBJECT
|
||||
|
||||
spake25519.c
|
||||
x25519-x86_64.c
|
||||
|
||||
${CURVE25519_ARCH_SOURCES}
|
||||
)
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,247 +0,0 @@
|
||||
/* Copyright (c) 2015, Google Inc.
|
||||
*
|
||||
* Permission to use, copy, modify, and/or distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||||
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
// This code is mostly taken from the ref10 version of Ed25519 in SUPERCOP
|
||||
// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
|
||||
// public domain but this file has the ISC license just to keep licencing
|
||||
// simple.
|
||||
//
|
||||
// The field functions are shared by Ed25519 and X25519 where possible.
|
||||
|
||||
#include <openssl/curve25519.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "../internal.h"
|
||||
#include "../../third_party/fiat/internal.h"
|
||||
|
||||
|
||||
#if defined(BORINGSSL_X25519_X86_64)
|
||||
|
||||
typedef struct { uint64_t v[5]; } fe25519;
|
||||
|
||||
// These functions are defined in asm/x25519-x86_64.S
|
||||
void x25519_x86_64_work_cswap(fe25519 *, uint64_t);
|
||||
void x25519_x86_64_mul(fe25519 *out, const fe25519 *a, const fe25519 *b);
|
||||
void x25519_x86_64_square(fe25519 *out, const fe25519 *a);
|
||||
void x25519_x86_64_freeze(fe25519 *);
|
||||
void x25519_x86_64_ladderstep(fe25519 *work);
|
||||
|
||||
static void fe25519_setint(fe25519 *r, unsigned v) {
|
||||
r->v[0] = v;
|
||||
r->v[1] = 0;
|
||||
r->v[2] = 0;
|
||||
r->v[3] = 0;
|
||||
r->v[4] = 0;
|
||||
}
|
||||
|
||||
// Assumes input x being reduced below 2^255
|
||||
static void fe25519_pack(unsigned char r[32], const fe25519 *x) {
|
||||
fe25519 t;
|
||||
t = *x;
|
||||
x25519_x86_64_freeze(&t);
|
||||
|
||||
r[0] = (uint8_t)(t.v[0] & 0xff);
|
||||
r[1] = (uint8_t)((t.v[0] >> 8) & 0xff);
|
||||
r[2] = (uint8_t)((t.v[0] >> 16) & 0xff);
|
||||
r[3] = (uint8_t)((t.v[0] >> 24) & 0xff);
|
||||
r[4] = (uint8_t)((t.v[0] >> 32) & 0xff);
|
||||
r[5] = (uint8_t)((t.v[0] >> 40) & 0xff);
|
||||
r[6] = (uint8_t)((t.v[0] >> 48));
|
||||
|
||||
r[6] ^= (uint8_t)((t.v[1] << 3) & 0xf8);
|
||||
r[7] = (uint8_t)((t.v[1] >> 5) & 0xff);
|
||||
r[8] = (uint8_t)((t.v[1] >> 13) & 0xff);
|
||||
r[9] = (uint8_t)((t.v[1] >> 21) & 0xff);
|
||||
r[10] = (uint8_t)((t.v[1] >> 29) & 0xff);
|
||||
r[11] = (uint8_t)((t.v[1] >> 37) & 0xff);
|
||||
r[12] = (uint8_t)((t.v[1] >> 45));
|
||||
|
||||
r[12] ^= (uint8_t)((t.v[2] << 6) & 0xc0);
|
||||
r[13] = (uint8_t)((t.v[2] >> 2) & 0xff);
|
||||
r[14] = (uint8_t)((t.v[2] >> 10) & 0xff);
|
||||
r[15] = (uint8_t)((t.v[2] >> 18) & 0xff);
|
||||
r[16] = (uint8_t)((t.v[2] >> 26) & 0xff);
|
||||
r[17] = (uint8_t)((t.v[2] >> 34) & 0xff);
|
||||
r[18] = (uint8_t)((t.v[2] >> 42) & 0xff);
|
||||
r[19] = (uint8_t)((t.v[2] >> 50));
|
||||
|
||||
r[19] ^= (uint8_t)((t.v[3] << 1) & 0xfe);
|
||||
r[20] = (uint8_t)((t.v[3] >> 7) & 0xff);
|
||||
r[21] = (uint8_t)((t.v[3] >> 15) & 0xff);
|
||||
r[22] = (uint8_t)((t.v[3] >> 23) & 0xff);
|
||||
r[23] = (uint8_t)((t.v[3] >> 31) & 0xff);
|
||||
r[24] = (uint8_t)((t.v[3] >> 39) & 0xff);
|
||||
r[25] = (uint8_t)((t.v[3] >> 47));
|
||||
|
||||
r[25] ^= (uint8_t)((t.v[4] << 4) & 0xf0);
|
||||
r[26] = (uint8_t)((t.v[4] >> 4) & 0xff);
|
||||
r[27] = (uint8_t)((t.v[4] >> 12) & 0xff);
|
||||
r[28] = (uint8_t)((t.v[4] >> 20) & 0xff);
|
||||
r[29] = (uint8_t)((t.v[4] >> 28) & 0xff);
|
||||
r[30] = (uint8_t)((t.v[4] >> 36) & 0xff);
|
||||
r[31] = (uint8_t)((t.v[4] >> 44));
|
||||
}
|
||||
|
||||
static void fe25519_unpack(fe25519 *r, const uint8_t x[32]) {
|
||||
r->v[0] = x[0];
|
||||
r->v[0] += (uint64_t)x[1] << 8;
|
||||
r->v[0] += (uint64_t)x[2] << 16;
|
||||
r->v[0] += (uint64_t)x[3] << 24;
|
||||
r->v[0] += (uint64_t)x[4] << 32;
|
||||
r->v[0] += (uint64_t)x[5] << 40;
|
||||
r->v[0] += ((uint64_t)x[6] & 7) << 48;
|
||||
|
||||
r->v[1] = x[6] >> 3;
|
||||
r->v[1] += (uint64_t)x[7] << 5;
|
||||
r->v[1] += (uint64_t)x[8] << 13;
|
||||
r->v[1] += (uint64_t)x[9] << 21;
|
||||
r->v[1] += (uint64_t)x[10] << 29;
|
||||
r->v[1] += (uint64_t)x[11] << 37;
|
||||
r->v[1] += ((uint64_t)x[12] & 63) << 45;
|
||||
|
||||
r->v[2] = x[12] >> 6;
|
||||
r->v[2] += (uint64_t)x[13] << 2;
|
||||
r->v[2] += (uint64_t)x[14] << 10;
|
||||
r->v[2] += (uint64_t)x[15] << 18;
|
||||
r->v[2] += (uint64_t)x[16] << 26;
|
||||
r->v[2] += (uint64_t)x[17] << 34;
|
||||
r->v[2] += (uint64_t)x[18] << 42;
|
||||
r->v[2] += ((uint64_t)x[19] & 1) << 50;
|
||||
|
||||
r->v[3] = x[19] >> 1;
|
||||
r->v[3] += (uint64_t)x[20] << 7;
|
||||
r->v[3] += (uint64_t)x[21] << 15;
|
||||
r->v[3] += (uint64_t)x[22] << 23;
|
||||
r->v[3] += (uint64_t)x[23] << 31;
|
||||
r->v[3] += (uint64_t)x[24] << 39;
|
||||
r->v[3] += ((uint64_t)x[25] & 15) << 47;
|
||||
|
||||
r->v[4] = x[25] >> 4;
|
||||
r->v[4] += (uint64_t)x[26] << 4;
|
||||
r->v[4] += (uint64_t)x[27] << 12;
|
||||
r->v[4] += (uint64_t)x[28] << 20;
|
||||
r->v[4] += (uint64_t)x[29] << 28;
|
||||
r->v[4] += (uint64_t)x[30] << 36;
|
||||
r->v[4] += ((uint64_t)x[31] & 127) << 44;
|
||||
}
|
||||
|
||||
static void fe25519_invert(fe25519 *r, const fe25519 *x) {
|
||||
fe25519 z2;
|
||||
fe25519 z9;
|
||||
fe25519 z11;
|
||||
fe25519 z2_5_0;
|
||||
fe25519 z2_10_0;
|
||||
fe25519 z2_20_0;
|
||||
fe25519 z2_50_0;
|
||||
fe25519 z2_100_0;
|
||||
fe25519 t;
|
||||
int i;
|
||||
|
||||
/* 2 */ x25519_x86_64_square(&z2, x);
|
||||
/* 4 */ x25519_x86_64_square(&t, &z2);
|
||||
/* 8 */ x25519_x86_64_square(&t, &t);
|
||||
/* 9 */ x25519_x86_64_mul(&z9, &t, x);
|
||||
/* 11 */ x25519_x86_64_mul(&z11, &z9, &z2);
|
||||
/* 22 */ x25519_x86_64_square(&t, &z11);
|
||||
/* 2^5 - 2^0 = 31 */ x25519_x86_64_mul(&z2_5_0, &t, &z9);
|
||||
|
||||
/* 2^6 - 2^1 */ x25519_x86_64_square(&t, &z2_5_0);
|
||||
/* 2^20 - 2^10 */ for (i = 1; i < 5; i++) { x25519_x86_64_square(&t, &t); }
|
||||
/* 2^10 - 2^0 */ x25519_x86_64_mul(&z2_10_0, &t, &z2_5_0);
|
||||
|
||||
/* 2^11 - 2^1 */ x25519_x86_64_square(&t, &z2_10_0);
|
||||
/* 2^20 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
|
||||
/* 2^20 - 2^0 */ x25519_x86_64_mul(&z2_20_0, &t, &z2_10_0);
|
||||
|
||||
/* 2^21 - 2^1 */ x25519_x86_64_square(&t, &z2_20_0);
|
||||
/* 2^40 - 2^20 */ for (i = 1; i < 20; i++) { x25519_x86_64_square(&t, &t); }
|
||||
/* 2^40 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_20_0);
|
||||
|
||||
/* 2^41 - 2^1 */ x25519_x86_64_square(&t, &t);
|
||||
/* 2^50 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
|
||||
/* 2^50 - 2^0 */ x25519_x86_64_mul(&z2_50_0, &t, &z2_10_0);
|
||||
|
||||
/* 2^51 - 2^1 */ x25519_x86_64_square(&t, &z2_50_0);
|
||||
/* 2^100 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
|
||||
/* 2^100 - 2^0 */ x25519_x86_64_mul(&z2_100_0, &t, &z2_50_0);
|
||||
|
||||
/* 2^101 - 2^1 */ x25519_x86_64_square(&t, &z2_100_0);
|
||||
/* 2^200 - 2^100 */ for (i = 1; i < 100; i++) {
|
||||
x25519_x86_64_square(&t, &t);
|
||||
}
|
||||
/* 2^200 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_100_0);
|
||||
|
||||
/* 2^201 - 2^1 */ x25519_x86_64_square(&t, &t);
|
||||
/* 2^250 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
|
||||
/* 2^250 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_50_0);
|
||||
|
||||
/* 2^251 - 2^1 */ x25519_x86_64_square(&t, &t);
|
||||
/* 2^252 - 2^2 */ x25519_x86_64_square(&t, &t);
|
||||
/* 2^253 - 2^3 */ x25519_x86_64_square(&t, &t);
|
||||
|
||||
/* 2^254 - 2^4 */ x25519_x86_64_square(&t, &t);
|
||||
|
||||
/* 2^255 - 2^5 */ x25519_x86_64_square(&t, &t);
|
||||
/* 2^255 - 21 */ x25519_x86_64_mul(r, &t, &z11);
|
||||
}
|
||||
|
||||
static void mladder(fe25519 *xr, fe25519 *zr, const uint8_t s[32]) {
|
||||
fe25519 work[5];
|
||||
|
||||
work[0] = *xr;
|
||||
fe25519_setint(work + 1, 1);
|
||||
fe25519_setint(work + 2, 0);
|
||||
work[3] = *xr;
|
||||
fe25519_setint(work + 4, 1);
|
||||
|
||||
int i, j;
|
||||
uint8_t prevbit = 0;
|
||||
|
||||
j = 6;
|
||||
for (i = 31; i >= 0; i--) {
|
||||
while (j >= 0) {
|
||||
const uint8_t bit = 1 & (s[i] >> j);
|
||||
const uint64_t swap = bit ^ prevbit;
|
||||
prevbit = bit;
|
||||
x25519_x86_64_work_cswap(work + 1, swap);
|
||||
x25519_x86_64_ladderstep(work);
|
||||
j -= 1;
|
||||
}
|
||||
j = 7;
|
||||
}
|
||||
|
||||
*xr = work[1];
|
||||
*zr = work[2];
|
||||
}
|
||||
|
||||
void x25519_x86_64(uint8_t out[32], const uint8_t scalar[32],
|
||||
const uint8_t point[32]) {
|
||||
uint8_t e[32];
|
||||
OPENSSL_memcpy(e, scalar, sizeof(e));
|
||||
|
||||
e[0] &= 248;
|
||||
e[31] &= 127;
|
||||
e[31] |= 64;
|
||||
|
||||
fe25519 t;
|
||||
fe25519 z;
|
||||
fe25519_unpack(&t, point);
|
||||
mladder(&t, &z, e);
|
||||
fe25519_invert(&z, &z);
|
||||
x25519_x86_64_mul(&t, &t, &z);
|
||||
fe25519_pack(out, &t);
|
||||
}
|
||||
|
||||
#endif // BORINGSSL_X25519_X86_64
|
36
third_party/fiat/curve25519.c
vendored
36
third_party/fiat/curve25519.c
vendored
@ -512,8 +512,6 @@ static void fe_sq_tt(fe *h, const fe *f) {
|
||||
fe_sqr_impl(h->v, f->v);
|
||||
}
|
||||
|
||||
#if !defined(BORINGSSL_X25519_X86_64)
|
||||
|
||||
// Replace (f,g) with (g,f) if b == 1;
|
||||
// replace (f,g) with (f,g) if b == 0.
|
||||
//
|
||||
@ -589,8 +587,6 @@ static void fe_mul121666(fe *h, const fe_loose *f) {
|
||||
assert_fe(h->v);
|
||||
}
|
||||
|
||||
#endif // !BORINGSSL_X25519_X86_64
|
||||
|
||||
// Adapted from Fiat-synthesized |fe_sub_impl| with |out| = 0.
|
||||
static void fe_neg_impl(uint64_t out[5], const uint64_t in2[5]) {
|
||||
{ const uint64_t x10 = 0;
|
||||
@ -1201,8 +1197,6 @@ static void fe_sq_tt(fe *h, const fe *f) {
|
||||
fe_sqr_impl(h->v, f->v);
|
||||
}
|
||||
|
||||
#if !defined(BORINGSSL_X25519_X86_64)
|
||||
|
||||
// Replace (f,g) with (g,f) if b == 1;
|
||||
// replace (f,g) with (f,g) if b == 0.
|
||||
//
|
||||
@ -1342,8 +1336,6 @@ static void fe_mul121666(fe *h, const fe_loose *f) {
|
||||
assert_fe(h->v);
|
||||
}
|
||||
|
||||
#endif // !BORINGSSL_X25519_X86_64
|
||||
|
||||
// Adapted from Fiat-synthesized |fe_sub_impl| with |out| = 0.
|
||||
static void fe_neg_impl(uint32_t out[10], const uint32_t in2[10]) {
|
||||
{ const uint32_t x20 = 0;
|
||||
@ -3063,15 +3055,6 @@ void ED25519_keypair_from_seed(uint8_t out_public_key[32],
|
||||
}
|
||||
|
||||
|
||||
#if defined(BORINGSSL_X25519_X86_64)
|
||||
|
||||
static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
|
||||
const uint8_t point[32]) {
|
||||
x25519_x86_64(out, scalar, point);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void x25519_scalar_mult_generic(uint8_t out[32],
|
||||
const uint8_t scalar[32],
|
||||
const uint8_t point[32]) {
|
||||
@ -3166,9 +3149,6 @@ static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
|
||||
x25519_scalar_mult_generic(out, scalar, point);
|
||||
}
|
||||
|
||||
#endif // BORINGSSL_X25519_X86_64
|
||||
|
||||
|
||||
void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) {
|
||||
RAND_bytes(out_private_key, 32);
|
||||
|
||||
@ -3200,20 +3180,6 @@ int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
|
||||
return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0;
|
||||
}
|
||||
|
||||
#if defined(BORINGSSL_X25519_X86_64)
|
||||
|
||||
// When |BORINGSSL_X25519_X86_64| is set, base point multiplication is done with
|
||||
// the Montgomery ladder because it's faster. Otherwise it's done using the
|
||||
// Ed25519 tables.
|
||||
|
||||
void X25519_public_from_private(uint8_t out_public_value[32],
|
||||
const uint8_t private_key[32]) {
|
||||
static const uint8_t kMongomeryBasePoint[32] = {9};
|
||||
x25519_scalar_mult(out_public_value, private_key, kMongomeryBasePoint);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void X25519_public_from_private(uint8_t out_public_value[32],
|
||||
const uint8_t private_key[32]) {
|
||||
#if defined(BORINGSSL_X25519_NEON)
|
||||
@ -3243,5 +3209,3 @@ void X25519_public_from_private(uint8_t out_public_value[32],
|
||||
fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
|
||||
fe_tobytes(out_public_value, &zminusy_inv);
|
||||
}
|
||||
|
||||
#endif // BORINGSSL_X25519_X86_64
|
||||
|
9
third_party/fiat/internal.h
vendored
9
third_party/fiat/internal.h
vendored
@ -32,15 +32,6 @@ extern "C" {
|
||||
#include "../../crypto/internal.h"
|
||||
|
||||
|
||||
#if defined(OPENSSL_X86_64) && !defined(OPENSSL_SMALL) && \
|
||||
!defined(OPENSSL_WINDOWS) && !defined(OPENSSL_NO_ASM)
|
||||
#define BORINGSSL_X25519_X86_64
|
||||
|
||||
void x25519_x86_64(uint8_t out[32], const uint8_t scalar[32],
|
||||
const uint8_t point[32]);
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_APPLE)
|
||||
#define BORINGSSL_X25519_NEON
|
||||
|
||||
|
@ -44,12 +44,6 @@ NON_PERL_FILES = {
|
||||
'src/crypto/curve25519/asm/x25519-asm-arm.S',
|
||||
'src/crypto/poly1305/poly1305_arm_asm.S',
|
||||
],
|
||||
('linux', 'x86_64'): [
|
||||
'src/crypto/curve25519/asm/x25519-asm-x86_64.S',
|
||||
],
|
||||
('mac', 'x86_64'): [
|
||||
'src/crypto/curve25519/asm/x25519-asm-x86_64.S',
|
||||
],
|
||||
}
|
||||
|
||||
PREFIX = None
|
||||
|
Loading…
x
Reference in New Issue
Block a user