Merge pull request #1548 from briansmith/b/merge-boringssl-4

Merge BoringSSL fa3fbda: P-256 assembly optimisations for Aarch64.
2022-11-03 09:01:20 -07:00 · 2022-11-03 09:01:20 -07:00 · 383317656b
commit 383317656b
parent 83298f55aa 4b9d4adc99
10 changed files with 1705 additions and 137 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -64,6 +64,7 @@ include = [
    "crypto/fipsmodule/bn/internal.h",
    "crypto/fipsmodule/bn/montgomery.c",
    "crypto/fipsmodule/bn/montgomery_inv.c",
+    "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl",
    "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl",
    "crypto/fipsmodule/ec/ecp_nistz.c",
    "crypto/fipsmodule/ec/ecp_nistz.h",
@ -72,9 +73,9 @@ include = [
    "crypto/fipsmodule/ec/gfp_p256.c",
    "crypto/fipsmodule/ec/gfp_p384.c",
    "crypto/fipsmodule/ec/p256.c",
-    "crypto/fipsmodule/ec/p256-x86_64-table.h",
-    "crypto/fipsmodule/ec/p256-x86_64.c",
-    "crypto/fipsmodule/ec/p256-x86_64.h",
+    "crypto/fipsmodule/ec/p256-nistz-table.h",
+    "crypto/fipsmodule/ec/p256-nistz.c",
+    "crypto/fipsmodule/ec/p256-nistz.h",
    "crypto/fipsmodule/ec/p256_shared.h",
    "crypto/fipsmodule/ec/p256_table.h",
    "crypto/fipsmodule/ec/util.h",
--- a/build.rs
+++ b/build.rs
@ -60,7 +60,6 @@ const RING_SRCS: &[(&[&str], &str)] = &[
    (&[X86_64], "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl"),
    (&[X86_64], "crypto/fipsmodule/bn/asm/x86_64-mont.pl"),
    (&[X86_64], "crypto/fipsmodule/bn/asm/x86_64-mont5.pl"),
-    (&[X86_64], "crypto/fipsmodule/ec/p256-x86_64.c"),
    (&[X86_64], "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl"),
    (&[X86_64], "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"),
    (&[X86_64], "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"),
@ -68,6 +67,8 @@ const RING_SRCS: &[(&[&str], &str)] = &[
    (&[X86_64], SHA512_X86_64),
    (&[X86_64], "crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl"),

+    (&[AARCH64, X86_64], "crypto/fipsmodule/ec/p256-nistz.c"),
+
    (&[AARCH64, ARM], "crypto/fipsmodule/aes/asm/aesv8-armx.pl"),
    (&[AARCH64, ARM], "crypto/fipsmodule/modes/asm/ghashv8-armx.pl"),

@ -84,6 +85,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[

    (&[AARCH64], "crypto/fipsmodule/aes/asm/vpaes-armv8.pl"),
    (&[AARCH64], "crypto/fipsmodule/bn/asm/armv8-mont.pl"),
+    (&[AARCH64], "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl"),
    (&[AARCH64], "crypto/chacha/asm/chacha-armv8.pl"),
    (&[AARCH64], "crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl"),
    (&[AARCH64], SHA512_ARMV8),
@ -903,6 +905,18 @@ fn generate_prefix_symbols_header(
 }

 fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
+    // Rename some nistz256 assembly functions to match the names of their
+    // polyfills.
+    static SYMBOLS_TO_RENAME: &[(&str, &str)] = &[
+        ("ecp_nistz256_point_double", "p256_point_double"),
+        ("ecp_nistz256_point_add", "p256_point_add"),
+        ("ecp_nistz256_point_add_affine", "p256_point_add_affine"),
+        ("ecp_nistz256_ord_mul_mont", "p256_scalar_mul_mont"),
+        ("ecp_nistz256_ord_sqr_mont", "p256_scalar_sqr_rep_mont"),
+        ("ecp_nistz256_mul_mont", "p256_mul_mont"),
+        ("ecp_nistz256_sqr_mont", "p256_sqr_mont"),
+    ];
+
    static SYMBOLS_TO_PREFIX: &[&str] = &[
        "CRYPTO_poly1305_finish",
        "CRYPTO_poly1305_finish_neon",
@ -961,9 +975,9 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
        "gcm_init_neon",
        "limbs_mul_add_limb",
        "little_endian_bytes_from_scalar",
-        "nistz256_neg",
-        "nistz256_select_w5",
-        "nistz256_select_w7",
+        "ecp_nistz256_neg",
+        "ecp_nistz256_select_w5",
+        "ecp_nistz256_select_w7",
        "nistz384_point_add",
        "nistz384_point_double",
        "nistz384_point_mul",
@ -1007,6 +1021,17 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {

    let mut out = String::new();

+    for (old, new) in SYMBOLS_TO_RENAME {
+        let line = format!(
+            "{pp}define {prefix_prefix}{old} {prefix_prefix}{new}\n",
+            pp = pp,
+            prefix_prefix = prefix_prefix,
+            old = old,
+            new = new
+        );
+        out += &line;
+    }
+
    for symbol in SYMBOLS_TO_PREFIX {
        let line = format!(
            "{pp}define {prefix_prefix}{symbol} {prefix_prefix}{prefix}{symbol}\n",
--- a/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
+++ b/crypto/fipsmodule/ec/asm/p256-armv8-asm.pl
--- a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
+++ b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
@ -90,11 +90,11 @@ my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
 $code.=<<___;

 ################################################################################
-# void nistz256_neg(uint64_t res[4], uint64_t a[4]);
-.globl	nistz256_neg
-.type	nistz256_neg,\@function,2
+# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
+.globl	ecp_nistz256_neg
+.type	ecp_nistz256_neg,\@function,2
 .align	32
-nistz256_neg:
+ecp_nistz256_neg:
 .cfi_startproc
 	push	%r12
 .cfi_push	%r12
@ -143,7 +143,7 @@ nistz256_neg:
 .Lneg_epilogue:
 	ret
 .cfi_endproc
-.size	nistz256_neg,.-nistz256_neg
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
 ___
 }
 {
@ -154,15 +154,15 @@ my ($poly1,$poly3)=($acc6,$acc7);

 $code.=<<___;
 ################################################################################
-# void p256_scalar_mul_mont(
+# void ecp_nistz256_ord_mul_mont(
 #   uint64_t res[4],
 #   uint64_t a[4],
 #   uint64_t b[4]);

-.globl	p256_scalar_mul_mont
-.type	p256_scalar_mul_mont,\@function,3
+.globl	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,\@function,3
 .align	32
-p256_scalar_mul_mont:
+ecp_nistz256_ord_mul_mont:
 .cfi_startproc
 ___
 $code.=<<___	if ($addx);
@ -482,18 +482,18 @@ $code.=<<___;
 .Lord_mul_epilogue:
 	ret
 .cfi_endproc
-.size	p256_scalar_mul_mont,.-p256_scalar_mul_mont
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont

 ################################################################################
-# void p256_scalar_sqr_rep_mont(
+# void ecp_nistz256_ord_sqr_mont(
 #   uint64_t res[4],
 #   uint64_t a[4],
 #   uint64_t rep);

-.globl	p256_scalar_sqr_rep_mont
-.type	p256_scalar_sqr_rep_mont,\@function,3
+.globl	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,\@function,3
 .align	32
-p256_scalar_sqr_rep_mont:
+ecp_nistz256_ord_sqr_mont:
 .cfi_startproc
 ___
 $code.=<<___	if ($addx);
@ -783,7 +783,7 @@ $code.=<<___;
 .Lord_sqr_epilogue:
 	ret
 .cfi_endproc
-.size	p256_scalar_sqr_rep_mont,.-p256_scalar_sqr_rep_mont
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
 ___

 $code.=<<___	if ($addx);
@ -1235,15 +1235,15 @@ ___

 $code.=<<___;
 ################################################################################
-# void p256_mul_mont(
+# void ecp_nistz256_mul_mont(
 #   uint64_t res[4],
 #   uint64_t a[4],
 #   uint64_t b[4]);

-.globl	p256_mul_mont
-.type	p256_mul_mont,\@function,3
+.globl	ecp_nistz256_mul_mont
+.type	ecp_nistz256_mul_mont,\@function,3
 .align	32
-p256_mul_mont:
+ecp_nistz256_mul_mont:
 .cfi_startproc
 ___
 $code.=<<___	if ($addx);
@ -1315,7 +1315,7 @@ $code.=<<___;
 .Lmul_epilogue:
 	ret
 .cfi_endproc
-.size	p256_mul_mont,.-p256_mul_mont
+.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont

 .type	__ecp_nistz256_mul_montq,\@abi-omnipotent
 .align	32
@ -1536,16 +1536,16 @@ __ecp_nistz256_mul_montq:
 .size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq

 ################################################################################
-# void p256_sqr_mont(
+# void ecp_nistz256_sqr_mont(
 #   uint64_t res[4],
 #   uint64_t a[4]);

 # we optimize the square according to S.Gueron and V.Krasnov,
 # "Speeding up Big-Number Squaring"
-.globl	p256_sqr_mont
-.type	p256_sqr_mont,\@function,2
+.globl	ecp_nistz256_sqr_mont
+.type	ecp_nistz256_sqr_mont,\@function,2
 .align	32
-p256_sqr_mont:
+ecp_nistz256_sqr_mont:
 .cfi_startproc
 ___
 $code.=<<___	if ($addx);
@ -1612,7 +1612,7 @@ $code.=<<___;
 .Lsqr_epilogue:
 	ret
 .cfi_endproc
-.size	p256_sqr_mont,.-p256_sqr_mont
+.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont

 .type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
 .align	32
@ -2090,11 +2090,11 @@ my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));

 $code.=<<___;
 ################################################################################
-# void nistz256_select_w5(uint64_t *val, uint64_t *in_t, crypto_word index);
-.globl	nistz256_select_w5
-.type	nistz256_select_w5,\@abi-omnipotent
+# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w5
+.type	ecp_nistz256_select_w5,\@abi-omnipotent
 .align	32
-nistz256_select_w5:
+ecp_nistz256_select_w5:
 .cfi_startproc
 ___
 $code.=<<___	if ($avx>1);
@ -2105,7 +2105,7 @@ $code.=<<___	if ($avx>1);
 ___
 $code.=<<___	if ($win64);
 	lea	-0x88(%rsp), %rax
-.LSEH_begin_nistz256_select_w5:
+.LSEH_begin_ecp_nistz256_select_w5:
 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
 	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
 	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
@ -2186,15 +2186,15 @@ ___
 $code.=<<___;
 	ret
 .cfi_endproc
-.LSEH_end_nistz256_select_w5:
-.size	nistz256_select_w5,.-nistz256_select_w5
+.LSEH_end_ecp_nistz256_select_w5:
+.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5

 ################################################################################
-# void nistz256_select_w7(uint64_t *val, uint64_t *in_t, crypto_word index);
-.globl	nistz256_select_w7
-.type	nistz256_select_w7,\@abi-omnipotent
+# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl	ecp_nistz256_select_w7
+.type	ecp_nistz256_select_w7,\@abi-omnipotent
 .align	32
-nistz256_select_w7:
+ecp_nistz256_select_w7:
 .cfi_startproc
 ___
 $code.=<<___	if ($avx>1);
@ -2205,7 +2205,7 @@ $code.=<<___	if ($avx>1);
 ___
 $code.=<<___	if ($win64);
 	lea	-0x88(%rsp), %rax
-.LSEH_begin_nistz256_select_w7:
+.LSEH_begin_ecp_nistz256_select_w7:
 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
 	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
 	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
@ -2275,8 +2275,8 @@ ___
 $code.=<<___;
 	ret
 .cfi_endproc
-.LSEH_end_nistz256_select_w7:
-.size	nistz256_select_w7,.-nistz256_select_w7
+.LSEH_end_ecp_nistz256_select_w7:
+.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
 ___
 }
 if ($avx>1) {
@ -2700,10 +2700,10 @@ sub gen_double () {
 	$bias = 0;

 $code.=<<___;
-.globl	p256_point_double
-.type	p256_point_double,\@function,2
+.globl	ecp_nistz256_point_double
+.type	ecp_nistz256_point_double,\@function,2
 .align	32
-p256_point_double:
+ecp_nistz256_point_double:
 .cfi_startproc
 ___
 $code.=<<___	if ($addx);
@ -2719,9 +2719,9 @@ ___
 	$bias = 128;

 $code.=<<___;
-.type	p256_point_doublex,\@function,2
+.type	ecp_nistz256_point_doublex,\@function,2
 .align	32
-p256_point_doublex:
+ecp_nistz256_point_doublex:
 .cfi_startproc
 .Lpoint_doublex:
 ___
@ -2931,7 +2931,7 @@ $code.=<<___;
 .Lpoint_double${x}_epilogue:
 	ret
 .cfi_endproc
-.size	p256_point_double$sfx,.-p256_point_double$sfx
+.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
 ___
 }
 &gen_double("q");
@ -2952,10 +2952,10 @@ sub gen_add () {
 	$bias = 0;

 $code.=<<___;
-.globl	p256_point_add
-.type	p256_point_add,\@function,3
+.globl	ecp_nistz256_point_add
+.type	ecp_nistz256_point_add,\@function,3
 .align	32
-p256_point_add:
+ecp_nistz256_point_add:
 .cfi_startproc
 ___
 $code.=<<___	if ($addx);
@ -2971,9 +2971,9 @@ ___
 	$bias = 128;

 $code.=<<___;
-.type	p256_point_addx,\@function,3
+.type	ecp_nistz256_point_addx,\@function,3
 .align	32
-p256_point_addx:
+ecp_nistz256_point_addx:
 .cfi_startproc
 .Lpoint_addx:
 ___
@ -3330,7 +3330,7 @@ $code.=<<___;
 .Lpoint_add${x}_epilogue:
 	ret
 .cfi_endproc
-.size	p256_point_add$sfx,.-p256_point_add$sfx
+.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
 ___
 }
 &gen_add("q");
@ -3350,10 +3350,10 @@ sub gen_add_affine () {
 	$bias = 0;

 $code.=<<___;
-.globl	p256_point_add_affine
-.type	p256_point_add_affine,\@function,3
+.globl	ecp_nistz256_point_add_affine
+.type	ecp_nistz256_point_add_affine,\@function,3
 .align	32
-p256_point_add_affine:
+ecp_nistz256_point_add_affine:
 .cfi_startproc
 ___
 $code.=<<___	if ($addx);
@ -3369,9 +3369,9 @@ ___
 	$bias = 128;

 $code.=<<___;
-.type	p256_point_add_affinex,\@function,3
+.type	ecp_nistz256_point_add_affinex,\@function,3
 .align	32
-p256_point_add_affinex:
+ecp_nistz256_point_add_affinex:
 .cfi_startproc
 .Lpoint_add_affinex:
 ___
@ -3655,7 +3655,7 @@ $code.=<<___;
 .Ladd_affine${x}_epilogue:
 	ret
 .cfi_endproc
-.size	p256_point_add_affine$sfx,.-p256_point_add_affine$sfx
+.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
 ___
 }
 &gen_add_affine("q");
@ -3956,17 +3956,17 @@ full_handler:

 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_nistz256_neg
-	.rva	.LSEH_end_nistz256_neg
-	.rva	.LSEH_info_nistz256_neg
+	.rva	.LSEH_begin_ecp_nistz256_neg
+	.rva	.LSEH_end_ecp_nistz256_neg
+	.rva	.LSEH_info_ecp_nistz256_neg

-	.rva	.LSEH_begin_p256_scalar_mul_mont
-	.rva	.LSEH_end_p256_scalar_mul_mont
-	.rva	.LSEH_info_p256_scalar_mul_mont
+	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
+	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
+	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont

-	.rva	.LSEH_begin_p256_scalar_sqr_rep_mont
-	.rva	.LSEH_end_p256_scalar_sqr_rep_mont
-	.rva	.LSEH_info_p256_scalar_sqr_rep_mont
+	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
+	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
+	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
 ___
 $code.=<<___	if ($addx);
 	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
@ -3978,20 +3978,20 @@ $code.=<<___	if ($addx);
 	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
 ___
 $code.=<<___;
-	.rva	.LSEH_begin_p256_mul_mont
-	.rva	.LSEH_end_p256_mul_mont
-	.rva	.LSEH_info_p256_mul_mont
+	.rva	.LSEH_begin_ecp_nistz256_mul_mont
+	.rva	.LSEH_end_ecp_nistz256_mul_mont
+	.rva	.LSEH_info_ecp_nistz256_mul_mont

-	.rva	.LSEH_begin_p256_sqr_mont
-	.rva	.LSEH_end_p256_sqr_mont
-	.rva	.LSEH_info_p256_sqr_mont
+	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
+	.rva	.LSEH_end_ecp_nistz256_sqr_mont
+	.rva	.LSEH_info_ecp_nistz256_sqr_mont

-	.rva	.LSEH_begin_nistz256_select_w5
-	.rva	.LSEH_end_nistz256_select_w5
+	.rva	.LSEH_begin_ecp_nistz256_select_w5
+	.rva	.LSEH_end_ecp_nistz256_select_w5
 	.rva	.LSEH_info_ecp_nistz256_select_wX

-	.rva	.LSEH_begin_nistz256_select_w7
-	.rva	.LSEH_end_nistz256_select_w7
+	.rva	.LSEH_begin_ecp_nistz256_select_w7
+	.rva	.LSEH_end_ecp_nistz256_select_w7
 	.rva	.LSEH_info_ecp_nistz256_select_wX
 ___
 $code.=<<___	if ($avx>1);
@ -4004,45 +4004,45 @@ $code.=<<___	if ($avx>1);
 	.rva	.LSEH_info_ecp_nistz256_avx2_select_wX
 ___
 $code.=<<___;
-	.rva	.LSEH_begin_p256_point_double
-	.rva	.LSEH_end_p256_point_double
-	.rva	.LSEH_info_p256_point_double
+	.rva	.LSEH_begin_ecp_nistz256_point_double
+	.rva	.LSEH_end_ecp_nistz256_point_double
+	.rva	.LSEH_info_ecp_nistz256_point_double

-	.rva	.LSEH_begin_p256_point_add
-	.rva	.LSEH_end_p256_point_add
-	.rva	.LSEH_info_p256_point_add
+	.rva	.LSEH_begin_ecp_nistz256_point_add
+	.rva	.LSEH_end_ecp_nistz256_point_add
+	.rva	.LSEH_info_ecp_nistz256_point_add

-	.rva	.LSEH_begin_p256_point_add_affine
-	.rva	.LSEH_end_p256_point_add_affine
-	.rva	.LSEH_info_p256_point_add_affine
+	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
+	.rva	.LSEH_end_ecp_nistz256_point_add_affine
+	.rva	.LSEH_info_ecp_nistz256_point_add_affine
 ___
 $code.=<<___ if ($addx);
-	.rva	.LSEH_begin_p256_point_doublex
-	.rva	.LSEH_end_p256_point_doublex
-	.rva	.LSEH_info_p256_point_doublex
+	.rva	.LSEH_begin_ecp_nistz256_point_doublex
+	.rva	.LSEH_end_ecp_nistz256_point_doublex
+	.rva	.LSEH_info_ecp_nistz256_point_doublex

-	.rva	.LSEH_begin_p256_point_addx
-	.rva	.LSEH_end_p256_point_addx
-	.rva	.LSEH_info_p256_point_addx
+	.rva	.LSEH_begin_ecp_nistz256_point_addx
+	.rva	.LSEH_end_ecp_nistz256_point_addx
+	.rva	.LSEH_info_ecp_nistz256_point_addx

-	.rva	.LSEH_begin_p256_point_add_affinex
-	.rva	.LSEH_end_p256_point_add_affinex
-	.rva	.LSEH_info_p256_point_add_affinex
+	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
+	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
+	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
 ___
 $code.=<<___;

 .section	.xdata
 .align	8
-.LSEH_info_nistz256_neg:
+.LSEH_info_ecp_nistz256_neg:
 	.byte	9,0,0,0
 	.rva	short_handler
 	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
-.LSEH_info_p256_scalar_mul_mont:
+.LSEH_info_ecp_nistz256_ord_mul_mont:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
 	.long	48,0
-.LSEH_info_p256_scalar_sqr_rep_mont:
+.LSEH_info_ecp_nistz256_ord_sqr_mont:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
@ -4061,12 +4061,12 @@ $code.=<<___ if ($addx);
 	.long	48,0
 ___
 $code.=<<___;
-.LSEH_info_p256_mul_mont:
+.LSEH_info_ecp_nistz256_mul_mont:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
 	.long	48,0
-.LSEH_info_p256_sqr_mont:
+.LSEH_info_ecp_nistz256_sqr_mont:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
@ -4104,17 +4104,17 @@ $code.=<<___	if ($avx>1);
 	.align	8
 ___
 $code.=<<___;
-.LSEH_info_p256_point_double:
+.LSEH_info_ecp_nistz256_point_double:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
 	.long	32*5+56,0
-.LSEH_info_p256_point_add:
+.LSEH_info_ecp_nistz256_point_add:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
 	.long	32*18+56,0
-.LSEH_info_p256_point_add_affine:
+.LSEH_info_ecp_nistz256_point_add_affine:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
@ -4122,17 +4122,17 @@ $code.=<<___;
 ___
 $code.=<<___ if ($addx);
 .align	8
-.LSEH_info_p256_point_doublex:
+.LSEH_info_ecp_nistz256_point_doublex:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
 	.long	32*5+56,0
-.LSEH_info_p256_point_addx:
+.LSEH_info_ecp_nistz256_point_addx:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
 	.long	32*18+56,0
-.LSEH_info_p256_point_add_affinex:
+.LSEH_info_ecp_nistz256_point_add_affinex:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
--- a/crypto/fipsmodule/ec/make_tables.go
+++ b/crypto/fipsmodule/ec/make_tables.go
@ -23,8 +23,8 @@ import (
 )

 func main() {
-	if err := writeP256X86_64Table("p256-x86_64-table.h"); err != nil {
-		fmt.Fprintf(os.Stderr, "Error writing p256-x86_64-table.h: %s\n", err)
+	if err := writeP256NistzTable("p256-nistz-table.h"); err != nil {
+		fmt.Fprintf(os.Stderr, "Error writing p256-nistz-table.h: %s\n", err)
 		os.Exit(1)
 	}

@ -34,7 +34,7 @@ func main() {
 	}
 }

-func writeP256X86_64Table(path string) error {
+func writeP256NistzTable(path string) error {
 	curve := elliptic.P256()
 	tables := make([][][2]*big.Int, 0, 37)
 	for shift := 0; shift < 256; shift += 7 {
@ -59,7 +59,7 @@ func writeP256X86_64Table(path string) error {
 */

 // This is the precomputed constant time access table for the code in
-// p256-x86_64.c, for the default generator. The table consists of 37
+// p256-nistz.c, for the default generator. The table consists of 37
 // subtables, each subtable contains 64 affine points. The affine points are
 // encoded as eight uint64's, four for the x coordinate and four for the y.
 // Both values are in little-endian order. There are 37 tables because a
--- a/crypto/fipsmodule/ec/p256-x86_64-table.h
+++ b/crypto/fipsmodule/ec/p256-x86_64-table.h
@ -9,7 +9,7 @@
 */

 // This is the precomputed constant time access table for the code in
-// p256-x86_64.c, for the default generator. The table consists of 37
+// p256-nistz.c, for the default generator. The table consists of 37
 // subtables, each subtable contains 64 affine points. The affine points are
 // encoded as eight uint64's, four for the x coordinate and four for the y.
 // Both values are in little-endian order. There are 37 tables because a
--- a/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/crypto/fipsmodule/ec/p256-x86_64.c
@ -22,7 +22,7 @@

 #include <stdint.h>

-#include "p256-x86_64.h"
+#include "p256-nistz.h"

 #if defined(OPENSSL_USE_NISTZ256)

@ -35,7 +35,7 @@ static const BN_ULONG ONE[P256_LIMBS] = {
 };

 // Precomputed tables for the default generator
-#include "p256-x86_64-table.h"
+#include "p256-nistz-table.h"

 // Recode window to a signed digit, see |nistp_recode_scalar_bits| in
 // util.c for details
@ -168,7 +168,7 @@ static void ecp_nistz256_windowed_mul(P256_POINT *r,
  crypto_word wvalue = p_str[(index - 1) / 8];
  wvalue = (wvalue >> ((index - 1) % 8)) & kMask;

-  ecp_nistz256_select_w5(r, table, booth_recode_w5(wvalue) >> 1);
+  ecp_nistz256_select_w5(r, table, (int)(booth_recode_w5(wvalue) >> 1));

  while (index >= 5) {
    if (index != 255) {
@ -179,7 +179,7 @@ static void ecp_nistz256_windowed_mul(P256_POINT *r,

      wvalue = booth_recode_w5(wvalue);

-      ecp_nistz256_select_w5(&h, table, wvalue >> 1);
+      ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1));

      ecp_nistz256_neg(tmp, h.Y);
      copy_conditional(h.Y, tmp, (wvalue & 1));
@ -202,7 +202,7 @@ static void ecp_nistz256_windowed_mul(P256_POINT *r,

  wvalue = booth_recode_w5(wvalue);

-  ecp_nistz256_select_w5(&h, table, wvalue >> 1);
+  ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1));

  ecp_nistz256_neg(tmp, h.Y);
  copy_conditional(h.Y, tmp, wvalue & 1);
@ -258,7 +258,7 @@ void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) {
  size_t index = 0;
  crypto_word wvalue = calc_first_wvalue(&index, p_str);

-  ecp_nistz256_select_w7(&p.a, ecp_nistz256_precomputed[0], wvalue >> 1);
+  ecp_nistz256_select_w7(&p.a, ecp_nistz256_precomputed[0], (int)(wvalue >> 1));
  ecp_nistz256_neg(p.p.Z, p.p.Y);
  copy_conditional(p.p.Y, p.p.Z, wvalue & 1);

@ -271,7 +271,7 @@ void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) {
  for (int i = 1; i < 37; i++) {
    wvalue = calc_wvalue(&index, p_str);

-    ecp_nistz256_select_w7(&t.a, ecp_nistz256_precomputed[i], wvalue >> 1);
+    ecp_nistz256_select_w7(&t.a, ecp_nistz256_precomputed[i], (int)(wvalue >> 1));

    ecp_nistz256_neg(t.p.Z, t.a.Y);
    copy_conditional(t.a.Y, t.p.Z, wvalue & 1);
--- a/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/crypto/fipsmodule/ec/p256-x86_64.h
@ -27,13 +27,6 @@

 #if defined(OPENSSL_USE_NISTZ256)

-#define ecp_nistz256_neg nistz256_neg
-#define ecp_nistz256_select_w5 nistz256_select_w5
-#define ecp_nistz256_select_w7 nistz256_select_w7
-#define ecp_nistz256_point_double p256_point_double
-#define ecp_nistz256_point_add p256_point_add
-#define ecp_nistz256_point_add_affine p256_point_add_affine
-
 // ecp_nistz256_neg sets |res| to -|a| mod P.
 void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);

@ -82,14 +75,14 @@ typedef struct {
 // and all zeros (the point at infinity) if |index| is 0. This is done in
 // constant time.
 void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16],
-                            crypto_word index);
+                            int index);

 // ecp_nistz256_select_w7 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 64
 // and all zeros (the point at infinity) if |index| is 0. This is done in
 // constant time.
 void ecp_nistz256_select_w7(P256_POINT_AFFINE *val,
                            const P256_POINT_AFFINE in_t[64],
-                            crypto_word index);
+                            int index);

 // ecp_nistz256_point_double sets |r| to |a| doubled.
 void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a);
--- a/crypto/fipsmodule/ec/p256_shared.h
+++ b/crypto/fipsmodule/ec/p256_shared.h
@ -23,7 +23,8 @@

 #include "../bn/internal.h"

-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+#if !defined(OPENSSL_NO_ASM) && \
+    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
    !defined(OPENSSL_SMALL)
 # define OPENSSL_USE_NISTZ256
 #endif
--- a/src/ec/suite_b/ops.rs
+++ b/src/ec/suite_b/ops.rs
@ -584,16 +584,16 @@ mod tests {
        })
    }

-    // There is no `nistz256_neg` on other targets.
+    // There is no `ecp_nistz256_neg` on other targets.
    #[cfg(target_arch = "x86_64")]
    #[test]
    fn p256_elem_neg_test() {
        prefixed_extern! {
-            fn nistz256_neg(r: *mut Limb, a: *const Limb);
+            fn ecp_nistz256_neg(r: *mut Limb, a: *const Limb);
        }
        elem_neg_test(
            &p256::COMMON_OPS,
-            nistz256_neg,
+            ecp_nistz256_neg,
            test_file!("ops/p256_elem_neg_tests.txt"),
        );
    }