aarch64 support.
This is an initial cut at aarch64 support. I have only qemu to test it however—hopefully hardware will be coming soon. This also affects 32-bit ARM in that aarch64 chips can run 32-bit code and we would like to be able to take advantage of the crypto operations even in 32-bit mode. AES and GHASH should Just Work in this case: the -armx.pl files can be built for either 32- or 64-bit mode based on the flavour argument given to the Perl script. SHA-1 and SHA-256 don't work like this however because they've never support for multiple implementations, thus BoringSSL built for 32-bit won't use the SHA instructions on an aarch64 chip. No dedicated ChaCha20 or Poly1305 support yet. Change-Id: Ib275bc4894a365c8ec7c42f4e91af6dba3bd686c Reviewed-on: https://boringssl-review.googlesource.com/2801 Reviewed-by: Adam Langley <agl@google.com>
This commit is contained in:
parent
bc44c089fb
commit
3e6526575a
@ -32,6 +32,8 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686")
|
||||
set(ARCH "x86")
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm")
|
||||
set(ARCH "arm")
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
|
||||
set(ARCH "aarch64")
|
||||
else()
|
||||
message(FATAL_ERROR "Unknown processor:" ${CMAKE_SYSTEM_PROCESSOR})
|
||||
endif()
|
||||
|
@ -5,7 +5,13 @@ if(APPLE)
|
||||
set(ASM_EXT S)
|
||||
enable_language(ASM)
|
||||
elseif(UNIX)
|
||||
set(PERLASM_STYLE elf)
|
||||
if (${ARCH} STREQUAL "aarch64")
|
||||
# The "armx" Perl scripts look for "64" in the style argument
|
||||
# in order to decide whether to generate 32- or 64-bit asm.
|
||||
set(PERLASM_STYLE linux64)
|
||||
else()
|
||||
set(PERLASM_STYLE elf)
|
||||
endif()
|
||||
set(ASM_EXT S)
|
||||
enable_language(ASM)
|
||||
else()
|
||||
@ -59,7 +65,14 @@ if (${ARCH} STREQUAL "arm")
|
||||
set(
|
||||
CRYPTO_ARCH_SOURCES
|
||||
|
||||
cpu-x86-asm.${ASM_EXT}
|
||||
cpu-arm.c
|
||||
)
|
||||
endif()
|
||||
|
||||
if (${ARCH} STREQUAL "aarch64")
|
||||
set(
|
||||
CRYPTO_ARCH_SOURCES
|
||||
|
||||
cpu-arm.c
|
||||
)
|
||||
endif()
|
||||
|
@ -27,6 +27,15 @@ if (${ARCH} STREQUAL "arm")
|
||||
|
||||
aes-armv4.${ASM_EXT}
|
||||
bsaes-armv7.${ASM_EXT}
|
||||
aesv8-armx.${ASM_EXT}
|
||||
)
|
||||
endif()
|
||||
|
||||
if (${ARCH} STREQUAL "aarch64")
|
||||
set(
|
||||
AES_ARCH_SOURCES
|
||||
|
||||
aesv8-armx.${ASM_EXT}
|
||||
)
|
||||
endif()
|
||||
|
||||
@ -50,3 +59,4 @@ perlasm(vpaes-x86.${ASM_EXT} asm/vpaes-x86.pl)
|
||||
perlasm(aesni-x86.${ASM_EXT} asm/aesni-x86.pl)
|
||||
perlasm(aes-armv4.${ASM_EXT} asm/aes-armv4.pl)
|
||||
perlasm(bsaes-armv7.${ASM_EXT} asm/bsaes-armv7.pl)
|
||||
perlasm(aesv8-armx.${ASM_EXT} asm/aesv8-armx.pl)
|
||||
|
962
crypto/aes/asm/aesv8-armx.pl
Normal file
962
crypto/aes/asm/aesv8-armx.pl
Normal file
@ -0,0 +1,962 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# This module implements support for ARMv8 AES instructions. The
|
||||
# module is endian-agnostic in sense that it supports both big- and
|
||||
# little-endian cases. As does it support both 32- and 64-bit modes
|
||||
# of operation. Latter is achieved by limiting amount of utilized
|
||||
# registers to 16, which implies additional NEON load and integer
|
||||
# instructions. This has no effect on mighty Apple A7, where results
|
||||
# are literally equal to the theoretical estimates based on AES
|
||||
# instruction latencies and issue rates. On Cortex-A53, an in-order
|
||||
# execution core, this costs up to 10-15%, which is partially
|
||||
# compensated by implementing dedicated code path for 128-bit
|
||||
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
|
||||
# seems to be limited by sheer amount of NEON instructions...
|
||||
#
|
||||
# Performance in cycles per byte processed with 128-bit key:
|
||||
#
|
||||
# CBC enc CBC dec CTR
|
||||
# Apple A7 2.39 1.20 1.20
|
||||
# Cortex-A53 2.45 1.87 1.94
|
||||
# Cortex-A57 3.64 1.34 1.32
|
||||
|
||||
$flavour = shift;
|
||||
open STDOUT,">".shift;
|
||||
|
||||
$prefix="aes_v8";
|
||||
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.text
|
||||
___
|
||||
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||||
$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
|
||||
#^^^^^^ this is done to simplify adoption by not depending
|
||||
# on latest binutils.
|
||||
|
||||
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
|
||||
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
|
||||
# maintain both 32- and 64-bit codes within single module and
|
||||
# transliterate common code to either flavour with regex vodoo.
|
||||
#
|
||||
{{{
|
||||
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
|
||||
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
|
||||
$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
|
||||
|
||||
|
||||
$code.=<<___;
|
||||
.align 5
|
||||
rcon:
|
||||
.long 0x01,0x01,0x01,0x01
|
||||
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
|
||||
.long 0x1b,0x1b,0x1b,0x1b
|
||||
|
||||
.globl ${prefix}_set_encrypt_key
|
||||
.type ${prefix}_set_encrypt_key,%function
|
||||
.align 5
|
||||
${prefix}_set_encrypt_key:
|
||||
.Lenc_key:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
___
|
||||
$code.=<<___;
|
||||
mov $ptr,#-1
|
||||
cmp $inp,#0
|
||||
b.eq .Lenc_key_abort
|
||||
cmp $out,#0
|
||||
b.eq .Lenc_key_abort
|
||||
mov $ptr,#-2
|
||||
cmp $bits,#128
|
||||
b.lt .Lenc_key_abort
|
||||
cmp $bits,#256
|
||||
b.gt .Lenc_key_abort
|
||||
tst $bits,#0x3f
|
||||
b.ne .Lenc_key_abort
|
||||
|
||||
adr $ptr,rcon
|
||||
cmp $bits,#192
|
||||
|
||||
veor $zero,$zero,$zero
|
||||
vld1.8 {$in0},[$inp],#16
|
||||
mov $bits,#8 // reuse $bits
|
||||
vld1.32 {$rcon,$mask},[$ptr],#32
|
||||
|
||||
b.lt .Loop128
|
||||
b.eq .L192
|
||||
b .L256
|
||||
|
||||
.align 4
|
||||
.Loop128:
|
||||
vtbl.8 $key,{$in0},$mask
|
||||
vext.8 $tmp,$zero,$in0,#12
|
||||
vst1.32 {$in0},[$out],#16
|
||||
aese $key,$zero
|
||||
subs $bits,$bits,#1
|
||||
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $key,$key,$rcon
|
||||
veor $in0,$in0,$tmp
|
||||
vshl.u8 $rcon,$rcon,#1
|
||||
veor $in0,$in0,$key
|
||||
b.ne .Loop128
|
||||
|
||||
vld1.32 {$rcon},[$ptr]
|
||||
|
||||
vtbl.8 $key,{$in0},$mask
|
||||
vext.8 $tmp,$zero,$in0,#12
|
||||
vst1.32 {$in0},[$out],#16
|
||||
aese $key,$zero
|
||||
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $key,$key,$rcon
|
||||
veor $in0,$in0,$tmp
|
||||
vshl.u8 $rcon,$rcon,#1
|
||||
veor $in0,$in0,$key
|
||||
|
||||
vtbl.8 $key,{$in0},$mask
|
||||
vext.8 $tmp,$zero,$in0,#12
|
||||
vst1.32 {$in0},[$out],#16
|
||||
aese $key,$zero
|
||||
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $key,$key,$rcon
|
||||
veor $in0,$in0,$tmp
|
||||
veor $in0,$in0,$key
|
||||
vst1.32 {$in0},[$out]
|
||||
add $out,$out,#0x50
|
||||
|
||||
mov $rounds,#10
|
||||
b .Ldone
|
||||
|
||||
.align 4
|
||||
.L192:
|
||||
vld1.8 {$in1},[$inp],#8
|
||||
vmov.i8 $key,#8 // borrow $key
|
||||
vst1.32 {$in0},[$out],#16
|
||||
vsub.i8 $mask,$mask,$key // adjust the mask
|
||||
|
||||
.Loop192:
|
||||
vtbl.8 $key,{$in1},$mask
|
||||
vext.8 $tmp,$zero,$in0,#12
|
||||
vst1.32 {$in1},[$out],#8
|
||||
aese $key,$zero
|
||||
subs $bits,$bits,#1
|
||||
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
|
||||
vdup.32 $tmp,${in0}[3]
|
||||
veor $tmp,$tmp,$in1
|
||||
veor $key,$key,$rcon
|
||||
vext.8 $in1,$zero,$in1,#12
|
||||
vshl.u8 $rcon,$rcon,#1
|
||||
veor $in1,$in1,$tmp
|
||||
veor $in0,$in0,$key
|
||||
veor $in1,$in1,$key
|
||||
vst1.32 {$in0},[$out],#16
|
||||
b.ne .Loop192
|
||||
|
||||
mov $rounds,#12
|
||||
add $out,$out,#0x20
|
||||
b .Ldone
|
||||
|
||||
.align 4
|
||||
.L256:
|
||||
vld1.8 {$in1},[$inp]
|
||||
mov $bits,#7
|
||||
mov $rounds,#14
|
||||
vst1.32 {$in0},[$out],#16
|
||||
|
||||
.Loop256:
|
||||
vtbl.8 $key,{$in1},$mask
|
||||
vext.8 $tmp,$zero,$in0,#12
|
||||
vst1.32 {$in1},[$out],#16
|
||||
aese $key,$zero
|
||||
subs $bits,$bits,#1
|
||||
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in0,$in0,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $key,$key,$rcon
|
||||
veor $in0,$in0,$tmp
|
||||
vshl.u8 $rcon,$rcon,#1
|
||||
veor $in0,$in0,$key
|
||||
vst1.32 {$in0},[$out],#16
|
||||
b.eq .Ldone
|
||||
|
||||
vdup.32 $key,${in0}[3] // just splat
|
||||
vext.8 $tmp,$zero,$in1,#12
|
||||
aese $key,$zero
|
||||
|
||||
veor $in1,$in1,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in1,$in1,$tmp
|
||||
vext.8 $tmp,$zero,$tmp,#12
|
||||
veor $in1,$in1,$tmp
|
||||
|
||||
veor $in1,$in1,$key
|
||||
b .Loop256
|
||||
|
||||
.Ldone:
|
||||
str $rounds,[$out]
|
||||
mov $ptr,#0
|
||||
|
||||
.Lenc_key_abort:
|
||||
mov x0,$ptr // return value
|
||||
`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
|
||||
ret
|
||||
.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
|
||||
|
||||
.globl ${prefix}_set_decrypt_key
|
||||
.type ${prefix}_set_decrypt_key,%function
|
||||
.align 5
|
||||
${prefix}_set_decrypt_key:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
stmdb sp!,{r4,lr}
|
||||
___
|
||||
$code.=<<___;
|
||||
bl .Lenc_key
|
||||
|
||||
cmp x0,#0
|
||||
b.ne .Ldec_key_abort
|
||||
|
||||
sub $out,$out,#240 // restore original $out
|
||||
mov x4,#-16
|
||||
add $inp,$out,x12,lsl#4 // end of key schedule
|
||||
|
||||
vld1.32 {v0.16b},[$out]
|
||||
vld1.32 {v1.16b},[$inp]
|
||||
vst1.32 {v0.16b},[$inp],x4
|
||||
vst1.32 {v1.16b},[$out],#16
|
||||
|
||||
.Loop_imc:
|
||||
vld1.32 {v0.16b},[$out]
|
||||
vld1.32 {v1.16b},[$inp]
|
||||
aesimc v0.16b,v0.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
vst1.32 {v0.16b},[$inp],x4
|
||||
vst1.32 {v1.16b},[$out],#16
|
||||
cmp $inp,$out
|
||||
b.hi .Loop_imc
|
||||
|
||||
vld1.32 {v0.16b},[$out]
|
||||
aesimc v0.16b,v0.16b
|
||||
vst1.32 {v0.16b},[$inp]
|
||||
|
||||
eor x0,x0,x0 // return value
|
||||
.Ldec_key_abort:
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
ldmia sp!,{r4,pc}
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
ldp x29,x30,[sp],#16
|
||||
ret
|
||||
___
|
||||
$code.=<<___;
|
||||
.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
|
||||
___
|
||||
}}}
|
||||
{{{
|
||||
sub gen_block () {
|
||||
my $dir = shift;
|
||||
my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
|
||||
my ($inp,$out,$key)=map("x$_",(0..2));
|
||||
my $rounds="w3";
|
||||
my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
|
||||
|
||||
$code.=<<___;
|
||||
.globl ${prefix}_${dir}crypt
|
||||
.type ${prefix}_${dir}crypt,%function
|
||||
.align 5
|
||||
${prefix}_${dir}crypt:
|
||||
ldr $rounds,[$key,#240]
|
||||
vld1.32 {$rndkey0},[$key],#16
|
||||
vld1.8 {$inout},[$inp]
|
||||
sub $rounds,$rounds,#2
|
||||
vld1.32 {$rndkey1},[$key],#16
|
||||
|
||||
.Loop_${dir}c:
|
||||
aes$e $inout,$rndkey0
|
||||
vld1.32 {$rndkey0},[$key],#16
|
||||
aes$mc $inout,$inout
|
||||
subs $rounds,$rounds,#2
|
||||
aes$e $inout,$rndkey1
|
||||
vld1.32 {$rndkey1},[$key],#16
|
||||
aes$mc $inout,$inout
|
||||
b.gt .Loop_${dir}c
|
||||
|
||||
aes$e $inout,$rndkey0
|
||||
vld1.32 {$rndkey0},[$key]
|
||||
aes$mc $inout,$inout
|
||||
aes$e $inout,$rndkey1
|
||||
veor $inout,$inout,$rndkey0
|
||||
|
||||
vst1.8 {$inout},[$out]
|
||||
ret
|
||||
.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
|
||||
___
|
||||
}
|
||||
&gen_block("en");
|
||||
&gen_block("de");
|
||||
}}}
|
||||
{{{
|
||||
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
|
||||
my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
|
||||
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
|
||||
|
||||
my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
|
||||
|
||||
### q8-q15 preloaded key schedule
|
||||
|
||||
$code.=<<___;
|
||||
.globl ${prefix}_cbc_encrypt
|
||||
.type ${prefix}_cbc_encrypt,%function
|
||||
.align 5
|
||||
${prefix}_cbc_encrypt:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
mov ip,sp
|
||||
stmdb sp!,{r4-r8,lr}
|
||||
vstmdb sp!,{d8-d15} @ ABI specification says so
|
||||
ldmia ip,{r4-r5} @ load remaining args
|
||||
___
|
||||
$code.=<<___;
|
||||
subs $len,$len,#16
|
||||
mov $step,#16
|
||||
b.lo .Lcbc_abort
|
||||
cclr $step,eq
|
||||
|
||||
cmp $enc,#0 // en- or decrypting?
|
||||
ldr $rounds,[$key,#240]
|
||||
and $len,$len,#-16
|
||||
vld1.8 {$ivec},[$ivp]
|
||||
vld1.8 {$dat},[$inp],$step
|
||||
|
||||
vld1.32 {q8-q9},[$key] // load key schedule...
|
||||
sub $rounds,$rounds,#6
|
||||
add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
|
||||
sub $rounds,$rounds,#2
|
||||
vld1.32 {q10-q11},[$key_],#32
|
||||
vld1.32 {q12-q13},[$key_],#32
|
||||
vld1.32 {q14-q15},[$key_],#32
|
||||
vld1.32 {$rndlast},[$key_]
|
||||
|
||||
add $key_,$key,#32
|
||||
mov $cnt,$rounds
|
||||
b.eq .Lcbc_dec
|
||||
|
||||
cmp $rounds,#2
|
||||
veor $dat,$dat,$ivec
|
||||
veor $rndzero_n_last,q8,$rndlast
|
||||
b.eq .Lcbc_enc128
|
||||
|
||||
.Loop_cbc_enc:
|
||||
aese $dat,q8
|
||||
vld1.32 {q8},[$key_],#16
|
||||
aesmc $dat,$dat
|
||||
subs $cnt,$cnt,#2
|
||||
aese $dat,q9
|
||||
vld1.32 {q9},[$key_],#16
|
||||
aesmc $dat,$dat
|
||||
b.gt .Loop_cbc_enc
|
||||
|
||||
aese $dat,q8
|
||||
aesmc $dat,$dat
|
||||
subs $len,$len,#16
|
||||
aese $dat,q9
|
||||
aesmc $dat,$dat
|
||||
cclr $step,eq
|
||||
aese $dat,q10
|
||||
aesmc $dat,$dat
|
||||
add $key_,$key,#16
|
||||
aese $dat,q11
|
||||
aesmc $dat,$dat
|
||||
vld1.8 {q8},[$inp],$step
|
||||
aese $dat,q12
|
||||
aesmc $dat,$dat
|
||||
veor q8,q8,$rndzero_n_last
|
||||
aese $dat,q13
|
||||
aesmc $dat,$dat
|
||||
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
|
||||
aese $dat,q14
|
||||
aesmc $dat,$dat
|
||||
aese $dat,q15
|
||||
|
||||
mov $cnt,$rounds
|
||||
veor $ivec,$dat,$rndlast
|
||||
vst1.8 {$ivec},[$out],#16
|
||||
b.hs .Loop_cbc_enc
|
||||
|
||||
b .Lcbc_done
|
||||
|
||||
.align 5
|
||||
.Lcbc_enc128:
|
||||
vld1.32 {$in0-$in1},[$key_]
|
||||
aese $dat,q8
|
||||
aesmc $dat,$dat
|
||||
b .Lenter_cbc_enc128
|
||||
.Loop_cbc_enc128:
|
||||
aese $dat,q8
|
||||
aesmc $dat,$dat
|
||||
vst1.8 {$ivec},[$out],#16
|
||||
.Lenter_cbc_enc128:
|
||||
aese $dat,q9
|
||||
aesmc $dat,$dat
|
||||
subs $len,$len,#16
|
||||
aese $dat,$in0
|
||||
aesmc $dat,$dat
|
||||
cclr $step,eq
|
||||
aese $dat,$in1
|
||||
aesmc $dat,$dat
|
||||
aese $dat,q10
|
||||
aesmc $dat,$dat
|
||||
aese $dat,q11
|
||||
aesmc $dat,$dat
|
||||
vld1.8 {q8},[$inp],$step
|
||||
aese $dat,q12
|
||||
aesmc $dat,$dat
|
||||
aese $dat,q13
|
||||
aesmc $dat,$dat
|
||||
aese $dat,q14
|
||||
aesmc $dat,$dat
|
||||
veor q8,q8,$rndzero_n_last
|
||||
aese $dat,q15
|
||||
veor $ivec,$dat,$rndlast
|
||||
b.hs .Loop_cbc_enc128
|
||||
|
||||
vst1.8 {$ivec},[$out],#16
|
||||
b .Lcbc_done
|
||||
___
|
||||
{
|
||||
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
|
||||
$code.=<<___;
|
||||
.align 5
|
||||
.Lcbc_dec:
|
||||
vld1.8 {$dat2},[$inp],#16
|
||||
subs $len,$len,#32 // bias
|
||||
add $cnt,$rounds,#2
|
||||
vorr $in1,$dat,$dat
|
||||
vorr $dat1,$dat,$dat
|
||||
vorr $in2,$dat2,$dat2
|
||||
b.lo .Lcbc_dec_tail
|
||||
|
||||
vorr $dat1,$dat2,$dat2
|
||||
vld1.8 {$dat2},[$inp],#16
|
||||
vorr $in0,$dat,$dat
|
||||
vorr $in1,$dat1,$dat1
|
||||
vorr $in2,$dat2,$dat2
|
||||
|
||||
.Loop3x_cbc_dec:
|
||||
aesd $dat0,q8
|
||||
aesd $dat1,q8
|
||||
aesd $dat2,q8
|
||||
vld1.32 {q8},[$key_],#16
|
||||
aesimc $dat0,$dat0
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
subs $cnt,$cnt,#2
|
||||
aesd $dat0,q9
|
||||
aesd $dat1,q9
|
||||
aesd $dat2,q9
|
||||
vld1.32 {q9},[$key_],#16
|
||||
aesimc $dat0,$dat0
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
b.gt .Loop3x_cbc_dec
|
||||
|
||||
aesd $dat0,q8
|
||||
aesd $dat1,q8
|
||||
aesd $dat2,q8
|
||||
veor $tmp0,$ivec,$rndlast
|
||||
aesimc $dat0,$dat0
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
veor $tmp1,$in0,$rndlast
|
||||
aesd $dat0,q9
|
||||
aesd $dat1,q9
|
||||
aesd $dat2,q9
|
||||
veor $tmp2,$in1,$rndlast
|
||||
subs $len,$len,#0x30
|
||||
aesimc $dat0,$dat0
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
vorr $ivec,$in2,$in2
|
||||
mov.lo x6,$len // x6, $cnt, is zero at this point
|
||||
aesd $dat0,q12
|
||||
aesd $dat1,q12
|
||||
aesd $dat2,q12
|
||||
add $inp,$inp,x6 // $inp is adjusted in such way that
|
||||
// at exit from the loop $dat1-$dat2
|
||||
// are loaded with last "words"
|
||||
aesimc $dat0,$dat0
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
mov $key_,$key
|
||||
aesd $dat0,q13
|
||||
aesd $dat1,q13
|
||||
aesd $dat2,q13
|
||||
vld1.8 {$in0},[$inp],#16
|
||||
aesimc $dat0,$dat0
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
vld1.8 {$in1},[$inp],#16
|
||||
aesd $dat0,q14
|
||||
aesd $dat1,q14
|
||||
aesd $dat2,q14
|
||||
vld1.8 {$in2},[$inp],#16
|
||||
aesimc $dat0,$dat0
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
|
||||
aesd $dat0,q15
|
||||
aesd $dat1,q15
|
||||
aesd $dat2,q15
|
||||
|
||||
add $cnt,$rounds,#2
|
||||
veor $tmp0,$tmp0,$dat0
|
||||
veor $tmp1,$tmp1,$dat1
|
||||
veor $dat2,$dat2,$tmp2
|
||||
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
|
||||
vorr $dat0,$in0,$in0
|
||||
vst1.8 {$tmp0},[$out],#16
|
||||
vorr $dat1,$in1,$in1
|
||||
vst1.8 {$tmp1},[$out],#16
|
||||
vst1.8 {$dat2},[$out],#16
|
||||
vorr $dat2,$in2,$in2
|
||||
b.hs .Loop3x_cbc_dec
|
||||
|
||||
cmn $len,#0x30
|
||||
b.eq .Lcbc_done
|
||||
nop
|
||||
|
||||
.Lcbc_dec_tail:
|
||||
aesd $dat1,q8
|
||||
aesd $dat2,q8
|
||||
vld1.32 {q8},[$key_],#16
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
subs $cnt,$cnt,#2
|
||||
aesd $dat1,q9
|
||||
aesd $dat2,q9
|
||||
vld1.32 {q9},[$key_],#16
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
b.gt .Lcbc_dec_tail
|
||||
|
||||
aesd $dat1,q8
|
||||
aesd $dat2,q8
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
aesd $dat1,q9
|
||||
aesd $dat2,q9
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
aesd $dat1,q12
|
||||
aesd $dat2,q12
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
cmn $len,#0x20
|
||||
aesd $dat1,q13
|
||||
aesd $dat2,q13
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
veor $tmp1,$ivec,$rndlast
|
||||
aesd $dat1,q14
|
||||
aesd $dat2,q14
|
||||
aesimc $dat1,$dat1
|
||||
aesimc $dat2,$dat2
|
||||
veor $tmp2,$in1,$rndlast
|
||||
aesd $dat1,q15
|
||||
aesd $dat2,q15
|
||||
b.eq .Lcbc_dec_one
|
||||
veor $tmp1,$tmp1,$dat1
|
||||
veor $tmp2,$tmp2,$dat2
|
||||
vorr $ivec,$in2,$in2
|
||||
vst1.8 {$tmp1},[$out],#16
|
||||
vst1.8 {$tmp2},[$out],#16
|
||||
b .Lcbc_done
|
||||
|
||||
.Lcbc_dec_one:
|
||||
veor $tmp1,$tmp1,$dat2
|
||||
vorr $ivec,$in2,$in2
|
||||
vst1.8 {$tmp1},[$out],#16
|
||||
|
||||
.Lcbc_done:
|
||||
vst1.8 {$ivec},[$ivp]
|
||||
.Lcbc_abort:
|
||||
___
|
||||
}
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
vldmia sp!,{d8-d15}
|
||||
ldmia sp!,{r4-r8,pc}
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
___
|
||||
$code.=<<___;
|
||||
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
|
||||
___
|
||||
}}}
|
||||
{{{
|
||||
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
|
||||
my ($rounds,$cnt,$key_)=("w5","w6","x7");
|
||||
my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
|
||||
my $step="x12"; # aliases with $tctr2
|
||||
|
||||
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
|
||||
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
|
||||
|
||||
my ($dat,$tmp)=($dat0,$tmp0);
|
||||
|
||||
### q8-q15 preloaded key schedule
|
||||
|
||||
$code.=<<___;
|
||||
.globl ${prefix}_ctr32_encrypt_blocks
|
||||
.type ${prefix}_ctr32_encrypt_blocks,%function
|
||||
.align 5
|
||||
${prefix}_ctr32_encrypt_blocks:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
mov ip,sp
|
||||
stmdb sp!,{r4-r10,lr}
|
||||
vstmdb sp!,{d8-d15} @ ABI specification says so
|
||||
ldr r4, [ip] @ load remaining arg
|
||||
___
|
||||
$code.=<<___;
|
||||
ldr $rounds,[$key,#240]
|
||||
|
||||
ldr $ctr, [$ivp, #12]
|
||||
vld1.32 {$dat0},[$ivp]
|
||||
|
||||
vld1.32 {q8-q9},[$key] // load key schedule...
|
||||
sub $rounds,$rounds,#4
|
||||
mov $step,#16
|
||||
cmp $len,#2
|
||||
add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
|
||||
sub $rounds,$rounds,#2
|
||||
vld1.32 {q12-q13},[$key_],#32
|
||||
vld1.32 {q14-q15},[$key_],#32
|
||||
vld1.32 {$rndlast},[$key_]
|
||||
add $key_,$key,#32
|
||||
mov $cnt,$rounds
|
||||
cclr $step,lo
|
||||
#ifndef __ARMEB__
|
||||
rev $ctr, $ctr
|
||||
#endif
|
||||
vorr $dat1,$dat0,$dat0
|
||||
add $tctr1, $ctr, #1
|
||||
vorr $dat2,$dat0,$dat0
|
||||
add $ctr, $ctr, #2
|
||||
vorr $ivec,$dat0,$dat0
|
||||
rev $tctr1, $tctr1
|
||||
vmov.32 ${dat1}[3],$tctr1
|
||||
b.ls .Lctr32_tail
|
||||
rev $tctr2, $ctr
|
||||
sub $len,$len,#3 // bias
|
||||
vmov.32 ${dat2}[3],$tctr2
|
||||
b .Loop3x_ctr32
|
||||
|
||||
.align 4
|
||||
.Loop3x_ctr32:
|
||||
aese $dat0,q8
|
||||
aese $dat1,q8
|
||||
aese $dat2,q8
|
||||
vld1.32 {q8},[$key_],#16
|
||||
aesmc $dat0,$dat0
|
||||
aesmc $dat1,$dat1
|
||||
aesmc $dat2,$dat2
|
||||
subs $cnt,$cnt,#2
|
||||
aese $dat0,q9
|
||||
aese $dat1,q9
|
||||
aese $dat2,q9
|
||||
vld1.32 {q9},[$key_],#16
|
||||
aesmc $dat0,$dat0
|
||||
aesmc $dat1,$dat1
|
||||
aesmc $dat2,$dat2
|
||||
b.gt .Loop3x_ctr32
|
||||
|
||||
aese $dat0,q8
|
||||
aese $dat1,q8
|
||||
aese $dat2,q8
|
||||
mov $key_,$key
|
||||
aesmc $tmp0,$dat0
|
||||
vld1.8 {$in0},[$inp],#16
|
||||
aesmc $tmp1,$dat1
|
||||
aesmc $dat2,$dat2
|
||||
vorr $dat0,$ivec,$ivec
|
||||
aese $tmp0,q9
|
||||
vld1.8 {$in1},[$inp],#16
|
||||
aese $tmp1,q9
|
||||
aese $dat2,q9
|
||||
vorr $dat1,$ivec,$ivec
|
||||
aesmc $tmp0,$tmp0
|
||||
vld1.8 {$in2},[$inp],#16
|
||||
aesmc $tmp1,$tmp1
|
||||
aesmc $tmp2,$dat2
|
||||
vorr $dat2,$ivec,$ivec
|
||||
add $tctr0,$ctr,#1
|
||||
aese $tmp0,q12
|
||||
aese $tmp1,q12
|
||||
aese $tmp2,q12
|
||||
veor $in0,$in0,$rndlast
|
||||
add $tctr1,$ctr,#2
|
||||
aesmc $tmp0,$tmp0
|
||||
aesmc $tmp1,$tmp1
|
||||
aesmc $tmp2,$tmp2
|
||||
veor $in1,$in1,$rndlast
|
||||
add $ctr,$ctr,#3
|
||||
aese $tmp0,q13
|
||||
aese $tmp1,q13
|
||||
aese $tmp2,q13
|
||||
veor $in2,$in2,$rndlast
|
||||
rev $tctr0,$tctr0
|
||||
aesmc $tmp0,$tmp0
|
||||
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
|
||||
aesmc $tmp1,$tmp1
|
||||
aesmc $tmp2,$tmp2
|
||||
vmov.32 ${dat0}[3], $tctr0
|
||||
rev $tctr1,$tctr1
|
||||
aese $tmp0,q14
|
||||
aese $tmp1,q14
|
||||
aese $tmp2,q14
|
||||
vmov.32 ${dat1}[3], $tctr1
|
||||
rev $tctr2,$ctr
|
||||
aesmc $tmp0,$tmp0
|
||||
aesmc $tmp1,$tmp1
|
||||
aesmc $tmp2,$tmp2
|
||||
vmov.32 ${dat2}[3], $tctr2
|
||||
subs $len,$len,#3
|
||||
aese $tmp0,q15
|
||||
aese $tmp1,q15
|
||||
aese $tmp2,q15
|
||||
|
||||
mov $cnt,$rounds
|
||||
veor $in0,$in0,$tmp0
|
||||
veor $in1,$in1,$tmp1
|
||||
veor $in2,$in2,$tmp2
|
||||
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
|
||||
vst1.8 {$in0},[$out],#16
|
||||
vst1.8 {$in1},[$out],#16
|
||||
vst1.8 {$in2},[$out],#16
|
||||
b.hs .Loop3x_ctr32
|
||||
|
||||
adds $len,$len,#3
|
||||
b.eq .Lctr32_done
|
||||
cmp $len,#1
|
||||
mov $step,#16
|
||||
cclr $step,eq
|
||||
|
||||
.Lctr32_tail:
|
||||
aese $dat0,q8
|
||||
aese $dat1,q8
|
||||
vld1.32 {q8},[$key_],#16
|
||||
aesmc $dat0,$dat0
|
||||
aesmc $dat1,$dat1
|
||||
subs $cnt,$cnt,#2
|
||||
aese $dat0,q9
|
||||
aese $dat1,q9
|
||||
vld1.32 {q9},[$key_],#16
|
||||
aesmc $dat0,$dat0
|
||||
aesmc $dat1,$dat1
|
||||
b.gt .Lctr32_tail
|
||||
|
||||
aese $dat0,q8
|
||||
aese $dat1,q8
|
||||
aesmc $dat0,$dat0
|
||||
aesmc $dat1,$dat1
|
||||
aese $dat0,q9
|
||||
aese $dat1,q9
|
||||
aesmc $dat0,$dat0
|
||||
aesmc $dat1,$dat1
|
||||
vld1.8 {$in0},[$inp],$step
|
||||
aese $dat0,q12
|
||||
aese $dat1,q12
|
||||
vld1.8 {$in1},[$inp]
|
||||
aesmc $dat0,$dat0
|
||||
aesmc $dat1,$dat1
|
||||
aese $dat0,q13
|
||||
aese $dat1,q13
|
||||
aesmc $dat0,$dat0
|
||||
aesmc $dat1,$dat1
|
||||
aese $dat0,q14
|
||||
aese $dat1,q14
|
||||
veor $in0,$in0,$rndlast
|
||||
aesmc $dat0,$dat0
|
||||
aesmc $dat1,$dat1
|
||||
veor $in1,$in1,$rndlast
|
||||
aese $dat0,q15
|
||||
aese $dat1,q15
|
||||
|
||||
cmp $len,#1
|
||||
veor $in0,$in0,$dat0
|
||||
veor $in1,$in1,$dat1
|
||||
vst1.8 {$in0},[$out],#16
|
||||
b.eq .Lctr32_done
|
||||
vst1.8 {$in1},[$out]
|
||||
|
||||
.Lctr32_done:
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
vldmia sp!,{d8-d15}
|
||||
ldmia sp!,{r4-r10,pc}
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
___
|
||||
$code.=<<___;
|
||||
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
|
||||
___
|
||||
}}}
|
||||
$code.=<<___;
|
||||
#endif
|
||||
___
|
||||
########################################
|
||||
if ($flavour =~ /64/) { ######## 64-bit code
|
||||
my %opcode = (
|
||||
"aesd" => 0x4e285800, "aese" => 0x4e284800,
|
||||
"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
|
||||
|
||||
local *unaes = sub {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
|
||||
sprintf ".inst\t0x%08x\t//%s %s",
|
||||
$opcode{$mnemonic}|$1|($2<<5),
|
||||
$mnemonic,$arg;
|
||||
};
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval($1)/geo;
|
||||
|
||||
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
|
||||
s/@\s/\/\//o; # old->new style commentary
|
||||
|
||||
#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
|
||||
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
|
||||
s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
|
||||
s/vmov\.i8/movi/o or # fix up legacy mnemonics
|
||||
s/vext\.8/ext/o or
|
||||
s/vrev32\.8/rev32/o or
|
||||
s/vtst\.8/cmtst/o or
|
||||
s/vshr/ushr/o or
|
||||
s/^(\s+)v/$1/o or # strip off v prefix
|
||||
s/\bbx\s+lr\b/ret/o;
|
||||
|
||||
# fix up remainig legacy suffixes
|
||||
s/\.[ui]?8//o;
|
||||
m/\],#8/o and s/\.16b/\.8b/go;
|
||||
s/\.[ui]?32//o and s/\.16b/\.4s/go;
|
||||
s/\.[ui]?64//o and s/\.16b/\.2d/go;
|
||||
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
} else { ######## 32-bit code
|
||||
my %opcode = (
|
||||
"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
|
||||
"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
|
||||
|
||||
local *unaes = sub {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
|
||||
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<1) |(($2&8)<<2);
|
||||
# since ARMv7 instructions are always encoded little-endian.
|
||||
# correct solution is to use .inst directive, but older
|
||||
# assemblers don't implement it:-(
|
||||
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||||
$word&0xff,($word>>8)&0xff,
|
||||
($word>>16)&0xff,($word>>24)&0xff,
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
};
|
||||
|
||||
sub unvtbl {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
|
||||
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
|
||||
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
|
||||
}
|
||||
|
||||
sub unvdup32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
||||
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||||
}
|
||||
|
||||
sub unvmov32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
|
||||
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval($1)/geo;
|
||||
|
||||
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
|
||||
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
|
||||
s/\/\/\s?/@ /o; # new->old style commentary
|
||||
|
||||
# fix up remainig new-style suffixes
|
||||
s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
|
||||
s/\],#[0-9]+/]!/o;
|
||||
|
||||
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
|
||||
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
|
||||
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
|
||||
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
|
||||
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
|
||||
s/^(\s+)b\./$1b/o or
|
||||
s/^(\s+)mov\./$1mov/o or
|
||||
s/^(\s+)ret/$1bx\tlr/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
}
|
||||
|
||||
close STDOUT;
|
@ -62,11 +62,22 @@
|
||||
# define __ARMEL__
|
||||
# endif
|
||||
# elif defined(__GNUC__)
|
||||
# if defined(__aarch64__)
|
||||
# define __ARM_ARCH__ 8
|
||||
# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
# define __ARMEB__
|
||||
# else
|
||||
# define __ARMEL__
|
||||
# endif
|
||||
/* Why doesn't gcc define __ARM_ARCH__? Instead it defines
|
||||
* bunch of below macros. See all_architectires[] table in
|
||||
* gcc/config/arm/arm.c. On a side note it defines
|
||||
* __ARMEL__/__ARMEB__ for little-/big-endian. */
|
||||
# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
|
||||
# elif defined(__ARM_ARCH)
|
||||
# define __ARM_ARCH__ __ARM_ARCH
|
||||
# elif defined(__ARM_ARCH_8A__)
|
||||
# define __ARM_ARCH__ 8
|
||||
# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
|
||||
defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
|
||||
defined(__ARM_ARCH_7EM__)
|
||||
# define __ARM_ARCH__ 7
|
||||
@ -87,6 +98,10 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Even when building for 32-bit ARM, support for aarch64 crypto instructions
|
||||
* will be included. */
|
||||
#define __ARM_MAX_ARCH__ 8
|
||||
|
||||
#if !__ASSEMBLER__
|
||||
|
||||
/* OPENSSL_armcap_P contains flags describing the capabilities of the CPU and
|
||||
@ -94,6 +109,8 @@
|
||||
* |cpu.h|. */
|
||||
extern uint32_t OPENSSL_armcap_P;
|
||||
|
||||
#endif /* !__ASSEMBLER__ */
|
||||
|
||||
/* ARMV7_NEON is true when a NEON unit is present in the current CPU. */
|
||||
#define ARMV7_NEON (1 << 0)
|
||||
|
||||
@ -101,9 +118,19 @@ extern uint32_t OPENSSL_armcap_P;
|
||||
* The Poly1305 NEON code is known to trigger bugs in the NEON units of some
|
||||
* phones. If this bit isn't set then the Poly1305 NEON code won't be used.
|
||||
* See https://code.google.com/p/chromium/issues/detail?id=341598. */
|
||||
#define ARMV7_NEON_FUNCTIONAL (1 << 1)
|
||||
#define ARMV7_NEON_FUNCTIONAL (1 << 10)
|
||||
|
||||
#endif /* !__ASSEMBLER__ */
|
||||
/* ARMV8_AES indicates support for hardware AES instructions. */
|
||||
#define ARMV8_AES (1 << 2)
|
||||
|
||||
/* ARMV8_SHA1 indicates support for hardware SHA-1 instructions. */
|
||||
#define ARMV8_SHA1 (1 << 3)
|
||||
|
||||
/* ARMV8_SHA256 indicates support for hardware SHA-256 instructions. */
|
||||
#define ARMV8_SHA256 (1 << 4)
|
||||
|
||||
/* ARMV8_PMULL indicates support for carryless multiplication. */
|
||||
#define ARMV8_PMULL (1 << 5)
|
||||
|
||||
|
||||
#endif /* OPENSSL_HEADER_THREAD_H */
|
||||
|
@ -263,7 +263,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
|
||||
#ifdef BN_LLONG
|
||||
BN_ULLONG t2;
|
||||
|
||||
#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(div_asm)
|
||||
#if defined(BN_LLONG) && !defined(div_asm)
|
||||
q = (BN_ULONG)(((((BN_ULLONG)n0) << BN_BITS2) | n1) / d0);
|
||||
#else
|
||||
q = div_asm(n0, n1, d0);
|
||||
|
@ -130,40 +130,8 @@
|
||||
BN_UMULT_LOHI(r0, r1, tmp, tmp); \
|
||||
}
|
||||
|
||||
#elif defined(BN_UMULT_HIGH)
|
||||
#define mul_add(r, a, w, c) \
|
||||
{ \
|
||||
BN_ULONG high, low, ret, tmp = (a); \
|
||||
ret = (r); \
|
||||
high = BN_UMULT_HIGH(w, tmp); \
|
||||
ret += (c); \
|
||||
low = (w) * tmp; \
|
||||
(c) = (ret < (c)) ? 1 : 0; \
|
||||
(c) += high; \
|
||||
ret += low; \
|
||||
(c) += (ret < low) ? 1 : 0; \
|
||||
(r) = ret; \
|
||||
}
|
||||
|
||||
#define mul(r, a, w, c) \
|
||||
{ \
|
||||
BN_ULONG high, low, ret, ta = (a); \
|
||||
low = (w) * ta; \
|
||||
high = BN_UMULT_HIGH(w, ta); \
|
||||
ret = low + (c); \
|
||||
(c) = high; \
|
||||
(c) += (ret < low) ? 1 : 0; \
|
||||
(r) = ret; \
|
||||
}
|
||||
|
||||
#define sqr(r0, r1, a) \
|
||||
{ \
|
||||
BN_ULONG tmp = (a); \
|
||||
(r0) = tmp * tmp; \
|
||||
(r1) = BN_UMULT_HIGH(tmp, tmp); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/*************************************************************
|
||||
* No long long type
|
||||
*/
|
||||
@ -424,7 +392,7 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
|
||||
|
||||
#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
|
||||
|
||||
#if defined(BN_LLONG) && defined(BN_DIV2W)
|
||||
#if defined(BN_LLONG)
|
||||
|
||||
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
|
||||
return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
|
||||
@ -502,7 +470,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
|
||||
#endif /* !defined(BN_LLONG) */
|
||||
|
||||
#ifdef BN_LLONG
|
||||
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
|
||||
@ -749,49 +717,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
|
||||
|
||||
#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
|
||||
|
||||
#elif defined(BN_UMULT_HIGH)
|
||||
|
||||
/* Keep in mind that additions to hi can not overflow, because
|
||||
* the high word of a multiplication result cannot be all-ones. */
|
||||
#define mul_add_c(a, b, c0, c1, c2) \
|
||||
do { \
|
||||
BN_ULONG ta = (a), tb = (b); \
|
||||
BN_ULONG lo = ta * tb; \
|
||||
BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \
|
||||
c0 += lo; \
|
||||
hi += (c0 < lo) ? 1 : 0; \
|
||||
c1 += hi; \
|
||||
c2 += (c1 < hi) ? 1 : 0; \
|
||||
} while (0)
|
||||
|
||||
#define mul_add_c2(a, b, c0, c1, c2) \
|
||||
do { \
|
||||
BN_ULONG ta = (a), tb = (b), tt; \
|
||||
BN_ULONG lo = ta * tb; \
|
||||
BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \
|
||||
c0 += lo; \
|
||||
tt = hi + ((c0 < lo) ? 1 : 0); \
|
||||
c1 += tt; \
|
||||
c2 += (c1 < tt) ? 1 : 0; \
|
||||
c0 += lo; \
|
||||
hi += (c0 < lo) ? 1 : 0; \
|
||||
c1 += hi; \
|
||||
c2 += (c1 < hi) ? 1 : 0; \
|
||||
} while (0)
|
||||
|
||||
#define sqr_add_c(a, i, c0, c1, c2) \
|
||||
do { \
|
||||
BN_ULONG ta = (a)[i]; \
|
||||
BN_ULONG lo = ta * ta; \
|
||||
BN_ULONG hi = BN_UMULT_HIGH(ta, ta); \
|
||||
c0 += lo; \
|
||||
hi += (c0 < lo) ? 1 : 0; \
|
||||
c1 += hi; \
|
||||
c2 += (c1 < hi) ? 1 : 0; \
|
||||
} while (0)
|
||||
|
||||
#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
|
||||
|
||||
#else /* !BN_LLONG */
|
||||
|
||||
/* Keep in mind that additions to hi can not overflow, because
|
||||
|
@ -142,8 +142,14 @@ BIGNUM *bn_expand(BIGNUM *bn, unsigned bits);
|
||||
|
||||
#if defined(OPENSSL_64_BIT)
|
||||
|
||||
#define BN_ULLONG unsigned long long
|
||||
#if !defined(_MSC_VER)
|
||||
/* MSVC doesn't support two-word integers on 64-bit. */
|
||||
#define BN_LLONG __int128_t
|
||||
#define BN_ULLONG __uint128_t
|
||||
#endif
|
||||
|
||||
#define BN_BITS 128
|
||||
#define BN_BITS2 64
|
||||
#define BN_BYTES 8
|
||||
#define BN_BITS4 32
|
||||
#define BN_MASK (0xffffffffffffffffffffffffffffffffLL)
|
||||
@ -160,9 +166,11 @@ BIGNUM *bn_expand(BIGNUM *bn, unsigned bits);
|
||||
|
||||
#elif defined(OPENSSL_32_BIT)
|
||||
|
||||
#define BN_ULLONG unsigned long long
|
||||
#define BN_MASK (0xffffffffffffffffLL)
|
||||
#define BN_LLONG int64_t
|
||||
#define BN_ULLONG uint64_t
|
||||
#define BN_MASK (0xffffffffffffffffLL)
|
||||
#define BN_BITS 64
|
||||
#define BN_BITS2 32
|
||||
#define BN_BYTES 4
|
||||
#define BN_BITS4 16
|
||||
#define BN_MASK2 (0xffffffffL)
|
||||
@ -188,6 +196,11 @@ BIGNUM *bn_expand(BIGNUM *bn, unsigned bits);
|
||||
#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32) /* 32 */
|
||||
#define BN_MONT_CTX_SET_SIZE_WORD (64) /* 32 */
|
||||
|
||||
#if defined(BN_LLONG)
|
||||
#define Lw(t) (((BN_ULONG)(t))&BN_MASK2)
|
||||
#define Hw(t) (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
|
||||
#endif
|
||||
|
||||
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
|
||||
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
|
||||
void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
|
||||
@ -213,6 +226,8 @@ int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl);
|
||||
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
||||
const BN_ULONG *np, const BN_ULONG *n0, int num);
|
||||
|
||||
#if !defined(BN_LLONG)
|
||||
|
||||
#define LBITS(a) ((a) & BN_MASK2l)
|
||||
#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
|
||||
#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
|
||||
@ -243,6 +258,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
||||
(h) = ht; \
|
||||
}
|
||||
|
||||
#endif /* !defined(BN_LLONG) */
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
|
||||
# if defined(__GNUC__) && __GNUC__ >= 2
|
||||
# define BN_UMULT_HIGH(a,b) ({ \
|
||||
|
@ -103,14 +103,33 @@ static char bsaes_capable(void) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
|
||||
#elif !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
|
||||
#include "../arm_arch.h"
|
||||
#if __ARM_ARCH__ >= 7
|
||||
|
||||
#if defined(OPENSSL_ARM)
|
||||
#define BSAES
|
||||
static char bsaes_capable(void) {
|
||||
return CRYPTO_is_NEON_capable();
|
||||
}
|
||||
#endif /* __ARM_ARCH__ >= 7 */
|
||||
#endif
|
||||
|
||||
#define HWAES
|
||||
static char hwaes_capable(void) {
|
||||
return (OPENSSL_armcap_P & ARMV8_AES) != 0;
|
||||
}
|
||||
|
||||
int aes_v8_set_encrypt_key(const uint8_t *user_key, const int bits,
|
||||
AES_KEY *key);
|
||||
int aes_v8_set_decrypt_key(const uint8_t *user_key, const int bits,
|
||||
AES_KEY *key);
|
||||
void aes_v8_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
|
||||
void aes_v8_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
|
||||
void aes_v8_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
|
||||
const AES_KEY *key, uint8_t *ivec, const int enc);
|
||||
void aes_v8_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
|
||||
const AES_KEY *key, const uint8_t ivec[16]);
|
||||
|
||||
#endif /* OPENSSL_ARM */
|
||||
|
||||
#if defined(BSAES)
|
||||
@ -174,6 +193,41 @@ void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(HWAES)
|
||||
/* If HWAES isn't defined then we provide dummy functions for each of the hwaes
|
||||
* functions. */
|
||||
int hwaes_capable() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int aes_v8_set_encrypt_key(const uint8_t *user_key, int bits,
|
||||
AES_KEY *key) {
|
||||
abort();
|
||||
}
|
||||
|
||||
int aes_v8_set_decrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) {
|
||||
abort();
|
||||
}
|
||||
|
||||
void aes_v8_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
|
||||
abort();
|
||||
}
|
||||
|
||||
void aes_v8_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
|
||||
abort();
|
||||
}
|
||||
|
||||
void aes_v8_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
|
||||
const AES_KEY *key, uint8_t *ivec, int enc) {
|
||||
abort();
|
||||
}
|
||||
|
||||
void aes_v8_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
|
||||
const AES_KEY *key, const uint8_t ivec[16]) {
|
||||
abort();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86_64) || defined(OPENSSL_X86))
|
||||
int aesni_set_encrypt_key(const uint8_t *userKey, int bits, AES_KEY *key);
|
||||
@ -227,7 +281,14 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
|
||||
|
||||
mode = ctx->cipher->flags & EVP_CIPH_MODE_MASK;
|
||||
if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) {
|
||||
if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) {
|
||||
if (hwaes_capable()) {
|
||||
ret = aes_v8_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
|
||||
dat->block = (block128_f)aes_v8_decrypt;
|
||||
dat->stream.cbc = NULL;
|
||||
if (mode == EVP_CIPH_CBC_MODE) {
|
||||
dat->stream.cbc = (cbc128_f)aes_v8_cbc_encrypt;
|
||||
}
|
||||
} else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) {
|
||||
ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
|
||||
dat->block = (block128_f)AES_decrypt;
|
||||
dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt;
|
||||
@ -242,6 +303,15 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
|
||||
dat->stream.cbc =
|
||||
mode == EVP_CIPH_CBC_MODE ? (cbc128_f)AES_cbc_encrypt : NULL;
|
||||
}
|
||||
} else if (hwaes_capable()) {
|
||||
ret = aes_v8_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
|
||||
dat->block = (block128_f)aes_v8_encrypt;
|
||||
dat->stream.cbc = NULL;
|
||||
if (mode == EVP_CIPH_CBC_MODE) {
|
||||
dat->stream.cbc = (cbc128_f)aes_v8_cbc_encrypt;
|
||||
} else if (mode == EVP_CIPH_CTR_MODE) {
|
||||
dat->stream.ctr = (ctr128_f)aes_v8_ctr32_encrypt_blocks;
|
||||
}
|
||||
} else if (bsaes_capable() && mode == EVP_CIPH_CTR_MODE) {
|
||||
ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
|
||||
dat->block = (block128_f)AES_encrypt;
|
||||
@ -316,6 +386,12 @@ static int aes_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
|
||||
|
||||
static ctr128_f aes_gcm_set_key(AES_KEY *aes_key, GCM128_CONTEXT *gcm_ctx,
|
||||
const uint8_t *key, size_t key_len) {
|
||||
if (hwaes_capable()) {
|
||||
aes_v8_set_encrypt_key(key, key_len * 8, aes_key);
|
||||
CRYPTO_gcm128_init(gcm_ctx, aes_key, (block128_f)aes_v8_encrypt);
|
||||
return (ctr128_f)aes_v8_ctr32_encrypt_blocks;
|
||||
}
|
||||
|
||||
if (bsaes_capable()) {
|
||||
AES_set_encrypt_key(key, key_len * 8, aes_key);
|
||||
CRYPTO_gcm128_init(gcm_ctx, aes_key, (block128_f)AES_encrypt);
|
||||
@ -1274,6 +1350,8 @@ const EVP_AEAD *EVP_aead_aes_256_key_wrap(void) { return &aead_aes_256_key_wrap;
|
||||
int EVP_has_aes_hardware(void) {
|
||||
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
|
||||
return aesni_capable() && crypto_gcm_clmul_enabled();
|
||||
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
|
||||
return hwaes_capable() && (OPENSSL_armcap_P & ARMV8_PMULL);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
|
@ -56,18 +56,14 @@
|
||||
|
||||
#include <openssl/cpu.h>
|
||||
|
||||
#if defined(OPENSSL_ARM)
|
||||
#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
#include "arm_arch.h"
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
uint32_t OPENSSL_armcap_P = ARMV7_NEON | ARMV7_NEON_FUNCTIONAL;
|
||||
#else
|
||||
uint32_t OPENSSL_armcap_P = ARMV7_NEON_FUNCTIONAL;
|
||||
#endif
|
||||
|
||||
char CRYPTO_is_NEON_capable(void) {
|
||||
return (OPENSSL_armcap_P & ARMV7_NEON) != 0;
|
||||
@ -94,4 +90,53 @@ void CRYPTO_set_NEON_functional(char neon_functional) {
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* defined(OPENSSL_ARM) */
|
||||
void OPENSSL_cpuid_setup(void) {
|
||||
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
|
||||
#if defined(OPENSSL_ARM)
|
||||
static const unsigned long kNEON = 1 << 12;
|
||||
if ((hwcap & kNEON) == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* In 32-bit mode, the ARMv8 feature bits are in a different aux vector
|
||||
* value. */
|
||||
hwcap = getauxval(AT_HWCAP2);
|
||||
|
||||
/* See /usr/include/asm/hwcap.h on an ARM installation for the source of
|
||||
* these values. */
|
||||
static const unsigned long kAES = 1 << 0;
|
||||
static const unsigned long kPMULL = 1 << 1;
|
||||
static const unsigned long kSHA1 = 1 << 2;
|
||||
static const unsigned long kSHA256 = 1 << 3;
|
||||
#elif defined(OPENSSL_AARCH64)
|
||||
/* See /usr/include/asm/hwcap.h on an aarch64 installation for the source of
|
||||
* these values. */
|
||||
static const unsigned long kNEON = 1 << 1;
|
||||
static const unsigned long kAES = 1 << 3;
|
||||
static const unsigned long kPMULL = 1 << 4;
|
||||
static const unsigned long kSHA1 = 1 << 5;
|
||||
static const unsigned long kSHA256 = 1 << 6;
|
||||
|
||||
if ((hwcap & kNEON) == 0) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
OPENSSL_armcap_P |= ARMV7_NEON | ARMV7_NEON_FUNCTIONAL;
|
||||
|
||||
if (hwcap & kAES) {
|
||||
OPENSSL_armcap_P |= ARMV8_AES;
|
||||
}
|
||||
if (hwcap & kPMULL) {
|
||||
OPENSSL_armcap_P |= ARMV8_PMULL;
|
||||
}
|
||||
if (hwcap & kSHA1) {
|
||||
OPENSSL_armcap_P |= ARMV8_SHA1;
|
||||
}
|
||||
if (hwcap & kSHA256) {
|
||||
OPENSSL_armcap_P |= ARMV8_SHA256;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) */
|
||||
|
@ -66,15 +66,6 @@
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
/* This value must be explicitly initialised to zero in order to work around a
|
||||
* bug in libtool or the linker on OS X.
|
||||
*
|
||||
* If not initialised then it becomes a "common symbol". When put into an
|
||||
* archive, linking on OS X will fail to resolve common symbols. By
|
||||
* initialising it to zero, it becomes a "data symbol", which isn't so
|
||||
* affected. */
|
||||
uint32_t OPENSSL_ia32cap_P[4] = {0};
|
||||
|
||||
/* OPENSSL_ia32_cpuid is defined in cpu-x86_64-asm.pl. */
|
||||
extern uint64_t OPENSSL_ia32_cpuid(uint32_t*);
|
||||
|
||||
|
@ -16,10 +16,12 @@
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
|
||||
/* x86 and x86_64 need to record the result of a cpuid call for the asm to work
|
||||
* correctly, unless compiled without asm code. */
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
|
||||
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
|
||||
/* x86, x86_64 and the ARMs need to record the result of a cpuid call for the
|
||||
* asm to work correctly, unless compiled without asm code. */
|
||||
#define NEED_CPUID
|
||||
|
||||
#else
|
||||
@ -30,7 +32,39 @@
|
||||
#define BORINGSSL_NO_STATIC_INITIALIZER
|
||||
#endif
|
||||
|
||||
#endif /* !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64) */
|
||||
#endif /* !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64 ||
|
||||
OPENSSL_ARM || OPENSSL_AARCH64) */
|
||||
|
||||
|
||||
/* The capability variables are defined in this file in order to work around a
|
||||
* linker bug. When linking with a .a, if no symbols in a .o are referenced
|
||||
* then the .o is discarded, even if it has constructor functions.
|
||||
*
|
||||
* This still means that any binaries that don't include some functionality
|
||||
* that tests the capability values will still skip the constructor but, so
|
||||
* far, the init constructor function only sets the capability variables. */
|
||||
|
||||
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
|
||||
/* This value must be explicitly initialised to zero in order to work around a
|
||||
* bug in libtool or the linker on OS X.
|
||||
*
|
||||
* If not initialised then it becomes a "common symbol". When put into an
|
||||
* archive, linking on OS X will fail to resolve common symbols. By
|
||||
* initialising it to zero, it becomes a "data symbol", which isn't so
|
||||
* affected. */
|
||||
uint32_t OPENSSL_ia32cap_P[4] = {0};
|
||||
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
|
||||
|
||||
#include "arm_arch.h"
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
uint32_t OPENSSL_armcap_P = ARMV7_NEON | ARMV7_NEON_FUNCTIONAL;
|
||||
#else
|
||||
uint32_t OPENSSL_armcap_P = ARMV7_NEON_FUNCTIONAL;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(OPENSSL_WINDOWS)
|
||||
#define OPENSSL_CDECL __cdecl
|
||||
@ -53,6 +87,8 @@ __declspec(allocate(".CRT$XCU")) void(*library_init_constructor)(void) =
|
||||
* BORINGSSL_NO_STATIC_INITIALIZER isn't defined, this is set as a static
|
||||
* initializer. Otherwise, it is called by CRYPTO_library_init. */
|
||||
static void OPENSSL_CDECL do_library_init(void) {
|
||||
/* WARNING: this function may only configure the capability variables. See the
|
||||
* note above about the linker bug. */
|
||||
#if defined(NEED_CPUID)
|
||||
OPENSSL_cpuid_setup();
|
||||
#endif
|
||||
|
@ -144,7 +144,8 @@ struct st_CRYPTO_EX_DATA_IMPL {
|
||||
|
||||
#endif /* defined(_MSC_VER) */
|
||||
|
||||
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
|
||||
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM) || \
|
||||
defined(OPENSSL_AARCH64)
|
||||
/* OPENSSL_cpuid_setup initializes OPENSSL_ia32cap_P. */
|
||||
void OPENSSL_cpuid_setup(void);
|
||||
#endif
|
||||
|
@ -22,6 +22,15 @@ if (${ARCH} STREQUAL "arm")
|
||||
MODES_ARCH_SOURCES
|
||||
|
||||
ghash-armv4.${ASM_EXT}
|
||||
ghashv8-armx.${ASM_EXT}
|
||||
)
|
||||
endif()
|
||||
|
||||
if (${ARCH} STREQUAL "aarch64")
|
||||
set(
|
||||
MODES_ARCH_SOURCES
|
||||
|
||||
ghashv8-armx.${ASM_EXT}
|
||||
)
|
||||
endif()
|
||||
|
||||
@ -43,6 +52,7 @@ perlasm(aesni-gcm-x86_64.${ASM_EXT} asm/aesni-gcm-x86_64.pl)
|
||||
perlasm(ghash-x86_64.${ASM_EXT} asm/ghash-x86_64.pl)
|
||||
perlasm(ghash-x86.${ASM_EXT} asm/ghash-x86.pl)
|
||||
perlasm(ghash-armv4.${ASM_EXT} asm/ghash-armv4.pl)
|
||||
perlasm(ghashv8-armx.${ASM_EXT} asm/ghashv8-armx.pl)
|
||||
|
||||
add_executable(
|
||||
gcm_test
|
||||
|
241
crypto/modes/asm/ghashv8-armx.pl
Normal file
241
crypto/modes/asm/ghashv8-armx.pl
Normal file
@ -0,0 +1,241 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
|
||||
#
|
||||
# June 2014
|
||||
#
|
||||
# Initial version was developed in tight cooperation with Ard
|
||||
# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
|
||||
# other assembly modules. Just like aesv8-armx.pl this module
|
||||
# supports both AArch32 and AArch64 execution modes.
|
||||
#
|
||||
# Current performance in cycles per processed byte:
|
||||
#
|
||||
# PMULL[2] 32-bit NEON(*)
|
||||
# Apple A7 1.76 5.62
|
||||
# Cortex-A53 1.45 8.39
|
||||
# Cortex-A57 2.22 7.61
|
||||
#
|
||||
# (*) presented for reference/comparison purposes;
|
||||
|
||||
$flavour = shift;
|
||||
open STDOUT,">".shift;
|
||||
|
||||
$Xi="x0"; # argument block
|
||||
$Htbl="x1";
|
||||
$inp="x2";
|
||||
$len="x3";
|
||||
|
||||
$inc="x12";
|
||||
|
||||
{
|
||||
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
|
||||
my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
|
||||
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
___
|
||||
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||||
$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
|
||||
|
||||
$code.=<<___;
|
||||
.global gcm_init_v8
|
||||
.type gcm_init_v8,%function
|
||||
.align 4
|
||||
gcm_init_v8:
|
||||
vld1.64 {$t1},[x1] @ load H
|
||||
vmov.i8 $t0,#0xe1
|
||||
vext.8 $IN,$t1,$t1,#8
|
||||
vshl.i64 $t0,$t0,#57
|
||||
vshr.u64 $t2,$t0,#63
|
||||
vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01
|
||||
vdup.32 $t1,${t1}[1]
|
||||
vshr.u64 $t3,$IN,#63
|
||||
vshr.s32 $t1,$t1,#31 @ broadcast carry bit
|
||||
vand $t3,$t3,$t0
|
||||
vshl.i64 $IN,$IN,#1
|
||||
vext.8 $t3,$t3,$t3,#8
|
||||
vand $t0,$t0,$t1
|
||||
vorr $IN,$IN,$t3 @ H<<<=1
|
||||
veor $IN,$IN,$t0 @ twisted H
|
||||
vst1.64 {$IN},[x0]
|
||||
|
||||
ret
|
||||
.size gcm_init_v8,.-gcm_init_v8
|
||||
|
||||
.global gcm_gmult_v8
|
||||
.type gcm_gmult_v8,%function
|
||||
.align 4
|
||||
gcm_gmult_v8:
|
||||
vld1.64 {$t1},[$Xi] @ load Xi
|
||||
vmov.i8 $t3,#0xe1
|
||||
vld1.64 {$H},[$Htbl] @ load twisted H
|
||||
vshl.u64 $t3,$t3,#57
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t1,$t1
|
||||
#endif
|
||||
vext.8 $Hhl,$H,$H,#8
|
||||
mov $len,#0
|
||||
vext.8 $IN,$t1,$t1,#8
|
||||
mov $inc,#0
|
||||
veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
|
||||
mov $inp,$Xi
|
||||
b .Lgmult_v8
|
||||
.size gcm_gmult_v8,.-gcm_gmult_v8
|
||||
|
||||
.global gcm_ghash_v8
|
||||
.type gcm_ghash_v8,%function
|
||||
.align 4
|
||||
gcm_ghash_v8:
|
||||
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
|
||||
subs $len,$len,#16
|
||||
vmov.i8 $t3,#0xe1
|
||||
mov $inc,#16
|
||||
vld1.64 {$H},[$Htbl] @ load twisted H
|
||||
cclr $inc,eq
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
vshl.u64 $t3,$t3,#57
|
||||
vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
|
||||
vext.8 $Hhl,$H,$H,#8
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $Xl,$Xl
|
||||
vrev64.8 $t1,$t1
|
||||
#endif
|
||||
veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
|
||||
vext.8 $IN,$t1,$t1,#8
|
||||
b .Loop_v8
|
||||
|
||||
.align 4
|
||||
.Loop_v8:
|
||||
vext.8 $t2,$Xl,$Xl,#8
|
||||
veor $IN,$IN,$Xl @ inp^=Xi
|
||||
veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi
|
||||
|
||||
.Lgmult_v8:
|
||||
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
|
||||
veor $t1,$t1,$IN @ Karatsuba pre-processing
|
||||
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
|
||||
subs $len,$len,#16
|
||||
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
cclr $inc,eq
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$t3 @ 1st phase
|
||||
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t1,$t1
|
||||
#endif
|
||||
veor $Xl,$Xm,$t2
|
||||
vext.8 $IN,$t1,$t1,#8
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
|
||||
vpmull.p64 $Xl,$Xl,$t3
|
||||
veor $t2,$t2,$Xh
|
||||
veor $Xl,$Xl,$t2
|
||||
b.hs .Loop_v8
|
||||
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
vst1.64 {$Xl},[$Xi] @ write out Xi
|
||||
|
||||
ret
|
||||
.size gcm_ghash_v8,.-gcm_ghash_v8
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
if ($flavour =~ /64/) { ######## 64-bit code
|
||||
sub unvmov {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
|
||||
sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
|
||||
}
|
||||
foreach(split("\n",$code)) {
|
||||
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
|
||||
s/vmov\.i8/movi/o or # fix up legacy mnemonics
|
||||
s/vmov\s+(.*)/unvmov($1)/geo or
|
||||
s/vext\.8/ext/o or
|
||||
s/vshr\.s/sshr\.s/o or
|
||||
s/vshr/ushr/o or
|
||||
s/^(\s+)v/$1/o or # strip off v prefix
|
||||
s/\bbx\s+lr\b/ret/o;
|
||||
|
||||
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
|
||||
s/@\s/\/\//o; # old->new style commentary
|
||||
|
||||
# fix up remainig legacy suffixes
|
||||
s/\.[ui]?8(\s)/$1/o;
|
||||
s/\.[uis]?32//o and s/\.16b/\.4s/go;
|
||||
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
|
||||
m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
|
||||
s/\.[uisp]?64//o and s/\.16b/\.2d/go;
|
||||
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
} else { ######## 32-bit code
|
||||
sub unvdup32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
||||
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||||
}
|
||||
sub unvpmullp64 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
|
||||
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<17)|(($2&8)<<4)
|
||||
|(($3&7)<<1) |(($3&8)<<2);
|
||||
$word |= 0x00010001 if ($mnemonic =~ "2");
|
||||
# since ARMv7 instructions are always encoded little-endian.
|
||||
# correct solution is to use .inst directive, but older
|
||||
# assemblers don't implement it:-(
|
||||
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||||
$word&0xff,($word>>8)&0xff,
|
||||
($word>>16)&0xff,($word>>24)&0xff,
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
|
||||
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
|
||||
s/\/\/\s?/@ /o; # new->old style commentary
|
||||
|
||||
# fix up remainig new-style suffixes
|
||||
s/\],#[0-9]+/]!/o;
|
||||
|
||||
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
|
||||
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
|
||||
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||||
s/^(\s+)b\./$1b/o or
|
||||
s/^(\s+)ret/$1bx\tlr/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
}
|
||||
|
||||
close STDOUT; # enforce flush
|
@ -57,8 +57,9 @@
|
||||
#include "../internal.h"
|
||||
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
|
||||
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
|
||||
#define GHASH_ASM
|
||||
#endif
|
||||
|
||||
@ -140,7 +141,7 @@ static void gcm_init_4bit(u128 Htable[16], uint64_t H[2]) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined(GHASH_ASM)
|
||||
#if !defined(GHASH_ASM) || defined(OPENSSL_AARCH64)
|
||||
static const size_t rem_4bit[16] = {
|
||||
PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
|
||||
PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
|
||||
@ -346,15 +347,48 @@ void gcm_gmult_4bit_x86(uint64_t Xi[2], const u128 Htable[16]);
|
||||
void gcm_ghash_4bit_x86(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
|
||||
size_t len);
|
||||
#endif
|
||||
#elif defined(OPENSSL_ARM)
|
||||
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
|
||||
#include "../arm_arch.h"
|
||||
#if __ARM_ARCH__ >= 7
|
||||
#define GHASH_ASM_ARM
|
||||
#define GCM_FUNCREF_4BIT
|
||||
void gcm_init_neon(u128 Htable[16],const uint64_t Xi[2]);
|
||||
|
||||
static int pmull_capable() {
|
||||
return (OPENSSL_armcap_P & ARMV8_PMULL) != 0;
|
||||
}
|
||||
|
||||
void gcm_init_v8(u128 Htable[16], const uint64_t Xi[2]);
|
||||
void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]);
|
||||
void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
|
||||
size_t len);
|
||||
|
||||
#if defined(OPENSSL_ARM)
|
||||
/* 32-bit ARM also has support for doing GCM with NEON instructions. */
|
||||
static int neon_capable() {
|
||||
return CRYPTO_is_NEON_capable();
|
||||
}
|
||||
|
||||
void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
|
||||
void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]);
|
||||
void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
|
||||
size_t len);
|
||||
#else
|
||||
/* AArch64 only has the ARMv8 versions of functions. */
|
||||
static int neon_capable() {
|
||||
return 0;
|
||||
}
|
||||
void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) {
|
||||
abort();
|
||||
}
|
||||
void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) {
|
||||
abort();
|
||||
}
|
||||
void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
|
||||
size_t len) {
|
||||
abort();
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
@ -433,7 +467,11 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block) {
|
||||
ctx->ghash = gcm_ghash_4bit;
|
||||
#endif
|
||||
#elif defined(GHASH_ASM_ARM)
|
||||
if (CRYPTO_is_NEON_capable()) {
|
||||
if (pmull_capable()) {
|
||||
gcm_init_v8(ctx->Htable, ctx->H.u);
|
||||
ctx->gmult = gcm_gmult_v8;
|
||||
ctx->ghash = gcm_ghash_v8;
|
||||
} else if (neon_capable()) {
|
||||
gcm_init_neon(ctx->Htable,ctx->H.u);
|
||||
ctx->gmult = gcm_gmult_neon;
|
||||
ctx->ghash = gcm_ghash_neon;
|
||||
@ -443,9 +481,9 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block) {
|
||||
ctx->ghash = gcm_ghash_4bit;
|
||||
}
|
||||
#else
|
||||
gcm_init_4bit(ctx->Htable, ctx->H.u);
|
||||
ctx->gmult = gcm_gmult_4bit;
|
||||
ctx->ghash = gcm_ghash_4bit;
|
||||
gcm_init_4bit(ctx->Htable, ctx->H.u);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,16 @@ if (${ARCH} STREQUAL "arm")
|
||||
)
|
||||
endif()
|
||||
|
||||
if (${ARCH} STREQUAL "aarch64")
|
||||
set(
|
||||
SHA_ARCH_SOURCES
|
||||
|
||||
sha1-armv8.${ASM_EXT}
|
||||
sha256-armv8.${ASM_EXT}
|
||||
sha512-armv8.${ASM_EXT}
|
||||
)
|
||||
endif()
|
||||
|
||||
add_library(
|
||||
sha
|
||||
|
||||
@ -51,3 +61,6 @@ perlasm(sha512-586.${ASM_EXT} asm/sha512-586.pl)
|
||||
perlasm(sha1-armv4-large.${ASM_EXT} asm/sha1-armv4-large.pl)
|
||||
perlasm(sha256-armv4.${ASM_EXT} asm/sha256-armv4.pl)
|
||||
perlasm(sha512-armv4.${ASM_EXT} asm/sha512-armv4.pl)
|
||||
perlasm(sha1-armv8.${ASM_EXT} asm/sha1-armv8.pl)
|
||||
perlasm(sha256-armv8.${ASM_EXT} asm/sha512-armv8.pl sha256)
|
||||
perlasm(sha512-armv8.${ASM_EXT} asm/sha512-armv8.pl sha512)
|
||||
|
334
crypto/sha/asm/sha1-armv8.pl
Normal file
334
crypto/sha/asm/sha1-armv8.pl
Normal file
@ -0,0 +1,334 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# SHA1 for ARMv8.
|
||||
#
|
||||
# Performance in cycles per processed byte and improvement coefficient
|
||||
# over code generated with "default" compiler:
|
||||
#
|
||||
# hardware-assisted software(*)
|
||||
# Apple A7 2.31 4.13 (+14%)
|
||||
# Cortex-A53 2.19 8.73 (+108%)
|
||||
# Cortex-A57 2.35 7.88 (+74%)
|
||||
#
|
||||
# (*) Software results are presented mostly for reference purposes.
|
||||
|
||||
$flavour = shift;
|
||||
open STDOUT,">".shift;
|
||||
|
||||
($ctx,$inp,$num)=("x0","x1","x2");
|
||||
@Xw=map("w$_",(3..17,19));
|
||||
@Xx=map("x$_",(3..17,19));
|
||||
@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
|
||||
($t0,$t1,$t2,$K)=map("w$_",(25..28));
|
||||
|
||||
|
||||
sub BODY_00_19 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=($i+2)&15;
|
||||
|
||||
$code.=<<___ if ($i<15 && !($i&1));
|
||||
lsr @Xx[$i+1],@Xx[$i],#32
|
||||
___
|
||||
$code.=<<___ if ($i<14 && !($i&1));
|
||||
ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
|
||||
___
|
||||
$code.=<<___ if ($i<14 && ($i&1));
|
||||
#ifdef __ARMEB__
|
||||
ror @Xx[$i+1],@Xx[$i+1],#32
|
||||
#else
|
||||
rev32 @Xx[$i+1],@Xx[$i+1]
|
||||
#endif
|
||||
___
|
||||
$code.=<<___ if ($i<14);
|
||||
bic $t0,$d,$b
|
||||
and $t1,$c,$b
|
||||
ror $t2,$a,#27
|
||||
add $d,$d,$K // future e+=K
|
||||
orr $t0,$t0,$t1
|
||||
add $e,$e,$t2 // e+=rot(a,5)
|
||||
ror $b,$b,#2
|
||||
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||||
add $e,$e,$t0 // e+=F(b,c,d)
|
||||
___
|
||||
$code.=<<___ if ($i==19);
|
||||
movz $K,#0xeba1
|
||||
movk $K,#0x6ed9,lsl#16
|
||||
___
|
||||
$code.=<<___ if ($i>=14);
|
||||
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
|
||||
bic $t0,$d,$b
|
||||
and $t1,$c,$b
|
||||
ror $t2,$a,#27
|
||||
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
|
||||
add $d,$d,$K // future e+=K
|
||||
orr $t0,$t0,$t1
|
||||
add $e,$e,$t2 // e+=rot(a,5)
|
||||
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
|
||||
ror $b,$b,#2
|
||||
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||||
add $e,$e,$t0 // e+=F(b,c,d)
|
||||
ror @Xw[$j],@Xw[$j],#31
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_40_59 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=($i+2)&15;
|
||||
|
||||
$code.=<<___ if ($i==59);
|
||||
movz $K,#0xc1d6
|
||||
movk $K,#0xca62,lsl#16
|
||||
___
|
||||
$code.=<<___;
|
||||
orr $t0,$b,$c
|
||||
and $t1,$b,$c
|
||||
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
|
||||
ror $t2,$a,#27
|
||||
and $t0,$t0,$d
|
||||
add $d,$d,$K // future e+=K
|
||||
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
|
||||
add $e,$e,$t2 // e+=rot(a,5)
|
||||
orr $t0,$t0,$t1
|
||||
ror $b,$b,#2
|
||||
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
|
||||
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||||
add $e,$e,$t0 // e+=F(b,c,d)
|
||||
ror @Xw[$j],@Xw[$j],#31
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_20_39 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=($i+2)&15;
|
||||
|
||||
$code.=<<___ if ($i==39);
|
||||
movz $K,#0xbcdc
|
||||
movk $K,#0x8f1b,lsl#16
|
||||
___
|
||||
$code.=<<___ if ($i<78);
|
||||
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
|
||||
eor $t0,$d,$b
|
||||
ror $t2,$a,#27
|
||||
add $d,$d,$K // future e+=K
|
||||
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
|
||||
eor $t0,$t0,$c
|
||||
add $e,$e,$t2 // e+=rot(a,5)
|
||||
ror $b,$b,#2
|
||||
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
|
||||
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||||
add $e,$e,$t0 // e+=F(b,c,d)
|
||||
ror @Xw[$j],@Xw[$j],#31
|
||||
___
|
||||
$code.=<<___ if ($i==78);
|
||||
ldp @Xw[1],@Xw[2],[$ctx]
|
||||
eor $t0,$d,$b
|
||||
ror $t2,$a,#27
|
||||
add $d,$d,$K // future e+=K
|
||||
eor $t0,$t0,$c
|
||||
add $e,$e,$t2 // e+=rot(a,5)
|
||||
ror $b,$b,#2
|
||||
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
|
||||
add $e,$e,$t0 // e+=F(b,c,d)
|
||||
___
|
||||
$code.=<<___ if ($i==79);
|
||||
ldp @Xw[3],@Xw[4],[$ctx,#8]
|
||||
eor $t0,$d,$b
|
||||
ror $t2,$a,#27
|
||||
eor $t0,$t0,$c
|
||||
add $e,$e,$t2 // e+=rot(a,5)
|
||||
ror $b,$b,#2
|
||||
ldr @Xw[5],[$ctx,#16]
|
||||
add $e,$e,$t0 // e+=F(b,c,d)
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
|
||||
.globl sha1_block_data_order
|
||||
.type sha1_block_data_order,%function
|
||||
.align 6
|
||||
sha1_block_data_order:
|
||||
ldr x16,.LOPENSSL_armcap_P
|
||||
adr x17,.LOPENSSL_armcap_P
|
||||
add x16,x16,x17
|
||||
ldr w16,[x16]
|
||||
tst w16,#ARMV8_SHA1
|
||||
b.ne .Lv8_entry
|
||||
|
||||
stp x29,x30,[sp,#-96]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
stp x23,x24,[sp,#48]
|
||||
stp x25,x26,[sp,#64]
|
||||
stp x27,x28,[sp,#80]
|
||||
|
||||
ldp $A,$B,[$ctx]
|
||||
ldp $C,$D,[$ctx,#8]
|
||||
ldr $E,[$ctx,#16]
|
||||
|
||||
.Loop:
|
||||
ldr @Xx[0],[$inp],#64
|
||||
movz $K,#0x7999
|
||||
sub $num,$num,#1
|
||||
movk $K,#0x5a82,lsl#16
|
||||
#ifdef __ARMEB__
|
||||
ror $Xx[0],@Xx[0],#32
|
||||
#else
|
||||
rev32 @Xx[0],@Xx[0]
|
||||
#endif
|
||||
add $E,$E,$K // warm it up
|
||||
add $E,$E,@Xw[0]
|
||||
___
|
||||
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
||||
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
||||
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
add $B,$B,@Xw[2]
|
||||
add $C,$C,@Xw[3]
|
||||
add $A,$A,@Xw[1]
|
||||
add $D,$D,@Xw[4]
|
||||
add $E,$E,@Xw[5]
|
||||
stp $A,$B,[$ctx]
|
||||
stp $C,$D,[$ctx,#8]
|
||||
str $E,[$ctx,#16]
|
||||
cbnz $num,.Loop
|
||||
|
||||
ldp x19,x20,[sp,#16]
|
||||
ldp x21,x22,[sp,#32]
|
||||
ldp x23,x24,[sp,#48]
|
||||
ldp x25,x26,[sp,#64]
|
||||
ldp x27,x28,[sp,#80]
|
||||
ldr x29,[sp],#96
|
||||
ret
|
||||
.size sha1_block_data_order,.-sha1_block_data_order
|
||||
___
|
||||
{{{
|
||||
my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
|
||||
my @MSG=map("v$_.16b",(4..7));
|
||||
my @Kxx=map("v$_.4s",(16..19));
|
||||
my ($W0,$W1)=("v20.4s","v21.4s");
|
||||
my $ABCD_SAVE="v22.16b";
|
||||
|
||||
$code.=<<___;
|
||||
.type sha1_block_armv8,%function
|
||||
.align 6
|
||||
sha1_block_armv8:
|
||||
.Lv8_entry:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
adr x4,.Lconst
|
||||
eor $E,$E,$E
|
||||
ld1.32 {$ABCD},[$ctx],#16
|
||||
ld1.32 {$E}[0],[$ctx]
|
||||
sub $ctx,$ctx,#16
|
||||
ld1.32 {@Kxx[0]-@Kxx[3]},[x4]
|
||||
|
||||
.Loop_hw:
|
||||
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
|
||||
sub $num,$num,#1
|
||||
rev32 @MSG[0],@MSG[0]
|
||||
rev32 @MSG[1],@MSG[1]
|
||||
|
||||
add.i32 $W0,@Kxx[0],@MSG[0]
|
||||
rev32 @MSG[2],@MSG[2]
|
||||
orr $ABCD_SAVE,$ABCD,$ABCD // offload
|
||||
|
||||
add.i32 $W1,@Kxx[0],@MSG[1]
|
||||
rev32 @MSG[3],@MSG[3]
|
||||
sha1h $E1,$ABCD
|
||||
sha1c $ABCD,$E,$W0 // 0
|
||||
add.i32 $W0,@Kxx[$j],@MSG[2]
|
||||
sha1su0 @MSG[0],@MSG[1],@MSG[2]
|
||||
___
|
||||
for ($j=0,$i=1;$i<20-3;$i++) {
|
||||
my $f=("c","p","m","p")[$i/5];
|
||||
$code.=<<___;
|
||||
sha1h $E0,$ABCD // $i
|
||||
sha1$f $ABCD,$E1,$W1
|
||||
add.i32 $W1,@Kxx[$j],@MSG[3]
|
||||
sha1su1 @MSG[0],@MSG[3]
|
||||
___
|
||||
$code.=<<___ if ($i<20-4);
|
||||
sha1su0 @MSG[1],@MSG[2],@MSG[3]
|
||||
___
|
||||
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
|
||||
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
|
||||
}
|
||||
$code.=<<___;
|
||||
sha1h $E0,$ABCD // $i
|
||||
sha1p $ABCD,$E1,$W1
|
||||
add.i32 $W1,@Kxx[$j],@MSG[3]
|
||||
|
||||
sha1h $E1,$ABCD // 18
|
||||
sha1p $ABCD,$E0,$W0
|
||||
|
||||
sha1h $E0,$ABCD // 19
|
||||
sha1p $ABCD,$E1,$W1
|
||||
|
||||
add.i32 $E,$E,$E0
|
||||
add.i32 $ABCD,$ABCD,$ABCD_SAVE
|
||||
|
||||
cbnz $num,.Loop_hw
|
||||
|
||||
st1.32 {$ABCD},[$ctx],#16
|
||||
st1.32 {$E}[0],[$ctx]
|
||||
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size sha1_block_armv8,.-sha1_block_armv8
|
||||
.align 6
|
||||
.Lconst:
|
||||
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
|
||||
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
|
||||
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
|
||||
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
|
||||
.LOPENSSL_armcap_P:
|
||||
.quad OPENSSL_armcap_P-.
|
||||
.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
___
|
||||
}}}
|
||||
|
||||
{ my %opcode = (
|
||||
"sha1c" => 0x5e000000, "sha1p" => 0x5e001000,
|
||||
"sha1m" => 0x5e002000, "sha1su0" => 0x5e003000,
|
||||
"sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 );
|
||||
|
||||
sub unsha1 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
|
||||
&&
|
||||
sprintf ".inst\t0x%08x\t//%s %s",
|
||||
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
|
||||
s/\`([^\`]*)\`/eval($1)/geo;
|
||||
|
||||
s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
|
||||
|
||||
s/\.\w?32\b//o and s/\.16b/\.4s/go;
|
||||
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
419
crypto/sha/asm/sha512-armv8.pl
Normal file
419
crypto/sha/asm/sha512-armv8.pl
Normal file
@ -0,0 +1,419 @@
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# SHA256/512 for ARMv8.
|
||||
#
|
||||
# Performance in cycles per processed byte and improvement coefficient
|
||||
# over code generated with "default" compiler:
|
||||
#
|
||||
# SHA256-hw SHA256(*) SHA512
|
||||
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
|
||||
# Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***))
|
||||
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
|
||||
#
|
||||
# (*) Software SHA256 results are of lesser relevance, presented
|
||||
# mostly for informational purposes.
|
||||
# (**) The result is a trade-off: it's possible to improve it by
|
||||
# 10% (or by 1 cycle per round), but at the cost of 20% loss
|
||||
# on Cortex-A53 (or by 4 cycles per round).
|
||||
# (***) Super-impressive coefficients over gcc-generated code are
|
||||
# indication of some compiler "pathology", most notably code
|
||||
# generated with -mgeneral-regs-only is significanty faster
|
||||
# and lags behind assembly only by 50-90%.
|
||||
|
||||
$flavour=shift;
|
||||
$output=shift;
|
||||
|
||||
if ($output =~ /512/) {
|
||||
$BITS=512;
|
||||
$SZ=8;
|
||||
@Sigma0=(28,34,39);
|
||||
@Sigma1=(14,18,41);
|
||||
@sigma0=(1, 8, 7);
|
||||
@sigma1=(19,61, 6);
|
||||
$rounds=80;
|
||||
$reg_t="x";
|
||||
} else {
|
||||
$BITS=256;
|
||||
$SZ=4;
|
||||
@Sigma0=( 2,13,22);
|
||||
@Sigma1=( 6,11,25);
|
||||
@sigma0=( 7,18, 3);
|
||||
@sigma1=(17,19,10);
|
||||
$rounds=64;
|
||||
$reg_t="w";
|
||||
}
|
||||
|
||||
$func="sha${BITS}_block_data_order";
|
||||
|
||||
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
|
||||
|
||||
@X=map("$reg_t$_",(3..15,0..2));
|
||||
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
|
||||
($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
|
||||
|
||||
sub BODY_00_xx {
|
||||
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
|
||||
my $j=($i+1)&15;
|
||||
my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
|
||||
$T0=@X[$i+3] if ($i<11);
|
||||
|
||||
$code.=<<___ if ($i<16);
|
||||
#ifndef __ARMEB__
|
||||
rev @X[$i],@X[$i] // $i
|
||||
#endif
|
||||
___
|
||||
$code.=<<___ if ($i<13 && ($i&1));
|
||||
ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
|
||||
___
|
||||
$code.=<<___ if ($i==13);
|
||||
ldp @X[14],@X[15],[$inp]
|
||||
___
|
||||
$code.=<<___ if ($i>=14);
|
||||
ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
|
||||
___
|
||||
$code.=<<___ if ($i>0 && $i<16);
|
||||
add $a,$a,$t1 // h+=Sigma0(a)
|
||||
___
|
||||
$code.=<<___ if ($i>=11);
|
||||
str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
|
||||
___
|
||||
# While ARMv8 specifies merged rotate-n-logical operation such as
|
||||
# 'eor x,y,z,ror#n', it was found to negatively affect performance
|
||||
# on Apple A7. The reason seems to be that it requires even 'y' to
|
||||
# be available earlier. This means that such merged instruction is
|
||||
# not necessarily best choice on critical path... On the other hand
|
||||
# Cortex-A5x handles merged instructions much better than disjoint
|
||||
# rotate and logical... See (**) footnote above.
|
||||
$code.=<<___ if ($i<15);
|
||||
ror $t0,$e,#$Sigma1[0]
|
||||
add $h,$h,$t2 // h+=K[i]
|
||||
eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
|
||||
and $t1,$f,$e
|
||||
bic $t2,$g,$e
|
||||
add $h,$h,@X[$i&15] // h+=X[i]
|
||||
orr $t1,$t1,$t2 // Ch(e,f,g)
|
||||
eor $t2,$a,$b // a^b, b^c in next round
|
||||
eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
|
||||
ror $T0,$a,#$Sigma0[0]
|
||||
add $h,$h,$t1 // h+=Ch(e,f,g)
|
||||
eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
|
||||
add $h,$h,$t0 // h+=Sigma1(e)
|
||||
and $t3,$t3,$t2 // (b^c)&=(a^b)
|
||||
add $d,$d,$h // d+=h
|
||||
eor $t3,$t3,$b // Maj(a,b,c)
|
||||
eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
|
||||
add $h,$h,$t3 // h+=Maj(a,b,c)
|
||||
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
|
||||
//add $h,$h,$t1 // h+=Sigma0(a)
|
||||
___
|
||||
$code.=<<___ if ($i>=15);
|
||||
ror $t0,$e,#$Sigma1[0]
|
||||
add $h,$h,$t2 // h+=K[i]
|
||||
ror $T1,@X[($j+1)&15],#$sigma0[0]
|
||||
and $t1,$f,$e
|
||||
ror $T2,@X[($j+14)&15],#$sigma1[0]
|
||||
bic $t2,$g,$e
|
||||
ror $T0,$a,#$Sigma0[0]
|
||||
add $h,$h,@X[$i&15] // h+=X[i]
|
||||
eor $t0,$t0,$e,ror#$Sigma1[1]
|
||||
eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
|
||||
orr $t1,$t1,$t2 // Ch(e,f,g)
|
||||
eor $t2,$a,$b // a^b, b^c in next round
|
||||
eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
|
||||
eor $T0,$T0,$a,ror#$Sigma0[1]
|
||||
add $h,$h,$t1 // h+=Ch(e,f,g)
|
||||
and $t3,$t3,$t2 // (b^c)&=(a^b)
|
||||
eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
|
||||
eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
|
||||
add $h,$h,$t0 // h+=Sigma1(e)
|
||||
eor $t3,$t3,$b // Maj(a,b,c)
|
||||
eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
|
||||
eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
|
||||
add @X[$j],@X[$j],@X[($j+9)&15]
|
||||
add $d,$d,$h // d+=h
|
||||
add $h,$h,$t3 // h+=Maj(a,b,c)
|
||||
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
|
||||
add @X[$j],@X[$j],$T1
|
||||
add $h,$h,$t1 // h+=Sigma0(a)
|
||||
add @X[$j],@X[$j],$T2
|
||||
___
|
||||
($t2,$t3)=($t3,$t2);
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
|
||||
.globl $func
|
||||
.type $func,%function
|
||||
.align 6
|
||||
$func:
|
||||
___
|
||||
$code.=<<___ if ($SZ==4);
|
||||
ldr x16,.LOPENSSL_armcap_P
|
||||
adr x17,.LOPENSSL_armcap_P
|
||||
add x16,x16,x17
|
||||
ldr w16,[x16]
|
||||
tst w16,#ARMV8_SHA256
|
||||
b.ne .Lv8_entry
|
||||
___
|
||||
$code.=<<___;
|
||||
stp x29,x30,[sp,#-128]!
|
||||
add x29,sp,#0
|
||||
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
stp x23,x24,[sp,#48]
|
||||
stp x25,x26,[sp,#64]
|
||||
stp x27,x28,[sp,#80]
|
||||
sub sp,sp,#4*$SZ
|
||||
|
||||
ldp $A,$B,[$ctx] // load context
|
||||
ldp $C,$D,[$ctx,#2*$SZ]
|
||||
ldp $E,$F,[$ctx,#4*$SZ]
|
||||
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
|
||||
ldp $G,$H,[$ctx,#6*$SZ]
|
||||
adr $Ktbl,K$BITS
|
||||
stp $ctx,$num,[x29,#96]
|
||||
|
||||
.Loop:
|
||||
ldp @X[0],@X[1],[$inp],#2*$SZ
|
||||
ldr $t2,[$Ktbl],#$SZ // *K++
|
||||
eor $t3,$B,$C // magic seed
|
||||
str $inp,[x29,#112]
|
||||
___
|
||||
for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=".Loop_16_xx:\n";
|
||||
for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
cbnz $t2,.Loop_16_xx
|
||||
|
||||
ldp $ctx,$num,[x29,#96]
|
||||
ldr $inp,[x29,#112]
|
||||
sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
|
||||
|
||||
ldp @X[0],@X[1],[$ctx]
|
||||
ldp @X[2],@X[3],[$ctx,#2*$SZ]
|
||||
add $inp,$inp,#14*$SZ // advance input pointer
|
||||
ldp @X[4],@X[5],[$ctx,#4*$SZ]
|
||||
add $A,$A,@X[0]
|
||||
ldp @X[6],@X[7],[$ctx,#6*$SZ]
|
||||
add $B,$B,@X[1]
|
||||
add $C,$C,@X[2]
|
||||
add $D,$D,@X[3]
|
||||
stp $A,$B,[$ctx]
|
||||
add $E,$E,@X[4]
|
||||
add $F,$F,@X[5]
|
||||
stp $C,$D,[$ctx,#2*$SZ]
|
||||
add $G,$G,@X[6]
|
||||
add $H,$H,@X[7]
|
||||
cmp $inp,$num
|
||||
stp $E,$F,[$ctx,#4*$SZ]
|
||||
stp $G,$H,[$ctx,#6*$SZ]
|
||||
b.ne .Loop
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
add sp,sp,#4*$SZ
|
||||
ldp x21,x22,[x29,#32]
|
||||
ldp x23,x24,[x29,#48]
|
||||
ldp x25,x26,[x29,#64]
|
||||
ldp x27,x28,[x29,#80]
|
||||
ldp x29,x30,[sp],#128
|
||||
ret
|
||||
.size $func,.-$func
|
||||
|
||||
.align 6
|
||||
.type K$BITS,%object
|
||||
K$BITS:
|
||||
___
|
||||
$code.=<<___ if ($SZ==8);
|
||||
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||
.quad 0x3956c25bf348b538,0x59f111f1b605d019
|
||||
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||
.quad 0xd807aa98a3030242,0x12835b0145706fbe
|
||||
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||
.quad 0x06ca6351e003826f,0x142929670a0e6e70
|
||||
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||
.quad 0x81c2c92e47edaee6,0x92722c851482353b
|
||||
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||
.quad 0xd192e819d6ef5218,0xd69906245565a910
|
||||
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||
.quad 0x90befffa23631e28,0xa4506cebde82bde9
|
||||
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||
.quad 0xca273eceea26619c,0xd186b8c721c0c207
|
||||
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||
.quad 0x113f9804bef90dae,0x1b710b35131c471b
|
||||
.quad 0x28db77f523047d84,0x32caab7b40c72493
|
||||
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||||
.quad 0 // terminator
|
||||
___
|
||||
$code.=<<___ if ($SZ==4);
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
.long 0 //terminator
|
||||
___
|
||||
$code.=<<___;
|
||||
.size K$BITS,.-K$BITS
|
||||
.align 3
|
||||
.LOPENSSL_armcap_P:
|
||||
.quad OPENSSL_armcap_P-.
|
||||
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
if ($SZ==4) {
|
||||
my $Ktbl="x3";
|
||||
|
||||
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
|
||||
my @MSG=map("v$_.16b",(4..7));
|
||||
my ($W0,$W1)=("v16.4s","v17.4s");
|
||||
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
|
||||
|
||||
$code.=<<___;
|
||||
.type sha256_block_armv8,%function
|
||||
.align 6
|
||||
sha256_block_armv8:
|
||||
.Lv8_entry:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
ld1.32 {$ABCD,$EFGH},[$ctx]
|
||||
adr $Ktbl,K256
|
||||
|
||||
.Loop_hw:
|
||||
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
|
||||
sub $num,$num,#1
|
||||
ld1.32 {$W0},[$Ktbl],#16
|
||||
rev32 @MSG[0],@MSG[0]
|
||||
rev32 @MSG[1],@MSG[1]
|
||||
rev32 @MSG[2],@MSG[2]
|
||||
rev32 @MSG[3],@MSG[3]
|
||||
orr $ABCD_SAVE,$ABCD,$ABCD // offload
|
||||
orr $EFGH_SAVE,$EFGH,$EFGH
|
||||
___
|
||||
for($i=0;$i<12;$i++) {
|
||||
$code.=<<___;
|
||||
ld1.32 {$W1},[$Ktbl],#16
|
||||
add.i32 $W0,$W0,@MSG[0]
|
||||
sha256su0 @MSG[0],@MSG[1]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
sha256su1 @MSG[0],@MSG[2],@MSG[3]
|
||||
___
|
||||
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
||||
}
|
||||
$code.=<<___;
|
||||
ld1.32 {$W1},[$Ktbl],#16
|
||||
add.i32 $W0,$W0,@MSG[0]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
|
||||
ld1.32 {$W0},[$Ktbl],#16
|
||||
add.i32 $W1,$W1,@MSG[1]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W1
|
||||
sha256h2 $EFGH,$abcd,$W1
|
||||
|
||||
ld1.32 {$W1},[$Ktbl]
|
||||
add.i32 $W0,$W0,@MSG[2]
|
||||
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W0
|
||||
sha256h2 $EFGH,$abcd,$W0
|
||||
|
||||
add.i32 $W1,$W1,@MSG[3]
|
||||
orr $abcd,$ABCD,$ABCD
|
||||
sha256h $ABCD,$EFGH,$W1
|
||||
sha256h2 $EFGH,$abcd,$W1
|
||||
|
||||
add.i32 $ABCD,$ABCD,$ABCD_SAVE
|
||||
add.i32 $EFGH,$EFGH,$EFGH_SAVE
|
||||
|
||||
cbnz $num,.Loop_hw
|
||||
|
||||
st1.32 {$ABCD,$EFGH},[$ctx]
|
||||
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size sha256_block_armv8,.-sha256_block_armv8
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
___
|
||||
|
||||
{ my %opcode = (
|
||||
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
|
||||
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
|
||||
|
||||
sub unsha256 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
|
||||
&&
|
||||
sprintf ".inst\t0x%08x\t//%s %s",
|
||||
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
|
||||
s/\`([^\`]*)\`/eval($1)/geo;
|
||||
|
||||
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
|
||||
|
||||
s/\.\w?32\b//o and s/\.16b/\.4s/go;
|
||||
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
@ -61,8 +61,9 @@
|
||||
#include <openssl/mem.h>
|
||||
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
|
||||
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
|
||||
#define SHA1_ASM
|
||||
#endif
|
||||
|
||||
|
@ -61,8 +61,9 @@
|
||||
#include <openssl/mem.h>
|
||||
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
|
||||
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
|
||||
#define SHA256_ASM
|
||||
#endif
|
||||
|
||||
|
@ -87,8 +87,9 @@
|
||||
* apply a number of optimizations to mitigate potential performance
|
||||
* penalties caused by previous design decision; */
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
|
||||
#if !defined(OPENSSL_NO_ASM) && \
|
||||
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
|
||||
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
|
||||
#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
|
||||
#define SHA512_ASM
|
||||
#endif
|
||||
|
@ -80,6 +80,9 @@ extern "C" {
|
||||
#elif defined(__arm) || defined(__arm__) || defined(_M_ARM)
|
||||
#define OPENSSL_32_BIT
|
||||
#define OPENSSL_ARM
|
||||
#elif defined(__aarch64__)
|
||||
#define OPENSSL_64_BIT
|
||||
#define OPENSSL_AARCH64
|
||||
#elif defined(__mips__)
|
||||
#define OPENSSL_32_BIT
|
||||
#define OPENSSL_MIPS
|
||||
|
@ -89,7 +89,7 @@ extern "C" {
|
||||
extern uint32_t OPENSSL_ia32cap_P[4];
|
||||
#endif
|
||||
|
||||
#if defined(OPENSSL_ARM)
|
||||
#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
|
||||
/* CRYPTO_is_NEON_capable returns true if the current CPU has a NEON unit. Note
|
||||
* that |OPENSSL_armcap_P| also exists and contains the same information in a
|
||||
* form that's easier for assembly to use. */
|
||||
|
@ -206,14 +206,6 @@
|
||||
*((c)++) = (uint8_t)(((l) >> 8) & 0xff), \
|
||||
*((c)++) = (uint8_t)(((l)) & 0xff))
|
||||
|
||||
#define l2n6(l, c) \
|
||||
(*((c)++) = (uint8_t)(((l) >> 40) & 0xff), \
|
||||
*((c)++) = (uint8_t)(((l) >> 32) & 0xff), \
|
||||
*((c)++) = (uint8_t)(((l) >> 24) & 0xff), \
|
||||
*((c)++) = (uint8_t)(((l) >> 16) & 0xff), \
|
||||
*((c)++) = (uint8_t)(((l) >> 8) & 0xff), \
|
||||
*((c)++) = (uint8_t)(((l)) & 0xff))
|
||||
|
||||
#define l2n8(l, c) \
|
||||
(*((c)++) = (uint8_t)(((l) >> 56) & 0xff), \
|
||||
*((c)++) = (uint8_t)(((l) >> 48) & 0xff), \
|
||||
@ -224,11 +216,6 @@
|
||||
*((c)++) = (uint8_t)(((l) >> 8) & 0xff), \
|
||||
*((c)++) = (uint8_t)(((l)) & 0xff))
|
||||
|
||||
#define n2l6(c, l) \
|
||||
(l = ((BN_ULLONG)(*((c)++))) << 40, l |= ((BN_ULLONG)(*((c)++))) << 32, \
|
||||
l |= ((BN_ULLONG)(*((c)++))) << 24, l |= ((BN_ULLONG)(*((c)++))) << 16, \
|
||||
l |= ((BN_ULLONG)(*((c)++))) << 8, l |= ((BN_ULLONG)(*((c)++))))
|
||||
|
||||
/* NOTE - c is not incremented as per l2c */
|
||||
#define l2cn(l1, l2, c, n) \
|
||||
{ \
|
||||
|
@ -12,7 +12,11 @@
|
||||
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <openssl/base.h>
|
||||
|
||||
// This file isn't built on ARM or Aarch64 because we link statically in those
|
||||
// builds and trying to override malloc in a static link doesn't work.
|
||||
#if defined(__linux__) && !defined(OPENSSL_ARM) && !defined(OPENSSL_AARCH64)
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
@ -123,4 +127,4 @@ void *realloc(void *ptr, size_t size) {
|
||||
|
||||
} // extern "C"
|
||||
|
||||
#endif /* defined(linux) */
|
||||
#endif /* defined(linux) && !ARM && !AARCH64 */
|
||||
|
6
util/aarch64-toolchain.cmake
Normal file
6
util/aarch64-toolchain.cmake
Normal file
@ -0,0 +1,6 @@
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
set(CMAKE_SYSTEM_VERSION 1)
|
||||
set(CMAKE_SYSTEM_PROCESSOR "aarch64")
|
||||
set(CMAKE_CXX_COMPILER "/opt/gcc-linaro-4.9-2014.11-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-g++")
|
||||
set(CMAKE_C_COMPILER "/opt/gcc-linaro-4.9-2014.11-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-gcc")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "-static")
|
@ -1,6 +1,6 @@
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
set(CMAKE_SYSTEM_VERSION 1)
|
||||
set(CMAKE_SYSTEM_PROCESSOR "arm")
|
||||
set(CMAKE_CXX_COMPILER "/opt/gcc-linaro-arm-linux-gnueabihf-4.8-2013.10_linux/bin/arm-linux-gnueabihf-g++")
|
||||
set(CMAKE_C_COMPILER "/opt/gcc-linaro-arm-linux-gnueabihf-4.8-2013.10_linux/bin/arm-linux-gnueabihf-gcc")
|
||||
set(CMAKE_CXX_COMPILER "/opt/gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf/bin/arm-linux-gnueabihf-g++")
|
||||
set(CMAKE_C_COMPILER "/opt/gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "-static")
|
||||
|
Loading…
x
Reference in New Issue
Block a user