aarch64 support.

This is an initial cut at aarch64 support. I have only qemu to test it
however—hopefully hardware will be coming soon.

This also affects 32-bit ARM in that aarch64 chips can run 32-bit code
and we would like to be able to take advantage of the crypto operations
even in 32-bit mode. AES and GHASH should Just Work in this case: the
-armx.pl files can be built for either 32- or 64-bit mode based on the
flavour argument given to the Perl script.

SHA-1 and SHA-256 don't work like this however because they've never
support for multiple implementations, thus BoringSSL built for 32-bit
won't use the SHA instructions on an aarch64 chip.

No dedicated ChaCha20 or Poly1305 support yet.

Change-Id: Ib275bc4894a365c8ec7c42f4e91af6dba3bd686c
Reviewed-on: https://boringssl-review.googlesource.com/2801
Reviewed-by: Adam Langley <agl@google.com>
This commit is contained in:
Adam Langley 2015-01-09 15:44:37 -08:00
parent bc44c089fb
commit 3e6526575a
28 changed files with 2309 additions and 144 deletions

View File

@ -32,6 +32,8 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686")
set(ARCH "x86")
elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm")
set(ARCH "arm")
elseif (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
set(ARCH "aarch64")
else()
message(FATAL_ERROR "Unknown processor:" ${CMAKE_SYSTEM_PROCESSOR})
endif()

View File

@ -5,7 +5,13 @@ if(APPLE)
set(ASM_EXT S)
enable_language(ASM)
elseif(UNIX)
set(PERLASM_STYLE elf)
if (${ARCH} STREQUAL "aarch64")
# The "armx" Perl scripts look for "64" in the style argument
# in order to decide whether to generate 32- or 64-bit asm.
set(PERLASM_STYLE linux64)
else()
set(PERLASM_STYLE elf)
endif()
set(ASM_EXT S)
enable_language(ASM)
else()
@ -59,7 +65,14 @@ if (${ARCH} STREQUAL "arm")
set(
CRYPTO_ARCH_SOURCES
cpu-x86-asm.${ASM_EXT}
cpu-arm.c
)
endif()
if (${ARCH} STREQUAL "aarch64")
set(
CRYPTO_ARCH_SOURCES
cpu-arm.c
)
endif()

View File

@ -27,6 +27,15 @@ if (${ARCH} STREQUAL "arm")
aes-armv4.${ASM_EXT}
bsaes-armv7.${ASM_EXT}
aesv8-armx.${ASM_EXT}
)
endif()
if (${ARCH} STREQUAL "aarch64")
set(
AES_ARCH_SOURCES
aesv8-armx.${ASM_EXT}
)
endif()
@ -50,3 +59,4 @@ perlasm(vpaes-x86.${ASM_EXT} asm/vpaes-x86.pl)
perlasm(aesni-x86.${ASM_EXT} asm/aesni-x86.pl)
perlasm(aes-armv4.${ASM_EXT} asm/aes-armv4.pl)
perlasm(bsaes-armv7.${ASM_EXT} asm/bsaes-armv7.pl)
perlasm(aesv8-armx.${ASM_EXT} asm/aesv8-armx.pl)

View File

@ -0,0 +1,962 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for ARMv8 AES instructions. The
# module is endian-agnostic in sense that it supports both big- and
# little-endian cases. As does it support both 32- and 64-bit modes
# of operation. Latter is achieved by limiting amount of utilized
# registers to 16, which implies additional NEON load and integer
# instructions. This has no effect on mighty Apple A7, where results
# are literally equal to the theoretical estimates based on AES
# instruction latencies and issue rates. On Cortex-A53, an in-order
# execution core, this costs up to 10-15%, which is partially
# compensated by implementing dedicated code path for 128-bit
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
# seems to be limited by sheer amount of NEON instructions...
#
# Performance in cycles per byte processed with 128-bit key:
#
# CBC enc CBC dec CTR
# Apple A7 2.39 1.20 1.20
# Cortex-A53 2.45 1.87 1.94
# Cortex-A57 3.64 1.34 1.32
$flavour = shift;
open STDOUT,">".shift;
$prefix="aes_v8";
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
#^^^^^^ this is done to simplify adoption by not depending
# on latest binutils.
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
# maintain both 32- and 64-bit codes within single module and
# transliterate common code to either flavour with regex vodoo.
#
{{{
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
$code.=<<___;
.align 5
rcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.globl ${prefix}_set_encrypt_key
.type ${prefix}_set_encrypt_key,%function
.align 5
${prefix}_set_encrypt_key:
.Lenc_key:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___;
mov $ptr,#-1
cmp $inp,#0
b.eq .Lenc_key_abort
cmp $out,#0
b.eq .Lenc_key_abort
mov $ptr,#-2
cmp $bits,#128
b.lt .Lenc_key_abort
cmp $bits,#256
b.gt .Lenc_key_abort
tst $bits,#0x3f
b.ne .Lenc_key_abort
adr $ptr,rcon
cmp $bits,#192
veor $zero,$zero,$zero
vld1.8 {$in0},[$inp],#16
mov $bits,#8 // reuse $bits
vld1.32 {$rcon,$mask},[$ptr],#32
b.lt .Loop128
b.eq .L192
b .L256
.align 4
.Loop128:
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
b.ne .Loop128
vld1.32 {$rcon},[$ptr]
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
veor $in0,$in0,$key
vst1.32 {$in0},[$out]
add $out,$out,#0x50
mov $rounds,#10
b .Ldone
.align 4
.L192:
vld1.8 {$in1},[$inp],#8
vmov.i8 $key,#8 // borrow $key
vst1.32 {$in0},[$out],#16
vsub.i8 $mask,$mask,$key // adjust the mask
.Loop192:
vtbl.8 $key,{$in1},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in1},[$out],#8
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vdup.32 $tmp,${in0}[3]
veor $tmp,$tmp,$in1
veor $key,$key,$rcon
vext.8 $in1,$zero,$in1,#12
vshl.u8 $rcon,$rcon,#1
veor $in1,$in1,$tmp
veor $in0,$in0,$key
veor $in1,$in1,$key
vst1.32 {$in0},[$out],#16
b.ne .Loop192
mov $rounds,#12
add $out,$out,#0x20
b .Ldone
.align 4
.L256:
vld1.8 {$in1},[$inp]
mov $bits,#7
mov $rounds,#14
vst1.32 {$in0},[$out],#16
.Loop256:
vtbl.8 $key,{$in1},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in1},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vst1.32 {$in0},[$out],#16
b.eq .Ldone
vdup.32 $key,${in0}[3] // just splat
vext.8 $tmp,$zero,$in1,#12
aese $key,$zero
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
veor $in1,$in1,$key
b .Loop256
.Ldone:
str $rounds,[$out]
mov $ptr,#0
.Lenc_key_abort:
mov x0,$ptr // return value
`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
ret
.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
.globl ${prefix}_set_decrypt_key
.type ${prefix}_set_decrypt_key,%function
.align 5
${prefix}_set_decrypt_key:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
stmdb sp!,{r4,lr}
___
$code.=<<___;
bl .Lenc_key
cmp x0,#0
b.ne .Ldec_key_abort
sub $out,$out,#240 // restore original $out
mov x4,#-16
add $inp,$out,x12,lsl#4 // end of key schedule
vld1.32 {v0.16b},[$out]
vld1.32 {v1.16b},[$inp]
vst1.32 {v0.16b},[$inp],x4
vst1.32 {v1.16b},[$out],#16
.Loop_imc:
vld1.32 {v0.16b},[$out]
vld1.32 {v1.16b},[$inp]
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
vst1.32 {v0.16b},[$inp],x4
vst1.32 {v1.16b},[$out],#16
cmp $inp,$out
b.hi .Loop_imc
vld1.32 {v0.16b},[$out]
aesimc v0.16b,v0.16b
vst1.32 {v0.16b},[$inp]
eor x0,x0,x0 // return value
.Ldec_key_abort:
___
$code.=<<___ if ($flavour !~ /64/);
ldmia sp!,{r4,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldp x29,x30,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
___
}}}
{{{
sub gen_block () {
my $dir = shift;
my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
my ($inp,$out,$key)=map("x$_",(0..2));
my $rounds="w3";
my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
$code.=<<___;
.globl ${prefix}_${dir}crypt
.type ${prefix}_${dir}crypt,%function
.align 5
${prefix}_${dir}crypt:
ldr $rounds,[$key,#240]
vld1.32 {$rndkey0},[$key],#16
vld1.8 {$inout},[$inp]
sub $rounds,$rounds,#2
vld1.32 {$rndkey1},[$key],#16
.Loop_${dir}c:
aes$e $inout,$rndkey0
vld1.32 {$rndkey0},[$key],#16
aes$mc $inout,$inout
subs $rounds,$rounds,#2
aes$e $inout,$rndkey1
vld1.32 {$rndkey1},[$key],#16
aes$mc $inout,$inout
b.gt .Loop_${dir}c
aes$e $inout,$rndkey0
vld1.32 {$rndkey0},[$key]
aes$mc $inout,$inout
aes$e $inout,$rndkey1
veor $inout,$inout,$rndkey0
vst1.8 {$inout},[$out]
ret
.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
### q8-q15 preloaded key schedule
$code.=<<___;
.globl ${prefix}_cbc_encrypt
.type ${prefix}_cbc_encrypt,%function
.align 5
${prefix}_cbc_encrypt:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
mov ip,sp
stmdb sp!,{r4-r8,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load remaining args
___
$code.=<<___;
subs $len,$len,#16
mov $step,#16
b.lo .Lcbc_abort
cclr $step,eq
cmp $enc,#0 // en- or decrypting?
ldr $rounds,[$key,#240]
and $len,$len,#-16
vld1.8 {$ivec},[$ivp]
vld1.8 {$dat},[$inp],$step
vld1.32 {q8-q9},[$key] // load key schedule...
sub $rounds,$rounds,#6
add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
sub $rounds,$rounds,#2
vld1.32 {q10-q11},[$key_],#32
vld1.32 {q12-q13},[$key_],#32
vld1.32 {q14-q15},[$key_],#32
vld1.32 {$rndlast},[$key_]
add $key_,$key,#32
mov $cnt,$rounds
b.eq .Lcbc_dec
cmp $rounds,#2
veor $dat,$dat,$ivec
veor $rndzero_n_last,q8,$rndlast
b.eq .Lcbc_enc128
.Loop_cbc_enc:
aese $dat,q8
vld1.32 {q8},[$key_],#16
aesmc $dat,$dat
subs $cnt,$cnt,#2
aese $dat,q9
vld1.32 {q9},[$key_],#16
aesmc $dat,$dat
b.gt .Loop_cbc_enc
aese $dat,q8
aesmc $dat,$dat
subs $len,$len,#16
aese $dat,q9
aesmc $dat,$dat
cclr $step,eq
aese $dat,q10
aesmc $dat,$dat
add $key_,$key,#16
aese $dat,q11
aesmc $dat,$dat
vld1.8 {q8},[$inp],$step
aese $dat,q12
aesmc $dat,$dat
veor q8,q8,$rndzero_n_last
aese $dat,q13
aesmc $dat,$dat
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
aese $dat,q14
aesmc $dat,$dat
aese $dat,q15
mov $cnt,$rounds
veor $ivec,$dat,$rndlast
vst1.8 {$ivec},[$out],#16
b.hs .Loop_cbc_enc
b .Lcbc_done
.align 5
.Lcbc_enc128:
vld1.32 {$in0-$in1},[$key_]
aese $dat,q8
aesmc $dat,$dat
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
aese $dat,q8
aesmc $dat,$dat
vst1.8 {$ivec},[$out],#16
.Lenter_cbc_enc128:
aese $dat,q9
aesmc $dat,$dat
subs $len,$len,#16
aese $dat,$in0
aesmc $dat,$dat
cclr $step,eq
aese $dat,$in1
aesmc $dat,$dat
aese $dat,q10
aesmc $dat,$dat
aese $dat,q11
aesmc $dat,$dat
vld1.8 {q8},[$inp],$step
aese $dat,q12
aesmc $dat,$dat
aese $dat,q13
aesmc $dat,$dat
aese $dat,q14
aesmc $dat,$dat
veor q8,q8,$rndzero_n_last
aese $dat,q15
veor $ivec,$dat,$rndlast
b.hs .Loop_cbc_enc128
vst1.8 {$ivec},[$out],#16
b .Lcbc_done
___
{
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
$code.=<<___;
.align 5
.Lcbc_dec:
vld1.8 {$dat2},[$inp],#16
subs $len,$len,#32 // bias
add $cnt,$rounds,#2
vorr $in1,$dat,$dat
vorr $dat1,$dat,$dat
vorr $in2,$dat2,$dat2
b.lo .Lcbc_dec_tail
vorr $dat1,$dat2,$dat2
vld1.8 {$dat2},[$inp],#16
vorr $in0,$dat,$dat
vorr $in1,$dat1,$dat1
vorr $in2,$dat2,$dat2
.Loop3x_cbc_dec:
aesd $dat0,q8
aesd $dat1,q8
aesd $dat2,q8
vld1.32 {q8},[$key_],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesimc $dat2,$dat2
subs $cnt,$cnt,#2
aesd $dat0,q9
aesd $dat1,q9
aesd $dat2,q9
vld1.32 {q9},[$key_],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesimc $dat2,$dat2
b.gt .Loop3x_cbc_dec
aesd $dat0,q8
aesd $dat1,q8
aesd $dat2,q8
veor $tmp0,$ivec,$rndlast
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesimc $dat2,$dat2
veor $tmp1,$in0,$rndlast
aesd $dat0,q9
aesd $dat1,q9
aesd $dat2,q9
veor $tmp2,$in1,$rndlast
subs $len,$len,#0x30
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesimc $dat2,$dat2
vorr $ivec,$in2,$in2
mov.lo x6,$len // x6, $cnt, is zero at this point
aesd $dat0,q12
aesd $dat1,q12
aesd $dat2,q12
add $inp,$inp,x6 // $inp is adjusted in such way that
// at exit from the loop $dat1-$dat2
// are loaded with last "words"
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesimc $dat2,$dat2
mov $key_,$key
aesd $dat0,q13
aesd $dat1,q13
aesd $dat2,q13
vld1.8 {$in0},[$inp],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesimc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
aesd $dat0,q14
aesd $dat1,q14
aesd $dat2,q14
vld1.8 {$in2},[$inp],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
aesimc $dat2,$dat2
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
aesd $dat0,q15
aesd $dat1,q15
aesd $dat2,q15
add $cnt,$rounds,#2
veor $tmp0,$tmp0,$dat0
veor $tmp1,$tmp1,$dat1
veor $dat2,$dat2,$tmp2
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vorr $dat0,$in0,$in0
vst1.8 {$tmp0},[$out],#16
vorr $dat1,$in1,$in1
vst1.8 {$tmp1},[$out],#16
vst1.8 {$dat2},[$out],#16
vorr $dat2,$in2,$in2
b.hs .Loop3x_cbc_dec
cmn $len,#0x30
b.eq .Lcbc_done
nop
.Lcbc_dec_tail:
aesd $dat1,q8
aesd $dat2,q8
vld1.32 {q8},[$key_],#16
aesimc $dat1,$dat1
aesimc $dat2,$dat2
subs $cnt,$cnt,#2
aesd $dat1,q9
aesd $dat2,q9
vld1.32 {q9},[$key_],#16
aesimc $dat1,$dat1
aesimc $dat2,$dat2
b.gt .Lcbc_dec_tail
aesd $dat1,q8
aesd $dat2,q8
aesimc $dat1,$dat1
aesimc $dat2,$dat2
aesd $dat1,q9
aesd $dat2,q9
aesimc $dat1,$dat1
aesimc $dat2,$dat2
aesd $dat1,q12
aesd $dat2,q12
aesimc $dat1,$dat1
aesimc $dat2,$dat2
cmn $len,#0x20
aesd $dat1,q13
aesd $dat2,q13
aesimc $dat1,$dat1
aesimc $dat2,$dat2
veor $tmp1,$ivec,$rndlast
aesd $dat1,q14
aesd $dat2,q14
aesimc $dat1,$dat1
aesimc $dat2,$dat2
veor $tmp2,$in1,$rndlast
aesd $dat1,q15
aesd $dat2,q15
b.eq .Lcbc_dec_one
veor $tmp1,$tmp1,$dat1
veor $tmp2,$tmp2,$dat2
vorr $ivec,$in2,$in2
vst1.8 {$tmp1},[$out],#16
vst1.8 {$tmp2},[$out],#16
b .Lcbc_done
.Lcbc_dec_one:
veor $tmp1,$tmp1,$dat2
vorr $ivec,$in2,$in2
vst1.8 {$tmp1},[$out],#16
.Lcbc_done:
vst1.8 {$ivec},[$ivp]
.Lcbc_abort:
___
}
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r8,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldr x29,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
my ($rounds,$cnt,$key_)=("w5","w6","x7");
my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
my $step="x12"; # aliases with $tctr2
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
my ($dat,$tmp)=($dat0,$tmp0);
### q8-q15 preloaded key schedule
$code.=<<___;
.globl ${prefix}_ctr32_encrypt_blocks
.type ${prefix}_ctr32_encrypt_blocks,%function
.align 5
${prefix}_ctr32_encrypt_blocks:
___
$code.=<<___ if ($flavour =~ /64/);
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
mov ip,sp
stmdb sp!,{r4-r10,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldr r4, [ip] @ load remaining arg
___
$code.=<<___;
ldr $rounds,[$key,#240]
ldr $ctr, [$ivp, #12]
vld1.32 {$dat0},[$ivp]
vld1.32 {q8-q9},[$key] // load key schedule...
sub $rounds,$rounds,#4
mov $step,#16
cmp $len,#2
add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
sub $rounds,$rounds,#2
vld1.32 {q12-q13},[$key_],#32
vld1.32 {q14-q15},[$key_],#32
vld1.32 {$rndlast},[$key_]
add $key_,$key,#32
mov $cnt,$rounds
cclr $step,lo
#ifndef __ARMEB__
rev $ctr, $ctr
#endif
vorr $dat1,$dat0,$dat0
add $tctr1, $ctr, #1
vorr $dat2,$dat0,$dat0
add $ctr, $ctr, #2
vorr $ivec,$dat0,$dat0
rev $tctr1, $tctr1
vmov.32 ${dat1}[3],$tctr1
b.ls .Lctr32_tail
rev $tctr2, $ctr
sub $len,$len,#3 // bias
vmov.32 ${dat2}[3],$tctr2
b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
aese $dat0,q8
aese $dat1,q8
aese $dat2,q8
vld1.32 {q8},[$key_],#16
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aesmc $dat2,$dat2
subs $cnt,$cnt,#2
aese $dat0,q9
aese $dat1,q9
aese $dat2,q9
vld1.32 {q9},[$key_],#16
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aesmc $dat2,$dat2
b.gt .Loop3x_ctr32
aese $dat0,q8
aese $dat1,q8
aese $dat2,q8
mov $key_,$key
aesmc $tmp0,$dat0
vld1.8 {$in0},[$inp],#16
aesmc $tmp1,$dat1
aesmc $dat2,$dat2
vorr $dat0,$ivec,$ivec
aese $tmp0,q9
vld1.8 {$in1},[$inp],#16
aese $tmp1,q9
aese $dat2,q9
vorr $dat1,$ivec,$ivec
aesmc $tmp0,$tmp0
vld1.8 {$in2},[$inp],#16
aesmc $tmp1,$tmp1
aesmc $tmp2,$dat2
vorr $dat2,$ivec,$ivec
add $tctr0,$ctr,#1
aese $tmp0,q12
aese $tmp1,q12
aese $tmp2,q12
veor $in0,$in0,$rndlast
add $tctr1,$ctr,#2
aesmc $tmp0,$tmp0
aesmc $tmp1,$tmp1
aesmc $tmp2,$tmp2
veor $in1,$in1,$rndlast
add $ctr,$ctr,#3
aese $tmp0,q13
aese $tmp1,q13
aese $tmp2,q13
veor $in2,$in2,$rndlast
rev $tctr0,$tctr0
aesmc $tmp0,$tmp0
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
aesmc $tmp1,$tmp1
aesmc $tmp2,$tmp2
vmov.32 ${dat0}[3], $tctr0
rev $tctr1,$tctr1
aese $tmp0,q14
aese $tmp1,q14
aese $tmp2,q14
vmov.32 ${dat1}[3], $tctr1
rev $tctr2,$ctr
aesmc $tmp0,$tmp0
aesmc $tmp1,$tmp1
aesmc $tmp2,$tmp2
vmov.32 ${dat2}[3], $tctr2
subs $len,$len,#3
aese $tmp0,q15
aese $tmp1,q15
aese $tmp2,q15
mov $cnt,$rounds
veor $in0,$in0,$tmp0
veor $in1,$in1,$tmp1
veor $in2,$in2,$tmp2
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vst1.8 {$in0},[$out],#16
vst1.8 {$in1},[$out],#16
vst1.8 {$in2},[$out],#16
b.hs .Loop3x_ctr32
adds $len,$len,#3
b.eq .Lctr32_done
cmp $len,#1
mov $step,#16
cclr $step,eq
.Lctr32_tail:
aese $dat0,q8
aese $dat1,q8
vld1.32 {q8},[$key_],#16
aesmc $dat0,$dat0
aesmc $dat1,$dat1
subs $cnt,$cnt,#2
aese $dat0,q9
aese $dat1,q9
vld1.32 {q9},[$key_],#16
aesmc $dat0,$dat0
aesmc $dat1,$dat1
b.gt .Lctr32_tail
aese $dat0,q8
aese $dat1,q8
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aese $dat0,q9
aese $dat1,q9
aesmc $dat0,$dat0
aesmc $dat1,$dat1
vld1.8 {$in0},[$inp],$step
aese $dat0,q12
aese $dat1,q12
vld1.8 {$in1},[$inp]
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aese $dat0,q13
aese $dat1,q13
aesmc $dat0,$dat0
aesmc $dat1,$dat1
aese $dat0,q14
aese $dat1,q14
veor $in0,$in0,$rndlast
aesmc $dat0,$dat0
aesmc $dat1,$dat1
veor $in1,$in1,$rndlast
aese $dat0,q15
aese $dat1,q15
cmp $len,#1
veor $in0,$in0,$dat0
veor $in1,$in1,$dat1
vst1.8 {$in0},[$out],#16
b.eq .Lctr32_done
vst1.8 {$in1},[$out]
.Lctr32_done:
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r10,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldr x29,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
$code.=<<___;
#endif
___
########################################
if ($flavour =~ /64/) { ######## 64-bit code
my %opcode = (
"aesd" => 0x4e285800, "aese" => 0x4e284800,
"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5),
$mnemonic,$arg;
};
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vext\.8/ext/o or
s/vrev32\.8/rev32/o or
s/vtst\.8/cmtst/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
# fix up remainig legacy suffixes
s/\.[ui]?8//o;
m/\],#8/o and s/\.16b/\.8b/go;
s/\.[ui]?32//o and s/\.16b/\.4s/go;
s/\.[ui]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
print $_,"\n";
}
} else { ######## 32-bit code
my %opcode = (
"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<1) |(($2&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
};
sub unvtbl {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remainig new-style suffixes
s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
s/\],#[0-9]+/]!/o;
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)mov\./$1mov/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT;

View File

@ -62,11 +62,22 @@
# define __ARMEL__
# endif
# elif defined(__GNUC__)
# if defined(__aarch64__)
# define __ARM_ARCH__ 8
# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
# define __ARMEB__
# else
# define __ARMEL__
# endif
/* Why doesn't gcc define __ARM_ARCH__? Instead it defines
* bunch of below macros. See all_architectires[] table in
* gcc/config/arm/arm.c. On a side note it defines
* __ARMEL__/__ARMEB__ for little-/big-endian. */
# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
# elif defined(__ARM_ARCH)
# define __ARM_ARCH__ __ARM_ARCH
# elif defined(__ARM_ARCH_8A__)
# define __ARM_ARCH__ 8
# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
defined(__ARM_ARCH_7EM__)
# define __ARM_ARCH__ 7
@ -87,6 +98,10 @@
# endif
#endif
/* Even when building for 32-bit ARM, support for aarch64 crypto instructions
* will be included. */
#define __ARM_MAX_ARCH__ 8
#if !__ASSEMBLER__
/* OPENSSL_armcap_P contains flags describing the capabilities of the CPU and
@ -94,6 +109,8 @@
* |cpu.h|. */
extern uint32_t OPENSSL_armcap_P;
#endif /* !__ASSEMBLER__ */
/* ARMV7_NEON is true when a NEON unit is present in the current CPU. */
#define ARMV7_NEON (1 << 0)
@ -101,9 +118,19 @@ extern uint32_t OPENSSL_armcap_P;
* The Poly1305 NEON code is known to trigger bugs in the NEON units of some
* phones. If this bit isn't set then the Poly1305 NEON code won't be used.
* See https://code.google.com/p/chromium/issues/detail?id=341598. */
#define ARMV7_NEON_FUNCTIONAL (1 << 1)
#define ARMV7_NEON_FUNCTIONAL (1 << 10)
#endif /* !__ASSEMBLER__ */
/* ARMV8_AES indicates support for hardware AES instructions. */
#define ARMV8_AES (1 << 2)
/* ARMV8_SHA1 indicates support for hardware SHA-1 instructions. */
#define ARMV8_SHA1 (1 << 3)
/* ARMV8_SHA256 indicates support for hardware SHA-256 instructions. */
#define ARMV8_SHA256 (1 << 4)
/* ARMV8_PMULL indicates support for carryless multiplication. */
#define ARMV8_PMULL (1 << 5)
#endif /* OPENSSL_HEADER_THREAD_H */

View File

@ -263,7 +263,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
#ifdef BN_LLONG
BN_ULLONG t2;
#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(div_asm)
#if defined(BN_LLONG) && !defined(div_asm)
q = (BN_ULONG)(((((BN_ULLONG)n0) << BN_BITS2) | n1) / d0);
#else
q = div_asm(n0, n1, d0);

View File

@ -130,40 +130,8 @@
BN_UMULT_LOHI(r0, r1, tmp, tmp); \
}
#elif defined(BN_UMULT_HIGH)
#define mul_add(r, a, w, c) \
{ \
BN_ULONG high, low, ret, tmp = (a); \
ret = (r); \
high = BN_UMULT_HIGH(w, tmp); \
ret += (c); \
low = (w) * tmp; \
(c) = (ret < (c)) ? 1 : 0; \
(c) += high; \
ret += low; \
(c) += (ret < low) ? 1 : 0; \
(r) = ret; \
}
#define mul(r, a, w, c) \
{ \
BN_ULONG high, low, ret, ta = (a); \
low = (w) * ta; \
high = BN_UMULT_HIGH(w, ta); \
ret = low + (c); \
(c) = high; \
(c) += (ret < low) ? 1 : 0; \
(r) = ret; \
}
#define sqr(r0, r1, a) \
{ \
BN_ULONG tmp = (a); \
(r0) = tmp * tmp; \
(r1) = BN_UMULT_HIGH(tmp, tmp); \
}
#else
/*************************************************************
* No long long type
*/
@ -424,7 +392,7 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
#if defined(BN_LLONG) && defined(BN_DIV2W)
#if defined(BN_LLONG)
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
@ -502,7 +470,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
return ret;
}
#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
#endif /* !defined(BN_LLONG) */
#ifdef BN_LLONG
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
@ -749,49 +717,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
#elif defined(BN_UMULT_HIGH)
/* Keep in mind that additions to hi can not overflow, because
* the high word of a multiplication result cannot be all-ones. */
#define mul_add_c(a, b, c0, c1, c2) \
do { \
BN_ULONG ta = (a), tb = (b); \
BN_ULONG lo = ta * tb; \
BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \
c0 += lo; \
hi += (c0 < lo) ? 1 : 0; \
c1 += hi; \
c2 += (c1 < hi) ? 1 : 0; \
} while (0)
#define mul_add_c2(a, b, c0, c1, c2) \
do { \
BN_ULONG ta = (a), tb = (b), tt; \
BN_ULONG lo = ta * tb; \
BN_ULONG hi = BN_UMULT_HIGH(ta, tb); \
c0 += lo; \
tt = hi + ((c0 < lo) ? 1 : 0); \
c1 += tt; \
c2 += (c1 < tt) ? 1 : 0; \
c0 += lo; \
hi += (c0 < lo) ? 1 : 0; \
c1 += hi; \
c2 += (c1 < hi) ? 1 : 0; \
} while (0)
#define sqr_add_c(a, i, c0, c1, c2) \
do { \
BN_ULONG ta = (a)[i]; \
BN_ULONG lo = ta * ta; \
BN_ULONG hi = BN_UMULT_HIGH(ta, ta); \
c0 += lo; \
hi += (c0 < lo) ? 1 : 0; \
c1 += hi; \
c2 += (c1 < hi) ? 1 : 0; \
} while (0)
#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
#else /* !BN_LLONG */
/* Keep in mind that additions to hi can not overflow, because

View File

@ -142,8 +142,14 @@ BIGNUM *bn_expand(BIGNUM *bn, unsigned bits);
#if defined(OPENSSL_64_BIT)
#define BN_ULLONG unsigned long long
#if !defined(_MSC_VER)
/* MSVC doesn't support two-word integers on 64-bit. */
#define BN_LLONG __int128_t
#define BN_ULLONG __uint128_t
#endif
#define BN_BITS 128
#define BN_BITS2 64
#define BN_BYTES 8
#define BN_BITS4 32
#define BN_MASK (0xffffffffffffffffffffffffffffffffLL)
@ -160,9 +166,11 @@ BIGNUM *bn_expand(BIGNUM *bn, unsigned bits);
#elif defined(OPENSSL_32_BIT)
#define BN_ULLONG unsigned long long
#define BN_MASK (0xffffffffffffffffLL)
#define BN_LLONG int64_t
#define BN_ULLONG uint64_t
#define BN_MASK (0xffffffffffffffffLL)
#define BN_BITS 64
#define BN_BITS2 32
#define BN_BYTES 4
#define BN_BITS4 16
#define BN_MASK2 (0xffffffffL)
@ -188,6 +196,11 @@ BIGNUM *bn_expand(BIGNUM *bn, unsigned bits);
#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32) /* 32 */
#define BN_MONT_CTX_SET_SIZE_WORD (64) /* 32 */
#if defined(BN_LLONG)
#define Lw(t) (((BN_ULONG)(t))&BN_MASK2)
#define Hw(t) (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
#endif
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
@ -213,6 +226,8 @@ int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl);
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
#if !defined(BN_LLONG)
#define LBITS(a) ((a) & BN_MASK2l)
#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
@ -243,6 +258,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
(h) = ht; \
}
#endif /* !defined(BN_LLONG) */
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
# if defined(__GNUC__) && __GNUC__ >= 2
# define BN_UMULT_HIGH(a,b) ({ \

View File

@ -103,14 +103,33 @@ static char bsaes_capable(void) {
}
#endif
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
#elif !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
#include "../arm_arch.h"
#if __ARM_ARCH__ >= 7
#if defined(OPENSSL_ARM)
#define BSAES
static char bsaes_capable(void) {
return CRYPTO_is_NEON_capable();
}
#endif /* __ARM_ARCH__ >= 7 */
#endif
#define HWAES
static char hwaes_capable(void) {
return (OPENSSL_armcap_P & ARMV8_AES) != 0;
}
int aes_v8_set_encrypt_key(const uint8_t *user_key, const int bits,
AES_KEY *key);
int aes_v8_set_decrypt_key(const uint8_t *user_key, const int bits,
AES_KEY *key);
void aes_v8_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
void aes_v8_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
void aes_v8_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
const AES_KEY *key, uint8_t *ivec, const int enc);
void aes_v8_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, const uint8_t ivec[16]);
#endif /* OPENSSL_ARM */
#if defined(BSAES)
@ -174,6 +193,41 @@ void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
}
#endif
#if !defined(HWAES)
/* If HWAES isn't defined then we provide dummy functions for each of the hwaes
* functions. */
int hwaes_capable() {
return 0;
}
int aes_v8_set_encrypt_key(const uint8_t *user_key, int bits,
AES_KEY *key) {
abort();
}
int aes_v8_set_decrypt_key(const uint8_t *user_key, int bits, AES_KEY *key) {
abort();
}
void aes_v8_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
abort();
}
void aes_v8_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
abort();
}
void aes_v8_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
const AES_KEY *key, uint8_t *ivec, int enc) {
abort();
}
void aes_v8_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, const uint8_t ivec[16]) {
abort();
}
#endif
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86_64) || defined(OPENSSL_X86))
int aesni_set_encrypt_key(const uint8_t *userKey, int bits, AES_KEY *key);
@ -227,7 +281,14 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
mode = ctx->cipher->flags & EVP_CIPH_MODE_MASK;
if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) {
if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) {
if (hwaes_capable()) {
ret = aes_v8_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f)aes_v8_decrypt;
dat->stream.cbc = NULL;
if (mode == EVP_CIPH_CBC_MODE) {
dat->stream.cbc = (cbc128_f)aes_v8_cbc_encrypt;
}
} else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) {
ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f)AES_decrypt;
dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt;
@ -242,6 +303,15 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
dat->stream.cbc =
mode == EVP_CIPH_CBC_MODE ? (cbc128_f)AES_cbc_encrypt : NULL;
}
} else if (hwaes_capable()) {
ret = aes_v8_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f)aes_v8_encrypt;
dat->stream.cbc = NULL;
if (mode == EVP_CIPH_CBC_MODE) {
dat->stream.cbc = (cbc128_f)aes_v8_cbc_encrypt;
} else if (mode == EVP_CIPH_CTR_MODE) {
dat->stream.ctr = (ctr128_f)aes_v8_ctr32_encrypt_blocks;
}
} else if (bsaes_capable() && mode == EVP_CIPH_CTR_MODE) {
ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
dat->block = (block128_f)AES_encrypt;
@ -316,6 +386,12 @@ static int aes_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
static ctr128_f aes_gcm_set_key(AES_KEY *aes_key, GCM128_CONTEXT *gcm_ctx,
const uint8_t *key, size_t key_len) {
if (hwaes_capable()) {
aes_v8_set_encrypt_key(key, key_len * 8, aes_key);
CRYPTO_gcm128_init(gcm_ctx, aes_key, (block128_f)aes_v8_encrypt);
return (ctr128_f)aes_v8_ctr32_encrypt_blocks;
}
if (bsaes_capable()) {
AES_set_encrypt_key(key, key_len * 8, aes_key);
CRYPTO_gcm128_init(gcm_ctx, aes_key, (block128_f)AES_encrypt);
@ -1274,6 +1350,8 @@ const EVP_AEAD *EVP_aead_aes_256_key_wrap(void) { return &aead_aes_256_key_wrap;
int EVP_has_aes_hardware(void) {
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
return aesni_capable() && crypto_gcm_clmul_enabled();
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
return hwaes_capable() && (OPENSSL_armcap_P & ARMV8_PMULL);
#else
return 0;
#endif

View File

@ -56,18 +56,14 @@
#include <openssl/cpu.h>
#if defined(OPENSSL_ARM)
#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
#include <stdio.h>
#include <inttypes.h>
#include <stdio.h>
#include <sys/auxv.h>
#include "arm_arch.h"
#if defined(__ARM_NEON__)
uint32_t OPENSSL_armcap_P = ARMV7_NEON | ARMV7_NEON_FUNCTIONAL;
#else
uint32_t OPENSSL_armcap_P = ARMV7_NEON_FUNCTIONAL;
#endif
char CRYPTO_is_NEON_capable(void) {
return (OPENSSL_armcap_P & ARMV7_NEON) != 0;
@ -94,4 +90,53 @@ void CRYPTO_set_NEON_functional(char neon_functional) {
}
}
#endif /* defined(OPENSSL_ARM) */
void OPENSSL_cpuid_setup(void) {
unsigned long hwcap = getauxval(AT_HWCAP);
#if defined(OPENSSL_ARM)
static const unsigned long kNEON = 1 << 12;
if ((hwcap & kNEON) == 0) {
return;
}
/* In 32-bit mode, the ARMv8 feature bits are in a different aux vector
* value. */
hwcap = getauxval(AT_HWCAP2);
/* See /usr/include/asm/hwcap.h on an ARM installation for the source of
* these values. */
static const unsigned long kAES = 1 << 0;
static const unsigned long kPMULL = 1 << 1;
static const unsigned long kSHA1 = 1 << 2;
static const unsigned long kSHA256 = 1 << 3;
#elif defined(OPENSSL_AARCH64)
/* See /usr/include/asm/hwcap.h on an aarch64 installation for the source of
* these values. */
static const unsigned long kNEON = 1 << 1;
static const unsigned long kAES = 1 << 3;
static const unsigned long kPMULL = 1 << 4;
static const unsigned long kSHA1 = 1 << 5;
static const unsigned long kSHA256 = 1 << 6;
if ((hwcap & kNEON) == 0) {
return;
}
#endif
OPENSSL_armcap_P |= ARMV7_NEON | ARMV7_NEON_FUNCTIONAL;
if (hwcap & kAES) {
OPENSSL_armcap_P |= ARMV8_AES;
}
if (hwcap & kPMULL) {
OPENSSL_armcap_P |= ARMV8_PMULL;
}
if (hwcap & kSHA1) {
OPENSSL_armcap_P |= ARMV8_SHA1;
}
if (hwcap & kSHA256) {
OPENSSL_armcap_P |= ARMV8_SHA256;
}
}
#endif /* defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) */

View File

@ -66,15 +66,6 @@
#include <stdio.h>
#include <inttypes.h>
/* This value must be explicitly initialised to zero in order to work around a
* bug in libtool or the linker on OS X.
*
* If not initialised then it becomes a "common symbol". When put into an
* archive, linking on OS X will fail to resolve common symbols. By
* initialising it to zero, it becomes a "data symbol", which isn't so
* affected. */
uint32_t OPENSSL_ia32cap_P[4] = {0};
/* OPENSSL_ia32_cpuid is defined in cpu-x86_64-asm.pl. */
extern uint64_t OPENSSL_ia32_cpuid(uint32_t*);

View File

@ -16,10 +16,12 @@
#include "internal.h"
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
/* x86 and x86_64 need to record the result of a cpuid call for the asm to work
* correctly, unless compiled without asm code. */
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
/* x86, x86_64 and the ARMs need to record the result of a cpuid call for the
* asm to work correctly, unless compiled without asm code. */
#define NEED_CPUID
#else
@ -30,7 +32,39 @@
#define BORINGSSL_NO_STATIC_INITIALIZER
#endif
#endif /* !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64) */
#endif /* !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64 ||
OPENSSL_ARM || OPENSSL_AARCH64) */
/* The capability variables are defined in this file in order to work around a
* linker bug. When linking with a .a, if no symbols in a .o are referenced
* then the .o is discarded, even if it has constructor functions.
*
* This still means that any binaries that don't include some functionality
* that tests the capability values will still skip the constructor but, so
* far, the init constructor function only sets the capability variables. */
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
/* This value must be explicitly initialised to zero in order to work around a
* bug in libtool or the linker on OS X.
*
* If not initialised then it becomes a "common symbol". When put into an
* archive, linking on OS X will fail to resolve common symbols. By
* initialising it to zero, it becomes a "data symbol", which isn't so
* affected. */
uint32_t OPENSSL_ia32cap_P[4] = {0};
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
#include "arm_arch.h"
#if defined(__ARM_NEON__)
uint32_t OPENSSL_armcap_P = ARMV7_NEON | ARMV7_NEON_FUNCTIONAL;
#else
uint32_t OPENSSL_armcap_P = ARMV7_NEON_FUNCTIONAL;
#endif
#endif
#if defined(OPENSSL_WINDOWS)
#define OPENSSL_CDECL __cdecl
@ -53,6 +87,8 @@ __declspec(allocate(".CRT$XCU")) void(*library_init_constructor)(void) =
* BORINGSSL_NO_STATIC_INITIALIZER isn't defined, this is set as a static
* initializer. Otherwise, it is called by CRYPTO_library_init. */
static void OPENSSL_CDECL do_library_init(void) {
/* WARNING: this function may only configure the capability variables. See the
* note above about the linker bug. */
#if defined(NEED_CPUID)
OPENSSL_cpuid_setup();
#endif

View File

@ -144,7 +144,8 @@ struct st_CRYPTO_EX_DATA_IMPL {
#endif /* defined(_MSC_VER) */
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM) || \
defined(OPENSSL_AARCH64)
/* OPENSSL_cpuid_setup initializes OPENSSL_ia32cap_P. */
void OPENSSL_cpuid_setup(void);
#endif

View File

@ -22,6 +22,15 @@ if (${ARCH} STREQUAL "arm")
MODES_ARCH_SOURCES
ghash-armv4.${ASM_EXT}
ghashv8-armx.${ASM_EXT}
)
endif()
if (${ARCH} STREQUAL "aarch64")
set(
MODES_ARCH_SOURCES
ghashv8-armx.${ASM_EXT}
)
endif()
@ -43,6 +52,7 @@ perlasm(aesni-gcm-x86_64.${ASM_EXT} asm/aesni-gcm-x86_64.pl)
perlasm(ghash-x86_64.${ASM_EXT} asm/ghash-x86_64.pl)
perlasm(ghash-x86.${ASM_EXT} asm/ghash-x86.pl)
perlasm(ghash-armv4.${ASM_EXT} asm/ghash-armv4.pl)
perlasm(ghashv8-armx.${ASM_EXT} asm/ghashv8-armx.pl)
add_executable(
gcm_test

View File

@ -0,0 +1,241 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
#
# June 2014
#
# Initial version was developed in tight cooperation with Ard
# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
# other assembly modules. Just like aesv8-armx.pl this module
# supports both AArch32 and AArch64 execution modes.
#
# Current performance in cycles per processed byte:
#
# PMULL[2] 32-bit NEON(*)
# Apple A7 1.76 5.62
# Cortex-A53 1.45 8.39
# Cortex-A57 2.22 7.61
#
# (*) presented for reference/comparison purposes;
$flavour = shift;
open STDOUT,">".shift;
$Xi="x0"; # argument block
$Htbl="x1";
$inp="x2";
$len="x3";
$inc="x12";
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
$code=<<___;
#include "arm_arch.h"
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
$code.=<<___;
.global gcm_init_v8
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
vld1.64 {$t1},[x1] @ load H
vmov.i8 $t0,#0xe1
vext.8 $IN,$t1,$t1,#8
vshl.i64 $t0,$t0,#57
vshr.u64 $t2,$t0,#63
vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01
vdup.32 $t1,${t1}[1]
vshr.u64 $t3,$IN,#63
vshr.s32 $t1,$t1,#31 @ broadcast carry bit
vand $t3,$t3,$t0
vshl.i64 $IN,$IN,#1
vext.8 $t3,$t3,$t3,#8
vand $t0,$t0,$t1
vorr $IN,$IN,$t3 @ H<<<=1
veor $IN,$IN,$t0 @ twisted H
vst1.64 {$IN},[x0]
ret
.size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
vld1.64 {$t1},[$Xi] @ load Xi
vmov.i8 $t3,#0xe1
vld1.64 {$H},[$Htbl] @ load twisted H
vshl.u64 $t3,$t3,#57
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $Hhl,$H,$H,#8
mov $len,#0
vext.8 $IN,$t1,$t1,#8
mov $inc,#0
veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
mov $inp,$Xi
b .Lgmult_v8
.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
subs $len,$len,#16
vmov.i8 $t3,#0xe1
mov $inc,#16
vld1.64 {$H},[$Htbl] @ load twisted H
cclr $inc,eq
vext.8 $Xl,$Xl,$Xl,#8
vshl.u64 $t3,$t3,#57
vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
vext.8 $Hhl,$H,$H,#8
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
vrev64.8 $t1,$t1
#endif
veor $Hhl,$Hhl,$H @ Karatsuba pre-processing
vext.8 $IN,$t1,$t1,#8
b .Loop_v8
.align 4
.Loop_v8:
vext.8 $t2,$Xl,$Xl,#8
veor $IN,$IN,$Xl @ inp^=Xi
veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi
.Lgmult_v8:
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
subs $len,$len,#16
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
cclr $inc,eq
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$t1},[$inp],$inc @ load [rotated] inp
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$t3 @ 1st phase
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
veor $Xl,$Xm,$t2
vext.8 $IN,$t1,$t1,#8
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vpmull.p64 $Xl,$Xl,$t3
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
b.hs .Loop_v8
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
ret
.size gcm_ghash_v8,.-gcm_ghash_v8
___
}
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($flavour =~ /64/) { ######## 64-bit code
sub unvmov {
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vmov\s+(.*)/unvmov($1)/geo or
s/vext\.8/ext/o or
s/vshr\.s/sshr\.s/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
# fix up remainig legacy suffixes
s/\.[ui]?8(\s)/$1/o;
s/\.[uis]?32//o and s/\.16b/\.4s/go;
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
s/\.[uisp]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
print $_,"\n";
}
} else { ######## 32-bit code
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvpmullp64 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
$word |= 0x00010001 if ($mnemonic =~ "2");
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remainig new-style suffixes
s/\],#[0-9]+/]!/o;
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT; # enforce flush

View File

@ -57,8 +57,9 @@
#include "../internal.h"
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
#define GHASH_ASM
#endif
@ -140,7 +141,7 @@ static void gcm_init_4bit(u128 Htable[16], uint64_t H[2]) {
#endif
}
#if !defined(GHASH_ASM)
#if !defined(GHASH_ASM) || defined(OPENSSL_AARCH64)
static const size_t rem_4bit[16] = {
PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
@ -346,15 +347,48 @@ void gcm_gmult_4bit_x86(uint64_t Xi[2], const u128 Htable[16]);
void gcm_ghash_4bit_x86(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len);
#endif
#elif defined(OPENSSL_ARM)
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
#include "../arm_arch.h"
#if __ARM_ARCH__ >= 7
#define GHASH_ASM_ARM
#define GCM_FUNCREF_4BIT
void gcm_init_neon(u128 Htable[16],const uint64_t Xi[2]);
static int pmull_capable() {
return (OPENSSL_armcap_P & ARMV8_PMULL) != 0;
}
void gcm_init_v8(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]);
void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len);
#if defined(OPENSSL_ARM)
/* 32-bit ARM also has support for doing GCM with NEON instructions. */
static int neon_capable() {
return CRYPTO_is_NEON_capable();
}
void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]);
void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len);
#else
/* AArch64 only has the ARMv8 versions of functions. */
static int neon_capable() {
return 0;
}
void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) {
abort();
}
void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) {
abort();
}
void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len) {
abort();
}
#endif
#endif
#endif
#endif
@ -433,7 +467,11 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block) {
ctx->ghash = gcm_ghash_4bit;
#endif
#elif defined(GHASH_ASM_ARM)
if (CRYPTO_is_NEON_capable()) {
if (pmull_capable()) {
gcm_init_v8(ctx->Htable, ctx->H.u);
ctx->gmult = gcm_gmult_v8;
ctx->ghash = gcm_ghash_v8;
} else if (neon_capable()) {
gcm_init_neon(ctx->Htable,ctx->H.u);
ctx->gmult = gcm_gmult_neon;
ctx->ghash = gcm_ghash_neon;
@ -443,9 +481,9 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block) {
ctx->ghash = gcm_ghash_4bit;
}
#else
gcm_init_4bit(ctx->Htable, ctx->H.u);
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
gcm_init_4bit(ctx->Htable, ctx->H.u);
#endif
}

View File

@ -30,6 +30,16 @@ if (${ARCH} STREQUAL "arm")
)
endif()
if (${ARCH} STREQUAL "aarch64")
set(
SHA_ARCH_SOURCES
sha1-armv8.${ASM_EXT}
sha256-armv8.${ASM_EXT}
sha512-armv8.${ASM_EXT}
)
endif()
add_library(
sha
@ -51,3 +61,6 @@ perlasm(sha512-586.${ASM_EXT} asm/sha512-586.pl)
perlasm(sha1-armv4-large.${ASM_EXT} asm/sha1-armv4-large.pl)
perlasm(sha256-armv4.${ASM_EXT} asm/sha256-armv4.pl)
perlasm(sha512-armv4.${ASM_EXT} asm/sha512-armv4.pl)
perlasm(sha1-armv8.${ASM_EXT} asm/sha1-armv8.pl)
perlasm(sha256-armv8.${ASM_EXT} asm/sha512-armv8.pl sha256)
perlasm(sha512-armv8.${ASM_EXT} asm/sha512-armv8.pl sha512)

View File

@ -0,0 +1,334 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA1 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# hardware-assisted software(*)
# Apple A7 2.31 4.13 (+14%)
# Cortex-A53 2.19 8.73 (+108%)
# Cortex-A57 2.35 7.88 (+74%)
#
# (*) Software results are presented mostly for reference purposes.
$flavour = shift;
open STDOUT,">".shift;
($ctx,$inp,$num)=("x0","x1","x2");
@Xw=map("w$_",(3..17,19));
@Xx=map("x$_",(3..17,19));
@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
($t0,$t1,$t2,$K)=map("w$_",(25..28));
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i<15 && !($i&1));
lsr @Xx[$i+1],@Xx[$i],#32
___
$code.=<<___ if ($i<14 && !($i&1));
ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
___
$code.=<<___ if ($i<14 && ($i&1));
#ifdef __ARMEB__
ror @Xx[$i+1],@Xx[$i+1],#32
#else
rev32 @Xx[$i+1],@Xx[$i+1]
#endif
___
$code.=<<___ if ($i<14);
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==19);
movz $K,#0xeba1
movk $K,#0x6ed9,lsl#16
___
$code.=<<___ if ($i>=14);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==59);
movz $K,#0xc1d6
movk $K,#0xca62,lsl#16
___
$code.=<<___;
orr $t0,$b,$c
and $t1,$b,$c
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
ror $t2,$a,#27
and $t0,$t0,$d
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $e,$e,$t2 // e+=rot(a,5)
orr $t0,$t0,$t1
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==39);
movz $K,#0xbcdc
movk $K,#0x8f1b,lsl#16
___
$code.=<<___ if ($i<78);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
$code.=<<___ if ($i==78);
ldp @Xw[1],@Xw[2],[$ctx]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==79);
ldp @Xw[3],@Xw[4],[$ctx,#8]
eor $t0,$d,$b
ror $t2,$a,#27
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
ldr @Xw[5],[$ctx,#16]
add $e,$e,$t0 // e+=F(b,c,d)
___
}
$code.=<<___;
#include "arm_arch.h"
.text
.globl sha1_block_data_order
.type sha1_block_data_order,%function
.align 6
sha1_block_data_order:
ldr x16,.LOPENSSL_armcap_P
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
tst w16,#ARMV8_SHA1
b.ne .Lv8_entry
stp x29,x30,[sp,#-96]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
ldp $A,$B,[$ctx]
ldp $C,$D,[$ctx,#8]
ldr $E,[$ctx,#16]
.Loop:
ldr @Xx[0],[$inp],#64
movz $K,#0x7999
sub $num,$num,#1
movk $K,#0x5a82,lsl#16
#ifdef __ARMEB__
ror $Xx[0],@Xx[0],#32
#else
rev32 @Xx[0],@Xx[0]
#endif
add $E,$E,$K // warm it up
add $E,$E,@Xw[0]
___
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $B,$B,@Xw[2]
add $C,$C,@Xw[3]
add $A,$A,@Xw[1]
add $D,$D,@Xw[4]
add $E,$E,@Xw[5]
stp $A,$B,[$ctx]
stp $C,$D,[$ctx,#8]
str $E,[$ctx,#16]
cbnz $num,.Loop
ldp x19,x20,[sp,#16]
ldp x21,x22,[sp,#32]
ldp x23,x24,[sp,#48]
ldp x25,x26,[sp,#64]
ldp x27,x28,[sp,#80]
ldr x29,[sp],#96
ret
.size sha1_block_data_order,.-sha1_block_data_order
___
{{{
my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
my @MSG=map("v$_.16b",(4..7));
my @Kxx=map("v$_.4s",(16..19));
my ($W0,$W1)=("v20.4s","v21.4s");
my $ABCD_SAVE="v22.16b";
$code.=<<___;
.type sha1_block_armv8,%function
.align 6
sha1_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
adr x4,.Lconst
eor $E,$E,$E
ld1.32 {$ABCD},[$ctx],#16
ld1.32 {$E}[0],[$ctx]
sub $ctx,$ctx,#16
ld1.32 {@Kxx[0]-@Kxx[3]},[x4]
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
add.i32 $W0,@Kxx[0],@MSG[0]
rev32 @MSG[2],@MSG[2]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
add.i32 $W1,@Kxx[0],@MSG[1]
rev32 @MSG[3],@MSG[3]
sha1h $E1,$ABCD
sha1c $ABCD,$E,$W0 // 0
add.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for ($j=0,$i=1;$i<20-3;$i++) {
my $f=("c","p","m","p")[$i/5];
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1$f $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code.=<<___ if ($i<20-4);
sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
}
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1p $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD // 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD // 19
sha1p $ABCD,$E1,$W1
add.i32 $E,$E,$E0
add.i32 $ABCD,$ABCD,$ABCD_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD},[$ctx],#16
st1.32 {$E}[0],[$ctx]
ldr x29,[sp],#16
ret
.size sha1_block_armv8,.-sha1_block_armv8
.align 6
.Lconst:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
.LOPENSSL_armcap_P:
.quad OPENSSL_armcap_P-.
.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.comm OPENSSL_armcap_P,4,4
___
}}}
{ my %opcode = (
"sha1c" => 0x5e000000, "sha1p" => 0x5e001000,
"sha1m" => 0x5e002000, "sha1su0" => 0x5e003000,
"sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 );
sub unsha1 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
print $_,"\n";
}
close STDOUT;

View File

@ -0,0 +1,419 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA256/512 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
# Cortex-A53 2.38 15.6 (+110%) 10.1 (+190%(***))
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
# 10% (or by 1 cycle per round), but at the cost of 20% loss
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significanty faster
# and lags behind assembly only by 50-90%.
$flavour=shift;
$output=shift;
if ($output =~ /512/) {
$BITS=512;
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$reg_t="x";
} else {
$BITS=256;
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$reg_t="w";
}
$func="sha${BITS}_block_data_order";
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
@X=map("$reg_t$_",(3..15,0..2));
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
sub BODY_00_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)&15;
my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
$T0=@X[$i+3] if ($i<11);
$code.=<<___ if ($i<16);
#ifndef __ARMEB__
rev @X[$i],@X[$i] // $i
#endif
___
$code.=<<___ if ($i<13 && ($i&1));
ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
___
$code.=<<___ if ($i==13);
ldp @X[14],@X[15],[$inp]
___
$code.=<<___ if ($i>=14);
ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
___
$code.=<<___ if ($i>0 && $i<16);
add $a,$a,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=11);
str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
___
# While ARMv8 specifies merged rotate-n-logical operation such as
# 'eor x,y,z,ror#n', it was found to negatively affect performance
# on Apple A7. The reason seems to be that it requires even 'y' to
# be available earlier. This means that such merged instruction is
# not necessarily best choice on critical path... On the other hand
# Cortex-A5x handles merged instructions much better than disjoint
# rotate and logical... See (**) footnote above.
$code.=<<___ if ($i<15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
and $t1,$f,$e
bic $t2,$g,$e
add $h,$h,@X[$i&15] // h+=X[i]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
ror $T0,$a,#$Sigma0[0]
add $h,$h,$t1 // h+=Ch(e,f,g)
eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
add $h,$h,$t0 // h+=Sigma1(e)
and $t3,$t3,$t2 // (b^c)&=(a^b)
add $d,$d,$h // d+=h
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
//add $h,$h,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
ror $T1,@X[($j+1)&15],#$sigma0[0]
and $t1,$f,$e
ror $T2,@X[($j+14)&15],#$sigma1[0]
bic $t2,$g,$e
ror $T0,$a,#$Sigma0[0]
add $h,$h,@X[$i&15] // h+=X[i]
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
eor $T0,$T0,$a,ror#$Sigma0[1]
add $h,$h,$t1 // h+=Ch(e,f,g)
and $t3,$t3,$t2 // (b^c)&=(a^b)
eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
add $h,$h,$t0 // h+=Sigma1(e)
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
add @X[$j],@X[$j],@X[($j+9)&15]
add $d,$d,$h // d+=h
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
add @X[$j],@X[$j],$T1
add $h,$h,$t1 // h+=Sigma0(a)
add @X[$j],@X[$j],$T2
___
($t2,$t3)=($t3,$t2);
}
$code.=<<___;
#include "arm_arch.h"
.text
.globl $func
.type $func,%function
.align 6
$func:
___
$code.=<<___ if ($SZ==4);
ldr x16,.LOPENSSL_armcap_P
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
tst w16,#ARMV8_SHA256
b.ne .Lv8_entry
___
$code.=<<___;
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#4*$SZ
ldp $A,$B,[$ctx] // load context
ldp $C,$D,[$ctx,#2*$SZ]
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
adr $Ktbl,K$BITS
stp $ctx,$num,[x29,#96]
.Loop:
ldp @X[0],@X[1],[$inp],#2*$SZ
ldr $t2,[$Ktbl],#$SZ // *K++
eor $t3,$B,$C // magic seed
str $inp,[x29,#112]
___
for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=".Loop_16_xx:\n";
for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
cbnz $t2,.Loop_16_xx
ldp $ctx,$num,[x29,#96]
ldr $inp,[x29,#112]
sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
ldp @X[0],@X[1],[$ctx]
ldp @X[2],@X[3],[$ctx,#2*$SZ]
add $inp,$inp,#14*$SZ // advance input pointer
ldp @X[4],@X[5],[$ctx,#4*$SZ]
add $A,$A,@X[0]
ldp @X[6],@X[7],[$ctx,#6*$SZ]
add $B,$B,@X[1]
add $C,$C,@X[2]
add $D,$D,@X[3]
stp $A,$B,[$ctx]
add $E,$E,@X[4]
add $F,$F,@X[5]
stp $C,$D,[$ctx,#2*$SZ]
add $G,$G,@X[6]
add $H,$H,@X[7]
cmp $inp,$num
stp $E,$F,[$ctx,#4*$SZ]
stp $G,$H,[$ctx,#6*$SZ]
b.ne .Loop
ldp x19,x20,[x29,#16]
add sp,sp,#4*$SZ
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
ret
.size $func,.-$func
.align 6
.type K$BITS,%object
K$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0 //terminator
___
$code.=<<___;
.size K$BITS,.-K$BITS
.align 3
.LOPENSSL_armcap_P:
.quad OPENSSL_armcap_P-.
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($SZ==4) {
my $Ktbl="x3";
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
my @MSG=map("v$_.16b",(4..7));
my ($W0,$W1)=("v16.4s","v17.4s");
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
$code.=<<___;
.type sha256_block_armv8,%function
.align 6
sha256_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
adr $Ktbl,K256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
ld1.32 {$W0},[$Ktbl],#16
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
rev32 @MSG[2],@MSG[2]
rev32 @MSG[3],@MSG[3]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
orr $EFGH_SAVE,$EFGH,$EFGH
___
for($i=0;$i<12;$i++) {
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
ld1.32 {$W0},[$Ktbl],#16
add.i32 $W1,$W1,@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
ld1.32 {$W1},[$Ktbl]
add.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
add.i32 $W1,$W1,@MSG[3]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
add.i32 $ABCD,$ABCD,$ABCD_SAVE
add.i32 $EFGH,$EFGH,$EFGH_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD,$EFGH},[$ctx]
ldr x29,[sp],#16
ret
.size sha256_block_armv8,.-sha256_block_armv8
___
}
$code.=<<___;
.comm OPENSSL_armcap_P,4,4
___
{ my %opcode = (
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
print $_,"\n";
}
close STDOUT;

View File

@ -61,8 +61,9 @@
#include <openssl/mem.h>
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
#define SHA1_ASM
#endif

View File

@ -61,8 +61,9 @@
#include <openssl/mem.h>
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
#define SHA256_ASM
#endif

View File

@ -87,8 +87,9 @@
* apply a number of optimizations to mitigate potential performance
* penalties caused by previous design decision; */
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM))
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
#define SHA512_ASM
#endif

View File

@ -80,6 +80,9 @@ extern "C" {
#elif defined(__arm) || defined(__arm__) || defined(_M_ARM)
#define OPENSSL_32_BIT
#define OPENSSL_ARM
#elif defined(__aarch64__)
#define OPENSSL_64_BIT
#define OPENSSL_AARCH64
#elif defined(__mips__)
#define OPENSSL_32_BIT
#define OPENSSL_MIPS

View File

@ -89,7 +89,7 @@ extern "C" {
extern uint32_t OPENSSL_ia32cap_P[4];
#endif
#if defined(OPENSSL_ARM)
#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
/* CRYPTO_is_NEON_capable returns true if the current CPU has a NEON unit. Note
* that |OPENSSL_armcap_P| also exists and contains the same information in a
* form that's easier for assembly to use. */

View File

@ -206,14 +206,6 @@
*((c)++) = (uint8_t)(((l) >> 8) & 0xff), \
*((c)++) = (uint8_t)(((l)) & 0xff))
#define l2n6(l, c) \
(*((c)++) = (uint8_t)(((l) >> 40) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 32) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 24) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 16) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 8) & 0xff), \
*((c)++) = (uint8_t)(((l)) & 0xff))
#define l2n8(l, c) \
(*((c)++) = (uint8_t)(((l) >> 56) & 0xff), \
*((c)++) = (uint8_t)(((l) >> 48) & 0xff), \
@ -224,11 +216,6 @@
*((c)++) = (uint8_t)(((l) >> 8) & 0xff), \
*((c)++) = (uint8_t)(((l)) & 0xff))
#define n2l6(c, l) \
(l = ((BN_ULLONG)(*((c)++))) << 40, l |= ((BN_ULLONG)(*((c)++))) << 32, \
l |= ((BN_ULLONG)(*((c)++))) << 24, l |= ((BN_ULLONG)(*((c)++))) << 16, \
l |= ((BN_ULLONG)(*((c)++))) << 8, l |= ((BN_ULLONG)(*((c)++))))
/* NOTE - c is not incremented as per l2c */
#define l2cn(l1, l2, c, n) \
{ \

View File

@ -12,7 +12,11 @@
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#if defined(__linux__)
#include <openssl/base.h>
// This file isn't built on ARM or Aarch64 because we link statically in those
// builds and trying to override malloc in a static link doesn't work.
#if defined(__linux__) && !defined(OPENSSL_ARM) && !defined(OPENSSL_AARCH64)
#include <stdint.h>
#include <stdlib.h>
@ -123,4 +127,4 @@ void *realloc(void *ptr, size_t size) {
} // extern "C"
#endif /* defined(linux) */
#endif /* defined(linux) && !ARM && !AARCH64 */

View File

@ -0,0 +1,6 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_VERSION 1)
set(CMAKE_SYSTEM_PROCESSOR "aarch64")
set(CMAKE_CXX_COMPILER "/opt/gcc-linaro-4.9-2014.11-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-g++")
set(CMAKE_C_COMPILER "/opt/gcc-linaro-4.9-2014.11-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-gcc")
set(CMAKE_EXE_LINKER_FLAGS "-static")

View File

@ -1,6 +1,6 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_VERSION 1)
set(CMAKE_SYSTEM_PROCESSOR "arm")
set(CMAKE_CXX_COMPILER "/opt/gcc-linaro-arm-linux-gnueabihf-4.8-2013.10_linux/bin/arm-linux-gnueabihf-g++")
set(CMAKE_C_COMPILER "/opt/gcc-linaro-arm-linux-gnueabihf-4.8-2013.10_linux/bin/arm-linux-gnueabihf-gcc")
set(CMAKE_CXX_COMPILER "/opt/gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf/bin/arm-linux-gnueabihf-g++")
set(CMAKE_C_COMPILER "/opt/gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc")
set(CMAKE_EXE_LINKER_FLAGS "-static")