Update several assembly files from upstream.

This change syncs several assembly files from upstream. The only meanful
additions are more CFI directives.

Change-Id: I6aec50b6fddbea297b79bae22cfd68d5c115220f
Reviewed-on: https://boringssl-review.googlesource.com/30364
Reviewed-by: Adam Langley <agl@google.com>
This commit is contained in:
Adam Langley 2018-08-07 11:26:15 -07:00
parent e27793940e
commit 6410e18e91
4 changed files with 190 additions and 8 deletions
crypto/fipsmodule

@ -44,9 +44,8 @@
# See ghash-x86.pl for background information and details about coding # See ghash-x86.pl for background information and details about coding
# techniques. # techniques.
# #
# Special thanks to David Woodhouse <dwmw2@infradead.org> for # Special thanks to David Woodhouse for providing access to a
# providing access to a Westmere-based system on behalf of Intel # Westmere-based system on behalf of Intel Open Source Technology Centre.
# Open Source Technology Centre.
# December 2012 # December 2012
# #
@ -228,13 +227,21 @@ $code=<<___;
.type gcm_gmult_4bit,\@function,2 .type gcm_gmult_4bit,\@function,2
.align 16 .align 16
gcm_gmult_4bit: gcm_gmult_4bit:
.cfi_startproc
push %rbx push %rbx
.cfi_push %rbx
push %rbp # %rbp and others are pushed exclusively in push %rbp # %rbp and others are pushed exclusively in
.cfi_push %rbp
push %r12 # order to reuse Win64 exception handler... push %r12 # order to reuse Win64 exception handler...
.cfi_push %r12
push %r13 push %r13
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
push %r15 push %r15
.cfi_push %r15
sub \$280,%rsp sub \$280,%rsp
.cfi_adjust_cfa_offset 280
.Lgmult_prologue: .Lgmult_prologue:
movzb 15($Xi),$Zlo movzb 15($Xi),$Zlo
@ -246,10 +253,14 @@ $code.=<<___;
mov $Zhi,($Xi) mov $Zhi,($Xi)
lea 280+48(%rsp),%rsi lea 280+48(%rsp),%rsi
.cfi_def_cfa %rsi,8
mov -8(%rsi),%rbx mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lgmult_epilogue: .Lgmult_epilogue:
ret ret
.cfi_endproc
.size gcm_gmult_4bit,.-gcm_gmult_4bit .size gcm_gmult_4bit,.-gcm_gmult_4bit
___ ___
@ -263,13 +274,21 @@ $code.=<<___;
.type gcm_ghash_4bit,\@function,4 .type gcm_ghash_4bit,\@function,4
.align 16 .align 16
gcm_ghash_4bit: gcm_ghash_4bit:
.cfi_startproc
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 push %r13
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
push %r15 push %r15
.cfi_push %r15
sub \$280,%rsp sub \$280,%rsp
.cfi_adjust_cfa_offset 280
.Lghash_prologue: .Lghash_prologue:
mov $inp,%r14 # reassign couple of args mov $inp,%r14 # reassign couple of args
mov $len,%r15 mov $len,%r15
@ -398,15 +417,24 @@ $code.=<<___;
mov $Zhi,($Xi) mov $Zhi,($Xi)
lea 280+48(%rsp),%rsi lea 280+48(%rsp),%rsi
.cfi_def_cfa %rsi,8
mov -48(%rsi),%r15 mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14 mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13 mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12 mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx mov -8(%rsi),%rbx
.cfi_restore %rbx
lea 0(%rsi),%rsp lea 0(%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lghash_epilogue: .Lghash_epilogue:
ret ret
.cfi_endproc
.size gcm_ghash_4bit,.-gcm_ghash_4bit .size gcm_ghash_4bit,.-gcm_ghash_4bit
___ ___

@ -35,6 +35,7 @@ print<<___;
.type CRYPTO_rdrand,\@function,1 .type CRYPTO_rdrand,\@function,1
.align 16 .align 16
CRYPTO_rdrand: CRYPTO_rdrand:
.cfi_startproc
xorq %rax, %rax xorq %rax, %rax
# This is rdrand %rcx. It sets rcx to a random value and sets the carry # This is rdrand %rcx. It sets rcx to a random value and sets the carry
# flag on success. # flag on success.
@ -43,6 +44,7 @@ CRYPTO_rdrand:
adcq %rax, %rax adcq %rax, %rax
movq %rcx, 0(%rdi) movq %rcx, 0(%rdi)
retq retq
.cfi_endproc
# CRYPTO_rdrand_multiple8_buf fills |len| bytes at |buf| with random data from # CRYPTO_rdrand_multiple8_buf fills |len| bytes at |buf| with random data from
# the hardware RNG. The |len| argument must be a multiple of eight. It returns # the hardware RNG. The |len| argument must be a multiple of eight. It returns
@ -52,6 +54,7 @@ CRYPTO_rdrand:
.type CRYPTO_rdrand_multiple8_buf,\@function,2 .type CRYPTO_rdrand_multiple8_buf,\@function,2
.align 16 .align 16
CRYPTO_rdrand_multiple8_buf: CRYPTO_rdrand_multiple8_buf:
.cfi_startproc
test %rsi, %rsi test %rsi, %rsi
jz .Lout jz .Lout
movq \$8, %rdx movq \$8, %rdx
@ -70,6 +73,7 @@ CRYPTO_rdrand_multiple8_buf:
.Lerr: .Lerr:
xorq %rax, %rax xorq %rax, %rax
retq retq
.cfi_endproc
___ ___
close STDOUT; # flush close STDOUT; # flush

@ -251,6 +251,7 @@ $code.=<<___;
.type sha1_block_data_order,\@function,3 .type sha1_block_data_order,\@function,3
.align 16 .align 16
sha1_block_data_order: sha1_block_data_order:
.cfi_startproc
leaq OPENSSL_ia32cap_P(%rip),%r10 leaq OPENSSL_ia32cap_P(%rip),%r10
mov 0(%r10),%r9d mov 0(%r10),%r9d
mov 4(%r10),%r8d mov 4(%r10),%r8d
@ -280,17 +281,24 @@ $code.=<<___;
.align 16 .align 16
.Lialu: .Lialu:
mov %rsp,%rax mov %rsp,%rax
.cfi_def_cfa_register %rax
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 push %r13
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
mov %rdi,$ctx # reassigned argument mov %rdi,$ctx # reassigned argument
sub \$`8+16*4`,%rsp sub \$`8+16*4`,%rsp
mov %rsi,$inp # reassigned argument mov %rsi,$inp # reassigned argument
and \$-64,%rsp and \$-64,%rsp
mov %rdx,$num # reassigned argument mov %rdx,$num # reassigned argument
mov %rax,`16*4`(%rsp) mov %rax,`16*4`(%rsp)
.cfi_cfa_expression %rsp+64,deref,+8
.Lprologue: .Lprologue:
mov 0($ctx),$A mov 0($ctx),$A
@ -324,14 +332,22 @@ $code.=<<___;
jnz .Lloop jnz .Lloop
mov `16*4`(%rsp),%rsi mov `16*4`(%rsp),%rsi
.cfi_def_cfa %rsi,8
mov -40(%rsi),%r14 mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13 mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12 mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lepilogue: .Lepilogue:
ret ret
.cfi_endproc
.size sha1_block_data_order,.-sha1_block_data_order .size sha1_block_data_order,.-sha1_block_data_order
___ ___
if ($shaext) {{{ if ($shaext) {{{
@ -347,6 +363,7 @@ $code.=<<___;
.align 32 .align 32
sha1_block_data_order_shaext: sha1_block_data_order_shaext:
_shaext_shortcut: _shaext_shortcut:
.cfi_startproc
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
lea `-8-4*16`(%rsp),%rsp lea `-8-4*16`(%rsp),%rsp
@ -444,6 +461,7 @@ $code.=<<___ if ($win64);
.Lepilogue_shaext: .Lepilogue_shaext:
___ ___
$code.=<<___; $code.=<<___;
.cfi_endproc
ret ret
.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
___ ___
@ -479,12 +497,19 @@ $code.=<<___;
.align 16 .align 16
sha1_block_data_order_ssse3: sha1_block_data_order_ssse3:
_ssse3_shortcut: _ssse3_shortcut:
.cfi_startproc
mov %rsp,$fp # frame pointer mov %rsp,$fp # frame pointer
.cfi_def_cfa_register $fp
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 # redundant, done to share Win64 SE handler push %r13 # redundant, done to share Win64 SE handler
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
lea `-64-($win64?6*16:0)`(%rsp),%rsp lea `-64-($win64?6*16:0)`(%rsp),%rsp
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
@ -912,13 +937,20 @@ $code.=<<___ if ($win64);
___ ___
$code.=<<___; $code.=<<___;
mov -40($fp),%r14 mov -40($fp),%r14
.cfi_restore %r14
mov -32($fp),%r13 mov -32($fp),%r13
.cfi_restore %r13
mov -24($fp),%r12 mov -24($fp),%r12
.cfi_restore %r12
mov -16($fp),%rbp mov -16($fp),%rbp
.cfi_restore %rbp
mov -8($fp),%rbx mov -8($fp),%rbx
.cfi_restore %rbx
lea ($fp),%rsp lea ($fp),%rsp
.cfi_def_cfa_register %rsp
.Lepilogue_ssse3: .Lepilogue_ssse3:
ret ret
.cfi_endproc
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
___ ___
@ -939,12 +971,19 @@ $code.=<<___;
.align 16 .align 16
sha1_block_data_order_avx: sha1_block_data_order_avx:
_avx_shortcut: _avx_shortcut:
.cfi_startproc
mov %rsp,$fp mov %rsp,$fp
.cfi_def_cfa_register $fp
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 # redundant, done to share Win64 SE handler push %r13 # redundant, done to share Win64 SE handler
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
lea `-64-($win64?6*16:0)`(%rsp),%rsp lea `-64-($win64?6*16:0)`(%rsp),%rsp
vzeroupper vzeroupper
___ ___
@ -997,7 +1036,7 @@ $code.=<<___;
jmp .Loop_avx jmp .Loop_avx
___ ___
sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 sub Xupdate_avx_16_31() # recall that $Xi starts with 4
{ use integer; { use integer;
my $body = shift; my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
@ -1274,13 +1313,20 @@ $code.=<<___ if ($win64);
___ ___
$code.=<<___; $code.=<<___;
mov -40($fp),%r14 mov -40($fp),%r14
.cfi_restore %r14
mov -32($fp),%r13 mov -32($fp),%r13
.cfi_restore %r13
mov -24($fp),%r12 mov -24($fp),%r12
.cfi_restore %r12
mov -16($fp),%rbp mov -16($fp),%rbp
.cfi_restore %rbp
mov -8($fp),%rbx mov -8($fp),%rbx
.cfi_restore %rbx
lea ($fp),%rsp lea ($fp),%rsp
.cfi_def_cfa_register %rsp
.Lepilogue_avx: .Lepilogue_avx:
ret ret
.cfi_endproc
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx .size sha1_block_data_order_avx,.-sha1_block_data_order_avx
___ ___
@ -1304,12 +1350,19 @@ $code.=<<___;
.align 16 .align 16
sha1_block_data_order_avx2: sha1_block_data_order_avx2:
_avx2_shortcut: _avx2_shortcut:
.cfi_startproc
mov %rsp,$fp mov %rsp,$fp
.cfi_def_cfa_register $fp
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 push %r13
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
vzeroupper vzeroupper
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
@ -1751,13 +1804,20 @@ $code.=<<___ if ($win64);
___ ___
$code.=<<___; $code.=<<___;
mov -40($fp),%r14 mov -40($fp),%r14
.cfi_restore %r14
mov -32($fp),%r13 mov -32($fp),%r13
.cfi_restore %r13
mov -24($fp),%r12 mov -24($fp),%r12
.cfi_restore %r12
mov -16($fp),%rbp mov -16($fp),%rbp
.cfi_restore %rbp
mov -8($fp),%rbx mov -8($fp),%rbx
.cfi_restore %rbx
lea ($fp),%rsp lea ($fp),%rsp
.cfi_def_cfa_register %rsp
.Lepilogue_avx2: .Lepilogue_avx2:
ret ret
.cfi_endproc
.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
___ ___
} }
@ -1917,9 +1977,9 @@ ssse3_handler:
mov -40(%rax),%r14 mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore cotnext->R12 mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore cotnext->R13 mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore cotnext->R14 mov %r14,232($context) # restore context->R14
.Lcommon_seh_tail: .Lcommon_seh_tail:
mov 8(%rax),%rdi mov 8(%rax),%rdi

@ -170,7 +170,7 @@ $Tbl="%rbp";
$_ctx="16*$SZ+0*8(%rsp)"; $_ctx="16*$SZ+0*8(%rsp)";
$_inp="16*$SZ+1*8(%rsp)"; $_inp="16*$SZ+1*8(%rsp)";
$_end="16*$SZ+2*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)";
$_rsp="16*$SZ+3*8(%rsp)"; $_rsp="`16*$SZ+3*8`(%rsp)";
$framesz="16*$SZ+4*8"; $framesz="16*$SZ+4*8";
@ -263,6 +263,7 @@ $code=<<___;
.type $func,\@function,3 .type $func,\@function,3
.align 16 .align 16
$func: $func:
.cfi_startproc
___ ___
$code.=<<___ if ($SZ==4 || $avx); $code.=<<___ if ($SZ==4 || $avx);
leaq OPENSSL_ia32cap_P(%rip),%r11 leaq OPENSSL_ia32cap_P(%rip),%r11
@ -296,12 +297,19 @@ $code.=<<___ if ($SZ==4);
___ ___
$code.=<<___; $code.=<<___;
mov %rsp,%rax # copy %rsp mov %rsp,%rax # copy %rsp
.cfi_def_cfa_register %rax
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 push %r13
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
push %r15 push %r15
.cfi_push %r15
shl \$4,%rdx # num*16 shl \$4,%rdx # num*16
sub \$$framesz,%rsp sub \$$framesz,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@ -310,6 +318,7 @@ $code.=<<___;
mov $inp,$_inp # save inp, 2nd arh mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg mov %rdx,$_end # save end pointer, "3rd" arg
mov %rax,$_rsp # save copy of %rsp mov %rax,$_rsp # save copy of %rsp
.cfi_cfa_expression $_rsp,deref,+8
.Lprologue: .Lprologue:
mov $SZ*0($ctx),$A mov $SZ*0($ctx),$A
@ -376,15 +385,24 @@ $code.=<<___;
jb .Lloop jb .Lloop
mov $_rsp,%rsi mov $_rsp,%rsi
.cfi_def_cfa %rsi,8
mov -48(%rsi),%r15 mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14 mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13 mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12 mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lepilogue: .Lepilogue:
ret ret
.cfi_endproc
.size $func,.-$func .size $func,.-$func
___ ___
@ -754,14 +772,22 @@ $code.=<<___;
.type ${func}_ssse3,\@function,3 .type ${func}_ssse3,\@function,3
.align 64 .align 64
${func}_ssse3: ${func}_ssse3:
.cfi_startproc
.Lssse3_shortcut: .Lssse3_shortcut:
mov %rsp,%rax # copy %rsp mov %rsp,%rax # copy %rsp
.cfi_def_cfa_register %rax
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 push %r13
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
push %r15 push %r15
.cfi_push %r15
shl \$4,%rdx # num*16 shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*4`,%rsp sub \$`$framesz+$win64*16*4`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@ -770,6 +796,7 @@ ${func}_ssse3:
mov $inp,$_inp # save inp, 2nd arh mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg mov %rdx,$_end # save end pointer, "3rd" arg
mov %rax,$_rsp # save copy of %rsp mov %rax,$_rsp # save copy of %rsp
.cfi_cfa_expression $_rsp,deref,+8
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm6,16*$SZ+32(%rsp)
@ -1068,6 +1095,7 @@ $code.=<<___;
jb .Lloop_ssse3 jb .Lloop_ssse3
mov $_rsp,%rsi mov $_rsp,%rsi
.cfi_def_cfa %rsi,8
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
movaps 16*$SZ+32(%rsp),%xmm6 movaps 16*$SZ+32(%rsp),%xmm6
@ -1077,14 +1105,22 @@ $code.=<<___ if ($win64);
___ ___
$code.=<<___; $code.=<<___;
mov -48(%rsi),%r15 mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14 mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13 mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12 mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lepilogue_ssse3: .Lepilogue_ssse3:
ret ret
.cfi_endproc
.size ${func}_ssse3,.-${func}_ssse3 .size ${func}_ssse3,.-${func}_ssse3
___ ___
} }
@ -1098,14 +1134,22 @@ $code.=<<___;
.type ${func}_xop,\@function,3 .type ${func}_xop,\@function,3
.align 64 .align 64
${func}_xop: ${func}_xop:
.cfi_startproc
.Lxop_shortcut: .Lxop_shortcut:
mov %rsp,%rax # copy %rsp mov %rsp,%rax # copy %rsp
.cfi_def_cfa_register %rax
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 push %r13
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
push %r15 push %r15
.cfi_push %r15
shl \$4,%rdx # num*16 shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@ -1114,6 +1158,7 @@ ${func}_xop:
mov $inp,$_inp # save inp, 2nd arh mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg mov %rdx,$_end # save end pointer, "3rd" arg
mov %rax,$_rsp # save copy of %rsp mov %rax,$_rsp # save copy of %rsp
.cfi_cfa_expression $_rsp,deref,+8
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm6,16*$SZ+32(%rsp)
@ -1440,6 +1485,7 @@ $code.=<<___;
jb .Lloop_xop jb .Lloop_xop
mov $_rsp,%rsi mov $_rsp,%rsi
.cfi_def_cfa %rsi,8
vzeroupper vzeroupper
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
@ -1454,14 +1500,22 @@ $code.=<<___ if ($win64 && $SZ>4);
___ ___
$code.=<<___; $code.=<<___;
mov -48(%rsi),%r15 mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14 mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13 mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12 mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lepilogue_xop: .Lepilogue_xop:
ret ret
.cfi_endproc
.size ${func}_xop,.-${func}_xop .size ${func}_xop,.-${func}_xop
___ ___
} }
@ -1474,14 +1528,22 @@ $code.=<<___;
.type ${func}_avx,\@function,3 .type ${func}_avx,\@function,3
.align 64 .align 64
${func}_avx: ${func}_avx:
.cfi_startproc
.Lavx_shortcut: .Lavx_shortcut:
mov %rsp,%rax # copy %rsp mov %rsp,%rax # copy %rsp
.cfi_def_cfa_register %rax
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 push %r13
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
push %r15 push %r15
.cfi_push %r15
shl \$4,%rdx # num*16 shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@ -1490,6 +1552,7 @@ ${func}_avx:
mov $inp,$_inp # save inp, 2nd arh mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg mov %rdx,$_end # save end pointer, "3rd" arg
mov %rax,$_rsp # save copy of %rsp mov %rax,$_rsp # save copy of %rsp
.cfi_cfa_expression $_rsp,deref,+8
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm6,16*$SZ+32(%rsp)
@ -1748,6 +1811,7 @@ $code.=<<___;
jb .Lloop_avx jb .Lloop_avx
mov $_rsp,%rsi mov $_rsp,%rsi
.cfi_def_cfa %rsi,8
vzeroupper vzeroupper
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
@ -1762,14 +1826,22 @@ $code.=<<___ if ($win64 && $SZ>4);
___ ___
$code.=<<___; $code.=<<___;
mov -48(%rsi),%r15 mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14 mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13 mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12 mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lepilogue_avx: .Lepilogue_avx:
ret ret
.cfi_endproc
.size ${func}_avx,.-${func}_avx .size ${func}_avx,.-${func}_avx
___ ___
@ -1825,14 +1897,22 @@ $code.=<<___;
.type ${func}_avx2,\@function,3 .type ${func}_avx2,\@function,3
.align 64 .align 64
${func}_avx2: ${func}_avx2:
.cfi_startproc
.Lavx2_shortcut: .Lavx2_shortcut:
mov %rsp,%rax # copy %rsp mov %rsp,%rax # copy %rsp
.cfi_def_cfa_register %rax
push %rbx push %rbx
.cfi_push %rbx
push %rbp push %rbp
.cfi_push %rbp
push %r12 push %r12
.cfi_push %r12
push %r13 push %r13
.cfi_push %r13
push %r14 push %r14
.cfi_push %r14
push %r15 push %r15
.cfi_push %r15
sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
shl \$4,%rdx # num*16 shl \$4,%rdx # num*16
and \$-256*$SZ,%rsp # align stack frame and \$-256*$SZ,%rsp # align stack frame
@ -1842,6 +1922,7 @@ ${func}_avx2:
mov $inp,$_inp # save inp, 2nd arh mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg mov %rdx,$_end # save end pointer, "3rd" arg
mov %rax,$_rsp # save copy of %rsp mov %rax,$_rsp # save copy of %rsp
.cfi_cfa_expression $_rsp,deref,+8
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm6,16*$SZ+32(%rsp)
@ -2122,6 +2203,7 @@ $code.=<<___;
.Ldone_avx2: .Ldone_avx2:
lea ($Tbl),%rsp lea ($Tbl),%rsp
mov $_rsp,%rsi mov $_rsp,%rsi
.cfi_def_cfa %rsi,8
vzeroupper vzeroupper
___ ___
$code.=<<___ if ($win64); $code.=<<___ if ($win64);
@ -2136,14 +2218,22 @@ $code.=<<___ if ($win64 && $SZ>4);
___ ___
$code.=<<___; $code.=<<___;
mov -48(%rsi),%r15 mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14 mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13 mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12 mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lepilogue_avx2: .Lepilogue_avx2:
ret ret
.cfi_endproc
.size ${func}_avx2,.-${func}_avx2 .size ${func}_avx2,.-${func}_avx2
___ ___
}} }}