Use C instead of assembly fallback code in GCM on X86_64.

This will ensure that this code is tested in CI and is being compiled
by MSVC; previously this C code wasn't being tested at all because all
platforms we use for testing were taking other code paths.
This commit is contained in:
Brian Smith 2019-01-18 12:23:36 -10:00
parent 225f6b0c3a
commit 0cd9bf6f64
2 changed files with 1 additions and 231 deletions

View File

@ -222,220 +222,6 @@ ___
$code=<<___;
.text
.extern GFp_ia32cap_P
.globl GFp_gcm_gmult_4bit
.type GFp_gcm_gmult_4bit,\@function,2
.align 16
GFp_gcm_gmult_4bit:
.cfi_startproc
push %rbx
.cfi_push %rbx
push %rbp # %rbp and others are pushed exclusively in
.cfi_push %rbp
push %r12 # order to reuse Win64 exception handler...
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$280,%rsp
.cfi_adjust_cfa_offset 280
.Lgmult_prologue:
movzb 15($Xi),$Zlo
lea .Lrem_4bit(%rip),$rem_4bit
___
&loop ($Xi);
$code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
lea 280+48(%rsp),%rsi
.cfi_def_cfa %rsi,8
mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lgmult_epilogue:
ret
.cfi_endproc
.size GFp_gcm_gmult_4bit,.-GFp_gcm_gmult_4bit
___
# per-function register layout
$inp="%rdx";
$len="%rcx";
$rem_8bit=$rem_4bit;
$code.=<<___;
.globl GFp_gcm_ghash_4bit
.type GFp_gcm_ghash_4bit,\@function,4
.align 16
GFp_gcm_ghash_4bit:
.cfi_startproc
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$280,%rsp
.cfi_adjust_cfa_offset 280
.Lghash_prologue:
mov $inp,%r14 # reassign couple of args
mov $len,%r15
___
{ my $inp="%r14";
my $dat="%edx";
my $len="%r15";
my @nhi=("%ebx","%ecx");
my @rem=("%r12","%r13");
my $Hshr4="%rbp";
&sub ($Htbl,-128); # size optimization
&lea ($Hshr4,"16+128(%rsp)");
{ my @lo =($nlo,$nhi);
my @hi =($Zlo,$Zhi);
&xor ($dat,$dat);
for ($i=0,$j=-2;$i<18;$i++,$j++) {
&mov ("$j(%rsp)",&LB($dat)) if ($i>1);
&or ($lo[0],$tmp) if ($i>1);
&mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
&shr ($lo[1],4) if ($i>0 && $i<17);
&mov ($tmp,$hi[1]) if ($i>0 && $i<17);
&shr ($hi[1],4) if ($i>0 && $i<17);
&mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
&mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
&shl (&LB($dat),4) if ($i>0 && $i<17);
&mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
&mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
&shl ($tmp,60) if ($i>0 && $i<17);
push (@lo,shift(@lo));
push (@hi,shift(@hi));
}
}
&add ($Htbl,-128);
&mov ($Zlo,"8($Xi)");
&mov ($Zhi,"0($Xi)");
&add ($len,$inp); # pointer to the end of data
&lea ($rem_8bit,".Lrem_8bit(%rip)");
&jmp (".Louter_loop");
$code.=".align 16\n.Louter_loop:\n";
&xor ($Zhi,"($inp)");
&mov ("%rdx","8($inp)");
&lea ($inp,"16($inp)");
&xor ("%rdx",$Zlo);
&mov ("($Xi)",$Zhi);
&mov ("8($Xi)","%rdx");
&shr ("%rdx",32);
&xor ($nlo,$nlo);
&rol ($dat,8);
&mov (&LB($nlo),&LB($dat));
&movz ($nhi[0],&LB($dat));
&shl (&LB($nlo),4);
&shr ($nhi[0],4);
for ($j=11,$i=0;$i<15;$i++) {
&rol ($dat,8);
&xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
&xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
&mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
&mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
&mov (&LB($nlo),&LB($dat));
&xor ($Zlo,$tmp) if ($i>0);
&movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
&movz ($nhi[1],&LB($dat));
&shl (&LB($nlo),4);
&movzb ($rem[0],"(%rsp,$nhi[0])");
&shr ($nhi[1],4) if ($i<14);
&and ($nhi[1],0xf0) if ($i==14);
&shl ($rem[1],48) if ($i>0);
&xor ($rem[0],$Zlo);
&mov ($tmp,$Zhi);
&xor ($Zhi,$rem[1]) if ($i>0);
&shr ($Zlo,8);
&movz ($rem[0],&LB($rem[0]));
&mov ($dat,"$j($Xi)") if (--$j%4==0);
&shr ($Zhi,8);
&xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
&shl ($tmp,56);
&xor ($Zhi,"($Hshr4,$nhi[0],8)");
unshift (@nhi,pop(@nhi)); # "rotate" registers
unshift (@rem,pop(@rem));
}
&movzw ($rem[1],"($rem_8bit,$rem[1],2)");
&xor ($Zlo,"8($Htbl,$nlo)");
&xor ($Zhi,"($Htbl,$nlo)");
&shl ($rem[1],48);
&xor ($Zlo,$tmp);
&xor ($Zhi,$rem[1]);
&movz ($rem[0],&LB($Zlo));
&shr ($Zlo,4);
&mov ($tmp,$Zhi);
&shl (&LB($rem[0]),4);
&shr ($Zhi,4);
&xor ($Zlo,"8($Htbl,$nhi[0])");
&movzw ($rem[0],"($rem_8bit,$rem[0],2)");
&shl ($tmp,60);
&xor ($Zhi,"($Htbl,$nhi[0])");
&xor ($Zlo,$tmp);
&shl ($rem[0],48);
&bswap ($Zlo);
&xor ($Zhi,$rem[0]);
&bswap ($Zhi);
&cmp ($inp,$len);
&jb (".Louter_loop");
}
$code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
lea 280+48(%rsp),%rsi
.cfi_def_cfa %rsi,8
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx
.cfi_restore %rbx
lea 0(%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lghash_epilogue:
ret
.cfi_endproc
.size GFp_gcm_ghash_4bit,.-GFp_gcm_ghash_4bit
___
######################################################################
@ -1721,14 +1507,6 @@ se_handler:
.section .pdata
.align 4
.rva .LSEH_begin_GFp_gcm_gmult_4bit
.rva .LSEH_end_GFp_gcm_gmult_4bit
.rva .LSEH_info_GFp_gcm_gmult_4bit
.rva .LSEH_begin_GFp_gcm_ghash_4bit
.rva .LSEH_end_GFp_gcm_ghash_4bit
.rva .LSEH_info_GFp_gcm_ghash_4bit
.rva .LSEH_begin_GFp_gcm_init_clmul
.rva .LSEH_end_GFp_gcm_init_clmul
.rva .LSEH_info_GFp_gcm_init_clmul
@ -1749,14 +1527,6 @@ ___
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_GFp_gcm_gmult_4bit:
.byte 9,0,0,0
.rva se_handler
.rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
.LSEH_info_GFp_gcm_ghash_4bit:
.byte 9,0,0,0
.rva se_handler
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
.LSEH_info_GFp_gcm_init_clmul:
.byte 0x01,0x08,0x03,0x00
.byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6

View File

@ -124,7 +124,7 @@ static void gcm_init_4bit(u128 Htable[16], const uint64_t H[2]) {
#endif
}
#if defined(OPENSSL_AARCH64) || defined(OPENSSL_PPC64LE)
#if defined(OPENSSL_AARCH64) || defined(OPENSSL_PPC64LE) || defined(OPENSSL_X86_64)
static const size_t rem_4bit[16] = {
PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),