Merge BoringSSL through af561c221d3af70bd0aa48024db4f1fcf1988eef.

Merge BoringSSL through af561c221d3af70bd0aa48024db4f1fcf1988eef.
This commit is contained in:
Brian Smith 2022-10-31 15:28:01 -07:00 committed by GitHub
commit feeca747e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 385 additions and 23 deletions

View File

@ -1,5 +1,5 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
@ -17,23 +17,31 @@
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
#
# June 2014
# Initial version was developed in tight cooperation with Ard Biesheuvel
# of Linaro from bits-n-pieces from other assembly modules. Just like
# aesv8-armx.pl this module supports both AArch32 and AArch64 execution modes.
#
# Initial version was developed in tight cooperation with Ard
# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
# Just like aesv8-armx.pl this module supports both AArch32 and
# AArch64 execution modes.
#
# July 2014
#
# Implement 2x aggregated reduction [see ghash-x86.pl for background
# information].
#
# November 2017
#
# AArch64 register bank to "accommodate" 4x aggregated reduction and
# improve performance by 20-70% depending on processor.
#
# Current performance in cycles per processed byte:
#
# PMULL[2] 32-bit NEON(*)
# Apple A7 0.92 5.62
# Cortex-A53 1.01 8.39
# Cortex-A57 1.17 7.61
# Denver 0.71 6.02
# Mongoose 1.10 8.06
# Kryo 1.16 8.00
# 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
# Apple A7 0.58 0.92 5.62
# Cortex-A53 0.85 1.01 8.39
# Cortex-A57 0.73 1.17 7.61
# Denver 0.51 0.65 6.02
# Mongoose 0.65 1.10 8.06
# Kryo 0.76 1.16 8.00
#
# (*) presented for reference/comparison purposes;
@ -62,6 +70,7 @@ my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
$code=<<___;
#include <ring-core/arm_arch.h>
#if __ARM_MAX_ARCH__>=7
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
@ -129,8 +138,56 @@ gcm_init_clmul:
vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
veor $t1,$t1,$H2
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2]
vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2]
___
if ($flavour =~ /64/) {
my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
$code.=<<___;
@ calculate H^3 and H^4
vpmull.p64 $Xl,$H, $H2
vpmull.p64 $Yl,$H2,$H2
vpmull2.p64 $Xh,$H, $H2
vpmull2.p64 $Yh,$H2,$H2
vpmull.p64 $Xm,$t0,$t1
vpmull.p64 $Ym,$t1,$t1
vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
vext.8 $t1,$Yl,$Yh,#8
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t0
veor $t3,$Yl,$Yh
veor $Ym,$Ym,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
veor $Ym,$Ym,$t3
vpmull.p64 $t3,$Yl,$xC2
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Yh#lo,$Ym#hi
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vmov $Ym#hi,$Yl#lo
veor $Xl,$Xm,$t2
veor $Yl,$Ym,$t3
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vext.8 $t3,$Yl,$Yl,#8
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $Yl,$Yl,$xC2
veor $t2,$t2,$Xh
veor $t3,$t3,$Yh
veor $H, $Xl,$t2 @ H^3
veor $H2,$Yl,$t3 @ H^4
vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing
vext.8 $t1,$H2,$H2,#8
veor $t0,$t0,$H
veor $t1,$t1,$H2
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$H-$H2},[x0] @ store Htable[3..5]
___
}
$code.=<<___;
ret
.size gcm_init_clmul,.-gcm_init_clmul
___
@ -202,6 +259,10 @@ $code.=<<___;
gcm_ghash_clmul:
AARCH64_VALID_CALL_TARGET
___
$code.=<<___ if ($flavour =~ /64/);
cmp $len,#64
b.hs .Lgcm_ghash_v8_4x
___
$code.=<<___ if ($flavour !~ /64/);
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
___
@ -349,10 +410,301 @@ $code.=<<___;
ret
.size gcm_ghash_clmul,.-gcm_ghash_clmul
___
if ($flavour =~ /64/) { # 4x subroutine
my ($I0,$j1,$j2,$j3,
$I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
$code.=<<___;
.type gcm_ghash_v8_4x,%function
.align 4
gcm_ghash_v8_4x:
.Lgcm_ghash_v8_4x:
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2
vmov.i8 $xC2,#0xe1
vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
vld1.64 {$I0-$j3},[$inp],#64
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
vrev64.8 $j1,$j1
vrev64.8 $j2,$j2
vrev64.8 $j3,$j3
vrev64.8 $I0,$I0
#endif
vext.8 $I3,$j3,$j3,#8
vext.8 $I2,$j2,$j2,#8
vext.8 $I1,$j1,$j1,#8
vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
veor $j3,$j3,$I3
vpmull2.p64 $Yh,$H,$I3
vpmull.p64 $Ym,$Hhl,$j3
vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
veor $j2,$j2,$I2
vpmull2.p64 $I2,$H2,$I2
vpmull2.p64 $j2,$Hhl,$j2
veor $Yl,$Yl,$t0
veor $Yh,$Yh,$I2
veor $Ym,$Ym,$j2
vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
veor $j1,$j1,$I1
vpmull2.p64 $I1,$H3,$I1
vpmull.p64 $j1,$H34,$j1
veor $Yl,$Yl,$j3
veor $Yh,$Yh,$I1
veor $Ym,$Ym,$j1
subs $len,$len,#128
b.lo .Ltail4x
b .Loop4x
.align 4
.Loop4x:
veor $t0,$I0,$Xl
vld1.64 {$I0-$j3},[$inp],#64
vext.8 $IN,$t0,$t0,#8
#ifndef __ARMEB__
vrev64.8 $j1,$j1
vrev64.8 $j2,$j2
vrev64.8 $j3,$j3
vrev64.8 $I0,$I0
#endif
vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H4,$IN
vext.8 $I3,$j3,$j3,#8
vpmull2.p64 $Xm,$H34,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
vext.8 $I2,$j2,$j2,#8
veor $Xm,$Xm,$Ym
vext.8 $I1,$j1,$j1,#8
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
veor $j3,$j3,$I3
veor $Xm,$Xm,$t1
vpmull2.p64 $Yh,$H,$I3
veor $Xm,$Xm,$t2
vpmull.p64 $Ym,$Hhl,$j3
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
veor $j2,$j2,$I2
vpmull2.p64 $I2,$H2,$I2
veor $Xl,$Xm,$t2
vpmull2.p64 $j2,$Hhl,$j2
veor $Yl,$Yl,$t0
veor $Yh,$Yh,$I2
veor $Ym,$Ym,$j2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
veor $j1,$j1,$I1
veor $t2,$t2,$Xh
vpmull2.p64 $I1,$H3,$I1
vpmull.p64 $j1,$H34,$j1
veor $Xl,$Xl,$t2
veor $Yl,$Yl,$j3
veor $Yh,$Yh,$I1
vext.8 $Xl,$Xl,$Xl,#8
veor $Ym,$Ym,$j1
subs $len,$len,#64
b.hs .Loop4x
.Ltail4x:
veor $t0,$I0,$Xl
vext.8 $IN,$t0,$t0,#8
vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H4,$IN
vpmull2.p64 $Xm,$H34,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
veor $Xm,$Xm,$Ym
adds $len,$len,#64
b.eq .Ldone4x
cmp $len,#32
b.lo .Lone
b.eq .Ltwo
.Lthree:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$I0-$j2},[$inp]
veor $Xm,$Xm,$t2
#ifndef __ARMEB__
vrev64.8 $j1,$j1
vrev64.8 $j2,$j2
vrev64.8 $I0,$I0
#endif
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $I2,$j2,$j2,#8
vext.8 $I1,$j1,$j1,#8
veor $Xl,$Xm,$t2
vpmull.p64 $Yl,$H,$I2 @ H·Ii+2
veor $j2,$j2,$I2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
vpmull2.p64 $Yh,$H,$I2
vpmull.p64 $Ym,$Hhl,$j2
veor $Xl,$Xl,$t2
vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1
veor $j1,$j1,$I1
vext.8 $Xl,$Xl,$Xl,#8
vpmull2.p64 $I1,$H2,$I1
veor $t0,$I0,$Xl
vpmull2.p64 $j1,$Hhl,$j1
vext.8 $IN,$t0,$t0,#8
veor $Yl,$Yl,$j3
veor $Yh,$Yh,$I1
veor $Ym,$Ym,$j1
vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H3,$IN
vpmull.p64 $Xm,$H34,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
veor $Xm,$Xm,$Ym
b .Ldone4x
.align 4
.Ltwo:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$I0-$j1},[$inp]
veor $Xm,$Xm,$t2
#ifndef __ARMEB__
vrev64.8 $j1,$j1
vrev64.8 $I0,$I0
#endif
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $I1,$j1,$j1,#8
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
vext.8 $Xl,$Xl,$Xl,#8
vpmull.p64 $Yl,$H,$I1 @ H·Ii+1
veor $j1,$j1,$I1
veor $t0,$I0,$Xl
vext.8 $IN,$t0,$t0,#8
vpmull2.p64 $Yh,$H,$I1
vpmull.p64 $Ym,$Hhl,$j1
vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H2,$IN
vpmull2.p64 $Xm,$Hhl,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
veor $Xm,$Xm,$Ym
b .Ldone4x
.align 4
.Lone:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$I0},[$inp]
veor $Xm,$Xm,$t2
#ifndef __ARMEB__
vrev64.8 $I0,$I0
#endif
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
vext.8 $Xl,$Xl,$Xl,#8
veor $t0,$I0,$Xl
vext.8 $IN,$t0,$t0,#8
vpmull.p64 $Xl,$H,$IN
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H,$IN
vpmull.p64 $Xm,$Hhl,$t0
.Ldone4x:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
vext.8 $Xl,$Xl,$Xl,#8
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vst1.64 {$Xl},[$Xi] @ write out Xi
ret
.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
___
}
}
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#endif
___
if ($flavour =~ /64/) { ######## 64-bit code
@ -360,7 +712,8 @@ if ($flavour =~ /64/) { ######## 64-bit code
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
$3<8?$3:$3+8,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or

View File

@ -1,5 +1,5 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
@ -27,6 +27,7 @@
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
# Kryo 1.92 17.4 (+30%) 11.2 (+8%)
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
@ -35,7 +36,7 @@
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significanty faster
# generated with -mgeneral-regs-only is significantly faster
# and the gap is only 40-90%.
$output=pop;
@ -89,7 +90,7 @@ my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
$T0=@X[$i+3] if ($i<11);
$code.=<<___ if ($i<16);
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev @X[$i],@X[$i] // $i
#endif
___
@ -449,12 +450,15 @@ close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
s/\.[ui]?8(\s)/$1/;
s/\.\w?32\b// and s/\.16b/\.4s/g;
m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
print $_,"\n";
}

View File

@ -1136,7 +1136,7 @@ my $endbranch = sub {
########################################################################
{
my $comment = "#";
my $comment = "//";
$comment = ";" if ($masm || $nasm);
print <<___;
$comment This file is generated from a similarly-named Perl script in the BoringSSL

View File

@ -275,7 +275,7 @@ sub ::asciz
sub ::asm_finish
{ &file_end();
my $comment = "#";
my $comment = "//";
$comment = ";" if ($win32);
print <<___;
$comment This file is generated from a similarly-named Perl script in the BoringSSL

View File

@ -138,7 +138,7 @@ fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) {
}
pub(super) fn init(xi: [u64; 2]) -> super::u128 {
// We implement GHASH in terms of POLYVAL, as described in RFC8452. This
// We implement GHASH in terms of POLYVAL, as described in RFC 8452. This
// avoids a shift by 1 in the multiplication, needed to account for bit
// reversal losing a bit after multiplication, that is,
// rev128(X) * rev128(Y) = rev255(X*Y).

View File

@ -10,6 +10,7 @@ Input = "Sample message for keylen<blocklen"
Key = 000102030405060708090A0B0C0D0E0F10111213
Output = 4C99FF0CB1B31BD33F8431DBAF4D17FCD356A807
# This is actually keylen>blocklen, but the NIST test vectors have a misleading input.
HMAC = SHA1
Input = "Sample message for keylen=blocklen"
Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F60616263
@ -25,6 +26,7 @@ Input = "Sample message for keylen<blocklen"
Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B
Output = E3D249A8CFB67EF8B7A169E9A0A599714A2CECBA65999A51BEB8FBBE
# This is actually keylen>blocklen, but the NIST test vectors have a misleading input.
HMAC = SHA224
Input = "Sample message for keylen=blocklen"
Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F60616263
@ -40,6 +42,7 @@ Input = "Sample message for keylen<blocklen"
Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F
Output = A28CF43130EE696A98F14A37678B56BCFCBDD9E5CF69717FECF5480F0EBDF790
# This is actually keylen>blocklen, but the NIST test vectors have a misleading input.
HMAC = SHA256
Input = "Sample message for keylen=blocklen"
Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F60616263
@ -55,6 +58,7 @@ Input = "Sample message for keylen<blocklen"
Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F
Output = 6EB242BDBB582CA17BEBFA481B1E23211464D2B7F8C20B9FF2201637B93646AF5AE9AC316E98DB45D9CAE773675EEED0
# This is actually keylen>blocklen, but the NIST test vectors have a misleading input.
HMAC = SHA384
Input = "Sample message for keylen=blocklen"
Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F808182838485868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9FA0A1A2A3A4A5A6A7A8A9AAABACADAEAFB0B1B2B3B4B5B6B7B8B9BABBBCBDBEBFC0C1C2C3C4C5C6C7
@ -70,6 +74,7 @@ Input = "Sample message for keylen<blocklen"
Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F
Output = FD44C18BDA0BB0A6CE0E82B031BF2818F6539BD56EC00BDC10A8A2D730B3634DE2545D639B0F2CF710D0692C72A1896F1F211C2B922D1A96C392E07E7EA9FEDC
# This is actually keylen>blocklen, but the NIST test vectors have a misleading input.
HMAC = SHA512
Input = "Sample message for keylen=blocklen"
Key = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F606162636465666768696A6B6C6D6E6F707172737475767778797A7B7C7D7E7F808182838485868788898A8B8C8D8E8F909192939495969798999A9B9C9D9E9FA0A1A2A3A4A5A6A7A8A9AAABACADAEAFB0B1B2B3B4B5B6B7B8B9BABBBCBDBEBFC0C1C2C3C4C5C6C7