libiberty, ld: Use x86 HW optimized sha1

The following patch attempts to use x86 SHA ISA if available to speed
up in my testing about 2.5x sha1 build-id processing (in my case on
AMD Ryzen 5 3600) while producing the same result.
I believe AArch64 has similar HW acceleration for SHA1, perhaps it
could be added similarly.

Note, seems lld uses BLAKE3 rather than md5/sha1.  I think it would be
a bad idea to lie to users, if they choose --buildid=sha1, we should
be using SHA1, not some other checksum, but perhaps we could add some other
--buildid= styles and perhaps make one of the new the default.

Tested on x86_64-linux, both on Intel i9-7960X (which doesn't have
sha_ni ISA support) without/with the patch and on AMD Ryzen 5 3600
(which does have it) without/with the patch.

2023-11-28  Jakub Jelinek  <jakub@redhat.com>

include/
	* sha1.h (sha1_process_bytes_fn): New typedef.
	(sha1_choose_process_bytes): Declare.
libiberty/
	* configure.ac (HAVE_X86_SHA1_HW_SUPPORT): New check.
	* sha1.c: If HAVE_X86_SHA1_HW_SUPPORT is defined, include x86intrin.h
	and cpuid.h.
	(sha1_hw_process_bytes, sha1_hw_process_block,
	sha1_choose_process_bytes): New functions.
	* config.in: Regenerated.
	* configure: Regenerated.
ld/
	* ldbuildid.c (generate_build_id): Use sha1_choose_process_bytes ()
	instead of &sha1_process_bytes.
This commit is contained in:
Jakub Jelinek
2023-11-28 13:29:58 +01:00
parent e5f1ee1832
commit 4a50820ee8
9 changed files with 435 additions and 1 deletions
+5
View File
@@ -1,3 +1,8 @@
2023-11-28 Jakub Jelinek <jakub@redhat.com>
* sha1.h (sha1_process_bytes_fn): New typedef.
(sha1_choose_process_bytes): Declare.
2023-11-10 Simon Marchi <simon.marchi@efficios.com>
* elf/amdgpu.h (EF_AMDGPU_MACH_AMDGCN_GFX1100,
+7
View File
@@ -108,6 +108,13 @@ extern void sha1_process_block (const void *buffer, size_t len,
extern void sha1_process_bytes (const void *buffer, size_t len,
struct sha1_ctx *ctx);
typedef void (*sha1_process_bytes_fn) (const void *, size_t,
struct sha1_ctx *);
/* Return sha1_process_bytes or some hardware optimized version thereof
depending on current CPU. */
extern sha1_process_bytes_fn sha1_choose_process_bytes (void);
/* Process the remaining bytes in the buffer and put result from CTX
in first 20 bytes following RESBUF. The result is always in little
endian byte order, so that a byte-wise output yields to the wanted
+5
View File
@@ -1,3 +1,8 @@
2023-11-28 Jakub Jelinek <jakub@redhat.com>
* ldbuildid.c (generate_build_id): Use sha1_choose_process_bytes ()
instead of &sha1_process_bytes.
2023-11-28 Nick Clifton <nickc@redhat.com>
* po/ro.po: New Romanian translation.
+2 -1
View File
@@ -114,7 +114,8 @@ generate_build_id (bfd *abfd,
struct sha1_ctx ctx;
sha1_init_ctx (&ctx);
if (!(*checksum_contents) (abfd, (sum_fn) &sha1_process_bytes, &ctx))
if (!(*checksum_contents) (abfd, (sum_fn) sha1_choose_process_bytes (),
&ctx))
return false;
sha1_finish_ctx (&ctx, id_bits);
}
+10
View File
@@ -1,3 +1,13 @@
2023-11-28 Jakub Jelinek <jakub@redhat.com>
* configure.ac (HAVE_X86_SHA1_HW_SUPPORT): New check.
* sha1.c: If HAVE_X86_SHA1_HW_SUPPORT is defined, include x86intrin.h
and cpuid.h.
(sha1_hw_process_bytes, sha1_hw_process_block,
sha1_choose_process_bytes): New functions.
* config.in: Regenerated.
* configure: Regenerated.
2023-06-15 Marek Polacek <polacek@redhat.com>
* configure.ac: Also set shared when enable_host_pie.
+3
View File
@@ -432,6 +432,9 @@
/* Define to 1 if `vfork' works. */
#undef HAVE_WORKING_VFORK
/* Define if you have x86 SHA1 HW acceleration support. */
#undef HAVE_X86_SHA1_HW_SUPPORT
/* Define to 1 if you have the `_doprnt' function. */
#undef HAVE__DOPRNT
+58
View File
@@ -7544,6 +7544,64 @@ case "${host}" in
esac
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SHA1 HW acceleration support" >&5
$as_echo_n "checking for SHA1 HW acceleration support... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <x86intrin.h>
#include <cpuid.h>
__attribute__((__target__ ("sse4.1,sha")))
void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1)
{
__m128i abcd = _mm_loadu_si128 ((const __m128i *) buf);
__m128i e0 = _mm_set_epi32 (e, 0, 0, 0);
abcd = _mm_shuffle_epi32 (abcd, 0x1b);
const __m128i shuf_mask = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
abcd = _mm_shuffle_epi8 (abcd, shuf_mask);
e0 = _mm_sha1nexte_epu32 (e0, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg0 = _mm_sha1msg2_epu32 (msg0, msg1);
msg0 = _mm_xor_si128 (msg0, msg1);
e0 = _mm_add_epi32 (e0, msg0);
e0 = abcd;
_mm_storeu_si128 (buf, abcd);
e = _mm_extract_epi32 (e0, 3);
}
int bar (void)
{
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
&& (ebx & bit_SHA) != 0
&& __get_cpuid (1, &eax, &ebx, &ecx, &edx)
&& (ecx & bit_SSE4_1) != 0)
return 1;
return 0;
}
int
main ()
{
bar ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: x86 SHA1" >&5
$as_echo "x86 SHA1" >&6; }
$as_echo "#define HAVE_X86_SHA1_HW_SUPPORT 1" >>confdefs.h
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+40
View File
@@ -740,6 +740,46 @@ case "${host}" in
esac
AC_SUBST(pexecute)
AC_MSG_CHECKING([for SHA1 HW acceleration support])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <x86intrin.h>
#include <cpuid.h>
__attribute__((__target__ ("sse4.1,sha")))
void foo (__m128i *buf, unsigned int e, __m128i msg0, __m128i msg1)
{
__m128i abcd = _mm_loadu_si128 ((const __m128i *) buf);
__m128i e0 = _mm_set_epi32 (e, 0, 0, 0);
abcd = _mm_shuffle_epi32 (abcd, 0x1b);
const __m128i shuf_mask = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
abcd = _mm_shuffle_epi8 (abcd, shuf_mask);
e0 = _mm_sha1nexte_epu32 (e0, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg0 = _mm_sha1msg2_epu32 (msg0, msg1);
msg0 = _mm_xor_si128 (msg0, msg1);
e0 = _mm_add_epi32 (e0, msg0);
e0 = abcd;
_mm_storeu_si128 (buf, abcd);
e = _mm_extract_epi32 (e0, 3);
}
int bar (void)
{
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
&& (ebx & bit_SHA) != 0
&& __get_cpuid (1, &eax, &ebx, &ecx, &edx)
&& (ecx & bit_SSE4_1) != 0)
return 1;
return 0;
}
]], [[bar ();]])],
[AC_MSG_RESULT([x86 SHA1])
AC_DEFINE(HAVE_X86_SHA1_HW_SUPPORT, 1,
[Define if you have x86 SHA1 HW acceleration support.])],
[AC_MSG_RESULT([no])])
libiberty_AC_FUNC_STRNCMP
# Install a library built with a cross compiler in $(tooldir) rather
+305
View File
@@ -29,6 +29,11 @@
#include <stddef.h>
#include <string.h>
#ifdef HAVE_X86_SHA1_HW_SUPPORT
# include <x86intrin.h>
# include <cpuid.h>
#endif
#if USE_UNLOCKED_IO
# include "unlocked-io.h"
#endif
@@ -412,3 +417,303 @@ sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
e = ctx->E += e;
}
}
#if defined(HAVE_X86_SHA1_HW_SUPPORT)
/* HW specific version of sha1_process_bytes. */
static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *);
static void
sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
{
/* When we already have some bits in our internal buffer concatenate
both inputs first. */
if (ctx->buflen != 0)
{
size_t left_over = ctx->buflen;
size_t add = 128 - left_over > len ? len : 128 - left_over;
memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
ctx->buflen += add;
if (ctx->buflen > 64)
{
sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
ctx->buflen &= 63;
/* The regions in the following copy operation cannot overlap. */
memcpy (ctx->buffer,
&((char *) ctx->buffer)[(left_over + add) & ~63],
ctx->buflen);
}
buffer = (const char *) buffer + add;
len -= add;
}
/* Process available complete blocks. */
if (len >= 64)
{
#if !_STRING_ARCH_unaligned
# define alignof(type) offsetof (struct { char c; type x; }, x)
# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
if (UNALIGNED_P (buffer))
while (len > 64)
{
sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
buffer = (const char *) buffer + 64;
len -= 64;
}
else
#endif
{
sha1_hw_process_block (buffer, len & ~63, ctx);
buffer = (const char *) buffer + (len & ~63);
len &= 63;
}
}
/* Move remaining bytes in internal buffer. */
if (len > 0)
{
size_t left_over = ctx->buflen;
memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
left_over += len;
if (left_over >= 64)
{
sha1_hw_process_block (ctx->buffer, 64, ctx);
left_over -= 64;
memmove (ctx->buffer, &ctx->buffer[16], left_over);
}
ctx->buflen = left_over;
}
}
/* Process LEN bytes of BUFFER, accumulating context into CTX.
Using CPU specific intrinsics. */
#ifdef HAVE_X86_SHA1_HW_SUPPORT
__attribute__((__target__ ("sse4.1,sha")))
#endif
static void
sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
{
#ifdef HAVE_X86_SHA1_HW_SUPPORT
/* Implemented from
https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html */
const __m128i *words = (const __m128i *) buffer;
const __m128i *endp = (const __m128i *) ((const char *) buffer + len);
__m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3;
const __m128i shuf_mask
= _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
char check[((offsetof (struct sha1_ctx, B)
== offsetof (struct sha1_ctx, A) + sizeof (ctx->A))
&& (offsetof (struct sha1_ctx, C)
== offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A))
&& (offsetof (struct sha1_ctx, D)
== offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A)))
? 1 : -1];
/* First increment the byte count. RFC 1321 specifies the possible
length of the file up to 2^64 bits. Here we only compute the
number of bytes. Do a double word increment. */
ctx->total[0] += len;
ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
(void) &check[0];
abcd = _mm_loadu_si128 ((const __m128i *) &ctx->A);
e0 = _mm_set_epi32 (ctx->E, 0, 0, 0);
abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
while (words < endp)
{
abcd_save = abcd;
e0_save = e0;
/* 0..3 */
msg0 = _mm_loadu_si128 (words);
msg0 = _mm_shuffle_epi8 (msg0, shuf_mask);
e0 = _mm_add_epi32 (e0, msg0);
e1 = abcd;
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
/* 4..7 */
msg1 = _mm_loadu_si128 (words + 1);
msg1 = _mm_shuffle_epi8 (msg1, shuf_mask);
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
/* 8..11 */
msg2 = _mm_loadu_si128 (words + 2);
msg2 = _mm_shuffle_epi8 (msg2, shuf_mask);
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
msg0 = _mm_xor_si128 (msg0, msg2);
/* 12..15 */
msg3 = _mm_loadu_si128 (words + 3);
msg3 = _mm_shuffle_epi8 (msg3, shuf_mask);
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
msg1 = _mm_xor_si128 (msg1, msg3);
/* 16..19 */
e0 = _mm_sha1nexte_epu32 (e0, msg0);
e1 = abcd;
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
msg2 = _mm_xor_si128 (msg2, msg0);
/* 20..23 */
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg3 = _mm_xor_si128 (msg3, msg1);
/* 24..27 */
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
msg0 = _mm_xor_si128 (msg0, msg2);
/* 28..31 */
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
msg1 = _mm_xor_si128 (msg1, msg3);
/* 32..35 */
e0 = _mm_sha1nexte_epu32 (e0, msg0);
e1 = abcd;
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
msg2 = _mm_xor_si128 (msg2, msg0);
/* 36..39 */
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg3 = _mm_xor_si128 (msg3, msg1);
/* 40..43 */
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
msg0 = _mm_xor_si128 (msg0, msg2);
/* 44..47 */
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
msg1 = _mm_xor_si128 (msg1, msg3);
/* 48..51 */
e0 = _mm_sha1nexte_epu32 (e0, msg0);
e1 = abcd;
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
msg2 = _mm_xor_si128 (msg2, msg0);
/* 52..55 */
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
msg3 = _mm_xor_si128 (msg3, msg1);
/* 56..59 */
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
msg0 = _mm_xor_si128 (msg0, msg2);
/* 60..63 */
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
msg1 = _mm_xor_si128 (msg1, msg3);
/* 64..67 */
e0 = _mm_sha1nexte_epu32 (e0, msg0);
e1 = abcd;
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
msg2 = _mm_xor_si128 (msg2, msg0);
/* 68..71 */
e1 = _mm_sha1nexte_epu32 (e1, msg1);
e0 = abcd;
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
msg3 = _mm_xor_si128 (msg3, msg1);
/* 72..75 */
e0 = _mm_sha1nexte_epu32 (e0, msg2);
e1 = abcd;
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
/* 76..79 */
e1 = _mm_sha1nexte_epu32 (e1, msg3);
e0 = abcd;
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
/* Finalize. */
e0 = _mm_sha1nexte_epu32 (e0, e0_save);
abcd = _mm_add_epi32 (abcd, abcd_save);
words = words + 4;
}
abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
_mm_storeu_si128 ((__m128i *) &ctx->A, abcd);
ctx->E = _mm_extract_epi32 (e0, 3);
#endif
}
#endif
/* Return sha1_process_bytes or some hardware optimized version thereof
depending on current CPU. */
sha1_process_bytes_fn
sha1_choose_process_bytes (void)
{
#ifdef HAVE_X86_SHA1_HW_SUPPORT
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
&& (ebx & bit_SHA) != 0
&& __get_cpuid (1, &eax, &ebx, &ecx, &edx)
&& (ecx & bit_SSE4_1) != 0)
return sha1_hw_process_bytes;
#endif
return sha1_process_bytes;
}