Implement high-level AES-GCM logic to Rust.

2018-12-04 15:21:26 -10:00 · 2018-12-04 15:21:26 -10:00 · b989d3343d
commit b989d3343d
parent e79ab7b532
25 changed files with 963 additions and 993 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -65,7 +65,6 @@ include = [
    "crypto/fipsmodule/bn/montgomery.c",
    "crypto/fipsmodule/bn/montgomery_inv.c",
    "crypto/fipsmodule/cipher/e_aes.c",
-    "crypto/fipsmodule/cipher/internal.h",
    "crypto/fipsmodule/ec/asm/ecp_nistz256-armv4.pl",
    "crypto/fipsmodule/ec/asm/ecp_nistz256-armv8.pl",
    "crypto/fipsmodule/ec/asm/ecp_nistz256-x86.pl",
@ -113,13 +112,16 @@ include = [
    "include/GFp/cpu.h",
    "include/GFp/mem.h",
    "include/GFp/type_check.h",
+    "src/aead/aes.rs",
    "src/aead/aes_gcm.rs",
    "src/aead/aes_tests.txt",
    "src/aead/chacha.rs",
    "src/aead/chacha_tests.txt",
    "src/aead/chacha20_poly1305.rs",
    "src/aead/chacha20_poly1305_openssh.rs",
+    "src/aead/gcm.rs",
    "src/aead/mod.rs",
+    "src/aead/nonce.rs",
    "src/aead/poly1305.rs",
    "src/aead/poly1305_test.txt",
    "src/agreement.rs",
@ -191,6 +193,7 @@ include = [
    "src/hmac_generate_serializable_tests.txt",
    "src/lib.rs",
    "src/limb.rs",
+    "src/endian.rs",
    "src/pbkdf2.rs",
    "src/pkcs8.rs",
    "src/polyfill.rs",
--- a/build.rs
+++ b/build.rs
@ -51,6 +51,7 @@ const NEVER: &str = "Don't ever build this file.";

 #[cfg_attr(rustfmt, rustfmt_skip)]
 const RING_SRCS: &[(&[&str], &str)] = &[
+    (&[], "crypto/block.c"),
    (&[], "crypto/fipsmodule/bn/generic.c"),
    (&[], "crypto/fipsmodule/bn/montgomery.c"),
    (&[], "crypto/fipsmodule/bn/montgomery_inv.c"),
@ -129,12 +130,12 @@ const RING_TEST_SRCS: &[&str] = &[("crypto/constant_time_test.c")];
 const RING_INCLUDES: &[&str] =
    &["crypto/fipsmodule/aes/internal.h",
      "crypto/fipsmodule/bn/internal.h",
-      "crypto/fipsmodule/cipher/internal.h",
      "crypto/fipsmodule/ec/ecp_nistz256_table.inl",
      "crypto/fipsmodule/ec/ecp_nistz384.inl",
      "crypto/fipsmodule/ec/ecp_nistz.h",
      "crypto/fipsmodule/ec/ecp_nistz384.h",
      "crypto/fipsmodule/ec/ecp_nistz256.h",
+      "crypto/block.h",
      "crypto/internal.h",
      "crypto/limbs/limbs.h",
      "crypto/limbs/limbs.inl",
--- a/crypto/block.c
+++ b/crypto/block.c
@ -0,0 +1,24 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#include "internal.h"
+#include "block.h"
+
+// Prevent missing prototypes warnings.
+void GFp_block128_xor_assign(Block *r, Block a);
+
+void GFp_block128_xor_assign(Block *r, Block a) {
+  r->subblocks[0] ^= a.subblocks[0];
+  r->subblocks[1] ^= a.subblocks[1];
+}
--- a/crypto/block.h
+++ b/crypto/block.h
@ -0,0 +1,24 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#include "internal.h"
+
+#ifndef RING_HEADER_CRYPTO_INTERNAL_H
+#define RING_HEADER_CRYPTO_INTERNAL_H
+
+typedef struct Block {
+  uint64_t subblocks[2];
+} Block;
+
+#endif
--- a/crypto/fipsmodule/aes/aes.c
+++ b/crypto/fipsmodule/aes/aes.c
@ -53,6 +53,9 @@
 #include "internal.h"
 #include "../modes/internal.h"

+int GFp_aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
+                                 AES_KEY *aeskey);
+void GFp_aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);

 // Te0[x] = S [x].[02, 01, 01, 03];
 // Te1[x] = S [x].[03, 02, 01, 01];
--- a/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
+++ b/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
@ -935,7 +935,7 @@ my $const = "r6";	# shared with _bsaes_encrypt8_alt
 my $keysched = "sp";

 $code.=<<___;
-.extern	GFp_AES_encrypt
+.extern	GFp_aes_nohw_encrypt
 .global	GFp_bsaes_ctr32_encrypt_blocks
 .type	GFp_bsaes_ctr32_encrypt_blocks,%function
 .align	5
@ -1141,7 +1141,7 @@ GFp_bsaes_ctr32_encrypt_blocks:
 	mov	r1, sp			@ output on the stack
 	mov	r2, r7			@ key

-	bl	GFp_AES_encrypt
+	bl	GFp_aes_nohw_encrypt

 	vld1.8	{@XMM[0]}, [r4]!	@ load input
 	vld1.8	{@XMM[1]}, [sp]		@ load encrypted counter
--- a/crypto/fipsmodule/aes/internal.h
+++ b/crypto/fipsmodule/aes/internal.h
@ -15,13 +15,7 @@
 #ifndef OPENSSL_HEADER_AES_INTERNAL_H
 #define OPENSSL_HEADER_AES_INTERNAL_H

-#include <stdlib.h>
-
-#include <GFp/cpu.h>
-
-int GFp_aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
-                                 AES_KEY *aeskey);
-void GFp_aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
+#include "../../internal.h"

 static inline int hwaes_capable(void) {
 #if defined(OPENSSL_X86_64) || defined(OPENSSL_X86)
@ -33,10 +27,4 @@ static inline int hwaes_capable(void) {
 #endif
 }

-int GFp_aes_hw_set_encrypt_key(const uint8_t *user_key, unsigned bits,
-                               AES_KEY *key);
-void GFp_aes_hw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
-void GFp_aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
-                                     const AES_KEY *key, const uint8_t ivec[16]);
-
 #endif  // OPENSSL_HEADER_AES_INTERNAL_H
--- a/crypto/fipsmodule/cipher/e_aes.c
+++ b/crypto/fipsmodule/cipher/e_aes.c
@ -46,247 +46,34 @@
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 * ==================================================================== */

-#if defined(_MSC_VER)
-#pragma warning(push, 3)
-#endif
-
-#include <string.h>
-
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-
-#include <GFp/aes.h>
 #include <GFp/cpu.h>

-#include "internal.h"
-#include "../../internal.h"
 #include "../aes/internal.h"
-#include "../modes/internal.h"

 #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
 #include <GFp/arm_arch.h>
 #endif

+int GFp_aes_hw_capable(void);

-#define EVP_AEAD_AES_GCM_NONCE_LEN 12
-#define EVP_AEAD_AES_GCM_TAG_LEN 16
-
-// Declarations for extern functions only called by Rust code, to avoid
-// -Wmissing-prototypes warnings.
-int GFp_aes_gcm_init(uint8_t *ctx_buf, size_t ctx_buf_len, const uint8_t *key,
-                     size_t key_len);
-int GFp_aes_gcm_open(const uint8_t *ctx_buf, uint8_t *out, size_t in_out_len,
-                     uint8_t tag_out[EVP_AEAD_AES_GCM_TAG_LEN],
-                     const uint8_t nonce[EVP_AEAD_AES_GCM_NONCE_LEN],
-                     const uint8_t *in, const uint8_t *ad, size_t ad_len);
-int GFp_aes_gcm_seal(const uint8_t *ctx_buf, uint8_t *in_out, size_t in_out_len,
-                     uint8_t tag_out[EVP_AEAD_AES_GCM_TAG_LEN],
-                     const uint8_t nonce[EVP_AEAD_AES_GCM_NONCE_LEN],
-                     const uint8_t *ad, size_t ad_len);
-void GFp_AES_set_encrypt_key(const uint8_t *user_key, unsigned bits,
-                             AES_KEY *key);
-void GFp_AES_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
-
-
+int GFp_aes_hw_capable(void) {
+  return hwaes_capable();
+}

 #if !defined(OPENSSL_NO_ASM) && \
    (defined(OPENSSL_X86_64) || defined(OPENSSL_X86))
-#define VPAES
-static char vpaes_capable(void) {
+int GFp_vpaes_capable(void);
+
+int GFp_vpaes_capable(void) {
  return (GFp_ia32cap_P[1] & (1 << (41 - 32))) != 0;
 }
 #endif

 #if !defined(OPENSSL_NO_ASM) && \
    defined(OPENSSL_ARM) && __ARM_MAX_ARCH__ >= 7
-#define BSAES
-static char bsaes_capable(void) {
+int GFp_bsaes_capable(void);
+
+int GFp_bsaes_capable(void) {
  return GFp_is_NEON_capable();
 }
 #endif
-
-#if defined(BSAES)
-// On platforms where BSAES gets defined (just above), then these functions are
-// provided by asm.
-void GFp_bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
-                                    const AES_KEY *key, const uint8_t ivec[16]);
-#endif
-
-static void aes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
-                                     size_t len, const AES_KEY *key,
-                                     const uint8_t ivec[16]);
-
-#if defined(VPAES)
-// On platforms where VPAES gets defined (just above), then these functions are
-// provided by asm.
-int GFp_vpaes_set_encrypt_key(const uint8_t *userKey, unsigned bits,
-                              AES_KEY *key);
-void GFp_vpaes_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key);
-#endif
-
-// |bits| must be 128 or 256. 192-bit keys are not supported.
-void GFp_AES_set_encrypt_key(const uint8_t *user_key, unsigned bits,
-                             AES_KEY *key) {
-  // Keep this in sync with |gcm128_init_gmult_ghash| and |aes_ctr|.
-
-  assert(user_key != NULL);
-  assert(key != NULL);
-  assert(bits == 128 || bits == 256);
-
-  if (hwaes_capable()) {
-    (void) GFp_aes_hw_set_encrypt_key(user_key, bits, key);
-    return;
-  }
-
-#if defined(VPAES)
-#if defined(BSAES)
-#error "BSAES and VPAES are enabled at the same time, unexpectedly."
-#endif
-  if (vpaes_capable()) {
-    (void) GFp_vpaes_set_encrypt_key(user_key, bits, key);
-    return;
-  }
-#endif
-
-  (void) GFp_aes_nohw_set_encrypt_key(user_key, bits, key);
-}
-
-static aes_block_f aes_block(void) {
-  // Keep this in sync with |GFp_AES_set_encrypt_key| and |aes_ctr|.
-  if (hwaes_capable()) {
-    return GFp_aes_hw_encrypt;
-  }
-
-#if defined(VPAES)
-#if defined(BSAES)
-#error "BSAES and VPAES are enabled at the same time, unexpectedly."
-#endif
-  if (vpaes_capable()) {
-    return GFp_vpaes_encrypt;
-  }
-#endif
-
-  return GFp_aes_nohw_encrypt;
-}
-
-void GFp_AES_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
-  (aes_block())(in, out, key);
-}
-
-static aes_ctr_f aes_ctr(void) {
-  // Keep this in sync with |set_set_key| and |aes_block|.
-
-  if (hwaes_capable()) {
-    return GFp_aes_hw_ctr32_encrypt_blocks;
-  }
-
-#if defined(BSAES)
-  if (bsaes_capable()) {
-    return GFp_bsaes_ctr32_encrypt_blocks;
-  }
-#endif
-
-  return aes_ctr32_encrypt_blocks;
-}
-
-static void aes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
-                                     size_t blocks, const AES_KEY *key,
-                                     const uint8_t ivec[16]) {
-  alignas(16) uint8_t counter_plaintext[16];
-  memcpy(counter_plaintext, ivec, 16);
-  uint32_t counter = from_be_u32_ptr(&counter_plaintext[12]);
-
-  aes_block_f block = aes_block();
-
-  for (size_t current_block = 0; current_block < blocks; ++current_block) {
-    alignas(16) uint8_t counter_ciphertext[16];
-    block(counter_plaintext, counter_ciphertext, key);
-    for (size_t i = 0; i < 16; ++i) {
-      out[i] = in[i] ^ counter_ciphertext[i];
-    }
-    // The caller must ensure the counter won't wrap around.
-    ++counter;
-    assert(counter != 0);
-    to_be_u32_ptr(&counter_plaintext[12], counter);
-    out += 16;
-    in += 16;
-  }
-}
-
-int GFp_aes_gcm_init(uint8_t *ctx_buf, size_t ctx_buf_len, const uint8_t *key,
-                     size_t key_len) {
-  alignas(16) AES_KEY ks;
-  assert(ctx_buf_len >= sizeof(ks) + GCM128_SERIALIZED_LEN);
-  if (ctx_buf_len < sizeof(ks) + GCM128_SERIALIZED_LEN) {
-    return 0;
-  }
-
-  if (key_len != (128 / 8) && key_len != (256 / 8)) {
-    return 0; // Invalid key length
-  }
-
-  GFp_AES_set_encrypt_key(key, (unsigned)key_len * 8, &ks);
-
-  GFp_gcm128_init_serialized(ctx_buf + sizeof(ks), &ks, aes_block());
-  memcpy(ctx_buf, &ks, sizeof(ks));
-  return 1;
-}
-
-static int gfp_aes_gcm_init_and_aad(GCM128_CONTEXT *gcm, AES_KEY *ks,
-                                    const uint8_t *ctx_buf, const uint8_t nonce[],
-                                    const uint8_t ad[], size_t ad_len) {
-  assert(ad != NULL || ad_len == 0);
-  memcpy(ks, ctx_buf, sizeof(*ks));
-
-  GFp_gcm128_init(gcm, ks, aes_block(), ctx_buf + sizeof(*ks),
-                  nonce);
-  return GFp_gcm128_aad(gcm, ad, ad_len);
-}
-
-int GFp_aes_gcm_seal(const uint8_t *ctx_buf, uint8_t *in_out, size_t in_out_len,
-                     uint8_t tag_out[EVP_AEAD_AES_GCM_TAG_LEN],
-                     const uint8_t nonce[EVP_AEAD_AES_GCM_NONCE_LEN],
-                     const uint8_t *ad, size_t ad_len) {
-  assert(in_out != NULL || in_out_len == 0);
-  assert(ad != NULL || ad_len == 0);
-
-  GCM128_CONTEXT gcm;
-  alignas(16) AES_KEY ks;
-  if (!gfp_aes_gcm_init_and_aad(&gcm, &ks, ctx_buf, nonce, ad, ad_len)) {
-    return 0;
-  }
-  if (in_out_len > 0) {
-    aes_ctr_f ctr = aes_ctr();
-    if (!GFp_gcm128_encrypt_ctr32(&gcm, &ks, in_out, in_out, in_out_len,
-                                  ctr)) {
-      return 0;
-    }
-  }
-  GFp_gcm128_tag(&gcm, tag_out);
-  return 1;
-}
-
-int GFp_aes_gcm_open(const uint8_t *ctx_buf, uint8_t *out, size_t in_out_len,
-                     uint8_t tag_out[EVP_AEAD_AES_GCM_TAG_LEN],
-                     const uint8_t nonce[EVP_AEAD_AES_GCM_NONCE_LEN],
-                     const uint8_t *in, const uint8_t *ad, size_t ad_len) {
-  assert(out != NULL || in_out_len == 0);
-  assert(aead_check_alias(in, in_out_len, out));
-  assert(in != NULL || in_out_len == 0);
-  assert(ad != NULL || ad_len == 0);
-
-  GCM128_CONTEXT gcm;
-  alignas(16) AES_KEY ks;
-  if (!gfp_aes_gcm_init_and_aad(&gcm, &ks, ctx_buf, nonce, ad, ad_len)) {
-    return 0;
-  }
-  if (in_out_len > 0) {
-    aes_ctr_f ctr = aes_ctr();
-    if (!GFp_gcm128_decrypt_ctr32(&gcm, &ks, in, out, in_out_len, ctr)) {
-      return 0;
-    }
-  }
-  GFp_gcm128_tag(&gcm, tag_out);
-  return 1;
-}
--- a/crypto/fipsmodule/cipher/internal.h
+++ b/crypto/fipsmodule/cipher/internal.h
@ -1,104 +0,0 @@
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to.  The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *    "This product includes cryptographic software written by
- *     Eric Young (eay@cryptsoft.com)"
- *    The word 'cryptographic' can be left out if the rouines from the library
- *    being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- *    the apps directory (application code) you must include an acknowledgement:
- *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed.  i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.] */
-
-#ifndef OPENSSL_HEADER_CIPHER_INTERNAL_H
-#define OPENSSL_HEADER_CIPHER_INTERNAL_H
-
-#include <GFp/base.h>
-
-#include <GFp/aes.h>
-
-#include "../../internal.h"
-#include "../modes/internal.h"
-
-// Preconditions for AEAD implementation methods. */
-
-// aead_check_alias returns 0 if |out| points within the buffer determined by
-// |in| and |in_len| and 1 otherwise.
-//
-// When processing, there's only an issue if |out| points within in[:in_len]
-// and isn't equal to |in|. If that's the case then writing the output will
-// stomp input that hasn't been read yet.
-//
-// This function checks for that case. */
-static inline int aead_check_alias(const uint8_t *in, size_t in_len,
-                                   const uint8_t *out) {
-  if (out <= in) {
-    return 1;
-  } else if (in + in_len <= out) {
-    return 1;
-  }
-  return 0;
-}
-
-// TODO: This isn't used yet, but it will probably be used soon, once
-// AES-GCM-SIV is integrated into *ring*.
-//
-#if 0
-
-// aes_ctr_set_key initialises |*aes_key| using |key_bytes| bytes from |key|,
-// where |key_bytes| must either be 16, 24 or 32. If not NULL, |*out_block| is
-// set to a function that encrypts single blocks. If not NULL, |*gcm_ctx| is
-// initialised to do GHASH with the given key. It returns a function for
-// optimised CTR-mode, or NULL if CTR-mode should be built using
-// |*out_block|.
-ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_CONTEXT *gcm_ctx,
-                         block128_f *out_block, const uint8_t *key,
-                         size_t key_bytes);
-
-#endif
-
-#endif  // OPENSSL_HEADER_CIPHER_INTERNAL_H
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@ -54,14 +54,11 @@

 #include "internal.h"
 #include "../../internal.h"
-#include "../aes/internal.h"
+#include "../../block.h"

-#if !defined(OPENSSL_NO_ASM) &&                         \
-    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
-     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) || \
-     defined(OPENSSL_PPC64LE))
-#define GHASH_ASM
-#endif
+void GFp_gcm128_ghash(GCM128_CONTEXT *ctx, const uint8_t input[], size_t input_len);
+void GFp_gcm128_gmult(GCM128_CONTEXT *ctx);
+int GFp_gcm_clmul_enabled(void);

 #define PACK(s) ((size_t)(s) << (sizeof(size_t) * 8 - 16))
 #define REDUCE1BIT(V)                                                 \
@ -77,10 +74,6 @@
    }                                                                 \
  } while (0)

-// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
-// bits of a |size_t|.
-static const size_t kSizeTWithoutLower4Bits = (size_t) -16;
-
 static void gcm_init_4bit(u128 Htable[16], const uint64_t H[2]) {
  u128 V;

@ -110,7 +103,7 @@ static void gcm_init_4bit(u128 Htable[16], const uint64_t H[2]) {
  Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
  Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;

-#if defined(GHASH_ASM) && defined(OPENSSL_ARM)
+#if defined(OPENSSL_ARM)
  // ARM assembler expects specific dword order in Htable.
  {
    int j;
@ -131,7 +124,7 @@ static void gcm_init_4bit(u128 Htable[16], const uint64_t H[2]) {
 #endif
 }

-#if !defined(GHASH_ASM) || defined(OPENSSL_AARCH64) || defined(OPENSSL_PPC64LE)
+#if defined(OPENSSL_AARCH64) || defined(OPENSSL_PPC64LE)
 static const size_t rem_4bit[16] = {
    PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
    PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
@ -248,26 +241,19 @@ static void GFp_gcm_ghash_4bit(uint8_t Xi[16], const u128 Htable[16],
    Xi[1] = from_be_u64(Z.lo);
  } while (inp += 16, len -= 16);
 }
-#else // GHASH_ASM
+#else
 void GFp_gcm_gmult_4bit(uint8_t Xi[16], const u128 Htable[16]);
 void GFp_gcm_ghash_4bit(uint8_t Xi[16], const u128 Htable[16],
                        const uint8_t *inp, size_t len);
 #endif

-#define GCM_MUL(ctx, Xi) GFp_gcm_gmult_4bit((ctx)->Xi, (ctx)->Htable)
-#if defined(GHASH_ASM)
-#define GHASH(ctx, in, len) GFp_gcm_ghash_4bit((ctx)->Xi, (ctx)->Htable, in, len)
 // GHASH_CHUNK is "stride parameter" missioned to mitigate cache
 // trashing effect. In other words idea is to hash data while it's
 // still in L1 cache after encryption pass...
 #define GHASH_CHUNK (3 * 1024)
-#endif


-#if defined(GHASH_ASM)
-
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
-#define GCM_FUNCREF_4BIT
 void GFp_gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]);
 void GFp_gcm_gmult_clmul(uint8_t Xi[16], const u128 Htable[16]);
 void GFp_gcm_ghash_clmul(uint8_t Xi[16], const u128 Htable[16],
@ -278,11 +264,11 @@ void GFp_gcm_ghash_clmul(uint8_t Xi[16], const u128 Htable[16],
 void GFp_gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
 void GFp_gcm_ghash_avx(uint8_t Xi[16], const u128 Htable[16], const uint8_t *in,
                       size_t len);
-#define AESNI_GCM
-size_t GFp_aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
-                             const void *key, uint8_t ivec[16], uint8_t Xi[16]);
-size_t GFp_aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
-                             const void *key, uint8_t ivec[16], uint8_t Xi[16]);
+int GFp_aesni_gcm_capable(void);
+
+int GFp_aesni_gcm_capable(void) {
+  return ((GFp_ia32cap_P[1] >> 22) & 0x41) == 0x41; // AVX+MOVBE
+}
 #endif

 #if defined(OPENSSL_X86)
@ -298,7 +284,6 @@ void GFp_gcm_ghash_4bit_mmx(uint8_t Xi[16], const u128 Htable[16],

 #if __ARM_MAX_ARCH__ >= 8
 #define ARM_PMULL_ASM
-#define GCM_FUNCREF_4BIT
 void GFp_gcm_init_v8(u128 Htable[16], const uint64_t Xi[2]);
 void GFp_gcm_gmult_v8(uint8_t Xi[16], const u128 Htable[16]);
 void GFp_gcm_ghash_v8(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
@ -306,7 +291,6 @@ void GFp_gcm_ghash_v8(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
 #endif

 #if defined(OPENSSL_ARM) && __ARM_MAX_ARCH__ >= 7
-#define GCM_FUNCREF_4BIT
 // 32-bit ARM also has support for doing GCM with NEON instructions.
 void GFp_gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
 void GFp_gcm_gmult_neon(uint8_t Xi[16], const u128 Htable[16]);
@ -316,56 +300,29 @@ void GFp_gcm_ghash_neon(uint8_t Xi[16], const u128 Htable[16],

 #elif defined(OPENSSL_PPC64LE)
 #define GHASH_ASM_PPC64LE
-#define GCM_FUNCREF_4BIT
 void GFp_gcm_init_p8(u128 Htable[16], const uint64_t Xi[2]);
 void GFp_gcm_gmult_p8(uint64_t Xi[2], const u128 Htable[16]);
 void GFp_gcm_ghash_p8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                      size_t len);
 #endif // Platform

-#endif // GHASH_ASM
+void GFp_gcm128_init_htable(GCM128_KEY *r, Block h_block);

-#ifdef GCM_FUNCREF_4BIT
-#undef GCM_MUL
-#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi, (ctx)->Htable)
-#ifdef GHASH
-#undef GHASH
-#define GHASH(ctx, in, len) (*gcm_ghash_p)((ctx)->Xi, (ctx)->Htable, in, len)
-#endif
-#endif
-
-static void gcm128_init_htable(u128 Htable[GCM128_HTABLE_LEN],
-                               const uint64_t H[2]);
-
-void GFp_gcm128_init_serialized(
-    uint8_t serialized_ctx[GCM128_SERIALIZED_LEN], const AES_KEY *key,
-    aes_block_f block) {
-  static const alignas(16) uint8_t ZEROS[16] = { 0 };
-  uint8_t H_be[16];
-  (*block)(ZEROS, H_be, key);
+void GFp_gcm128_init_htable(GCM128_KEY *r, Block h_block) {

  // H is stored in host byte order
  alignas(16) uint64_t H[2];
-  H[0] = from_be_u64_ptr(H_be);
-  H[1] = from_be_u64_ptr(H_be + 8);
+  H[0] = from_be_u64(h_block.subblocks[0]);
+  H[1] = from_be_u64(h_block.subblocks[1]);

-  alignas(16) u128 Htable[GCM128_HTABLE_LEN];
-  gcm128_init_htable(Htable, H);
+  u128 *Htable = r->Htable;

-  OPENSSL_COMPILE_ASSERT(sizeof(Htable) == GCM128_SERIALIZED_LEN,
-                         GCM128_SERIALIZED_LEN_is_wrong);
-
-  memcpy(serialized_ctx, Htable, GCM128_SERIALIZED_LEN);
-}
-
-static void gcm128_init_htable(u128 Htable[GCM128_HTABLE_LEN],
-                               const uint64_t H[2]) {
  // Keep in sync with |gcm128_init_gmult_ghash|.

 #if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86)
  if (GFp_gcm_clmul_enabled()) {
 #if defined(GHASH_ASM_X86_64)
-    if (((GFp_ia32cap_P[1] >> 22) & 0x41) == 0x41) { // AVX+MOVBE
+    if (GFp_aesni_gcm_capable()) {
      GFp_gcm_init_avx(Htable, H);
      return;
    }
@ -396,296 +353,89 @@ static void gcm128_init_htable(u128 Htable[GCM128_HTABLE_LEN],
  gcm_init_4bit(Htable, H);
 }

-static void gcm128_init_gmult_ghash(GCM128_CONTEXT *ctx) {
+void GFp_gcm128_gmult(GCM128_CONTEXT *ctx) {
+  // Keep in sync with |gcm128_ghash|, gcm128_init_htable| and |GFp_AES_set_encrypt_key|.
+
+#if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86)
+  if (GFp_gcm_clmul_enabled()) {
+    // GFp_gcm_gmult_avx2 was an alias for GFp_gcm_gmult_clmul so there's no need
+    // for x86-64 MOVEBE+AVX2 stuff here. Apparently GFp_gcm_gmult_clmul doesn't need
+    // that stuff.
+    GFp_gcm_gmult_clmul(ctx->Xi, ctx->key.Htable);
+    return;
+  }
+#endif
+#if defined(ARM_PMULL_ASM)
+  if (GFp_is_ARMv8_PMULL_capable()) {
+    GFp_gcm_gmult_v8(ctx->Xi, ctx->key.Htable);
+    return;
+  }
+#endif
+#if defined(OPENSSL_ARM)
+  if (GFp_is_NEON_capable()) {
+    GFp_gcm_gmult_neon(ctx->Xi, ctx->key.Htable);
+    return;
+  }
+#endif
+#if defined(GHASH_ASM_PPC64LE)
+  if (GFp_is_PPC64LE_vcrypto_capable()) {
+    GFp_gcm_gmult_p8(ctx->Xi, ctx->key.Htable);
+    return;
+  }
+#endif
+
+#if defined(GHASH_ASM_X86)
+  GFp_gcm_gmult_4bit_mmx(ctx->Xi, ctx->key.Htable);
+#else
+  GFp_gcm_gmult_4bit(ctx->Xi, ctx->key.Htable);
+#endif
+}
+
+void GFp_gcm128_ghash(GCM128_CONTEXT *ctx, const uint8_t input[], size_t input_len) {
+  assert(input_len % 16 == 0);
  // Keep in sync with |gcm128_init_htable| and |GFp_AES_set_encrypt_key|.

 #if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86)
  if (GFp_gcm_clmul_enabled()) {
 #if defined(GHASH_ASM_X86_64)
    if (((GFp_ia32cap_P[1] >> 22) & 0x41) == 0x41) { // AVX+MOVBE
-      ctx->gmult = GFp_gcm_gmult_clmul;
-      ctx->ghash = GFp_gcm_ghash_avx;
-      ctx->use_aesni_gcm_crypt = hwaes_capable() ? 1 : 0;
+      GFp_gcm_ghash_avx(ctx->Xi, ctx->key.Htable, input, input_len);
      return;
    }
 #endif
-    ctx->gmult = GFp_gcm_gmult_clmul;
-    ctx->ghash = GFp_gcm_ghash_clmul;
+    GFp_gcm_ghash_clmul(ctx->Xi, ctx->key.Htable, input, input_len);
    return;
  }
 #endif
 #if defined(ARM_PMULL_ASM)
  if (GFp_is_ARMv8_PMULL_capable()) {
-    ctx->gmult = GFp_gcm_gmult_v8;
-    ctx->ghash = GFp_gcm_ghash_v8;
+    GFp_gcm_ghash_v8(ctx->Xi, ctx->key.Htable, input, input_len);
    return;
  }
 #endif
 #if defined(OPENSSL_ARM)
  if (GFp_is_NEON_capable()) {
-    ctx->gmult = GFp_gcm_gmult_neon;
-    ctx->ghash = GFp_gcm_ghash_neon;
+    GFp_gcm_ghash_neon(ctx->Xi, ctx->key.Htable, input, input_len);
    return;
  }
 #endif
 #if defined(GHASH_ASM_PPC64LE)
  if (GFp_is_PPC64LE_vcrypto_capable()) {
-    ctx->gmult = GFp_gcm_gmult_p8;
-    ctx->ghash = GFp_gcm_ghash_p8;
+    GFp_gcm_ghash_p8(ctx->Xi, ctx->key.Htable, input, input_len);
    return;
  }
 #endif

 #if defined(GHASH_ASM_X86)
-  ctx->gmult = GFp_gcm_gmult_4bit_mmx;
-  ctx->ghash = GFp_gcm_ghash_4bit_mmx;
+  GFp_gcm_ghash_4bit_mmx(ctx->Xi, ctx->key.Htable, input, input_len);
 #else
-  ctx->gmult = GFp_gcm_gmult_4bit;
-  ctx->ghash = GFp_gcm_ghash_4bit;
+  GFp_gcm_ghash_4bit(ctx->Xi, ctx->key.Htable, input, input_len);
 #endif
 }

-void GFp_gcm128_init(GCM128_CONTEXT *ctx, const AES_KEY *key,
-                        aes_block_f block,
-                        const uint8_t serialized_ctx[GCM128_SERIALIZED_LEN],
-                        const uint8_t *iv) {
-  uint32_t ctr = 1;
-
-  memset(ctx, 0, sizeof(*ctx));
-  memcpy(ctx->Yi, iv, 12);
-  to_be_u32_ptr(ctx->Yi + 12, ctr);
-  (block)(ctx->Yi, ctx->EK0, key);
-  ++ctr;
-  to_be_u32_ptr(ctx->Yi + 12, ctr);
-
-  OPENSSL_COMPILE_ASSERT(sizeof(ctx->Htable) == GCM128_SERIALIZED_LEN,
-                         GCM128_SERIALIZED_LEN_is_wrong);
-
-  memcpy(ctx->Htable, serialized_ctx, GCM128_SERIALIZED_LEN);
-  ctx->block = block;
-  gcm128_init_gmult_ghash(ctx);
-}
-
-int GFp_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
-  assert(ctx->len.u[0] == 0);
-  assert(ctx->len.u[1] == 0);
-
-#ifdef GCM_FUNCREF_4BIT
-  gmult_func gcm_gmult_p = ctx->gmult;
-#endif
-
-  ctx->len.u[0] = len;
-  if (ctx->len.u[0] > (UINT64_C(1) << 61)) {
-    return 0;
-  }
-
-  if (len > 0) {
-    for (;;) {
-      for (size_t i = 0; i < 16 && i < len; ++i) {
-        ctx->Xi[i] ^= aad[i];
-      }
-      GCM_MUL(ctx, Xi);
-      if (len <= 16) {
-        break;
-      }
-      aad += 16;
-      len -= 16;
-    }
-  }
-
-  return 1;
-}
-
-int GFp_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
-                                const uint8_t *in, uint8_t *out, size_t len,
-                                aes_ctr_f stream) {
-  assert(ctx->len.u[1] == 0);
-
-  unsigned int ctr;
-#ifdef GCM_FUNCREF_4BIT
-  gmult_func gcm_gmult_p = ctx->gmult;
-#ifdef GHASH
-  ghash_func gcm_ghash_p = ctx->ghash;
-#endif
-#endif
-
-  ctx->len.u[1] = len;
-  if (ctx->len.u[1] > ((UINT64_C(1) << 36) - 32)) {
-    return 0;
-  }
-
-#if defined(AESNI_GCM)
-  if (ctx->use_aesni_gcm_crypt) {
-    // |aesni_gcm_encrypt| may not process all the input given to it. It may
-    // not process *any* of its input if it is deemed too small.
-    size_t bulk = GFp_aesni_gcm_encrypt(in, out, len, key, ctx->Yi, ctx->Xi);
-    in += bulk;
-    out += bulk;
-    len -= bulk;
-  }
-#endif
-
-  ctr = from_be_u32_ptr(ctx->Yi + 12);
-
-#if defined(GHASH)
-  while (len >= GHASH_CHUNK) {
-    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi);
-    ctr += GHASH_CHUNK / 16;
-    to_be_u32_ptr(ctx->Yi + 12, ctr);
-    GHASH(ctx, out, GHASH_CHUNK);
-    out += GHASH_CHUNK;
-    in += GHASH_CHUNK;
-    len -= GHASH_CHUNK;
-  }
-#endif
-  size_t i = len & kSizeTWithoutLower4Bits;
-  if (i != 0) {
-    size_t j = i / 16;
-
-    (*stream)(in, out, j, key, ctx->Yi);
-    ctr += (unsigned int)j;
-    to_be_u32_ptr(ctx->Yi + 12, ctr);
-    in += i;
-    len -= i;
-#if defined(GHASH)
-    GHASH(ctx, out, i);
-    out += i;
-#else
-    while (j--) {
-      for (i = 0; i < 16; ++i) {
-        ctx->Xi[i] ^= out[i];
-      }
-      GCM_MUL(ctx, Xi);
-      out += 16;
-    }
-#endif
-  }
-  if (len) {
-    (*ctx->block)(ctx->Yi, ctx->EKi, key);
-    ++ctr;
-    to_be_u32_ptr(ctx->Yi + 12, ctr);
-    size_t n = 0;
-    while (len--) {
-      ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n];
-      ++n;
-    }
-    GCM_MUL(ctx, Xi);
-  }
-
-  return 1;
-}
-
-int GFp_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
-                                const uint8_t *in, uint8_t *out, size_t len,
-                                aes_ctr_f stream) {
-  assert(ctx->len.u[1] == 0);
-
-  unsigned int ctr;
-#ifdef GCM_FUNCREF_4BIT
-  gmult_func gcm_gmult_p = ctx->gmult;
-#ifdef GHASH
-  ghash_func gcm_ghash_p = ctx->ghash;
-#endif
-#endif
-
-  ctx->len.u[1] = len;
-  if (ctx->len.u[1] > ((UINT64_C(1) << 36) - 32)) {
-    return 0;
-  }
-
-#if defined(AESNI_GCM)
-  if (ctx->use_aesni_gcm_crypt) {
-    // |aesni_gcm_decrypt| may not process all the input given to it. It may
-    // not process *any* of its input if it is deemed too small.
-    size_t bulk = GFp_aesni_gcm_decrypt(in, out, len, key, ctx->Yi, ctx->Xi);
-    in += bulk;
-    out += bulk;
-    len -= bulk;
-  }
-#endif
-
-  ctr = from_be_u32_ptr(ctx->Yi + 12);
-
-#if defined(GHASH)
-  while (len >= GHASH_CHUNK) {
-    GHASH(ctx, in, GHASH_CHUNK);
-    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi);
-    ctr += GHASH_CHUNK / 16;
-    to_be_u32_ptr(ctx->Yi + 12, ctr);
-    out += GHASH_CHUNK;
-    in += GHASH_CHUNK;
-    len -= GHASH_CHUNK;
-  }
-#endif
-  size_t i = len & kSizeTWithoutLower4Bits;
-  if (i != 0) {
-    size_t j = i / 16;
-
-#if defined(GHASH)
-    GHASH(ctx, in, i);
-#else
-    while (j--) {
-      size_t k;
-      for (k = 0; k < 16; ++k) {
-        ctx->Xi[k] ^= in[k];
-      }
-      GCM_MUL(ctx, Xi);
-      in += 16;
-    }
-    j = i / 16;
-    in -= i;
-#endif
-    (*stream)(in, out, j, key, ctx->Yi);
-    ctr += (unsigned int)j;
-    to_be_u32_ptr(ctx->Yi + 12, ctr);
-    out += i;
-    in += i;
-    len -= i;
-  }
-  if (len) {
-    (*ctx->block)(ctx->Yi, ctx->EKi, key);
-    ++ctr;
-    to_be_u32_ptr(ctx->Yi + 12, ctr);
-    size_t n = 0;
-    while (len--) {
-      uint8_t c = in[n];
-      ctx->Xi[n] ^= c;
-      out[n] = c ^ ctx->EKi[n];
-      ++n;
-    }
-    GCM_MUL(ctx, Xi);
-  }
-
-  return 1;
-}
-
-void GFp_gcm128_tag(GCM128_CONTEXT *ctx, uint8_t tag[16]) {
-  uint64_t alen = ctx->len.u[0] << 3;
-  uint64_t clen = ctx->len.u[1] << 3;
-#ifdef GCM_FUNCREF_4BIT
-  gmult_func gcm_gmult_p = ctx->gmult;
-#endif
-
-  uint8_t a_c_len[16];
-  to_be_u64_ptr(a_c_len, alen);
-  to_be_u64_ptr(a_c_len + 8, clen);
-  for (size_t i = 0; i < 16; ++i) {
-    ctx->Xi[i] ^= a_c_len[i];
-  }
-  GCM_MUL(ctx, Xi);
-
-  for (size_t i = 0; i < 16; ++i) {
-    tag[i] = ctx->Xi[i] ^ ctx->EK0[i];
-  }
-}
-
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
 int GFp_gcm_clmul_enabled(void) {
-#ifdef GHASH_ASM
  return GFp_ia32cap_P[0] & (1 << 24) && // check FXSR bit
         GFp_ia32cap_P[1] & (1 << 1);    // check PCLMULQDQ bit
-#else
-  return 0;
-#endif
 }
 #endif
--- a/crypto/fipsmodule/modes/internal.h
+++ b/crypto/fipsmodule/modes/internal.h
@ -49,125 +49,27 @@
 #ifndef OPENSSL_HEADER_MODES_INTERNAL_H
 #define OPENSSL_HEADER_MODES_INTERNAL_H

-#include <GFp/aes.h>
-
 #include "../../internal.h"

-// aes_block_f is a pointer to |AES_Encrypt| or a variant thereof.
-typedef void (*aes_block_f)(const uint8_t in[16], uint8_t out[16],
-                            const AES_KEY *key);
-int GFp_aes_block_is_aesni_encrypt(aes_block_f aes_block);
-
 // GCM definitions
 typedef struct { uint64_t hi,lo; } u128;

 #define GCM128_HTABLE_LEN 16

-#define GCM128_SERIALIZED_LEN (GCM128_HTABLE_LEN * 16)
+// Keep in sync with GCM128_KEY in aes_gcm.rs.
+typedef struct {
+    alignas(16) u128 Htable[GCM128_HTABLE_LEN];
+} GCM128_KEY;

-/* gmult_func multiplies |Xi| by the GCM key and writes the result back to
- * |Xi|. */
-typedef void (*gmult_func)(uint8_t  Xi[16], const u128 Htable[GCM128_HTABLE_LEN]);
-
-/* ghash_func repeatedly multiplies |Xi| by the GCM key and adds in blocks from
- * |inp|. The result is written back to |Xi| and the |len| argument must be a
- * multiple of 16. */
-typedef void (*ghash_func)(uint8_t Xi[16], const u128 Htable[GCM128_HTABLE_LEN],
-                           const uint8_t *inp, size_t len);
-
-// This differs from OpenSSL's |gcm128_context| in that it does not have the
-// |key| pointer, in order to make it |memcpy|-friendly. See GFp/modes.h
-// for more info.
-struct gcm128_context {
-  // Following 6 names follow names in GCM specification
-  alignas(16) uint8_t Yi[16];
-  alignas(16) uint8_t EKi[16];
-  alignas(16) uint8_t EK0[16];
-  alignas(16) struct {
-    uint64_t u[2];
-  } len;
-  alignas(16) uint8_t Xi[16];
-  alignas(16) struct {
-    uint64_t u[2];
-  } H_unused;
-
-  // Relative position of Xi, H and pre-computed Htable is used in some
-  // assembler modules, i.e. don't change the order!
-  u128 Htable[GCM128_HTABLE_LEN];
-
-  gmult_func gmult;
-  ghash_func ghash;
-  aes_block_f block;
-
-  // use_aesni_gcm_crypt is true if this context should use the assembly
-  // functions |aesni_gcm_encrypt| and |aesni_gcm_decrypt| to process data.
-  unsigned use_aesni_gcm_crypt:1;
-};
-
-#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
-// GFp_gcm_clmul_enabled returns one if the CLMUL implementation of GCM is
-// used.
-int GFp_gcm_clmul_enabled(void);
-#endif
-
-
-// CTR.
-
-// aes_ctr_f is the type of a function that performs CTR-mode encryption with
-// AES.
-typedef void (*aes_ctr_f)(const uint8_t *in, uint8_t *out, size_t blocks,
-                          const AES_KEY *key, const uint8_t ivec[16]);
-
-// GCM.
-//
-// This API differs from the OpenSSL API slightly. The |GCM128_CONTEXT| does
-// not have a |key| pointer that points to the key as OpenSSL's version does.
-// Instead, every function takes a |key| parameter. This way |GCM128_CONTEXT|
-// can be safely copied.
-
-typedef struct gcm128_context GCM128_CONTEXT;
-
-OPENSSL_EXPORT void GFp_gcm128_init_serialized(
-    uint8_t serialized_ctx[GCM128_SERIALIZED_LEN], const AES_KEY *key,
-    aes_block_f block);
-
-OPENSSL_EXPORT void GFp_gcm128_init(
-    GCM128_CONTEXT *ctx, const AES_KEY *key, aes_block_f block,
-    const uint8_t serialized_ctx[GCM128_SERIALIZED_LEN], const uint8_t *iv);
-
-// GFp_gcm128_aad sets the authenticated data for an instance of GCM. This must
-// be called before and data is encrypted. It returns one on success and zero
-// otherwise.
-OPENSSL_EXPORT int GFp_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad,
-                                  size_t len);
-
-// GFp_gcm128_encrypt_ctr32 encrypts |len| bytes from |in| to |out| using a CTR
-// function that only handles the bottom 32 bits of the nonce, like
-// |GFp_ctr128_encrypt_ctr32|. The |key| must be the same key that was passed
-// to |GFp_gcm128_init|. It returns one on success and zero otherwise.
-OPENSSL_EXPORT int GFp_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
-                                            const AES_KEY *key,
-                                            const uint8_t *in, uint8_t *out,
-                                            size_t len, aes_ctr_f stream);
-
-// GFp_gcm128_decrypt_ctr32 decrypts |len| bytes from |in| to |out| using a CTR
-// function that only handles the bottom 32 bits of the nonce, like
-// |GFp_ctr128_encrypt_ctr32|. The |key| must be the same key that was passed
-// to |GFp_gcm128_init|. It returns one on success and zero otherwise.
-OPENSSL_EXPORT int GFp_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
-                                            const AES_KEY *key,
-                                            const uint8_t *in, uint8_t *out,
-                                            size_t len, aes_ctr_f stream);
-
-// GFp_gcm128_tag calculates the authenticator and copies it into |tag|.
-OPENSSL_EXPORT void GFp_gcm128_tag(GCM128_CONTEXT *ctx, uint8_t tag[16]);
-
-
-#if !defined(OPENSSL_NO_ASM) && \
-    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
-void GFp_aesni_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
-                                    size_t blocks, const AES_KEY *key,
-                                    const uint8_t *ivec);
-#endif
+// Keep in sync with GCK_CONTEXT in aes_gcm.rs.
+typedef struct {
+    // Relative position of Xi, H and pre-computed Htable is used in some
+    // assembler modules, i.e. don't change the order!
+    alignas(16) uint8_t Xi[16];
+    struct {
+        uint64_t u[2];
+    } H_unused;
+    GCM128_KEY key;
+} GCM128_CONTEXT;

 #endif  // OPENSSL_HEADER_MODES_INTERNAL_H
--- a/crypto/internal.h
+++ b/crypto/internal.h
@ -289,32 +289,6 @@ static inline uint32_t from_be_u32_ptr(const uint8_t *data) {
 }


-// from_be_u64_ptr returns the 64-bit big-endian-encoded value at |data|.
-static inline uint64_t from_be_u64_ptr(const uint8_t *data) {
-#if defined(__clang__) || defined(_MSC_VER)
-  // XXX: Unlike GCC, Clang doesn't optimize compliant access to unaligned data
-  // well. See https://llvm.org/bugs/show_bug.cgi?id=20605,
-  // https://llvm.org/bugs/show_bug.cgi?id=17603,
-  // http://blog.regehr.org/archives/702, and
-  // http://blog.regehr.org/archives/1055. MSVC seems to have similar problems.
-  uint64_t value;
-  memcpy(&value, data, sizeof(value));
-#if OPENSSL_ENDIAN != OPENSSL_BIG_ENDIAN
-  value = bswap_u64(value);
-#endif
-  return value;
-#else
-  return ((uint64_t)data[0] << 56) |
-         ((uint64_t)data[1] << 48) |
-         ((uint64_t)data[2] << 40) |
-         ((uint64_t)data[3] << 32) |
-         ((uint64_t)data[4] << 24) |
-         ((uint64_t)data[5] << 16) |
-         ((uint64_t)data[6] << 8) |
-         ((uint64_t)data[7]);
-#endif
-}
-
 // to_be_u32_ptr writes the value |x| to the location |out| in big-endian
 // order.
 static inline void to_be_u32_ptr(uint8_t *out, uint32_t value) {
--- a/include/GFp/aes.h
+++ b/include/GFp/aes.h
@ -54,14 +54,9 @@
 // Raw AES functions.


-#define AES_ENCRYPT 1
-#define AES_DECRYPT 0
-
 // AES_MAXNR is the maximum number of AES rounds.
 #define AES_MAXNR 14

-#define AES_BLOCK_SIZE 16
-
 // aes_key_st should be an opaque type, but EVP requires that the size be
 // known.
 struct aes_key_st {
--- a/src/aead.rs
+++ b/src/aead.rs
@ -27,7 +27,7 @@

 use self::{
    block::{Block, BLOCK_LEN},
-    nonce::{Counter, Iv, NonceRef},
+    nonce::NonceRef,
 };
 use crate::{
    constant_time, cpu, error,
@ -139,7 +139,7 @@ pub fn open_in_place<'a>(
    let (in_out, received_tag) =
        ciphertext_and_tag_modified_in_place.split_at_mut(in_prefix_len + ciphertext_len);
    let Tag(calculated_tag) =
-        (key.key.algorithm.open)(&key.key.inner, nonce, &ad, in_prefix_len, in_out)?;
+        (key.key.algorithm.open)(&key.key.inner, nonce, &ad, in_prefix_len, in_out);
    if constant_time::verify_slices_are_equal(calculated_tag.as_ref(), received_tag).is_err() {
        // Zero out the plaintext so that it isn't accidentally leaked or used
        // after verification fails. It would be safest if we could check the
@ -222,7 +222,7 @@ pub fn seal_in_place(
    let (in_out, tag_out) = in_out.split_at_mut(in_out_len);

    let tag_out: &mut [u8; TAG_LEN] = tag_out.try_into_()?;
-    let Tag(tag) = (key.key.algorithm.seal)(&key.key.inner, nonce, ad, in_out)?;
+    let Tag(tag) = (key.key.algorithm.seal)(&key.key.inner, nonce, ad, in_out);
    tag_out.copy_from_slice(tag.as_ref());

    Ok(in_out_len + TAG_LEN)
@ -246,11 +246,6 @@ enum KeyInner {
 impl Key {
    fn new(algorithm: &'static Algorithm, key_bytes: &[u8]) -> Result<Self, error::Unspecified> {
        cpu::cache_detected_features();
-
-        if key_bytes.len() != algorithm.key_len() {
-            return Err(error::Unspecified);
-        }
-
        Ok(Key {
            inner: (algorithm.init)(key_bytes)?,
            algorithm,
@ -271,19 +266,14 @@ impl Key {
 pub struct Algorithm {
    init: fn(key: &[u8]) -> Result<KeyInner, error::Unspecified>,

-    seal: fn(
-        key: &KeyInner,
-        nonce: NonceRef,
-        ad: &[u8],
-        in_out: &mut [u8],
-    ) -> Result<Tag, error::Unspecified>,
+    seal: fn(key: &KeyInner, nonce: NonceRef, ad: &[u8], in_out: &mut [u8]) -> Tag,
    open: fn(
        ctx: &KeyInner,
        nonce: NonceRef,
        ad: &[u8],
        in_prefix_len: usize,
        in_out: &mut [u8],
-    ) -> Result<Tag, error::Unspecified>,
+    ) -> Tag,

    key_len: usize,
    id: AlgorithmID,
@ -361,15 +351,19 @@ fn check_per_nonce_max_bytes(alg: &Algorithm, in_out_len: usize) -> Result<(), e
    Ok(())
 }

+#[derive(Clone, Copy)]
 enum Direction {
    Opening { in_prefix_len: usize },
    Sealing,
 }

+mod aes;
 mod aes_gcm;
 mod block;
 mod chacha;
 mod chacha20_poly1305;
 pub mod chacha20_poly1305_openssh;
+mod gcm;
 mod nonce;
 mod poly1305;
+mod shift;
--- a/src/aead/aes.rs
+++ b/src/aead/aes.rs
@ -0,0 +1,311 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{
+    nonce::{self, Iv},
+    shift, Block, Direction, BLOCK_LEN,
+};
+use crate::{bits::BitLength, c, endian::*, error, polyfill};
+
+pub struct Key(AES_KEY);
+
+impl Key {
+    #[inline]
+    pub fn new(bytes: &[u8], variant: Variant) -> Result<Self, error::Unspecified> {
+        let key_bits = match variant {
+            Variant::AES_128 => BitLength::from_usize_bits(128),
+            Variant::AES_256 => BitLength::from_usize_bits(256),
+        };
+        if BitLength::from_usize_bytes(bytes.len())? != key_bits {
+            return Err(error::Unspecified);
+        }
+
+        let mut key = AES_KEY {
+            rd_key: [0u32; 4 * (MAX_ROUNDS + 1)],
+            rounds: 0,
+        };
+
+        match detect_implementation() {
+            Implementation::HWAES => {
+                extern "C" {
+                    fn GFp_aes_hw_set_encrypt_key(
+                        user_key: *const u8, bits: c::uint, key: &mut AES_KEY,
+                    ) -> ZeroMeansSuccess;
+                }
+                Result::from(unsafe {
+                    GFp_aes_hw_set_encrypt_key(
+                        bytes.as_ptr(),
+                        key_bits.as_usize_bits() as c::uint,
+                        &mut key,
+                    )
+                })?;
+            },
+
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            Implementation::VPAES => {
+                extern "C" {
+                    fn GFp_vpaes_set_encrypt_key(
+                        user_key: *const u8, bits: c::uint, key: &mut AES_KEY,
+                    ) -> ZeroMeansSuccess;
+                }
+                Result::from(unsafe {
+                    GFp_vpaes_set_encrypt_key(
+                        bytes.as_ptr(),
+                        key_bits.as_usize_bits() as c::uint,
+                        &mut key,
+                    )
+                })?;
+            },
+
+            _ => {
+                extern "C" {
+                    fn GFp_aes_nohw_set_encrypt_key(
+                        user_key: *const u8, bits: c::uint, key: &mut AES_KEY,
+                    ) -> ZeroMeansSuccess;
+                }
+                Result::from(unsafe {
+                    GFp_aes_nohw_set_encrypt_key(
+                        bytes.as_ptr(),
+                        key_bits.as_usize_bits() as c::uint,
+                        &mut key,
+                    )
+                })?;
+            },
+        };
+
+        Ok(Key(key))
+    }
+
+    #[inline]
+    pub fn encrypt_block(&self, mut a: Block) -> Block {
+        let aliasing_const: *const Block = &a;
+        let aliasing_mut: *mut Block = &mut a;
+
+        match detect_implementation() {
+            Implementation::HWAES => {
+                extern "C" {
+                    fn GFp_aes_hw_encrypt(a: *const Block, r: *mut Block, key: &AES_KEY);
+                }
+                unsafe {
+                    GFp_aes_hw_encrypt(aliasing_const, aliasing_mut, &self.0);
+                }
+            },
+
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            Implementation::VPAES => {
+                extern "C" {
+                    fn GFp_vpaes_encrypt(a: *const Block, r: *mut Block, key: &AES_KEY);
+                }
+                unsafe {
+                    GFp_vpaes_encrypt(aliasing_const, aliasing_mut, &self.0);
+                }
+            },
+
+            _ => {
+                extern "C" {
+                    fn GFp_aes_nohw_encrypt(a: *const Block, r: *mut Block, key: &AES_KEY);
+                }
+                unsafe {
+                    GFp_aes_nohw_encrypt(aliasing_const, aliasing_mut, &self.0);
+                }
+            },
+        }
+
+        a
+    }
+
+    #[inline]
+    pub fn encrypt_iv_xor_block(&self, iv: Iv, input: Block) -> Block {
+        let mut output = self.encrypt_block(iv.into_block_less_safe());
+        output.bitxor_assign(input);
+        output
+    }
+
+    #[inline]
+    pub(super) fn ctr32_encrypt_blocks(
+        &self, in_out: &mut [u8], direction: Direction, ctr: &mut Counter,
+    ) {
+        let output: *mut u8 = in_out.as_mut_ptr();
+        let in_prefix_len = match direction {
+            Direction::Opening { in_prefix_len } => in_prefix_len,
+            Direction::Sealing => 0,
+        };
+        let input: *const u8 = in_out[in_prefix_len..].as_ptr();
+
+        let in_out_len = in_out.len().checked_sub(in_prefix_len).unwrap();
+
+        assert_eq!(in_out_len % BLOCK_LEN, 0);
+        let blocks = in_out_len / BLOCK_LEN;
+        let blocks_u32 = blocks as u32;
+        assert_eq!(blocks, polyfill::usize_from_u32(blocks_u32));
+
+        match detect_implementation() {
+            Implementation::HWAES => {
+                extern "C" {
+                    fn GFp_aes_hw_ctr32_encrypt_blocks(
+                        input: *const u8, output: *mut u8, blocks: c::size_t, key: &AES_KEY,
+                        ivec: &Counter,
+                    );
+                }
+                unsafe {
+                    GFp_aes_hw_ctr32_encrypt_blocks(input, output, blocks, &self.0, ctr);
+                }
+                ctr.increment_by_less_safe(blocks_u32);
+            },
+
+            #[cfg(target_arch = "arm")]
+            Implementation::BSAES => {
+                extern "C" {
+                    fn GFp_bsaes_ctr32_encrypt_blocks(
+                        input: *const u8, output: *mut u8, blocks: c::size_t, key: &AES_KEY,
+                        ivec: &Counter,
+                    );
+                }
+                unsafe {
+                    GFp_bsaes_ctr32_encrypt_blocks(input, output, blocks, &self.0, ctr);
+                }
+                ctr.increment_by_less_safe(blocks_u32);
+            },
+
+            _ => {
+                shift::shift_full_blocks(in_out, in_prefix_len, |input| {
+                    self.encrypt_iv_xor_block(ctr.increment(), Block::from(input))
+                });
+            },
+        }
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[must_use]
+    pub fn is_aes_hw(&self) -> bool {
+        match detect_implementation() {
+            Implementation::HWAES => true,
+            _ => false,
+        }
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[must_use]
+    pub(super) fn inner_less_safe(&self) -> &AES_KEY { &self.0 }
+}
+
+// Keep this in sync with AES_KEY in aes.h.
+#[repr(C)]
+pub(super) struct AES_KEY {
+    pub rd_key: [u32; 4 * (MAX_ROUNDS + 1)],
+    pub rounds: c::uint,
+}
+
+// Keep this in sync with `AES_MAXNR` in aes.h.
+const MAX_ROUNDS: usize = 14;
+
+pub enum Variant {
+    AES_128,
+    AES_256,
+}
+
+pub type Counter = nonce::Counter<BigEndian<u32>>;
+
+#[repr(C)] // Only so `Key` can be `#[repr(C)]`
+#[derive(Clone, Copy)]
+pub enum Implementation {
+    HWAES = 1,
+
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    VPAES = 2,
+
+    #[cfg(target_arch = "arm")]
+    BSAES = 3,
+
+    Fallback = 4,
+}
+
+fn detect_implementation() -> Implementation {
+    extern "C" {
+        fn GFp_aes_hw_capable() -> c::int;
+    }
+
+    if unsafe { GFp_aes_hw_capable() } != 0 {
+        return Implementation::HWAES;
+    }
+
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    {
+        extern "C" {
+            fn GFp_vpaes_capable() -> c::int;
+        }
+        if unsafe { GFp_vpaes_capable() } != 0 {
+            return Implementation::VPAES;
+        }
+    }
+
+    #[cfg(target_arch = "arm")]
+    {
+        extern "C" {
+            fn GFp_bsaes_capable() -> c::int;
+        }
+        if unsafe { GFp_bsaes_capable() } != 0 {
+            return Implementation::BSAES;
+        }
+    }
+
+    Implementation::Fallback
+}
+
+#[must_use]
+#[repr(transparent)]
+pub struct ZeroMeansSuccess(c::int);
+
+impl From<ZeroMeansSuccess> for Result<(), error::Unspecified> {
+    fn from(ZeroMeansSuccess(value): ZeroMeansSuccess) -> Self {
+        if value == 0 {
+            Ok(())
+        } else {
+            Err(error::Unspecified)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{super::BLOCK_LEN, *};
+    use crate::{polyfill::convert::*, test};
+
+    #[test]
+    pub fn test_aes() {
+        test::from_file("src/aead/aes_tests.txt", |section, test_case| {
+            assert_eq!(section, "");
+            let key = consume_key(test_case, "Key");
+            let input = test_case.consume_bytes("Input");
+            let input: &[u8; BLOCK_LEN] = input.as_slice().try_into_()?;
+            let expected_output = test_case.consume_bytes("Output");
+
+            let block = Block::from(input);
+            let output = key.encrypt_block(block);
+            assert_eq!(output.as_ref(), &expected_output[..]);
+
+            Ok(())
+        })
+    }
+
+    fn consume_key(test_case: &mut test::TestCase, name: &str) -> Key {
+        let key = test_case.consume_bytes(name);
+        let variant = match key.len() {
+            16 => Variant::AES_128,
+            32 => Variant::AES_256,
+            _ => unreachable!(),
+        };
+        Key::new(&key[..], variant).unwrap()
+    }
+}
--- a/src/aead/aes_gcm.rs
+++ b/src/aead/aes_gcm.rs
@ -12,11 +12,13 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

-use super::{Block, NonceRef, Tag, BLOCK_LEN};
-use crate::{aead, bssl, c, error};
-
-#[repr(align(16))]
-pub struct Key([u8; AES_KEY_CTX_BUF_LEN]);
+use super::{
+    aes::{self, Counter},
+    gcm,
+    nonce::NonceRef,
+    shift, Block, Direction, Tag, BLOCK_LEN,
+};
+use crate::{aead, endian::*, error, polyfill};

 /// AES-128 in GCM mode with 128-bit tags and 96 bit nonces.
 ///
@ -24,8 +26,8 @@ pub struct Key([u8; AES_KEY_CTX_BUF_LEN]);
 ///
 /// Go analog: [`crypto.aes`](https://golang.org/pkg/crypto/aes/)
 pub static AES_128_GCM: aead::Algorithm = aead::Algorithm {
-    key_len: AES_128_KEY_LEN,
-    init: aes_gcm_init,
+    key_len: 16,
+    init: init_128,
    seal: aes_gcm_seal,
    open: aes_gcm_open,
    id: aead::AlgorithmID::AES_128_GCM,
@ -38,163 +40,202 @@ pub static AES_128_GCM: aead::Algorithm = aead::Algorithm {
 ///
 /// Go analog: [`crypto.aes`](https://golang.org/pkg/crypto/aes/)
 pub static AES_256_GCM: aead::Algorithm = aead::Algorithm {
-    key_len: AES_256_KEY_LEN,
-    init: aes_gcm_init,
+    key_len: 32,
+    init: init_256,
    seal: aes_gcm_seal,
    open: aes_gcm_open,
    id: aead::AlgorithmID::AES_256_GCM,
    max_input_len: AES_GCM_MAX_INPUT_LEN,
 };

-fn aes_gcm_init(key: &[u8]) -> Result<super::KeyInner, error::Unspecified> {
-    let mut r = Key([0u8; AES_KEY_CTX_BUF_LEN]);
-    Result::from(unsafe {
-        GFp_aes_gcm_init(r.0.as_mut_ptr(), r.0.len(), key.as_ptr(), key.len())
-    })?;
-    Ok(super::KeyInner::AesGcm(r))
+pub struct Key {
+    gcm_key: gcm::Key, // First because it has a large alignment requirement.
+    aes_key: aes::Key,
 }

-fn aes_gcm_seal(
-    key: &super::KeyInner, nonce: NonceRef, ad: &[u8], in_out: &mut [u8],
-) -> Result<Tag, error::Unspecified> {
-    let ctx = match key {
-        super::KeyInner::AesGcm(Key(ctx)) => ctx,
-        _ => unreachable!(),
-    };
-    let mut tag = Tag(Block::zero());
-    Result::from(unsafe {
-        GFp_aes_gcm_seal(
-            ctx.as_ptr(),
-            in_out.as_mut_ptr(),
-            in_out.len(),
-            &mut tag,
-            nonce,
-            ad.as_ptr(),
-            ad.len(),
-        )
-    })?;
-    Ok(tag)
+fn init_128(key: &[u8]) -> Result<aead::KeyInner, error::Unspecified> {
+    init(key, aes::Variant::AES_128)
+}
+
+fn init_256(key: &[u8]) -> Result<aead::KeyInner, error::Unspecified> {
+    init(key, aes::Variant::AES_256)
+}
+
+fn init(key: &[u8], variant: aes::Variant) -> Result<aead::KeyInner, error::Unspecified> {
+    let aes_key = aes::Key::new(key, variant)?;
+    let gcm_key = gcm::Key::new(aes_key.encrypt_block(Block::zero()));
+    Ok(aead::KeyInner::AesGcm(Key { aes_key, gcm_key }))
+}
+
+const CHUNK_BLOCKS: usize = 3 * 1024 / 16;
+
+fn aes_gcm_seal(key: &aead::KeyInner, nonce: NonceRef, ad: &[u8], in_out: &mut [u8]) -> Tag {
+    aead(key, nonce, ad, in_out, Direction::Sealing)
 }

 fn aes_gcm_open(
-    key: &super::KeyInner, nonce: NonceRef, ad: &[u8], in_prefix_len: usize, in_out: &mut [u8],
-) -> Result<Tag, error::Unspecified> {
-    let ctx = match key {
-        super::KeyInner::AesGcm(Key(ctx)) => ctx,
-        _ => unreachable!(),
-    };
-    let mut tag = Tag(Block::zero());
-    Result::from(unsafe {
-        GFp_aes_gcm_open(
-            ctx.as_ptr(),
-            in_out.as_mut_ptr(),
-            in_out.len() - in_prefix_len,
-            &mut tag,
-            nonce,
-            in_out[in_prefix_len..].as_ptr(),
-            ad.as_ptr(),
-            ad.len(),
-        )
-    })?;
-    Ok(tag)
+    key: &aead::KeyInner, nonce: NonceRef, ad: &[u8], in_prefix_len: usize, in_out: &mut [u8],
+) -> Tag {
+    aead(key, nonce, ad, in_out, Direction::Opening { in_prefix_len })
 }

-const AES_128_KEY_LEN: usize = 128 / 8;
-const AES_256_KEY_LEN: usize = 32; // 256 / 8
+#[inline(always)] // Avoid branching on `direction`.
+fn aead(
+    key: &aead::KeyInner, nonce: NonceRef, aad: &[u8], in_out: &mut [u8], direction: Direction,
+) -> Tag {
+    let Key { aes_key, gcm_key } = match key {
+        aead::KeyInner::AesGcm(key) => key,
+        _ => unimplemented!(),
+    };

-const AES_KEY_CTX_BUF_LEN: usize = AES_KEY_BUF_LEN + GCM128_SERIALIZED_LEN;
+    let mut ctr = Counter::one(nonce);
+    let tag_iv = ctr.increment();

-// Keep this in sync with `AES_KEY` in aes.h.
-const AES_KEY_BUF_LEN: usize = (4 * 4 * (AES_MAX_ROUNDS + 1)) + 8;
+    let mut gcm_ctx = gcm::Context::new(gcm_key, aad);
+
+    let in_prefix_len = match direction {
+        Direction::Opening { in_prefix_len } => in_prefix_len,
+        Direction::Sealing => 0,
+    };
+
+    let total_in_out_len = in_out.len() - in_prefix_len;
+
+    let in_out = integrated_aes_gcm(aes_key, &mut gcm_ctx, in_out, &mut ctr, direction);
+    let in_out_len = in_out.len() - in_prefix_len;
+
+    // Process any (remaining) whole blocks.
+    let whole_len = in_out_len - (in_out_len % BLOCK_LEN);
+    {
+        let mut chunk_len = CHUNK_BLOCKS * BLOCK_LEN;
+        let mut output = 0;
+        let mut input = in_prefix_len;
+        loop {
+            if whole_len - output < chunk_len {
+                chunk_len = whole_len - output;
+            }
+            if chunk_len == 0 {
+                break;
+            }
+
+            if let Direction::Opening { .. } = direction {
+                gcm_ctx.update_blocks(&in_out[input..][..chunk_len]);
+            }
+
+            aes_key.ctr32_encrypt_blocks(
+                &mut in_out[output..][..(chunk_len + in_prefix_len)],
+                direction,
+                &mut ctr,
+            );
+
+            if let Direction::Sealing = direction {
+                gcm_ctx.update_blocks(&in_out[output..][..chunk_len]);
+            }
+
+            output += chunk_len;
+            input += chunk_len;
+        }
+    }
+
+    // Process any remaining partial block.
+    let remainder = &mut in_out[whole_len..];
+    shift::shift_partial((in_prefix_len, remainder), |remainder| {
+        let mut input = Block::zero();
+        input.partial_copy_from(remainder);
+        if let Direction::Opening { .. } = direction {
+            gcm_ctx.update_block(input);
+        }
+        let mut output = aes_key.encrypt_iv_xor_block(ctr.into(), input);
+        if let Direction::Sealing = direction {
+            polyfill::slice::fill(&mut output.as_mut()[remainder.len()..], 0);
+            gcm_ctx.update_block(output);
+        }
+        output
+    });
+
+    // Authenticate the final block containing the input lengths.
+    let aad_bits = polyfill::u64_from_usize(aad.len()) << 3;
+    let ciphertext_bits = polyfill::u64_from_usize(total_in_out_len) << 3;
+    gcm_ctx.update_block(Block::from_u64_be(
+        BigEndian::from(aad_bits),
+        BigEndian::from(ciphertext_bits),
+    ));
+
+    // Finalize the tag and return it.
+    gcm_ctx.pre_finish(|pre_tag| {
+        let block = tag_iv.into_block_less_safe();
+        let mut tag = aes_key.encrypt_block(block);
+        tag.bitxor_assign(pre_tag);
+        Tag(tag)
+    })
+}
+
+// Returns the data that wasn't processed.
+#[cfg(target_arch = "x86_64")]
+#[inline] // Optimize out the match on `direction`.
+fn integrated_aes_gcm<'a>(
+    aes_key: &aes::Key, gcm_ctx: &mut gcm::Context, in_out: &'a mut [u8], ctr: &mut Counter,
+    direction: Direction,
+) -> &'a mut [u8] {
+    use crate::c;
+
+    if !aes_key.is_aes_hw() || !gcm_ctx.is_avx2() {
+        return in_out;
+    }
+
+    let processed = match direction {
+        Direction::Opening { in_prefix_len } => {
+            extern "C" {
+                fn GFp_aesni_gcm_decrypt(
+                    input: *const u8, output: *mut u8, len: c::size_t, key: &aes::AES_KEY,
+                    ivec: &mut Counter, gcm: &mut gcm::Context,
+                ) -> c::size_t;
+            }
+            unsafe {
+                GFp_aesni_gcm_decrypt(
+                    in_out[in_prefix_len..].as_ptr(),
+                    in_out.as_mut_ptr(),
+                    in_out.len() - in_prefix_len,
+                    aes_key.inner_less_safe(),
+                    ctr,
+                    gcm_ctx,
+                )
+            }
+        },
+        Direction::Sealing => {
+            extern "C" {
+                fn GFp_aesni_gcm_encrypt(
+                    input: *const u8, output: *mut u8, len: c::size_t, key: &aes::AES_KEY,
+                    ivec: &mut Counter, gcm: &mut gcm::Context,
+                ) -> c::size_t;
+            }
+            unsafe {
+                GFp_aesni_gcm_encrypt(
+                    in_out.as_ptr(),
+                    in_out.as_mut_ptr(),
+                    in_out.len(),
+                    aes_key.inner_less_safe(),
+                    ctr,
+                    gcm_ctx,
+                )
+            }
+        },
+    };
+
+    &mut in_out[processed..]
+}
+
+#[cfg(not(target_arch = "x86_64"))]
+#[inline]
+fn integrated_aes_gcm<'a>(
+    _: &aes::Key, _: &mut gcm::Context, in_out: &'a mut [u8], _: &mut Counter, _: Direction,
+) -> &'a mut [u8] {
+    in_out // This doesn't process any of the input so it all remains.
+}

 const AES_GCM_MAX_INPUT_LEN: u64 = super::max_input_len(BLOCK_LEN, 2);

-// Keep this in sync with `AES_MAXNR` in aes.h.
-const AES_MAX_ROUNDS: usize = 14;
-
-// Keep this in sync with `GCM128_SERIALIZED_LEN` in gcm.h.
-// TODO: test.
-// TODO: some implementations of GCM don't require the buffer to be this big.
-// We should shrink it down on those platforms since this is still huge.
-const GCM128_SERIALIZED_LEN: usize = 16 * 16;
-
-extern "C" {
-    fn GFp_aes_gcm_init(
-        ctx_buf: *mut u8, ctx_buf_len: c::size_t, key: *const u8, key_len: c::size_t,
-    ) -> bssl::Result;
-
-    fn GFp_aes_gcm_seal(
-        ctx_buf: *const u8, in_out: *mut u8, in_out_len: c::size_t, tag_out: &mut Tag,
-        nonce: NonceRef, ad: *const u8, ad_len: c::size_t,
-    ) -> bssl::Result;
-
-    fn GFp_aes_gcm_open(
-        ctx_buf: *const u8, out: *mut u8, in_out_len: c::size_t, tag_out: &mut Tag,
-        nonce: NonceRef, in_: *const u8, ad: *const u8, ad_len: c::size_t,
-    ) -> bssl::Result;
-}
-
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::{bits::BitLength, c, polyfill::convert::*, test};
-
-    #[test]
-    pub fn test_aes() {
-        test::from_file("src/aead/aes_tests.txt", |section, test_case| {
-            assert_eq!(section, "");
-            let key = test_case.consume_bytes("Key");
-            let input = test_case.consume_bytes("Input");
-            let input: &[u8; AES_BLOCK_SIZE] = input.as_slice().try_into_().unwrap();
-            let expected_output = test_case.consume_bytes("Output");
-
-            // Key setup.
-            let key_bits = BitLength::from_usize_bytes(key.len()).unwrap();
-            assert!(
-                key_bits == BitLength::from_usize_bits(128)
-                    || key_bits == BitLength::from_usize_bits(256)
-            );
-            let key_bits = key_bits.as_usize_bits() as c::uint;
-            let mut aes_key = AES_KEY {
-                rd_key: [0u32; 4 * (AES_MAX_ROUNDS + 1)],
-                rounds: 0,
-            };
-            unsafe {
-                GFp_AES_set_encrypt_key(key.as_ptr(), key_bits, &mut aes_key);
-            }
-
-            // Test encryption into a separate buffer.
-            let mut output_buf = [0u8; AES_BLOCK_SIZE];
-            unsafe {
-                GFp_AES_encrypt(input.as_ptr(), output_buf.as_mut_ptr(), &aes_key);
-            }
-            assert_eq!(&output_buf[..], &expected_output[..]);
-
-            // Test in-place encryption.
-            output_buf.copy_from_slice(&input[..]);
-            unsafe {
-                GFp_AES_encrypt(output_buf.as_ptr(), output_buf.as_mut_ptr(), &aes_key);
-            }
-            assert_eq!(&output_buf[..], &expected_output[..]);
-
-            Ok(())
-        })
-    }
-
-    const AES_BLOCK_SIZE: usize = 16;
-
-    // Keep this in sync with AES_KEY in aes.h.
-    #[repr(C)]
-    pub struct AES_KEY {
-        pub rd_key: [u32; 4 * (AES_MAX_ROUNDS + 1)],
-        pub rounds: c::uint,
-    }
-
-    extern "C" {
-        fn GFp_AES_set_encrypt_key(key: *const u8, bits: c::uint, aes_key: *mut AES_KEY);
-        fn GFp_AES_encrypt(in_: *const u8, out: *mut u8, key: *const AES_KEY);
-    }
-
    #[test]
    fn max_input_len_test() {
        // [NIST SP800-38D] Section 5.2.1.1. Note that [RFC 5116 Section 5.1] and
@ -206,7 +247,13 @@ mod tests {
        // [RFC 5116 Section 5.2]: https://tools.ietf.org/html/rfc5116#section-5.2
        const NIST_SP800_38D_MAX_BITS: u64 = (1u64 << 39) - 256;
        assert_eq!(NIST_SP800_38D_MAX_BITS, 549_755_813_632u64);
-        assert_eq!(AES_128_GCM.max_input_len * 8, NIST_SP800_38D_MAX_BITS);
-        assert_eq!(AES_256_GCM.max_input_len * 8, NIST_SP800_38D_MAX_BITS);
+        assert_eq!(
+            super::AES_128_GCM.max_input_len * 8,
+            NIST_SP800_38D_MAX_BITS
+        );
+        assert_eq!(
+            super::AES_256_GCM.max_input_len * 8,
+            NIST_SP800_38D_MAX_BITS
+        );
    }
 }
--- a/src/aead/block.rs
+++ b/src/aead/block.rs
@ -39,11 +39,30 @@ impl Block {
        }
    }

+    #[inline]
+    pub fn from_u64_be(first: BigEndian<u64>, second: BigEndian<u64>) -> Self {
+        Self {
+            subblocks: [unsafe { core::mem::transmute(first) }, unsafe {
+                core::mem::transmute(second)
+            }],
+        }
+    }
+
    /// Replaces the first `a.len()` bytes of the block's value with `a`,
    /// leaving the rest of the block unchanged. Panics if `a` is larger
    /// than a block.
    #[inline]
    pub fn partial_copy_from(&mut self, a: &[u8]) { self.as_mut()[..a.len()].copy_from_slice(a); }
+
+    #[inline]
+    pub fn bitxor_assign(&mut self, a: Block) {
+        extern "C" {
+            fn GFp_block128_xor_assign(r: &mut Block, a: Block);
+        }
+        unsafe {
+            GFp_block128_xor_assign(self, a);
+        }
+    }
 }

 impl<'a> From<&'a [u8; BLOCK_LEN]> for Block {
@ -71,3 +90,30 @@ impl From_<&mut [Block; 2]> for &mut [u8; 2 * BLOCK_LEN] {
    #[inline]
    fn from_(bytes: &mut [Block; 2]) -> Self { unsafe { core::mem::transmute(bytes) } }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bitxor_assign() {
+        const ONES: u64 = -1i64 as u64;
+        const TEST_CASES: &[([u64; 2], [u64; 2], [u64; 2])] = &[
+            ([0, 0], [0, 0], [0, 0]),
+            ([0, 0], [ONES, ONES], [ONES, ONES]),
+            ([0, ONES], [ONES, 0], [ONES, ONES]),
+            ([ONES, 0], [0, ONES], [ONES, ONES]),
+            ([ONES, ONES], [ONES, ONES], [0, 0]),
+        ];
+        for (expected_result, a, b) in TEST_CASES {
+            let mut r = Block::from_u64_le(a[0].into(), a[1].into());
+            r.bitxor_assign(Block::from_u64_le(b[0].into(), b[1].into()));
+            assert_eq!(*expected_result, r.subblocks);
+
+            // XOR is symmetric.
+            let mut r = Block::from_u64_le(b[0].into(), b[1].into());
+            r.bitxor_assign(Block::from_u64_le(a[0].into(), a[1].into()));
+            assert_eq!(*expected_result, r.subblocks);
+        }
+    }
+}
--- a/src/aead/chacha.rs
+++ b/src/aead/chacha.rs
@ -13,8 +13,11 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

-use super::{Block, Counter, Iv, BLOCK_LEN};
-use crate::{c, polyfill::convert::*};
+use super::{
+    nonce::{self, Iv},
+    Block, BLOCK_LEN,
+};
+use crate::{c, endian::*, polyfill::convert::*};
 use core;

 #[repr(C)]
@ -24,6 +27,8 @@ impl<'a> From<&'a [u8; KEY_LEN]> for Key {
    fn from(value: &[u8; KEY_LEN]) -> Self { Key(<[Block; KEY_BLOCKS]>::from_(value)) }
 }

+pub type Counter = nonce::Counter<LittleEndian<u32>>;
+
 #[inline] // Optimize away match on `iv`.
 pub fn chacha20_xor_in_place(key: &Key, iv: CounterOrIv, in_out: &mut [u8]) {
    unsafe {
--- a/src/aead/chacha20_poly1305.rs
+++ b/src/aead/chacha20_poly1305.rs
@ -12,7 +12,11 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

-use super::{chacha, poly1305, Block, Counter, Direction, Iv, NonceRef, Tag, BLOCK_LEN};
+use super::{
+    chacha::{self, Counter},
+    nonce::{Iv, NonceRef},
+    poly1305, Block, Direction, Tag, BLOCK_LEN,
+};
 use crate::{
    aead,
    endian::*,
@ -42,20 +46,14 @@ fn chacha20_poly1305_init(key: &[u8]) -> Result<aead::KeyInner, error::Unspecifi

 fn chacha20_poly1305_seal(
    key: &aead::KeyInner, nonce: NonceRef, ad: &[u8], in_out: &mut [u8],
-) -> Result<Tag, error::Unspecified> {
-    Ok(aead(key, nonce, ad, in_out, Direction::Sealing))
+) -> Tag {
+    aead(key, nonce, ad, in_out, Direction::Sealing)
 }

 fn chacha20_poly1305_open(
    key: &aead::KeyInner, nonce: NonceRef, ad: &[u8], in_prefix_len: usize, in_out: &mut [u8],
-) -> Result<Tag, error::Unspecified> {
-    Ok(aead(
-        key,
-        nonce,
-        ad,
-        in_out,
-        Direction::Opening { in_prefix_len },
-    ))
+) -> Tag {
+    aead(key, nonce, ad, in_out, Direction::Opening { in_prefix_len })
 }

 pub type Key = chacha::Key;
--- a/src/aead/chacha20_poly1305_openssh.rs
+++ b/src/aead/chacha20_poly1305_openssh.rs
@ -32,7 +32,7 @@
 use super::{
    chacha::{self, *},
    chacha20_poly1305::derive_poly1305_key,
-    poly1305, Counter, NonceRef, Tag,
+    poly1305, NonceRef, Tag,
 };
 use crate::{constant_time, endian::*, error, polyfill::convert::*};

--- a/src/aead/gcm.rs
+++ b/src/aead/gcm.rs
@ -0,0 +1,106 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{Block, BLOCK_LEN};
+use crate::c;
+
+#[repr(transparent)]
+pub struct Key(GCM128_KEY);
+
+impl Key {
+    pub fn new(h_be: Block) -> Self {
+        extern "C" {
+            fn GFp_gcm128_init_htable(gcm_key: &mut GCM128_KEY, h_block: Block);
+        }
+
+        let mut r = Key(GCM128_KEY([Block::zero(); GCM128_HTABLE_LEN]));
+        unsafe {
+            GFp_gcm128_init_htable(&mut r.0, h_be);
+        }
+        r
+    }
+}
+
+#[repr(transparent)]
+pub struct Context(GCM128_CONTEXT);
+
+impl Context {
+    pub fn new(key: &Key, aad: &[u8]) -> Self {
+        let mut ctx = Context(GCM128_CONTEXT {
+            Xi: Block::zero(),
+            H_unused: Block::zero(),
+            key: key.0.clone(),
+        });
+
+        for ad in aad.chunks(BLOCK_LEN) {
+            let mut block = Block::zero();
+            block.partial_copy_from(ad);
+            ctx.update_block(block);
+        }
+
+        ctx
+    }
+
+    pub fn update_blocks(&mut self, input: &[u8]) {
+        debug_assert!(input.len() > 0);
+        debug_assert_eq!(input.len() % BLOCK_LEN, 0);
+        extern "C" {
+            fn GFp_gcm128_ghash(ctx: &mut Context, input: *const u8, input_len: c::size_t);
+        }
+        unsafe {
+            GFp_gcm128_ghash(self, input.as_ptr(), input.len());
+        }
+    }
+
+    pub fn update_block(&mut self, a: Block) {
+        extern "C" {
+            fn GFp_gcm128_gmult(ctx: &mut Context);
+        }
+
+        self.0.Xi.bitxor_assign(a);
+        unsafe {
+            GFp_gcm128_gmult(self);
+        }
+    }
+
+    pub(super) fn pre_finish<F>(self, f: F) -> super::Tag
+    where
+        F: FnOnce(Block) -> super::Tag,
+    {
+        f(self.0.Xi)
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    pub(super) fn is_avx2(&self) -> bool {
+        extern "C" {
+            fn GFp_aesni_gcm_capable() -> c::int;
+        }
+        1 == unsafe { GFp_aesni_gcm_capable() }
+    }
+}
+
+// Keep in sync with `GCM128_KEY` in modes/internal.h.
+#[derive(Clone)]
+#[repr(C, align(16))]
+struct GCM128_KEY([Block; GCM128_HTABLE_LEN]);
+
+const GCM128_HTABLE_LEN: usize = 16;
+
+// Keep in sync with `GCM128_CONTEXT` in modes/internal.h.
+#[repr(C, align(16))]
+struct GCM128_CONTEXT {
+    Xi: Block,
+    H_unused: Block,
+    key: GCM128_KEY,
+}
--- a/src/aead/nonce.rs
+++ b/src/aead/nonce.rs
@ -25,6 +25,7 @@

 use super::Block;
 use crate::{endian::*, error, polyfill::convert::*};
+use core::marker::PhantomData;

 /// A nonce.
 ///
@ -48,13 +49,21 @@ pub const NONCE_LEN: usize = 96 / 8;
 ///
 /// Intentionally not `Clone` to ensure counters aren't forked.
 #[repr(C)]
-pub union Counter {
+pub union Counter<U32: Layout<u32>>
+where
+    u32: From<U32>,
+{
    block: Block,
-    u32s: [LittleEndian<u32>; 4],
+    u32s: [U32; 4],
+    encoding: PhantomData<U32>,
 }

-impl Counter {
+impl<U32: Layout<u32>> Counter<U32>
+where
+    u32: From<U32>,
+{
    pub fn zero(nonce: NonceRef) -> Self { Self::new(nonce, 0) }
+    pub fn one(nonce: NonceRef) -> Self { Self::new(nonce, 1) }

    // Used by `zero()` and by the tests.
    #[cfg(test)]
@ -67,28 +76,28 @@ impl Counter {
            block: Block::zero(),
        };
        let block = unsafe { &mut r.block };
-        block.as_mut()[4..].copy_from_slice(nonce);
-        let u32s = unsafe { &mut r.u32s };
-        u32s[0] = initial_counter.into();
+        block.as_mut()[U32::NONCE_BYTE_INDEX..][..NONCE_LEN].copy_from_slice(nonce);
+        r.increment_by_less_safe(initial_counter);
+
        r
    }

-    /// XXX: The caller is responsible for ensuring that the counter doesn't
-    /// wrap around to zero.
+    #[inline]
    pub fn increment(&mut self) -> Iv {
        let block = unsafe { &self.block };
        let r = Iv(block.clone());

-        let ctr = unsafe { &mut self.u32s[0] };
-        let new_value = u32::from(*ctr) + 1;
-        *ctr = new_value.into();
+        self.increment_by_less_safe(1);

        r
    }
-}

-impl Into<Iv> for Counter {
-    fn into(self) -> Iv { Iv(unsafe { self.block }) }
+    #[inline]
+    pub fn increment_by_less_safe(&mut self, increment_by: u32) {
+        let u32s = unsafe { &mut self.u32s };
+        let value = &mut u32s[U32::COUNTER_U32_INDEX];
+        *value = (u32::from(*value) + increment_by).into();
+    }
 }

 /// The IV for a single block encryption.
@ -96,3 +105,41 @@ impl Into<Iv> for Counter {
 /// Intentionally not `Clone` to ensure each is used only once.
 #[repr(C)]
 pub struct Iv(Block);
+
+impl<U32: Layout<u32>> From<Counter<U32>> for Iv
+where
+    u32: From<U32>,
+{
+    fn from(counter: Counter<U32>) -> Self { Iv(unsafe { counter.block }) }
+}
+
+impl Iv {
+    #[inline]
+    pub fn into_block_less_safe(self) -> Block { self.0 }
+}
+
+pub trait Layout<T>: Encoding<T>
+where
+    T: From<Self>,
+{
+    const COUNTER_U32_INDEX: usize;
+    const NONCE_BYTE_INDEX: usize;
+}
+
+impl<T> Layout<T> for BigEndian<T>
+where
+    BigEndian<T>: Encoding<T>,
+    T: Copy + From<Self>,
+{
+    const COUNTER_U32_INDEX: usize = 3;
+    const NONCE_BYTE_INDEX: usize = 0;
+}
+
+impl<T> Layout<T> for LittleEndian<T>
+where
+    LittleEndian<T>: Encoding<T>,
+    T: Copy + From<Self>,
+{
+    const COUNTER_U32_INDEX: usize = 0;
+    const NONCE_BYTE_INDEX: usize = 4;
+}
--- a/src/aead/shift.rs
+++ b/src/aead/shift.rs
@ -0,0 +1,49 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::block::{Block, BLOCK_LEN};
+use crate::polyfill::convert::*;
+
+pub fn shift_full_blocks<F>(in_out: &mut [u8], in_prefix_len: usize, mut transform: F)
+where
+    F: FnMut(&[u8; BLOCK_LEN]) -> Block,
+{
+    let in_out_len = in_out.len().checked_sub(in_prefix_len).unwrap();
+
+    for i in (0..in_out_len).step_by(BLOCK_LEN) {
+        let block = {
+            let input =
+                <&[u8; BLOCK_LEN]>::try_from_(&in_out[(in_prefix_len + i)..][..BLOCK_LEN]).unwrap();
+            transform(input)
+        };
+        let output = <&mut [u8; BLOCK_LEN]>::try_from_(&mut in_out[i..][..BLOCK_LEN]).unwrap();
+        *output = *block.as_ref();
+    }
+}
+
+pub fn shift_partial<F>((in_prefix_len, in_out): (usize, &mut [u8]), transform: F)
+where
+    F: FnOnce(&[u8]) -> Block,
+{
+    let (block, in_out_len) = {
+        let input = &in_out[in_prefix_len..];
+        let in_out_len = input.len();
+        if in_out_len == 0 {
+            return;
+        }
+        debug_assert!(in_out_len < BLOCK_LEN);
+        (transform(input), in_out_len)
+    };
+    in_out[..in_out_len].copy_from_slice(&block.as_ref()[..in_out_len]);
+}
--- a/src/endian.rs
+++ b/src/endian.rs
@ -1,11 +1,17 @@
 use crate::private;
 use core::num::Wrapping;

-pub trait Encoding<T>: From<T> + Sized + private::Sealed {
+pub trait Encoding<T>: Copy + From<T> + Sized + private::Sealed
+where
+    T: From<Self>,
+{
    const ZERO: Self;
 }

-pub fn as_bytes<E: Encoding<T>, T>(x: &[E]) -> &[u8] {
+pub fn as_bytes<E: Encoding<T>, T>(x: &[E]) -> &[u8]
+where
+    T: From<E>,
+{
    unsafe {
        core::slice::from_raw_parts(x.as_ptr() as *const u8, x.len() * core::mem::size_of::<E>())
    }
@ -58,3 +64,14 @@ impl_endian!(BigEndian, u32, to_be, from_be);
 impl_endian!(BigEndian, u64, to_be, from_be);
 impl_endian!(LittleEndian, u32, to_le, from_le);
 impl_endian!(LittleEndian, u64, to_le, from_le);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_big_endian() {
+        let x = BigEndian::from(1u32);
+        assert_eq!(u32::from(x), 1);
+    }
+}
--- a/src/polyfill.rs
+++ b/src/polyfill.rs
@ -23,6 +23,9 @@ pub mod convert;
 #[inline(always)]
 pub const fn u64_from_usize(x: usize) -> u64 { x as u64 }

+#[inline(always)]
+pub fn usize_from_u32(x: u32) -> usize { x as usize }
+
 /// `core::num::Wrapping` doesn't support `rotate_left`.
 /// There is no usable trait for `rotate_left`, so this polyfill just
 /// hard-codes u32. https://github.com/rust-lang/rust/issues/32463