From 86493ca4bee20d2b9945c8cfd7345f8e0a9125f8 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Fri, 13 Apr 2018 13:57:22 +0200
Subject: [PATCH] Optimize `fill_bytes_via`

---
 rand_core/src/impls.rs | 57 +++++++++++++++++++++---------------------
 rand_core/src/lib.rs   |  5 ++--
 src/jitter.rs          |  2 +-
 src/mock.rs            |  2 +-
 src/prng/xorshift.rs   |  4 ++-
 5 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs
index 645dc8f9..530a2ed7 100644
--- a/rand_core/src/impls.rs
+++ b/rand_core/src/impls.rs
@@ -37,35 +37,34 @@ pub fn next_u64_via_u32<R: RngCore + ?Sized>(rng: &mut R) -> u64 {
     (y << 32) | x
 }
 
-macro_rules! fill_bytes_via {
-    ($rng:ident, $next_u:ident, $BYTES:expr, $dest:ident) => {{
-        let mut left = $dest;
-        while left.len() >= $BYTES {
-            let (l, r) = {left}.split_at_mut($BYTES);
-            left = r;
-            let chunk: [u8; $BYTES] = unsafe {
-                transmute($rng.$next_u().to_le())
-            };
-            l.copy_from_slice(&chunk);
-        }
-        let n = left.len();
-        if n > 0 {
-            let chunk: [u8; $BYTES] = unsafe {
-                transmute($rng.$next_u().to_le())
-            };
-            left.copy_from_slice(&chunk[..n]);
-        }
-    }}
-}
-
-/// Implement `fill_bytes` via `next_u32`, little-endian order.
-pub fn fill_bytes_via_u32<R: RngCore + ?Sized>(rng: &mut R, dest: &mut [u8]) {
-    fill_bytes_via!(rng, next_u32, 4, dest)
-}
-
-/// Implement `fill_bytes` via `next_u64`, little-endian order.
-pub fn fill_bytes_via_u64<R: RngCore + ?Sized>(rng: &mut R, dest: &mut [u8]) {
-    fill_bytes_via!(rng, next_u64, 8, dest)
+/// Implement `fill_bytes` via `next_u64` and `next_u32`, little-endian order.
+///
+/// The fastest way to fill a slice is usually to work as long as possible with
+/// integers. That is why this method mostly uses `next_u64`, and only when
+/// there are 4 or less bytes remaining at the end of the slice it uses
+/// `next_u32` once.
+pub fn fill_bytes_via_next<R: RngCore + ?Sized>(rng: &mut R, dest: &mut [u8]) {
+    let mut left = dest;
+    while left.len() >= 8 {
+        let (l, r) = {left}.split_at_mut(8);
+        left = r;
+        let chunk: [u8; 8] = unsafe {
+            transmute(rng.next_u64().to_le())
+        };
+        l.copy_from_slice(&chunk);
+    }
+    let n = left.len();
+    if n > 4 {
+        let chunk: [u8; 8] = unsafe {
+            transmute(rng.next_u64().to_le())
+        };
+        left.copy_from_slice(&chunk[..n]);
+    } else if n > 0 {
+        let chunk: [u8; 4] = unsafe {
+            transmute(rng.next_u32().to_le())
+        };
+        left.copy_from_slice(&chunk[..n]);
+    }
 }
 
 macro_rules! impl_uint_from_fill {
diff --git a/rand_core/src/lib.rs b/rand_core/src/lib.rs
index 924d44ef..7ac0686d 100644
--- a/rand_core/src/lib.rs
+++ b/rand_core/src/lib.rs
@@ -121,7 +121,7 @@ pub mod le;
 ///     }
 ///     
 ///     fn fill_bytes(&mut self, dest: &mut [u8]) {
-///         impls::fill_bytes_via_u64(self, dest)
+///         impls::fill_bytes_via_next(self, dest)
 ///     }
 ///     
 ///     fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
@@ -160,8 +160,7 @@ pub trait RngCore {
     ///
     /// RNGs must implement at least one method from this trait directly. In
     /// the case this method is not implemented directly, it can be implemented
-    /// [via `next_u32`](../rand_core/impls/fn.fill_bytes_via_u32.html) or
-    /// [via `next_u64`](../rand_core/impls/fn.fill_bytes_via_u64.html) or
+    /// [via `next_u*`](../rand_core/impls/fn.fill_bytes_via_next.html) or
     /// via `try_fill_bytes`; if this generator can fail the implementation
     /// must choose how best to handle errors here (e.g. panic with a
     /// descriptive message or log a warning and retry a few times).
diff --git a/src/jitter.rs b/src/jitter.rs
index 719afa3a..5811479e 100644
--- a/src/jitter.rs
+++ b/src/jitter.rs
@@ -804,7 +804,7 @@ impl RngCore for JitterRng {
         //
         // This is done especially for wrappers that implement `next_u32`
         // themselves via `fill_bytes`.
-        impls::fill_bytes_via_u32(self, dest)
+        impls::fill_bytes_via_next(self, dest)
     }
 
     fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
diff --git a/src/mock.rs b/src/mock.rs
index 5c73594f..090258ef 100644
--- a/src/mock.rs
+++ b/src/mock.rs
@@ -52,7 +52,7 @@ impl RngCore for StepRng {
     }
 
     fn fill_bytes(&mut self, dest: &mut [u8]) {
-        impls::fill_bytes_via_u64(self, dest);
+        impls::fill_bytes_via_next(self, dest);
     }
 
     fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
diff --git a/src/prng/xorshift.rs b/src/prng/xorshift.rs
index 9f7a3c88..9fac6e33 100644
--- a/src/prng/xorshift.rs
+++ b/src/prng/xorshift.rs
@@ -71,12 +71,14 @@ impl RngCore for XorShiftRng {
         self.w.0
     }
 
+    #[inline]
     fn next_u64(&mut self) -> u64 {
         impls::next_u64_via_u32(self)
     }
 
+    #[inline]
     fn fill_bytes(&mut self, dest: &mut [u8]) {
-        impls::fill_bytes_via_u32(self, dest)
+        impls::fill_bytes_via_next(self, dest)
     }
 
     fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {