Optimize intrinsics on wasm32

Profiling a recent demo I was playing with on `wasm32-unknown-unknown` pointed me to the surprising result that 15% of the execution time was in the `sqrt` intrinsic (there's a lot of math here). Upon investigation I remembered that wasm (unconditionally) has a native `f32.sqrt` instruction! I was then subsequently confused that a simple `f.sqrt()` actually codegens to use `f32.sqrt` in Rust, but I later realized that the implementations of intrinsics in this library often use other intrinsics to implement them. That means that the real intrinsic here, `acos`, internally called `sqrt` at some point but wasn't using the optimized implementation! To help fix this situation this PR is intended on providing the infrastructure for optimized implementations (via code generation) to be used for each intrinsic. I've gone thorugh the various math instructions that wasm has available and updated each of the intrinsic implementations in this crate to optionally use the LLVM intrinsic versions, which are known to unconditionally compile down to a single instruction (unlike the arbitrary platform, where we don't know what it will compile down to!). To do this I created a new macro to wrap the invocation of LLVM intrinsics. Invoking LLVM intrinsics is turned off by default (through a new and on-by-default feature, `stable`). When the `stable` feature is disabled, however, then the wasm-target specifically will enable usage of the LLVM intrinsics. I've additionally added a CI builder which should verify that these continue to build on Travis. After this I intended to update the submodule in the `compiler-builtins` repository so we can pull in the optimized implementation there, and `compiler-builtins` naturally won't set `feature = "stable"` when compiling so all the intrinsics should get compiled in by default. After a further update of `the libcompiler_builtins` submodule in rust-lang/rust we should be good to go!
2018-10-11 14:43:19 -07:00 · 2018-10-11 14:43:19 -07:00 · 284f2d007c
commit 284f2d007c
parent 8e857c72cf
14 changed files with 114 additions and 0 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -29,6 +29,13 @@ matrix:
    - env: TARGET=cargo-fmt
      rust: beta

+    - env: TARGET=wasm32-unknown-unknown
+      rust: nightly
+      install: rustup target add $TARGET
+      script:
+        - cargo build --target $TARGET
+        - cargo build --no-default-features --target $TARGET
+
 before_install: set -e

 install:
--- a/Cargo.toml
+++ b/Cargo.toml
@ -24,3 +24,7 @@ members = [

 [dev-dependencies]
 shared = { path = "shared" }
+
+[features]
+default = ['stable']
+stable = []
--- a/src/lib.rs
+++ b/src/lib.rs
@ -11,6 +11,10 @@

 #![deny(warnings)]
 #![no_std]
+#![cfg_attr(
+    all(target_arch = "wasm32", not(feature = "stable")),
+    feature(core_intrinsics)
+)]

 mod math;

--- a/src/math/ceil.rs
+++ b/src/math/ceil.rs
@ -4,6 +4,14 @@ const TOINT: f64 = 1. / f64::EPSILON;

 #[inline]
 pub fn ceil(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.ceil` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::ceilf64(x) }
+        }
+    }
    let u: u64 = x.to_bits();
    let e: i64 = (u >> 52 & 0x7ff) as i64;
    let y: f64;
--- a/src/math/ceilf.rs
+++ b/src/math/ceilf.rs
@ -2,6 +2,14 @@ use core::f32;

 #[inline]
 pub fn ceilf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.ceil` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::ceilf32(x) }
+        }
+    }
    let mut ui = x.to_bits();
    let e = (((ui >> 23) & 0xff) - 0x7f) as i32;

--- a/src/math/fabs.rs
+++ b/src/math/fabs.rs
@ -2,5 +2,13 @@ use core::u64;

 #[inline]
 pub fn fabs(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.abs` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::fabsf64(x) }
+        }
+    }
    f64::from_bits(x.to_bits() & (u64::MAX / 2))
 }
--- a/src/math/fabsf.rs
+++ b/src/math/fabsf.rs
@ -1,4 +1,12 @@
 #[inline]
 pub fn fabsf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.abs` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::fabsf32(x) }
+        }
+    }
    f32::from_bits(x.to_bits() & 0x7fffffff)
 }
--- a/src/math/floor.rs
+++ b/src/math/floor.rs
@ -4,6 +4,14 @@ const TOINT: f64 = 1. / f64::EPSILON;

 #[inline]
 pub fn floor(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.floor` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::floorf64(x) }
+        }
+    }
    let ui = x.to_bits();
    let e = ((ui >> 52) & 0x7ff) as i32;

--- a/src/math/floorf.rs
+++ b/src/math/floorf.rs
@ -2,6 +2,14 @@ use core::f32;

 #[inline]
 pub fn floorf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.floor` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::floorf32(x) }
+        }
+    }
    let mut ui = x.to_bits();
    let e = (((ui >> 23) & 0xff) - 0x7f) as i32;

--- a/src/math/mod.rs
+++ b/src/math/mod.rs
@ -58,6 +58,17 @@ macro_rules! i {
    };
 }

+macro_rules! llvm_intrinsically_optimized {
+    (#[cfg($($clause:tt)*)] $e:expr) => {
+        #[cfg(all(not(feature = "stable"), $($clause)*))]
+        {
+            if true { // thwart the dead code lint
+                $e
+            }
+        }
+    };
+}
+
 // Public modules
 mod acos;
 mod acosf;
--- a/src/math/sqrt.rs
+++ b/src/math/sqrt.rs
@ -82,6 +82,18 @@ const TINY: f64 = 1.0e-300;

 #[inline]
 pub fn sqrt(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.sqrt` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return if x < 0.0 {
+                f64::NAN
+            } else {
+                unsafe { ::core::intrinsics::sqrtf64(x) }
+            }
+        }
+    }
    let mut z: f64;
    let sign: u32 = 0x80000000;
    let mut ix0: i32;
--- a/src/math/sqrtf.rs
+++ b/src/math/sqrtf.rs
@ -17,6 +17,18 @@ const TINY: f32 = 1.0e-30;

 #[inline]
 pub fn sqrtf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.sqrt` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return if x < 0.0 {
+                ::core::f32::NAN
+            } else {
+                unsafe { ::core::intrinsics::sqrtf32(x) }
+            }
+        }
+    }
    let mut z: f32;
    let sign: i32 = 0x80000000u32 as i32;
    let mut ix: i32;
--- a/src/math/trunc.rs
+++ b/src/math/trunc.rs
@ -2,6 +2,14 @@ use core::f64;

 #[inline]
 pub fn trunc(x: f64) -> f64 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f64.trunc` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::truncf64(x) }
+        }
+    }
    let x1p120 = f64::from_bits(0x4770000000000000); // 0x1p120f === 2 ^ 120

    let mut i: u64 = x.to_bits();
--- a/src/math/truncf.rs
+++ b/src/math/truncf.rs
@ -2,6 +2,14 @@ use core::f32;

 #[inline]
 pub fn truncf(x: f32) -> f32 {
+    // On wasm32 we know that LLVM's intrinsic will compile to an optimized
+    // `f32.trunc` native instruction, so we can leverage this for both code size
+    // and speed.
+    llvm_intrinsically_optimized! {
+        #[cfg(target_arch = "wasm32")] {
+            return unsafe { ::core::intrinsics::truncf32(x) }
+        }
+    }
    let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120

    let mut i: u32 = x.to_bits();