[libc] Add Timing Utils for AMDGPU (#96828)

PR for adding AMDGPU timing utils for benchmarking.

I was not able to test this code since I do not have an AMD GPU, but I
was able to successfully compile this code using
-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_TEST_ARCHITECTURE=gfx90a
-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_LOADER_EXECUTABLE=echo
-DRUNTIMES_amdgcn_amd-amdhsa_LIBC_GPU_TARGET_ARCHITECTURE=gfx90a to
force the code to compile without having an AMD gpu on my machine.

@jhuber6
This commit is contained in:
jameshu15869 2024-07-10 16:04:56 -05:00 committed by GitHub
parent dd3aa5eb70
commit eb66e31bc2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 120 additions and 1 deletions

View File

@ -0,0 +1,7 @@
add_header_library(
amdgpu_timing
HDRS
timing.h
DEPENDS
libc.src.__support.common
)

View File

@ -0,0 +1,112 @@
//===------------- AMDGPU implementation of timing utils --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/common.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
#include <stdint.h>
// AMDGPU does not support input register constraints for i1 and i8, so we must
// cast them to uint16_t's before loading them into registers.
#define FORCE_TO_REGISTER(TYPE, VARIABLE) \
if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>) \
asm("" ::"v"(static_cast<uint16_t>(VARIABLE))); \
else \
asm("" ::"v"(VARIABLE))
namespace LIBC_NAMESPACE {
// Returns the overhead associated with calling the profiling region. This
// allows us to substract the constant-time overhead from the latency to
// obtain a true result. This can vary with system load.
[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
uint32_t result = 0.0;
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
asm("" ::"s"(start));
uint64_t stop = gpu::processor_clock();
return stop - start;
}
// Profile a simple function and obtain its latency in clock cycles on the
// system. This function cannot be inlined or else it will disturb the very
// delicate balance of hard-coded dependencies.
template <typename F, typename T>
[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
// We need to store the input somewhere to guarantee that the compiler
// will not constant propagate it and remove the profiling region.
volatile T storage = t;
T arg = storage;
FORCE_TO_REGISTER(T, arg);
// The AMDGPU architecture needs to wait on pending results.
gpu::memory_fence();
// Get the current timestamp from the clock.
uint64_t start = gpu::processor_clock();
// This forces the compiler to load the input argument and run the clock
// cycle counter before the profiling region.
FORCE_TO_REGISTER(T, arg);
asm("" ::"s"(start));
// Run the function under test and return its value.
auto result = f(arg);
// This inline assembly performs a no-op which forces the result to both
// be used and prevents us from exiting this region before it's complete.
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
// Obtain the current timestamp after running the calculation and force
// ordering.
uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
gpu::memory_fence();
// Return the time elapsed.
return stop - start;
}
template <typename F, typename T1, typename T2>
[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
volatile T1 storage1 = t1;
volatile T2 storage2 = t2;
T1 arg1 = storage1;
T2 arg2 = storage2;
FORCE_TO_REGISTER(T1, arg1);
FORCE_TO_REGISTER(T2, arg2);
gpu::memory_fence();
uint64_t start = gpu::processor_clock();
FORCE_TO_REGISTER(T1, arg1);
FORCE_TO_REGISTER(T2, arg2);
asm("" ::"s"(start));
auto result = f(arg1, arg2);
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
gpu::memory_fence();
return stop - start;
}
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU

View File

@ -12,7 +12,7 @@
#include "src/__support/macros/properties/architectures.h"
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
#error "amdgpu not yet supported"
#include "amdgpu/timing.h"
#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
#include "nvptx/timing.h"
#else