nvme: dma cache flush, better performance (played with timer)

This commit is contained in:
Mark Poliakov 2025-02-09 16:52:36 +02:00
parent a5e479007f
commit 40574c60f0
13 changed files with 179 additions and 64 deletions

1
Cargo.lock generated
View File

@ -2725,6 +2725,7 @@ dependencies = [
"bytemuck",
"device-api",
"futures-util",
"kernel-arch",
"kernel-fs",
"libk",
"libk-mm",

View File

@ -6,7 +6,10 @@ extern crate alloc;
use core::sync::atomic::{AtomicUsize, Ordering};
use aarch64_cpu::registers::{DAIF, MPIDR_EL1, TPIDR_EL1};
use aarch64_cpu::{
asm::barrier,
registers::{DAIF, MPIDR_EL1, TPIDR_EL1},
};
use alloc::{boxed::Box, sync::Arc, vec::Vec};
use device_api::interrupt::LocalInterruptController;
use kernel_arch_interface::{
@ -134,4 +137,30 @@ impl Architecture for ArchitectureImpl {
fn cpu_enabled_features<S: Scheduler>(_cpu: &CpuImpl<Self, S>) -> Option<&Self::CpuFeatures> {
None
}
// Cache/barrier operation
fn load_barrier() {
barrier::dmb(barrier::ISHLD);
}
fn store_barrier() {
barrier::dmb(barrier::ISHST);
}
fn memory_barrier() {
barrier::dsb(barrier::SY);
}
fn flush_virtual_range(range: core::ops::Range<usize>) {
// TODO cache line assumed to be 64 bytes
const CLSIZE: usize = 64;
let start = range.start & !(CLSIZE - 1);
let end = (range.end + (CLSIZE - 1)) & !(CLSIZE - 1);
for line in (start..end).step_by(CLSIZE) {
unsafe {
core::arch::asm!("dc ivac, {address}", address = in(reg) line);
}
}
}
}

View File

@ -2,6 +2,8 @@
#![feature(step_trait, const_trait_impl, never_type, decl_macro)]
#![allow(clippy::new_without_default)]
use core::ops::Range;
use alloc::vec::Vec;
use cpu::{CpuData, CpuFeatureSet, CpuImpl, IpiQueue};
use device_api::interrupt::LocalInterruptController;
@ -80,4 +82,16 @@ pub trait Architecture: Sized + 'static {
fn cpu_enabled_features<S: Scheduler>(cpu: &CpuImpl<Self, S>) -> Option<&Self::CpuFeatures> {
None
}
// Cache/barrier operation
fn load_barrier();
fn store_barrier();
fn memory_barrier() {
Self::store_barrier();
Self::load_barrier();
}
/// Flushes/invalidates a range of virtual memory from the CPU's data cache.
fn flush_virtual_range(range: Range<usize>);
}

View File

@ -3,7 +3,10 @@
extern crate alloc;
use core::sync::atomic::{AtomicUsize, Ordering};
use core::{
ops::Range,
sync::atomic::{AtomicUsize, Ordering},
};
use alloc::{boxed::Box, collections::btree_map::BTreeMap, vec::Vec};
use device_api::interrupt::LocalInterruptController;
@ -163,4 +166,21 @@ impl Architecture for ArchitectureImpl {
fn idle_task() -> extern "C" fn(usize) -> ! {
idle_task
}
// Cache/barrier operation
fn load_barrier() {
unsafe { core::arch::asm!("fence r, w") };
}
fn store_barrier() {
unsafe { core::arch::asm!("fence w, r") };
}
fn memory_barrier() {
unsafe { core::arch::asm!("fence rw, rw") };
}
fn flush_virtual_range(_range: Range<usize>) {
// TODO
}
}

View File

@ -5,7 +5,7 @@
extern crate alloc;
use core::{
ops::DerefMut,
ops::{DerefMut, Range},
sync::atomic::{AtomicUsize, Ordering},
};
@ -195,4 +195,30 @@ impl Architecture for ArchitectureImpl {
fn cpu_available_features<S: Scheduler>(cpu: &CpuImpl<Self, S>) -> Option<&Self::CpuFeatures> {
Some(&cpu.available_features)
}
// Cache/barrier
fn load_barrier() {
unsafe { core::arch::x86_64::_mm_lfence() };
}
fn store_barrier() {
unsafe { core::arch::x86_64::_mm_sfence() };
}
fn memory_barrier() {
unsafe { core::arch::x86_64::_mm_mfence() };
}
fn flush_virtual_range(range: Range<usize>) {
// TODO I assume 64-byte cache line on all CPUs
// TODO clflush instruction may not be available, test for it
const CLSIZE: usize = 64;
let start = range.start & !(CLSIZE - 1);
let end = (range.end + (CLSIZE - 1)) & !(CLSIZE - 1);
for line in (start..end).step_by(CLSIZE) {
unsafe { core::arch::x86_64::_mm_clflush(line as _) };
}
}
}

View File

@ -10,6 +10,7 @@ libk-util.workspace = true
libk-mm.workspace = true
libk.workspace = true
device-api = { workspace = true, features = ["derive"] }
kernel-arch.workspace = true
ygg_driver_pci = { path = "../../bus/pci" }
kernel-fs = { path = "../../fs/kernel-fs" }

View File

@ -95,28 +95,21 @@ impl BlockDevice for NvmeNamespace {
if position % self.block_size() as u64 != 0 {
return Err(Error::InvalidOperation);
}
if buffer.len() % self.block_size() != 0 {
if buffer.len() % self.block_size() != 0 || buffer.is_empty() {
return Err(Error::InvalidOperation);
}
let lba = position / self.block_size() as u64;
let lba_count = buffer.len() / self.block_size();
let lba_count = buffer.len().div_ceil(self.block_size());
if lba + lba_count as u64 > self.block_count() {
return Err(Error::InvalidOperation);
}
let result = self
.controller
.perform_io(
self.nsid,
lba,
lba_count,
buffer.bus_address(),
buffer.len(),
IoDirection::Read,
)
.perform_read(self.nsid, lba, lba_count, buffer)
.await;
log::info!("read #{lba}, {lba_count} blocks -> {result:?} @ {buffer:p}");
log::trace!(target: "io", "read #{lba}, {lba_count} blocks -> {result:?}");
result.map_err(NvmeError::into)
}
@ -125,34 +118,21 @@ impl BlockDevice for NvmeNamespace {
if position % self.block_size() as u64 != 0 {
return Err(Error::InvalidOperation);
}
if buffer.len() % self.block_size() != 0 {
if buffer.len() % self.block_size() != 0 || buffer.is_empty() {
return Err(Error::InvalidOperation);
}
let lba = position / self.block_size() as u64;
let lba_count = buffer.len() / self.block_size();
let lba_count = buffer.len().div_ceil(self.block_size());
if lba + lba_count as u64 > self.block_count() {
return Err(Error::InvalidOperation);
}
// TODO ArchitectureImpl::flush_data_cache()
#[cfg(target_arch = "x86_64")]
unsafe {
core::arch::asm!("wbinvd");
}
let result = self
.controller
.perform_io(
self.nsid,
lba,
lba_count,
buffer.bus_address(),
buffer.len(),
IoDirection::Write,
)
.perform_write(self.nsid, lba, lba_count, buffer)
.await;
log::info!(target: "io", "write -> #{lba}, {lba_count} blocks -> {result:?} @ {buffer:p}");
log::trace!(target: "io", "write -> #{lba}, {lba_count} blocks -> {result:?}");
result.map_err(NvmeError::into)
}

View File

@ -7,7 +7,7 @@
extern crate alloc;
use core::{
mem::size_of,
mem::{size_of, MaybeUninit},
sync::atomic::{AtomicUsize, Ordering},
time::Duration,
};
@ -20,9 +20,10 @@ use device_api::{
interrupt::{InterruptAffinity, InterruptHandler, IrqVector},
};
use drive::NvmeNamespace;
use kernel_arch::{Architecture, ArchitectureImpl};
use libk::{
device::manager::probe_partitions,
dma::BusAddress,
dma::{BusAddress, DmaSlice, DmaSliceMut},
fs::devfs,
task::{cpu_count, cpu_index, runtime},
};
@ -229,42 +230,54 @@ impl NvmeController {
Ok(())
}
pub async fn perform_io(
pub async fn perform_read(
&self,
nsid: u32,
lba: u64,
lba_count: usize,
buffer_address: BusAddress,
transfer_size: usize,
direction: IoDirection,
buffer: DmaSliceMut<'_, MaybeUninit<u8>>,
) -> Result<(), NvmeError> {
let prp_list = PrpList::from_buffer(&*self.dma, buffer_address, transfer_size)?;
let _guard = IrqGuard::acquire();
let prp_list = PrpList::from_buffer(&*self.dma, buffer.bus_address(), buffer.len())?;
let cpu_index = cpu_index();
let ioq = &self.ioqs.get()[cpu_index as usize];
let cmd_id = ioq.submit(
IoRead {
nsid,
lba,
count: lba_count as _,
},
&prp_list,
true,
)?;
ioq.wait_for_completion(cmd_id, ()).await?;
let cmd_id = match direction {
IoDirection::Read => ioq.submit(
IoRead {
nsid,
lba,
count: lba_count as _,
},
&prp_list,
true,
)?,
IoDirection::Write => ioq.submit(
IoWrite {
nsid,
lba,
count: lba_count as _,
},
&prp_list,
true,
)?,
};
ArchitectureImpl::memory_barrier();
Ok(())
}
pub async fn perform_write(
&self,
nsid: u32,
lba: u64,
lba_count: usize,
buffer: DmaSlice<'_, u8>,
) -> Result<(), NvmeError> {
buffer.cache_flush_all();
ArchitectureImpl::store_barrier();
let prp_list = PrpList::from_buffer(&*self.dma, buffer.bus_address(), buffer.len())?;
let cpu_index = cpu_index();
let ioq = &self.ioqs.get()[cpu_index as usize];
let cmd_id = ioq.submit(
IoWrite {
nsid,
lba,
count: lba_count as _,
},
&prp_list,
true,
)?;
ioq.wait_for_completion(cmd_id, ()).await?;
Ok(())

View File

@ -3,6 +3,7 @@ use core::{future::poll_fn, mem::size_of, ptr::null_mut, task::Poll};
use alloc::collections::{BTreeMap, BTreeSet};
use bytemuck::{Pod, Zeroable};
use device_api::dma::DmaAllocator;
use kernel_arch::{Architecture, ArchitectureImpl};
use libk::dma::{BusAddress, DmaBuffer};
use libk_mm::address::AsPhysicalAddress;
use libk_util::{sync::IrqSafeSpinlock, waker::QueueWaker};
@ -254,6 +255,8 @@ impl<T> Queue<T> {
self.tail = new_tail;
if !self.tail_doorbell.is_null() {
self.data.cache_flush_element(self.tail);
ArchitectureImpl::store_barrier();
unsafe {
self.tail_doorbell
.write_volatile(self.tail.try_into().unwrap());

View File

@ -20,7 +20,7 @@ use core::{
};
use address::Virtualize;
use kernel_arch::mem::PhysicalMemoryAllocator;
use kernel_arch::{mem::PhysicalMemoryAllocator, Architecture, ArchitectureImpl};
use libk_mm_interface::{
address::{AsPhysicalAddress, PhysicalAddress},
table::{MapAttributes, TableAllocator},
@ -485,3 +485,11 @@ impl<T> DerefMut for PageSlice<T> {
&mut self.data
}
}
pub fn flush_cache_data<T>(data: *const T) {
ArchitectureImpl::flush_virtual_range(data.addr()..data.addr() + size_of::<T>());
}
pub fn flush_cache_data_slice<T>(data: *const [T]) {
ArchitectureImpl::flush_virtual_range(data.addr()..data.addr() + size_of::<T>() * data.len());
}

View File

@ -25,7 +25,7 @@ mod sink;
pub use panic::{panic_log, PanicLoggerSink};
pub use ring::add_kernel_log_file;
pub use sink::{add_early_sink, add_serial_sink, add_sink, DebugSink};
pub use sink::{add_early_sink, add_serial_sink, add_sink, disable_early_sinks, DebugSink};
static DEBUG_LOCK: IrqSafeSpinlock<()> = IrqSafeSpinlock::new(());

View File

@ -213,6 +213,18 @@ impl<T> DmaBuffer<[T]> {
range,
}
}
pub fn cache_flush_element(&self, index: usize) {
libk_mm::flush_cache_data(&raw const self[index]);
}
pub fn cache_flush_range(&self, range: Range<usize>) {
libk_mm::flush_cache_data_slice(&raw const self[range]);
}
pub fn cache_flush_all(&self) {
libk_mm::flush_cache_data_slice(&raw const self[..]);
}
}
impl<T: ?Sized> DmaBuffer<T> {
@ -308,6 +320,10 @@ impl<'a, T> DmaSlice<'a, T> {
pub fn into_parts(self) -> (&'a DmaBuffer<[T]>, Range<usize>) {
(self.buffer, self.range)
}
pub fn cache_flush_all(&self) {
self.buffer.cache_flush_range(self.range.clone());
}
}
impl<T> Deref for DmaSlice<'_, T> {
@ -333,6 +349,10 @@ impl<'a, T> DmaSliceMut<'a, T> {
pub fn into_parts(self) -> (&'a mut DmaBuffer<[T]>, Range<usize>) {
(self.buffer, self.range)
}
pub fn cache_flush_all(&self) {
self.buffer.cache_flush_range(self.range.clone());
}
}
impl<T> Deref for DmaSliceMut<'_, T> {

View File

@ -31,7 +31,7 @@ use super::{
MAX_MSI_VECTORS,
};
const TIMER_INTERVAL: u32 = 150000;
const TIMER_INTERVAL: u32 = 15000;
/// When initialized, contains the Local APIC ID of the bootstrap processor
pub static BSP_APIC_ID: OneTimeInit<u32> = OneTimeInit::new();
@ -329,7 +329,7 @@ impl LocalApic {
regs.TaskPriorityRegister.set(0);
// Enable timer
regs.TimerDivideConfig.set(0x3);
regs.TimerDivideConfig.set(0x2);
regs.TimerInitCount.set(TIMER_INTERVAL);
// Configure local interrupt vectors