nvme: dma cache flush, better performance (played with timer)
This commit is contained in:
parent
a5e479007f
commit
40574c60f0
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -2725,6 +2725,7 @@ dependencies = [
|
||||
"bytemuck",
|
||||
"device-api",
|
||||
"futures-util",
|
||||
"kernel-arch",
|
||||
"kernel-fs",
|
||||
"libk",
|
||||
"libk-mm",
|
||||
|
@ -6,7 +6,10 @@ extern crate alloc;
|
||||
|
||||
use core::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use aarch64_cpu::registers::{DAIF, MPIDR_EL1, TPIDR_EL1};
|
||||
use aarch64_cpu::{
|
||||
asm::barrier,
|
||||
registers::{DAIF, MPIDR_EL1, TPIDR_EL1},
|
||||
};
|
||||
use alloc::{boxed::Box, sync::Arc, vec::Vec};
|
||||
use device_api::interrupt::LocalInterruptController;
|
||||
use kernel_arch_interface::{
|
||||
@ -134,4 +137,30 @@ impl Architecture for ArchitectureImpl {
|
||||
fn cpu_enabled_features<S: Scheduler>(_cpu: &CpuImpl<Self, S>) -> Option<&Self::CpuFeatures> {
|
||||
None
|
||||
}
|
||||
|
||||
// Cache/barrier operation
|
||||
fn load_barrier() {
|
||||
barrier::dmb(barrier::ISHLD);
|
||||
}
|
||||
|
||||
fn store_barrier() {
|
||||
barrier::dmb(barrier::ISHST);
|
||||
}
|
||||
|
||||
fn memory_barrier() {
|
||||
barrier::dsb(barrier::SY);
|
||||
}
|
||||
|
||||
fn flush_virtual_range(range: core::ops::Range<usize>) {
|
||||
// TODO cache line assumed to be 64 bytes
|
||||
const CLSIZE: usize = 64;
|
||||
let start = range.start & !(CLSIZE - 1);
|
||||
let end = (range.end + (CLSIZE - 1)) & !(CLSIZE - 1);
|
||||
|
||||
for line in (start..end).step_by(CLSIZE) {
|
||||
unsafe {
|
||||
core::arch::asm!("dc ivac, {address}", address = in(reg) line);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,8 @@
|
||||
#![feature(step_trait, const_trait_impl, never_type, decl_macro)]
|
||||
#![allow(clippy::new_without_default)]
|
||||
|
||||
use core::ops::Range;
|
||||
|
||||
use alloc::vec::Vec;
|
||||
use cpu::{CpuData, CpuFeatureSet, CpuImpl, IpiQueue};
|
||||
use device_api::interrupt::LocalInterruptController;
|
||||
@ -80,4 +82,16 @@ pub trait Architecture: Sized + 'static {
|
||||
fn cpu_enabled_features<S: Scheduler>(cpu: &CpuImpl<Self, S>) -> Option<&Self::CpuFeatures> {
|
||||
None
|
||||
}
|
||||
|
||||
// Cache/barrier operation
|
||||
|
||||
fn load_barrier();
|
||||
fn store_barrier();
|
||||
fn memory_barrier() {
|
||||
Self::store_barrier();
|
||||
Self::load_barrier();
|
||||
}
|
||||
|
||||
/// Flushes/invalidates a range of virtual memory from the CPU's data cache.
|
||||
fn flush_virtual_range(range: Range<usize>);
|
||||
}
|
||||
|
@ -3,7 +3,10 @@
|
||||
|
||||
extern crate alloc;
|
||||
|
||||
use core::sync::atomic::{AtomicUsize, Ordering};
|
||||
use core::{
|
||||
ops::Range,
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
};
|
||||
|
||||
use alloc::{boxed::Box, collections::btree_map::BTreeMap, vec::Vec};
|
||||
use device_api::interrupt::LocalInterruptController;
|
||||
@ -163,4 +166,21 @@ impl Architecture for ArchitectureImpl {
|
||||
fn idle_task() -> extern "C" fn(usize) -> ! {
|
||||
idle_task
|
||||
}
|
||||
|
||||
// Cache/barrier operation
|
||||
fn load_barrier() {
|
||||
unsafe { core::arch::asm!("fence r, w") };
|
||||
}
|
||||
|
||||
fn store_barrier() {
|
||||
unsafe { core::arch::asm!("fence w, r") };
|
||||
}
|
||||
|
||||
fn memory_barrier() {
|
||||
unsafe { core::arch::asm!("fence rw, rw") };
|
||||
}
|
||||
|
||||
fn flush_virtual_range(_range: Range<usize>) {
|
||||
// TODO
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
extern crate alloc;
|
||||
|
||||
use core::{
|
||||
ops::DerefMut,
|
||||
ops::{DerefMut, Range},
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
};
|
||||
|
||||
@ -195,4 +195,30 @@ impl Architecture for ArchitectureImpl {
|
||||
fn cpu_available_features<S: Scheduler>(cpu: &CpuImpl<Self, S>) -> Option<&Self::CpuFeatures> {
|
||||
Some(&cpu.available_features)
|
||||
}
|
||||
|
||||
// Cache/barrier
|
||||
|
||||
fn load_barrier() {
|
||||
unsafe { core::arch::x86_64::_mm_lfence() };
|
||||
}
|
||||
|
||||
fn store_barrier() {
|
||||
unsafe { core::arch::x86_64::_mm_sfence() };
|
||||
}
|
||||
|
||||
fn memory_barrier() {
|
||||
unsafe { core::arch::x86_64::_mm_mfence() };
|
||||
}
|
||||
|
||||
fn flush_virtual_range(range: Range<usize>) {
|
||||
// TODO I assume 64-byte cache line on all CPUs
|
||||
// TODO clflush instruction may not be available, test for it
|
||||
const CLSIZE: usize = 64;
|
||||
let start = range.start & !(CLSIZE - 1);
|
||||
let end = (range.end + (CLSIZE - 1)) & !(CLSIZE - 1);
|
||||
|
||||
for line in (start..end).step_by(CLSIZE) {
|
||||
unsafe { core::arch::x86_64::_mm_clflush(line as _) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ libk-util.workspace = true
|
||||
libk-mm.workspace = true
|
||||
libk.workspace = true
|
||||
device-api = { workspace = true, features = ["derive"] }
|
||||
kernel-arch.workspace = true
|
||||
|
||||
ygg_driver_pci = { path = "../../bus/pci" }
|
||||
kernel-fs = { path = "../../fs/kernel-fs" }
|
||||
|
@ -95,28 +95,21 @@ impl BlockDevice for NvmeNamespace {
|
||||
if position % self.block_size() as u64 != 0 {
|
||||
return Err(Error::InvalidOperation);
|
||||
}
|
||||
if buffer.len() % self.block_size() != 0 {
|
||||
if buffer.len() % self.block_size() != 0 || buffer.is_empty() {
|
||||
return Err(Error::InvalidOperation);
|
||||
}
|
||||
let lba = position / self.block_size() as u64;
|
||||
let lba_count = buffer.len() / self.block_size();
|
||||
let lba_count = buffer.len().div_ceil(self.block_size());
|
||||
if lba + lba_count as u64 > self.block_count() {
|
||||
return Err(Error::InvalidOperation);
|
||||
}
|
||||
|
||||
let result = self
|
||||
.controller
|
||||
.perform_io(
|
||||
self.nsid,
|
||||
lba,
|
||||
lba_count,
|
||||
buffer.bus_address(),
|
||||
buffer.len(),
|
||||
IoDirection::Read,
|
||||
)
|
||||
.perform_read(self.nsid, lba, lba_count, buffer)
|
||||
.await;
|
||||
|
||||
log::info!("read #{lba}, {lba_count} blocks -> {result:?} @ {buffer:p}");
|
||||
log::trace!(target: "io", "read #{lba}, {lba_count} blocks -> {result:?}");
|
||||
|
||||
result.map_err(NvmeError::into)
|
||||
}
|
||||
@ -125,34 +118,21 @@ impl BlockDevice for NvmeNamespace {
|
||||
if position % self.block_size() as u64 != 0 {
|
||||
return Err(Error::InvalidOperation);
|
||||
}
|
||||
if buffer.len() % self.block_size() != 0 {
|
||||
if buffer.len() % self.block_size() != 0 || buffer.is_empty() {
|
||||
return Err(Error::InvalidOperation);
|
||||
}
|
||||
let lba = position / self.block_size() as u64;
|
||||
let lba_count = buffer.len() / self.block_size();
|
||||
let lba_count = buffer.len().div_ceil(self.block_size());
|
||||
if lba + lba_count as u64 > self.block_count() {
|
||||
return Err(Error::InvalidOperation);
|
||||
}
|
||||
|
||||
// TODO ArchitectureImpl::flush_data_cache()
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
unsafe {
|
||||
core::arch::asm!("wbinvd");
|
||||
}
|
||||
|
||||
let result = self
|
||||
.controller
|
||||
.perform_io(
|
||||
self.nsid,
|
||||
lba,
|
||||
lba_count,
|
||||
buffer.bus_address(),
|
||||
buffer.len(),
|
||||
IoDirection::Write,
|
||||
)
|
||||
.perform_write(self.nsid, lba, lba_count, buffer)
|
||||
.await;
|
||||
|
||||
log::info!(target: "io", "write -> #{lba}, {lba_count} blocks -> {result:?} @ {buffer:p}");
|
||||
log::trace!(target: "io", "write -> #{lba}, {lba_count} blocks -> {result:?}");
|
||||
|
||||
result.map_err(NvmeError::into)
|
||||
}
|
||||
|
@ -7,7 +7,7 @@
|
||||
extern crate alloc;
|
||||
|
||||
use core::{
|
||||
mem::size_of,
|
||||
mem::{size_of, MaybeUninit},
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
time::Duration,
|
||||
};
|
||||
@ -20,9 +20,10 @@ use device_api::{
|
||||
interrupt::{InterruptAffinity, InterruptHandler, IrqVector},
|
||||
};
|
||||
use drive::NvmeNamespace;
|
||||
use kernel_arch::{Architecture, ArchitectureImpl};
|
||||
use libk::{
|
||||
device::manager::probe_partitions,
|
||||
dma::BusAddress,
|
||||
dma::{BusAddress, DmaSlice, DmaSliceMut},
|
||||
fs::devfs,
|
||||
task::{cpu_count, cpu_index, runtime},
|
||||
};
|
||||
@ -229,42 +230,54 @@ impl NvmeController {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn perform_io(
|
||||
pub async fn perform_read(
|
||||
&self,
|
||||
nsid: u32,
|
||||
lba: u64,
|
||||
lba_count: usize,
|
||||
buffer_address: BusAddress,
|
||||
transfer_size: usize,
|
||||
direction: IoDirection,
|
||||
buffer: DmaSliceMut<'_, MaybeUninit<u8>>,
|
||||
) -> Result<(), NvmeError> {
|
||||
let prp_list = PrpList::from_buffer(&*self.dma, buffer_address, transfer_size)?;
|
||||
|
||||
let _guard = IrqGuard::acquire();
|
||||
let prp_list = PrpList::from_buffer(&*self.dma, buffer.bus_address(), buffer.len())?;
|
||||
let cpu_index = cpu_index();
|
||||
let ioq = &self.ioqs.get()[cpu_index as usize];
|
||||
let cmd_id = ioq.submit(
|
||||
IoRead {
|
||||
nsid,
|
||||
lba,
|
||||
count: lba_count as _,
|
||||
},
|
||||
&prp_list,
|
||||
true,
|
||||
)?;
|
||||
ioq.wait_for_completion(cmd_id, ()).await?;
|
||||
|
||||
let cmd_id = match direction {
|
||||
IoDirection::Read => ioq.submit(
|
||||
IoRead {
|
||||
nsid,
|
||||
lba,
|
||||
count: lba_count as _,
|
||||
},
|
||||
&prp_list,
|
||||
true,
|
||||
)?,
|
||||
IoDirection::Write => ioq.submit(
|
||||
IoWrite {
|
||||
nsid,
|
||||
lba,
|
||||
count: lba_count as _,
|
||||
},
|
||||
&prp_list,
|
||||
true,
|
||||
)?,
|
||||
};
|
||||
ArchitectureImpl::memory_barrier();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn perform_write(
|
||||
&self,
|
||||
nsid: u32,
|
||||
lba: u64,
|
||||
lba_count: usize,
|
||||
buffer: DmaSlice<'_, u8>,
|
||||
) -> Result<(), NvmeError> {
|
||||
buffer.cache_flush_all();
|
||||
ArchitectureImpl::store_barrier();
|
||||
|
||||
let prp_list = PrpList::from_buffer(&*self.dma, buffer.bus_address(), buffer.len())?;
|
||||
let cpu_index = cpu_index();
|
||||
let ioq = &self.ioqs.get()[cpu_index as usize];
|
||||
let cmd_id = ioq.submit(
|
||||
IoWrite {
|
||||
nsid,
|
||||
lba,
|
||||
count: lba_count as _,
|
||||
},
|
||||
&prp_list,
|
||||
true,
|
||||
)?;
|
||||
ioq.wait_for_completion(cmd_id, ()).await?;
|
||||
|
||||
Ok(())
|
||||
|
@ -3,6 +3,7 @@ use core::{future::poll_fn, mem::size_of, ptr::null_mut, task::Poll};
|
||||
use alloc::collections::{BTreeMap, BTreeSet};
|
||||
use bytemuck::{Pod, Zeroable};
|
||||
use device_api::dma::DmaAllocator;
|
||||
use kernel_arch::{Architecture, ArchitectureImpl};
|
||||
use libk::dma::{BusAddress, DmaBuffer};
|
||||
use libk_mm::address::AsPhysicalAddress;
|
||||
use libk_util::{sync::IrqSafeSpinlock, waker::QueueWaker};
|
||||
@ -254,6 +255,8 @@ impl<T> Queue<T> {
|
||||
self.tail = new_tail;
|
||||
|
||||
if !self.tail_doorbell.is_null() {
|
||||
self.data.cache_flush_element(self.tail);
|
||||
ArchitectureImpl::store_barrier();
|
||||
unsafe {
|
||||
self.tail_doorbell
|
||||
.write_volatile(self.tail.try_into().unwrap());
|
||||
|
@ -20,7 +20,7 @@ use core::{
|
||||
};
|
||||
|
||||
use address::Virtualize;
|
||||
use kernel_arch::mem::PhysicalMemoryAllocator;
|
||||
use kernel_arch::{mem::PhysicalMemoryAllocator, Architecture, ArchitectureImpl};
|
||||
use libk_mm_interface::{
|
||||
address::{AsPhysicalAddress, PhysicalAddress},
|
||||
table::{MapAttributes, TableAllocator},
|
||||
@ -485,3 +485,11 @@ impl<T> DerefMut for PageSlice<T> {
|
||||
&mut self.data
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush_cache_data<T>(data: *const T) {
|
||||
ArchitectureImpl::flush_virtual_range(data.addr()..data.addr() + size_of::<T>());
|
||||
}
|
||||
|
||||
pub fn flush_cache_data_slice<T>(data: *const [T]) {
|
||||
ArchitectureImpl::flush_virtual_range(data.addr()..data.addr() + size_of::<T>() * data.len());
|
||||
}
|
||||
|
@ -25,7 +25,7 @@ mod sink;
|
||||
|
||||
pub use panic::{panic_log, PanicLoggerSink};
|
||||
pub use ring::add_kernel_log_file;
|
||||
pub use sink::{add_early_sink, add_serial_sink, add_sink, DebugSink};
|
||||
pub use sink::{add_early_sink, add_serial_sink, add_sink, disable_early_sinks, DebugSink};
|
||||
|
||||
static DEBUG_LOCK: IrqSafeSpinlock<()> = IrqSafeSpinlock::new(());
|
||||
|
||||
|
@ -213,6 +213,18 @@ impl<T> DmaBuffer<[T]> {
|
||||
range,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cache_flush_element(&self, index: usize) {
|
||||
libk_mm::flush_cache_data(&raw const self[index]);
|
||||
}
|
||||
|
||||
pub fn cache_flush_range(&self, range: Range<usize>) {
|
||||
libk_mm::flush_cache_data_slice(&raw const self[range]);
|
||||
}
|
||||
|
||||
pub fn cache_flush_all(&self) {
|
||||
libk_mm::flush_cache_data_slice(&raw const self[..]);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: ?Sized> DmaBuffer<T> {
|
||||
@ -308,6 +320,10 @@ impl<'a, T> DmaSlice<'a, T> {
|
||||
pub fn into_parts(self) -> (&'a DmaBuffer<[T]>, Range<usize>) {
|
||||
(self.buffer, self.range)
|
||||
}
|
||||
|
||||
pub fn cache_flush_all(&self) {
|
||||
self.buffer.cache_flush_range(self.range.clone());
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for DmaSlice<'_, T> {
|
||||
@ -333,6 +349,10 @@ impl<'a, T> DmaSliceMut<'a, T> {
|
||||
pub fn into_parts(self) -> (&'a mut DmaBuffer<[T]>, Range<usize>) {
|
||||
(self.buffer, self.range)
|
||||
}
|
||||
|
||||
pub fn cache_flush_all(&self) {
|
||||
self.buffer.cache_flush_range(self.range.clone());
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for DmaSliceMut<'_, T> {
|
||||
|
@ -31,7 +31,7 @@ use super::{
|
||||
MAX_MSI_VECTORS,
|
||||
};
|
||||
|
||||
const TIMER_INTERVAL: u32 = 150000;
|
||||
const TIMER_INTERVAL: u32 = 15000;
|
||||
|
||||
/// When initialized, contains the Local APIC ID of the bootstrap processor
|
||||
pub static BSP_APIC_ID: OneTimeInit<u32> = OneTimeInit::new();
|
||||
@ -329,7 +329,7 @@ impl LocalApic {
|
||||
regs.TaskPriorityRegister.set(0);
|
||||
|
||||
// Enable timer
|
||||
regs.TimerDivideConfig.set(0x3);
|
||||
regs.TimerDivideConfig.set(0x2);
|
||||
regs.TimerInitCount.set(TIMER_INTERVAL);
|
||||
|
||||
// Configure local interrupt vectors
|
||||
|
Loading…
x
Reference in New Issue
Block a user