#![feature(const_trait_impl, let_chains, if_let_guard, maybe_uninit_slice)] #![allow(missing_docs)] #![no_std] // TODO #![allow(unused)] extern crate alloc; use core::{ mem::size_of, sync::atomic::{AtomicUsize, Ordering}, time::Duration, }; use alloc::{collections::BTreeMap, format, sync::Arc, vec::Vec}; use command::{IdentifyActiveNamespaceIdListRequest, IdentifyControllerRequest}; use device_api::{ device::Device, interrupt::{InterruptAffinity, InterruptHandler}, }; use drive::NvmeNamespace; use libk::{ device::manager::probe_partitions, fs::devfs, task::{cpu_count, cpu_index, runtime}, }; use libk_mm::{address::PhysicalAddress, device::DeviceMemoryIo, L3_PAGE_SIZE}; use libk_util::{ sync::{IrqGuard, IrqSafeSpinlock}, OneTimeInit, }; use queue::PrpList; use tock_registers::{ interfaces::{ReadWriteable, Readable, Writeable}, register_bitfields, register_structs, registers::{ReadOnly, ReadWrite, WriteOnly}, }; use ygg_driver_pci::{ device::{PciDeviceInfo, PreferredInterruptMode}, PciCommandRegister, PciConfigurationSpace, }; use yggdrasil_abi::{error::Error, io::FileMode}; use crate::{ command::{IoRead, IoWrite}, queue::{CompletionQueueEntry, SubmissionQueueEntry}, }; use self::{ command::{CreateIoCompletionQueue, CreateIoSubmissionQueue, SetFeatureRequest}, error::NvmeError, queue::QueuePair, }; mod command; mod drive; mod error; mod queue; pub const MAX_PAGES_PER_REQUEST: usize = 256; // Use host page pub const PAGE_SIZE: usize = L3_PAGE_SIZE; register_bitfields! { u32, CC [ IOCQES OFFSET(20) NUMBITS(4) [], IOSQES OFFSET(16) NUMBITS(4) [], AMS OFFSET(11) NUMBITS(3) [], MPS OFFSET(7) NUMBITS(4) [], CSS OFFSET(4) NUMBITS(3) [ NvmCommandSet = 0 ], ENABLE OFFSET(0) NUMBITS(1) [], ], CSTS [ CFS OFFSET(1) NUMBITS(1) [], RDY OFFSET(0) NUMBITS(1) [], ], AQA [ /// Admin Completion Queue Size in entries - 1 ACQS OFFSET(16) NUMBITS(12) [], /// Admin Submission Queue Size in entries - 1 ASQS OFFSET(0) NUMBITS(12) [], ] } register_bitfields! { u64, CAP [ /// Maximum Queue Entries Supported - 1. i.e., 0 means maximum queue len of 1, 1 = 2 etc. MQES OFFSET(0) NUMBITS(16) [], /// Timeout. Represents the worst-case time the host software should wait for CSTS.RDY to /// change its state. TO OFFSET(24) NUMBITS(8) [], /// Doorbell stride. Stride in bytes = pow(2, 2 + DSTRD). DSTRD OFFSET(32) NUMBITS(4) [], /// NVM Subsystem Reset Supported (see NVMe BS Section 3.7.1) NSSRS OFFSET(36) NUMBITS(1) [], /// Controller supports one or more I/O command sets CSS_IO_COMMANDS OFFSET(43) NUMBITS(1) [], /// Controller only supports admin commands and no I/O commands CSS_ADMIN_ONLY OFFSET(44) NUMBITS(1) [], /// Memory page size minimum (bytes = pow(2, 12 + MPSMIN)) MPSMIN OFFSET(48) NUMBITS(4) [], /// Memory page size maximum -|- MPSMAX OFFSET(52) NUMBITS(4) [], ] } register_structs! { #[allow(non_snake_case)] Regs { (0x00 => CAP: ReadOnly), (0x08 => VS: ReadOnly), (0x0C => INTMS: WriteOnly), (0x10 => INTMC: WriteOnly), (0x14 => CC: ReadWrite), (0x18 => _0), (0x1C => CSTS: ReadOnly), (0x20 => _1), (0x24 => AQA: ReadWrite), (0x28 => ASQ: ReadWrite), (0x30 => ACQ: ReadWrite), (0x38 => _2), (0x2000 => @END), } } pub struct NvmeController { regs: IrqSafeSpinlock>, admin_q: OneTimeInit, ioqs: OneTimeInit>, io_queue_count: AtomicUsize, drive_table: IrqSafeSpinlock>>, controller_id: OneTimeInit, pci: PciDeviceInfo, doorbell_shift: usize, min_page_size: usize, } #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum IoDirection { Read, Write, } impl Regs { unsafe fn doorbell_ptr(&self, shift: usize, completion: bool, queue_index: usize) -> *mut u32 { let doorbell_base = (self as *const Regs as *mut Regs).addr() + 0x1000; let offset = ((queue_index << shift) + completion as usize) * 4; (doorbell_base + offset) as *mut u32 } } impl NvmeController { const ADMIN_QUEUE_SIZE: usize = 32; const IO_QUEUE_SIZE: usize = 32; async fn create_queues(&self) -> Result<(), NvmeError> { let admin_q = self.admin_q.get(); let io_queue_count = self.io_queue_count.load(Ordering::Acquire); log::info!( "Creating {} queue pairs for nvme{}", io_queue_count, self.controller_id.get() ); // Request a CQ/SQ pair for I/O admin_q .request_no_data(SetFeatureRequest::NumberOfQueues( io_queue_count as _, io_queue_count as _, )) .await?; let mut queues = Vec::new(); for i in 1..=io_queue_count { let id = i as u32; let (sq_doorbell, cq_doorbell) = unsafe { self.doorbell_pair(i) }; let queue = QueuePair::new(id, i, Self::IO_QUEUE_SIZE, sq_doorbell, cq_doorbell) .map_err(NvmeError::MemoryError)?; admin_q .request_no_data(CreateIoCompletionQueue { id, vector: id, size: Self::IO_QUEUE_SIZE, data: queue.cq_physical_pointer(), }) .await?; admin_q .request_no_data(CreateIoSubmissionQueue { id, cq_id: id, size: Self::IO_QUEUE_SIZE, data: queue.sq_physical_pointer(), }) .await?; queues.push(queue); } self.ioqs.init(queues); Ok(()) } async fn late_init(self: Arc) -> Result<(), NvmeError> { register_nvme_controller(self.clone()); let io_queue_count = cpu_count(); self.io_queue_count.store(io_queue_count, Ordering::Release); { let range = self .pci .map_interrupt_multiple(0..io_queue_count + 1, InterruptAffinity::Any, self.clone()) .unwrap(); // TODO handle different MSI range allocations for (i, msi) in range.iter().enumerate() { assert_eq!(i, msi.vector); } } let admin_q = self.admin_q.get(); // Identify the controller let identify = admin_q.request(IdentifyControllerRequest).await?; let max_transfer_size = if identify.mdts == 0 { // Pick some sane default value 256 * self.min_page_size } else { (1 << identify.mdts) * self.min_page_size }; self.create_queues().await?; // Identify namespaces self.enumerate_namespaces(max_transfer_size).await?; Ok(()) } async fn enumerate_namespaces( self: &Arc, max_transfer_size: usize, ) -> Result<(), NvmeError> { let admin_q = self.admin_q.get(); let namespaces = admin_q .request(IdentifyActiveNamespaceIdListRequest { start_id: 0 }) .await?; let count = namespaces.entries.iter().position(|&x| x == 0).unwrap(); let list = &namespaces.entries[..count]; for &nsid in list { match NvmeNamespace::create(self.clone(), nsid, max_transfer_size).await { Ok(drive) => { self.drive_table.lock().insert(nsid, drive); } Err(error) => { log::warn!("Could not create nvme drive, nsid={}: {:?}", nsid, error); } } } Ok(()) } pub async fn perform_io( &self, nsid: u32, lba: u64, lba_count: usize, buffer_address: PhysicalAddress, transfer_size: usize, direction: IoDirection, ) -> Result<(), NvmeError> { let prp_list = PrpList::from_buffer(buffer_address, transfer_size)?; let _guard = IrqGuard::acquire(); let cpu_index = cpu_index(); let ioq = &self.ioqs.get()[cpu_index as usize]; let cmd_id = match direction { IoDirection::Read => ioq.submit( IoRead { nsid, lba, count: lba_count as _, }, &prp_list, true, )?, IoDirection::Write => ioq.submit( IoWrite { nsid, lba, count: lba_count as _, }, &prp_list, true, )?, }; ioq.wait_for_completion(cmd_id, ()).await?; Ok(()) } unsafe fn doorbell_pair(&self, idx: usize) -> (*mut u32, *mut u32) { let regs = self.regs.lock(); let sq_ptr = regs.doorbell_ptr(self.doorbell_shift, false, idx); let cq_ptr = regs.doorbell_ptr(self.doorbell_shift, true, idx); (sq_ptr, cq_ptr) } } impl InterruptHandler for NvmeController { fn handle_irq(self: Arc, vector: Option) -> bool { let vector = vector.expect("Only MSI-X interrupts are supported"); if vector == 0 { self.admin_q.get().process_completions() != 0 } else if vector <= self.io_queue_count.load(Ordering::Acquire) && let Some(ioqs) = self.ioqs.try_get() { ioqs[vector - 1].process_completions() != 0 } else { false } } } impl Device for NvmeController { unsafe fn init(self: Arc) -> Result<(), Error> { let regs = self.regs.lock(); let timeout = Duration::from_millis(regs.CAP.read(CAP::TO) * 500); log::debug!("Worst-case timeout: {:?}", timeout); while regs.CSTS.matches_all(CSTS::RDY::SET) { core::hint::spin_loop(); } if Self::ADMIN_QUEUE_SIZE as u64 > regs.CAP.read(CAP::MQES) + 1 { todo!( "queue_slots too big, max = {}", regs.CAP.read(CAP::MQES) + 1 ); } // Setup the admin queue (index 0) let admin_sq_doorbell = unsafe { regs.doorbell_ptr(self.doorbell_shift, false, 0) }; let admin_cq_doorbell = unsafe { regs.doorbell_ptr(self.doorbell_shift, true, 0) }; log::debug!("sq_doorbell for adminq = {:p}", admin_sq_doorbell); let admin_q = QueuePair::new( 0, 0, Self::ADMIN_QUEUE_SIZE, admin_sq_doorbell, admin_cq_doorbell, ) .unwrap(); regs.AQA.modify( AQA::ASQS.val(Self::ADMIN_QUEUE_SIZE as u32 - 1) + AQA::ACQS.val(Self::ADMIN_QUEUE_SIZE as u32 - 1), ); regs.ASQ.set(admin_q.sq_physical_pointer().into()); regs.ACQ.set(admin_q.cq_physical_pointer().into()); // Configure the controller const IOSQES: u32 = size_of::().ilog2(); const IOCQES: u32 = size_of::().ilog2(); regs.CC.modify( CC::IOCQES.val(IOCQES) + CC::IOSQES.val(IOSQES) + CC::MPS.val(0) + CC::CSS::NvmCommandSet, ); // Enable the controller regs.CC.modify(CC::ENABLE::SET); log::debug!("Reset the controller"); while !regs.CSTS.matches_any(&[CSTS::RDY::SET, CSTS::CFS::SET]) { core::hint::spin_loop(); } if regs.CSTS.matches_all(CSTS::CFS::SET) { todo!("CFS set after reset!"); } self.admin_q.init(admin_q); // Schedule late_init task runtime::spawn(self.clone().late_init())?; Ok(()) } fn display_name(&self) -> &str { "NVM Express Controller" } } // TODO unsafe impl Sync for NvmeController {} static NVME_CONTROLLERS: IrqSafeSpinlock>> = IrqSafeSpinlock::new(Vec::new()); pub fn probe(info: &PciDeviceInfo) -> Result, Error> { let bar0 = info .config_space .bar(0) .unwrap() .as_memory() .expect("Expected a memory BAR0"); info.init_interrupts(PreferredInterruptMode::Msi(true))?; let mut cmd = PciCommandRegister::from_bits_retain(info.config_space.command()); cmd &= !(PciCommandRegister::DISABLE_INTERRUPTS | PciCommandRegister::ENABLE_IO); cmd |= PciCommandRegister::ENABLE_MEMORY | PciCommandRegister::BUS_MASTER; info.config_space.set_command(cmd.bits()); let regs = unsafe { DeviceMemoryIo::::map(bar0, Default::default()) }?; // Disable the controller regs.CC.modify(CC::ENABLE::CLEAR); let doorbell_shift = regs.CAP.read(CAP::DSTRD) as usize + 1; let min_page_size = 1 << (regs.CAP.read(CAP::MPSMIN) + 12); if min_page_size > PAGE_SIZE { log::error!("Cannot support NVMe HC: min page size ({min_page_size}) > host page size ({PAGE_SIZE})"); return Err(Error::InvalidArgument); } let device = NvmeController { regs: IrqSafeSpinlock::new(regs), admin_q: OneTimeInit::new(), ioqs: OneTimeInit::new(), drive_table: IrqSafeSpinlock::new(BTreeMap::new()), controller_id: OneTimeInit::new(), pci: info.clone(), io_queue_count: AtomicUsize::new(1), doorbell_shift, min_page_size, }; Ok(Arc::new(device)) } pub fn register_nvme_controller(controller: Arc) { let mut list = NVME_CONTROLLERS.lock(); let id = list.len(); list.push(controller.clone()); controller.controller_id.init(id as u32); } pub fn register_nvme_namespace(namespace: Arc, probe: bool) { let name = format!("nvme{}n{}", namespace.controller_id(), namespace.id()); log::info!("Register NVMe namespace: {name}"); devfs::add_named_block_device(namespace.clone(), name.clone(), FileMode::new(0o600)).ok(); if probe { runtime::spawn(async move { let name = name; log::info!("Probing partitions for {name}"); probe_partitions(namespace, |index, partition| { let partition_name = format!("{name}p{}", index + 1); devfs::add_named_block_device( Arc::new(partition), partition_name, FileMode::new(0o600), ) .ok(); }) .await .inspect_err(|error| log::error!("{name}: partition probe failed: {error:?}")) }) .ok(); } }