From 5de0d191c18f6e22bd7d962108cd240449d3bc6e Mon Sep 17 00:00:00 2001 From: Chen Chengjun Date: Wed, 11 Feb 2026 09:26:01 +0000 Subject: [PATCH] Refactor page cache implementation --- kernel/src/fs/utils/page_cache.rs | 707 +++++++++------------------ kernel/src/vm/vmo/mod.rs | 784 +++++++++++++++++++----------- kernel/src/vm/vmo/page_cache.rs | 72 +++ 3 files changed, 803 insertions(+), 760 deletions(-) create mode 100644 kernel/src/vm/vmo/page_cache.rs diff --git a/kernel/src/fs/utils/page_cache.rs b/kernel/src/fs/utils/page_cache.rs index b752f91b9..eb572e2de 100644 --- a/kernel/src/fs/utils/page_cache.rs +++ b/kernel/src/fs/utils/page_cache.rs @@ -1,505 +1,133 @@ // SPDX-License-Identifier: MPL-2.0 -#![expect(dead_code)] - use core::{ - ops::Range, - sync::atomic::{AtomicU8, Ordering}, + ops::{Deref, Range}, + sync::atomic::{AtomicBool, AtomicU8, Ordering}, }; -use align_ext::AlignExt; use aster_block::bio::{BioStatus, BioWaiter}; -use lru::LruCache; use ostd::{ impl_untyped_frame_meta_for, - mm::{Frame, FrameAllocOptions, UFrame, VmIoFill}, + mm::{Frame, FrameAllocOptions, HasPaddr}, + sync::WaitQueue, }; -use crate::{ - prelude::*, - vm::vmo::{Pager, Vmo, VmoFlags, VmoOptions, get_page_idx_range}, -}; +use crate::{prelude::*, vm::vmo::Vmo}; -pub struct PageCache { - pages: Arc, - manager: Arc, -} +/// The page cache type. +/// +/// The page cache is implemented using a [`Vmo`]. Typically, a page cache for +/// a disk-based file system (e.g., ext2, exfat) is a **disk-backed VMO**, which +/// is associated with a [`PageCacheBackend`] that provides I/O operations to read +/// from and write to the underlying block device. In contrast, for purely in-memory +/// file systems (e.g., ramfs), the page cache is an **anonymous VMO** — it has no +/// backend and its pages exist only in RAM. +pub type PageCache = Arc; -impl PageCache { - /// Creates an empty size page cache associated with a new backend. - pub fn new(backend: Weak) -> Result { - let manager = Arc::new(PageCacheManager::new(backend)); - let pages = VmoOptions::new(0) - .flags(VmoFlags::RESIZABLE) - .pager(manager.clone()) - .alloc()?; - Ok(Self { pages, manager }) - } +/// A trait for page cache operations. +/// +/// The page cache serves as an in-memory buffer between the file system and +/// block devices, caching frequently accessed file data to improve performance. +pub trait PageCacheOps { + /// Creates a new page cache with the specified capacity. + fn with_capacity(capacity: usize, backend: Weak) -> Result>; - /// Creates a page cache associated with an existing backend. + /// Resizes the page cache to the target size. /// - /// The `capacity` is the initial cache size required by the backend. - /// This size usually corresponds to the size of the backend. - pub fn with_capacity(capacity: usize, backend: Weak) -> Result { - let manager = Arc::new(PageCacheManager::new(backend)); - let pages = VmoOptions::new(capacity) - .flags(VmoFlags::RESIZABLE) - .pager(manager.clone()) - .alloc()?; - Ok(Self { pages, manager }) - } - - /// Returns the Vmo object. - pub fn pages(&self) -> &Arc { - &self.pages - } - - /// Evict the data within a specified range from the page cache and persist - /// them to the backend. - pub fn evict_range(&self, range: Range) -> Result<()> { - self.manager.evict_range(range) - } - - /// Evict the data within a specified range from the page cache without persisting - /// them to the backend. - pub fn discard_range(&self, range: Range) { - self.manager.discard_range(range) - } - - /// Returns the backend. - pub fn backend(&self) -> Arc { - self.manager.backend() - } - - /// Resizes the current page cache to a target size. - pub fn resize(&self, new_size: usize) -> Result<()> { - // If the new size is smaller and not page-aligned, - // first zero the gap between the new size and the - // next page boundary (or the old size), if such a gap exists. - let old_size = self.pages.size(); - if old_size > new_size && !new_size.is_multiple_of(PAGE_SIZE) { - let gap_size = old_size.min(new_size.align_up(PAGE_SIZE)) - new_size; - if gap_size > 0 { - self.fill_zeros(new_size..new_size + gap_size)?; - } - } - self.pages.resize(new_size) - } - - /// Fill the specified range with zeros in the page cache. - pub fn fill_zeros(&self, range: Range) -> Result<()> { - if range.is_empty() { - return Ok(()); - } - let (start, end) = (range.start, range.end); - - // Write zeros to the first partial page if any - let first_page_end = start.align_up(PAGE_SIZE); - if first_page_end > start { - let zero_len = first_page_end.min(end) - start; - self.pages().fill_zeros(start, zero_len)?; - } - - // Write zeros to the last partial page if any - let last_page_start = end.align_down(PAGE_SIZE); - if last_page_start < end && last_page_start >= start { - let zero_len = end - last_page_start; - self.pages().fill_zeros(last_page_start, zero_len)?; - } - - for offset in (first_page_end..last_page_start).step_by(PAGE_SIZE) { - self.pages().fill_zeros(offset, PAGE_SIZE)?; - } - Ok(()) - } -} - -impl Drop for PageCache { - fn drop(&mut self) { - // TODO: - // The default destruction procedure exhibits slow performance. - // In contrast, resizing the `VMO` to zero greatly accelerates the process. - // We need to find out the underlying cause of this discrepancy. - let _ = self.pages.resize(0); - } -} - -impl Debug for PageCache { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - f.debug_struct("PageCache") - .field("size", &self.pages.size()) - .field("manager", &self.manager) - .finish() - } -} - -struct ReadaheadWindow { - /// The window. - window: Range, - /// Look ahead position in the current window, where the readahead is triggered. - /// TODO: We set the `lookahead_index` to the start of the window for now. - /// This should be adjustable by the user. - lookahead_index: usize, -} - -impl ReadaheadWindow { - pub fn new(window: Range) -> Self { - let lookahead_index = window.start; - Self { - window, - lookahead_index, - } - } - - /// Gets the next readahead window. - /// Most of the time, we push the window forward and double its size. + /// The `new_size` will be rounded up to page boundaries. If the new size is smaller + /// than the current size, pages that fall entirely within the truncated range will be + /// decommitted (freed). For the page that is only partially truncated (i.e., the page + /// containing the new boundary), the truncated portion will be filled with zeros instead. /// - /// The `max_size` is the maximum size of the window. - /// The `max_page` is the total page number of the file, and the window should not - /// exceed the scope of the file. - pub fn next(&self, max_size: usize, max_page: usize) -> Self { - let new_start = self.window.end; - let cur_size = self.window.end - self.window.start; - let new_size = (cur_size * 2).min(max_size).min(max_page - new_start); - Self { - window: new_start..(new_start + new_size), - lookahead_index: new_start, - } - } + /// The `old_size` represents the actual used range of the page cache (i.e., the logical + /// size of the cached content), which may differ from the total capacity of the page cache. + /// It is used to determine the boundary of the previously valid data so that only the + /// discarded logical range (from `new_size` to `old_size`) within a partially truncated + /// page needs to be zero-filled. + fn resize(&self, new_size: usize, old_size: usize) -> Result<()>; - pub fn lookahead_index(&self) -> usize { - self.lookahead_index - } + /// Flushes the dirty pages in the specified range to the backend storage. + /// + /// This operation ensures that any modifications made to the pages within the given + /// range are persisted to the underlying storage device or file system. + /// + /// If the given range exceeds the current size of the page cache, only the pages within + /// the valid range will be flushed. + fn flush_range(&self, range: Range) -> Result<()>; - pub fn readahead_index(&self) -> usize { - self.window.end - } + /// Discards the pages within the specified range from the page cache. + /// + /// This operation will first **flush** the dirty pages in the range to the backend storage, + /// ensuring that any modifications are persisted. After flushing, the pages are removed + /// from the page cache. This is useful for invalidating cached data that is no longer needed + /// or has become stale. + fn discard_range(&self, range: Range) -> Result<()>; - pub fn readahead_range(&self) -> Range { - self.window.clone() - } -} - -struct ReadaheadState { - /// Current readahead window. - ra_window: Option, - /// Maximum window size. - max_size: usize, - /// The last page visited, used to determine sequential I/O. - prev_page: Option, - /// Readahead requests waiter. - waiter: BioWaiter, -} - -impl ReadaheadState { - const INIT_WINDOW_SIZE: usize = 4; - const DEFAULT_MAX_SIZE: usize = 32; - - pub fn new() -> Self { - Self { - ra_window: None, - max_size: Self::DEFAULT_MAX_SIZE, - prev_page: None, - waiter: BioWaiter::new(), - } - } - - /// Sets the maximum readahead window size. - pub fn set_max_window_size(&mut self, size: usize) { - self.max_size = size; - } - - fn is_sequential(&self, idx: usize) -> bool { - if let Some(prev) = self.prev_page { - idx == prev || idx == prev + 1 - } else { - false - } - } - - /// The number of bio requests in waiter. - /// This number will be zero if there are no previous readahead. - pub fn request_number(&self) -> usize { - self.waiter.nreqs() - } - - /// Checks for the previous readahead. - /// Returns true if the previous readahead has been completed. - pub fn prev_readahead_is_completed(&self) -> bool { - let nreqs = self.request_number(); - if nreqs == 0 { - return false; - } - - for i in 0..nreqs { - if self.waiter.status(i) == BioStatus::Submit { - return false; - } - } - true - } - - /// Waits for the previous readahead. - pub fn wait_for_prev_readahead( - &mut self, - pages: &mut MutexGuard>, - ) -> Result<()> { - if matches!(self.waiter.wait(), Some(BioStatus::Complete)) { - let Some(window) = &self.ra_window else { - return_errno!(Errno::EINVAL) - }; - for idx in window.readahead_range() { - if let Some(page) = pages.get_mut(&idx) { - page.store_state(PageState::UpToDate); - } - } - self.waiter.clear(); - } else { - return_errno!(Errno::EIO) - } - - Ok(()) - } - - /// Determines whether a new readahead should be performed. - /// We only consider readahead for sequential I/O now. - /// There should be at most one in-progress readahead. - pub fn should_readahead(&self, idx: usize, max_page: usize) -> bool { - if self.request_number() == 0 && self.is_sequential(idx) { - if let Some(cur_window) = &self.ra_window { - let trigger_readahead = - idx == cur_window.lookahead_index() || idx == cur_window.readahead_index(); - let next_window_exist = cur_window.readahead_range().end < max_page; - trigger_readahead && next_window_exist - } else { - let new_window_start = idx + 1; - new_window_start < max_page - } - } else { - false - } - } - - /// Setup the new readahead window. - pub fn setup_window(&mut self, idx: usize, max_page: usize) { - let new_window = if let Some(cur_window) = &self.ra_window { - cur_window.next(self.max_size, max_page) - } else { - let start_idx = idx + 1; - let init_size = Self::INIT_WINDOW_SIZE.min(self.max_size); - let end_idx = (start_idx + init_size).min(max_page); - ReadaheadWindow::new(start_idx..end_idx) - }; - self.ra_window = Some(new_window); - } - - /// Conducts the new readahead. - /// Sends the relevant read request and sets the relevant page in the page cache to `Uninit`. - pub fn conduct_readahead( - &mut self, - pages: &mut MutexGuard>, - backend: Arc, - ) -> Result<()> { - let Some(window) = &self.ra_window else { - return_errno!(Errno::EINVAL) - }; - for async_idx in window.readahead_range() { - let mut async_page = CachePage::alloc_uninit()?; - let pg_waiter = backend.read_page_async(async_idx, &async_page)?; - if pg_waiter.nreqs() > 0 { - self.waiter.concat(pg_waiter); - } else { - // Some backends (e.g. RamFs) do not issue requests, but fill the page directly. - async_page.store_state(PageState::UpToDate); - } - pages.put(async_idx, async_page); - } - Ok(()) - } - - /// Sets the last page visited. - pub fn set_prev_page(&mut self, idx: usize) { - self.prev_page = Some(idx); - } -} - -struct PageCacheManager { - pages: Mutex>, - backend: Weak, - ra_state: Mutex, -} - -impl PageCacheManager { - pub fn new(backend: Weak) -> Self { - Self { - pages: Mutex::new(LruCache::unbounded()), - backend, - ra_state: Mutex::new(ReadaheadState::new()), - } - } - - pub fn backend(&self) -> Arc { - self.backend.upgrade().unwrap() - } - - // Discard pages without writing them back to disk. - pub fn discard_range(&self, range: Range) { - let page_idx_range = get_page_idx_range(&range); - let mut pages = self.pages.lock(); - for idx in page_idx_range { - pages.pop(&idx); - } - } - - pub fn evict_range(&self, range: Range) -> Result<()> { - let page_idx_range = get_page_idx_range(&range); - - let mut bio_waiter = BioWaiter::new(); - let mut pages = self.pages.lock(); - let backend = self.backend(); - let backend_npages = backend.npages(); - for idx in page_idx_range.start..page_idx_range.end { - if let Some(page) = pages.peek(&idx) - && page.load_state() == PageState::Dirty - && idx < backend_npages - { - let waiter = backend.write_page_async(idx, page)?; - bio_waiter.concat(waiter); - } - } - - if !matches!(bio_waiter.wait(), Some(BioStatus::Complete)) { - // Do not allow partial failure - return_errno!(Errno::EIO); - } - - for (_, page) in pages - .iter_mut() - .filter(|(idx, _)| page_idx_range.contains(*idx)) - { - page.store_state(PageState::UpToDate); - } - Ok(()) - } - - fn ondemand_readahead(&self, idx: usize) -> Result { - let mut pages = self.pages.lock(); - let mut ra_state = self.ra_state.lock(); - let backend = self.backend(); - // Checks for the previous readahead. - if ra_state.prev_readahead_is_completed() { - ra_state.wait_for_prev_readahead(&mut pages)?; - } - // There are three possible conditions that could be encountered upon reaching here. - // 1. The requested page is ready for read in page cache. - // 2. The requested page is in previous readahead range, not ready for now. - // 3. The requested page is on disk, need a sync read operation here. - let frame = if let Some(page) = pages.get(&idx) { - // Cond 1 & 2. - if let PageState::Uninit = page.load_state() { - // Cond 2: We should wait for the previous readahead. - // If there is no previous readahead, an error must have occurred somewhere. - assert!(ra_state.request_number() != 0); - ra_state.wait_for_prev_readahead(&mut pages)?; - pages.get(&idx).unwrap().clone() - } else { - // Cond 1. - page.clone() - } - } else { - // Cond 3. - // Conducts the sync read operation. - let page = if idx < backend.npages() { - let mut page = CachePage::alloc_uninit()?; - backend.read_page(idx, &page)?; - page.store_state(PageState::UpToDate); - page - } else { - CachePage::alloc_zero(PageState::Uninit)? - }; - let frame = page.clone(); - pages.put(idx, page); - frame - }; - if ra_state.should_readahead(idx, backend.npages()) { - ra_state.setup_window(idx, backend.npages()); - ra_state.conduct_readahead(&mut pages, backend)?; - } - ra_state.set_prev_page(idx); - Ok(frame.into()) - } -} - -impl Debug for PageCacheManager { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - f.debug_struct("PageCacheManager") - .field("pages", &self.pages.lock()) - .finish() - } -} - -impl Pager for PageCacheManager { - fn commit_page(&self, idx: usize) -> Result { - self.ondemand_readahead(idx) - } - - fn update_page(&self, idx: usize) -> Result<()> { - let mut pages = self.pages.lock(); - if let Some(page) = pages.get_mut(&idx) { - page.store_state(PageState::Dirty); - } else { - warn!("The page {} is not in page cache", idx); - } - - Ok(()) - } - - fn decommit_page(&self, idx: usize) -> Result<()> { - let page_result = self.pages.lock().pop(&idx); - if let Some(page) = page_result - && let PageState::Dirty = page.load_state() - { - let Some(backend) = self.backend.upgrade() else { - return Ok(()); - }; - if idx < backend.npages() { - backend.write_page(idx, &page)?; - } - } - - Ok(()) - } - - fn commit_overwrite(&self, idx: usize) -> Result { - if let Some(page) = self.pages.lock().get(&idx) { - return Ok(page.clone().into()); - } - - let page = CachePage::alloc_uninit()?; - Ok(self.pages.lock().get_or_insert(idx, || page).clone().into()) - } + /// Fills the specified range of the page cache with zeros. + fn fill_zeros(&self, range: Range) -> Result<()>; } /// A page in the page cache. pub type CachePage = Frame; +const PAGE_WAIT_QUEUE_MASK: usize = 0xff; +const PAGE_WAIT_QUEUE_NUM: usize = PAGE_WAIT_QUEUE_MASK + 1; + +/// Global array of wait queues for page cache operations. +/// +/// Each wait queue in this array handles wait/wake operations for a subset of cache pages. +/// The queue for a specific page is selected using: `PAGE_WAIT_QUEUES[page.paddr() & PAGE_WAIT_QUEUE_MASK]`. +/// +/// This approach avoids the overhead of per-page wait queues while still providing +/// reasonable concurrency through hashing. +static PAGE_WAIT_QUEUES: [WaitQueue; PAGE_WAIT_QUEUE_NUM] = + [const { WaitQueue::new() }; PAGE_WAIT_QUEUE_NUM]; + /// Metadata for a page in the page cache. #[derive(Debug)] pub struct CachePageMeta { - pub state: AtomicPageState, + /// The current state of the page (uninit, up-to-date, or dirty). + state: AtomicPageState, + /// This bit acts as a mutex for the corresponding page. + /// + /// When this bit is set, the holder has the exclusive right to perform critical + /// state transitions (e.g., preparing for I/O). + lock: AtomicBool, // TODO: Add a reverse mapping from the page to VMO for eviction. } +impl Default for CachePageMeta { + fn default() -> Self { + Self { + state: AtomicPageState::new(PageState::Uninit), + lock: AtomicBool::new(false), + } + } +} + impl_untyped_frame_meta_for!(CachePageMeta); -pub trait CachePageExt { +pub trait CachePageExt: Sized { /// Gets the metadata associated with the cache page. fn metadata(&self) -> &CachePageMeta; + /// Gets the wait queue associated with the cache page. + fn wait_queue(&self) -> &'static WaitQueue; + + /// Tries to lock the cache page. + fn try_lock(&self) -> Option; + + /// Locks the cache page, blocking until the lock is acquired. + fn lock(self) -> LockedCachePage; + + /// Ensures the page is initialized, calling `init_fn` if necessary. + fn ensure_init(&self, init_fn: impl Fn(LockedCachePage) -> Result<()>) -> Result<()>; + /// Allocates a new cache page which content and state are uninitialized. fn alloc_uninit() -> Result { let meta = CachePageMeta { state: AtomicPageState::new(PageState::Uninit), + lock: AtomicBool::new(false), }; let page = FrameAllocOptions::new() .zeroed(false) @@ -508,9 +136,10 @@ pub trait CachePageExt { } /// Allocates a new zeroed cache page with the wanted state. - fn alloc_zero(state: PageState) -> Result { + fn alloc_zero() -> Result { let meta = CachePageMeta { - state: AtomicPageState::new(state), + state: AtomicPageState::new(PageState::UpToDate), + lock: AtomicBool::new(false), }; let page = FrameAllocOptions::new() .zeroed(true) @@ -518,14 +147,19 @@ pub trait CachePageExt { Ok(page) } - /// Loads the current state of the cache page. - fn load_state(&self) -> PageState { - self.metadata().state.load(Ordering::Relaxed) + fn is_uninit(&self) -> bool { + matches!( + self.metadata().state.load(Ordering::Acquire), + PageState::Uninit + ) } - /// Stores a new state for the cache page. - fn store_state(&mut self, new_state: PageState) { - self.metadata().state.store(new_state, Ordering::Relaxed); + /// Checks if the page is dirty. + fn is_dirty(&self) -> bool { + matches!( + self.metadata().state.load(Ordering::Acquire), + PageState::Dirty + ) } } @@ -533,8 +167,124 @@ impl CachePageExt for CachePage { fn metadata(&self) -> &CachePageMeta { self.meta() } + + fn wait_queue(&self) -> &'static WaitQueue { + &PAGE_WAIT_QUEUES[self.paddr() & PAGE_WAIT_QUEUE_MASK] + } + + fn try_lock(&self) -> Option { + let wait_queue = self.wait_queue(); + self.metadata() + .lock + .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed) + .is_ok() + .then(|| LockedCachePage::new(self.clone(), wait_queue)) + } + + fn lock(self) -> LockedCachePage { + let wait_queue = self.wait_queue(); + self.wait_queue().wait_until(|| { + self.metadata() + .lock + .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed) + .ok() + }); + LockedCachePage::new(self, wait_queue) + } + + fn ensure_init(&self, init_fn: impl Fn(LockedCachePage) -> Result<()>) -> Result<()> { + // Fast path: if the page is already initialized, return immediately without waiting. + if !self.is_uninit() { + return Ok(()); + } + + let lock_page = self.clone().lock(); + // Check again after acquiring the lock to avoid duplicate initialization. + if !lock_page.is_uninit() { + return Ok(()); + } + + init_fn(lock_page) + } } +/// A locked cache page. +/// +/// The locked page has the exclusive right to perform critical +/// state transitions (e.g., preparing for I/O). +pub struct LockedCachePage { + page: Option, + wait_queue: &'static WaitQueue, +} + +impl Debug for LockedCachePage { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("LockedCachePage") + .field("page", &self.page) + .finish() + } +} + +impl LockedCachePage { + fn new(page: CachePage, wait_queue: &'static WaitQueue) -> Self { + Self { + page: Some(page), + wait_queue, + } + } + + /// Unlocks the page and returns the underlying cache page. + pub fn unlock(mut self) -> CachePage { + let page = self.page.take().expect("page already taken"); + page.metadata().lock.store(false, Ordering::Release); + self.wait_queue.wake_all(); + page + } + + fn page(&self) -> &CachePage { + self.page.as_ref().expect("page already taken") + } + + /// Marks the page as up-to-date. + /// + /// This indicates that the page's contents are synchronized with disk + /// and can be safely read. + pub fn set_up_to_date(&self) { + self.page() + .metadata() + .state + .store(PageState::UpToDate, Ordering::Relaxed); + } + + /// Marks the page as dirty. + /// + /// This indicates that the page has been modified and needs to be + /// written back to disk eventually. + pub fn set_dirty(&self) { + self.metadata() + .state + .store(PageState::Dirty, Ordering::Relaxed); + } +} + +impl Deref for LockedCachePage { + type Target = CachePage; + + fn deref(&self) -> &Self::Target { + self.page.as_ref().expect("page already taken") + } +} + +impl Drop for LockedCachePage { + fn drop(&mut self) { + if let Some(page) = &self.page { + page.metadata().lock.store(false, Ordering::Release); + self.wait_queue.wake_all(); + } + } +} + +/// The state of a page in the page cache. #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[repr(u8)] pub enum PageState { @@ -580,25 +330,26 @@ impl AtomicPageState { /// This trait represents the backend for the page cache. pub trait PageCacheBackend: Sync + Send { /// Reads a page from the backend asynchronously. - fn read_page_async(&self, idx: usize, frame: &CachePage) -> Result; + fn read_page_async(&self, idx: usize, frame: LockedCachePage) -> Result; /// Writes a page to the backend asynchronously. - fn write_page_async(&self, idx: usize, frame: &CachePage) -> Result; + fn write_page_async(&self, idx: usize, frame: LockedCachePage) -> Result; /// Returns the number of pages in the backend. fn npages(&self) -> usize; } impl dyn PageCacheBackend { /// Reads a page from the backend synchronously. - fn read_page(&self, idx: usize, frame: &CachePage) -> Result<()> { - let waiter = self.read_page_async(idx, frame)?; + pub fn read_page(&self, idx: usize, page: LockedCachePage) -> Result<()> { + let waiter = self.read_page_async(idx, page)?; match waiter.wait() { Some(BioStatus::Complete) => Ok(()), _ => return_errno!(Errno::EIO), } } + /// Writes a page to the backend synchronously. - fn write_page(&self, idx: usize, frame: &CachePage) -> Result<()> { - let waiter = self.write_page_async(idx, frame)?; + pub fn write_page(&self, idx: usize, page: LockedCachePage) -> Result<()> { + let waiter = self.write_page_async(idx, page)?; match waiter.wait() { Some(BioStatus::Complete) => Ok(()), _ => return_errno!(Errno::EIO), diff --git a/kernel/src/vm/vmo/mod.rs b/kernel/src/vm/vmo/mod.rs index 731335e67..215b0f2d6 100644 --- a/kernel/src/vm/vmo/mod.rs +++ b/kernel/src/vm/vmo/mod.rs @@ -6,34 +6,45 @@ //! Virtual Memory Objects (VMOs). use core::{ - ops::Range, + cell::RefCell, + ops::{Deref, Range}, sync::atomic::{AtomicIsize, AtomicUsize, Ordering}, }; use align_ext::AlignExt; use ostd::{ - mm::{ - FrameAllocOptions, UFrame, VmIo, VmIoFill, VmReader, VmWriter, io_util::HasVmReaderWriter, - }, + mm::{VmIo, VmIoFill, VmReader, VmWriter, io_util::HasVmReaderWriter}, task::disable_preempt, }; use xarray::{Cursor, LockedXArray, XArray}; -use crate::prelude::*; +use crate::{ + fs::utils::{CachePage, CachePageExt, LockedCachePage, PageCacheBackend}, + prelude::*, +}; mod options; -mod pager; +mod page_cache; pub use options::VmoOptions; -pub use pager::Pager; -/// Virtual Memory Objects (VMOs) are a type of capability that represents a -/// range of memory pages. +/// Virtual Memory Objects (VMOs) represent contiguous ranges of virtual memory pages. /// -/// Broadly speaking, there are two types of VMO: -/// 1. File-backed VMO: the VMO backed by a file and resides in the page cache, -/// which includes a [`Pager`] to provide it with actual pages. -/// 2. Anonymous VMO: the VMO without a file backup, which does not have a `Pager`. +/// VMOs serve as the fundamental building blocks for memory management in Asterinas, +/// providing a unified interface for both anonymous (RAM-backed) and disk-backed memory. +/// +/// # Types of VMOs +/// +/// There are two primary types of VMOs, distinguished by their backing storage: +/// +/// 1. **Anonymous VMO**: Backed purely by RAM with no persistent storage. Pages are +/// initially zero-filled and exist only in memory. These are typically used for +/// heap allocations, anonymous mappings, and stack memory. +/// +/// 2. **Disk-backed VMO**: Associated with a disk-backed file through a [`PageCacheBackend`]. +/// Pages are lazily loaded from the file on first access and can be written back +/// to storage. These VMOs integrate with the page cache for efficient file I/O +/// and memory-mapped files. /// /// # Features /// @@ -44,10 +55,25 @@ pub use pager::Pager; /// * **Device driver support.** If specified upon creation, VMOs will be /// backed by physically contiguous memory pages starting at a target address. /// * **File system support.** By default, a VMO's memory pages are initially -/// all zeros. But if a VMO is attached to a pager (`Pager`) upon creation, -/// then its memory pages will be populated by the pager. -/// With this pager mechanism, file systems can easily implement page caches -/// with VMOs by attaching the VMOs to pagers backed by inodes. +/// all zeros. But if a VMO is attached to a backend ([`PageCacheBackend`]) upon creation, +/// then its memory pages will be populated by the backend. +/// With this backend mechanism, file systems can easily implement page caches +/// with VMOs by attaching the VMOs to backends backed by inodes. +/// +/// # Concurrency Guarantees +/// +/// A `Vmo` guarantees the correctness of each [`CachePage`]'s [`PageState`] +/// transitions (e.g., `Uninit` → `UpToDate` → `Dirty`). These transitions are +/// performed atomically under the page lock, ensuring that concurrent readers +/// and writers always observe a consistent page state. +/// +/// However, a `Vmo` does **not** guarantee atomicity of the page *contents* +/// with respect to concurrent reads and writes. In particular, when a page is +/// mapped into user-space address space, the kernel cannot prevent data races +/// between concurrent user-space memory accesses and kernel-side I/O operations +/// (e.g., `read`/`write` system calls or page fault handling). Callers that +/// require stronger consistency guarantees must provide their own +/// synchronization (e.g., file locks or application-level mutexes). /// /// # Examples /// @@ -56,16 +82,21 @@ pub use pager::Pager; /// # Implementation /// /// `Vmo` provides high-level APIs for address space management by wrapping -/// around its low-level counterpart [`ostd::mm::UFrame`]. -/// Compared with `UFrame`, +/// around its low-level counterpart [`CachePage`]. Compared with [`CachePage`], /// `Vmo` is easier to use (by offering more powerful APIs) and /// harder to misuse (thanks to its nature of being capability). +/// +/// [`PageState`]: crate::fs::utils::PageState pub struct Vmo { - pager: Option>, + /// The backend that provides disk I/O operations, if any. + /// + /// Using `Weak` here is to avoid circular references in exfat file systems. + /// We should avoid the circular reference by design, and then we can change this to `Arc`. + backend: Option>, /// Flags flags: VmoFlags, /// The virtual pages where the VMO resides. - pages: XArray, + pages: XArray, /// The size of the VMO. /// /// Note: This size may not necessarily match the size of the `pages`, but it is @@ -84,6 +115,7 @@ pub struct Vmo { impl Debug for Vmo { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("Vmo") + .field("has_backend", &self.backend.is_some()) .field("flags", &self.flags) .field("size", &self.size) .field("writable_mapping_status", &self.writable_mapping_status) @@ -111,11 +143,22 @@ bitflags! { /// The error type used for commit operations of [`Vmo`]. #[derive(Debug)] pub enum VmoCommitError { - /// Represents a general error raised during the commit operation. + /// A general error occurred during the commit operation. Err(Error), - /// Represents that the commit operation need to do I/O operation on the - /// wrapped index. + /// The commit operation requires an I/O operation to read the page + /// from the backend. + /// + /// The wrapped value is the page index. NeedIo(usize), + /// Failed to lock the page because it's currently locked by another thread. + /// + /// Contains the page index and the page itself. + LockPageFailed(usize, CachePage), + /// The page exists but is not yet initialized. + /// + /// The caller should wait for initialization to complete. + /// Contains the page index and the page. + WaitUntilInit(usize, CachePage), } impl From for VmoCommitError { @@ -130,84 +173,54 @@ impl From for VmoCommitError { } } -bitflags! { - /// Commit Flags. - pub struct CommitFlags: u8 { - /// Set this flag if the page will be completely overwritten. - /// This flag contains the WILL_WRITE flag. - const WILL_OVERWRITE = 1; - } -} - -impl CommitFlags { - pub fn will_overwrite(&self) -> bool { - self.contains(Self::WILL_OVERWRITE) - } -} - impl Vmo { - /// Prepares a new `UFrame` for the target index in pages, returns this new frame. + /// Converts this VMO to a disk-backed VMO wrapper if it has a backend. /// - /// This operation may involve I/O operations if the VMO is backed by a pager. - fn prepare_page(&self, page_idx: usize, commit_flags: CommitFlags) -> Result { - match &self.pager { - None => Ok(FrameAllocOptions::new().alloc_frame()?.into()), - Some(pager) => { - if commit_flags.will_overwrite() { - pager.commit_overwrite(page_idx) - } else { - pager.commit_page(page_idx) - } - } - } + /// Returns `None` if this is an anonymous VMO. + pub fn as_disk_backed(&self) -> Option> { + self.backend.as_ref().and_then(|weak_backend| { + weak_backend + .upgrade() + .map(|backend| DiskBackedVmo { vmo: self, backend }) + }) } /// Commits a page at a specific page index. /// /// This method may involve I/O operations if the VMO needs to fetch a page from /// the underlying page cache. - pub fn commit_on(&self, page_idx: usize, commit_flags: CommitFlags) -> Result { - let new_page = self.prepare_page(page_idx, commit_flags)?; + pub fn commit_on(&self, page_idx: usize) -> Result { + self.commit_on_internal(page_idx, false) + } + fn commit_on_internal(&self, page_idx: usize, will_overwrite: bool) -> Result { let mut locked_pages = self.pages.lock(); if page_idx * PAGE_SIZE > self.size() { return_errno_with_message!(Errno::EINVAL, "the offset is outside the VMO"); } - let mut cursor = locked_pages.cursor_mut(page_idx as u64); - if let Some(page) = cursor.load() { - return Ok(page.clone()); + if let Some(disk_backed) = self.as_disk_backed() { + disk_backed.commit_on(locked_pages, page_idx, will_overwrite) + } else { + let mut cursor = locked_pages.cursor_mut(page_idx as u64); + if let Some(page) = cursor.load() { + return Ok(page.clone()); + } + + let new_page = CachePage::alloc_zero()?; + cursor.store(new_page.clone()); + + Ok(new_page) } - - cursor.store(new_page.clone()); - Ok(new_page) - } - - fn try_commit_with_cursor( - &self, - cursor: &mut Cursor<'_, UFrame>, - ) -> core::result::Result { - if let Some(committed_page) = cursor.load() { - return Ok(committed_page.clone()); - } - - if let Some(pager) = &self.pager { - // FIXME: Here `Vmo` treat all instructions in `pager` as I/O instructions - // since it needs to take the inner `Mutex` lock and users also cannot hold a - // `SpinLock` to do such instructions. This workaround may introduce some performance - // issues. In the future we should solve the redundancy of `Vmo` and the pagecache - // make sure return such error when really needing I/Os. - return Err(VmoCommitError::NeedIo(cursor.index() as usize)); - } - - let frame = self.commit_on(cursor.index() as usize, CommitFlags::empty())?; - Ok(frame) } /// Commits the page corresponding to the target offset in the VMO. /// /// If the commit operation needs to perform I/O, it will return a [`VmoCommitError::NeedIo`]. - pub fn try_commit_page(&self, offset: usize) -> core::result::Result { + pub fn try_commit_page( + &self, + offset: usize, + ) -> core::result::Result { let page_idx = offset / PAGE_SIZE; if offset >= self.size() { return Err(VmoCommitError::Err(Error::with_message( @@ -218,7 +231,30 @@ impl Vmo { let guard = disable_preempt(); let mut cursor = self.pages.cursor(&guard, page_idx as u64); - self.try_commit_with_cursor(&mut cursor) + self.try_commit_with_cursor(&mut cursor, false) + .map(|(_, page)| page) + } + + fn try_commit_with_cursor( + &self, + cursor: &mut Cursor<'_, CachePage>, + will_overwrite: bool, + ) -> core::result::Result<(usize, CachePage), VmoCommitError> { + if let Some(disk_backed) = self.as_disk_backed() { + if let Some((index, page)) = + disk_backed.try_commit_with_cursor(cursor, will_overwrite)? + { + return Ok((index, page)); + } + } else if let Some(page) = cursor.load() { + let index = cursor.index() as usize; + return Ok((index, page.clone())); + } + + // Need to commit. Only Anonymous VMOs can reach here, because disk-backed VMOs will return + // `Err` if the page is not loaded. + let index = cursor.index() as usize; + Ok((index, self.commit_on_internal(index, will_overwrite)?)) } /// Traverses the indices within a specified range of a VMO sequentially. @@ -230,11 +266,25 @@ impl Vmo { pub fn try_operate_on_range( &self, range: &Range, - mut operate: F, + operate: F, ) -> core::result::Result<(), VmoCommitError> where F: FnMut( - &mut dyn FnMut() -> core::result::Result, + &mut dyn FnMut() -> core::result::Result<(usize, CachePage), VmoCommitError>, + ) -> core::result::Result<(), VmoCommitError>, + { + self.try_operate_on_range_internal(range, operate, false) + } + + fn try_operate_on_range_internal( + &self, + range: &Range, + mut operate: F, + will_overwrite: bool, + ) -> core::result::Result<(), VmoCommitError> + where + F: FnMut( + &mut dyn FnMut() -> core::result::Result<(usize, CachePage), VmoCommitError>, ) -> core::result::Result<(), VmoCommitError>, { if range.end > self.size() { @@ -248,224 +298,231 @@ impl Vmo { let guard = disable_preempt(); let mut cursor = self.pages.cursor(&guard, page_idx_range.start as u64); for page_idx in page_idx_range { - let mut commit_fn = || self.try_commit_with_cursor(&mut cursor); + let mut commit_fn = || self.try_commit_with_cursor(&mut cursor, will_overwrite); operate(&mut commit_fn)?; cursor.next(); } Ok(()) } - /// Traverses the indices within a specified range of a VMO sequentially. - /// - /// For each index position, you have the option to commit the page as well as - /// perform other operations. - /// - /// This method may involve I/O operations if the VMO needs to fetch a page from - /// the underlying page cache. - fn operate_on_range( - &self, - mut range: Range, - mut operate: F, - commit_flags: CommitFlags, - ) -> Result<()> - where - F: FnMut( - &mut dyn FnMut() -> core::result::Result, - ) -> core::result::Result<(), VmoCommitError>, - { - 'retry: loop { - let res = self.try_operate_on_range(&range, &mut operate); - match res { - Ok(_) => return Ok(()), - Err(VmoCommitError::Err(e)) => return Err(e), - Err(VmoCommitError::NeedIo(index)) => { - self.commit_on(index, commit_flags)?; - range.start = index * PAGE_SIZE; - continue 'retry; - } - } - } - } - - /// Decommits a range of pages in the VMO. - /// - /// The range must be within the size of the VMO. - /// - /// The start and end addresses will be rounded down and up to page boundaries. - pub fn decommit(&self, range: Range) -> Result<()> { - let locked_pages = self.pages.lock(); - if range.end > self.size() { - return_errno_with_message!(Errno::EINVAL, "operated range exceeds the vmo size"); - } - - self.decommit_pages(locked_pages, range)?; - Ok(()) - } - - /// Reads the specified amount of buffer content starting from the target offset in the VMO. - pub fn read(&self, offset: usize, writer: &mut VmWriter) -> Result<()> { - let read_len = writer.avail().min(self.size().saturating_sub(offset)); - let read_range = offset..(offset + read_len); - let mut read_offset = offset % PAGE_SIZE; - - let read = - move |commit_fn: &mut dyn FnMut() -> core::result::Result| { - let frame = commit_fn()?; - frame - .reader() - .skip(read_offset) - .read_fallible(writer) - .map_err(|e| VmoCommitError::from(e.0))?; - read_offset = 0; - Ok(()) - }; - - self.operate_on_range(read_range, read, CommitFlags::empty()) - } - - /// Writes the specified amount of buffer content starting from the target offset in the VMO. - pub fn write(&self, offset: usize, reader: &mut VmReader) -> Result<()> { - let write_len = reader.remain(); - let write_range = offset..(offset + write_len); - let mut write_offset = offset % PAGE_SIZE; - let mut write = - move |commit_fn: &mut dyn FnMut() -> core::result::Result| { - let frame = commit_fn()?; - frame - .writer() - .skip(write_offset) - .write_fallible(reader) - .map_err(|e| VmoCommitError::from(e.0))?; - write_offset = 0; - Ok(()) - }; - - if write_range.len() < PAGE_SIZE { - self.operate_on_range(write_range.clone(), write, CommitFlags::empty())?; - } else { - let temp = write_range.start + PAGE_SIZE - 1; - let up_align_start = temp - temp % PAGE_SIZE; - let down_align_end = write_range.end - write_range.end % PAGE_SIZE; - if write_range.start != up_align_start { - let head_range = write_range.start..up_align_start; - self.operate_on_range(head_range, &mut write, CommitFlags::empty())?; - } - if up_align_start != down_align_end { - let mid_range = up_align_start..down_align_end; - self.operate_on_range(mid_range, &mut write, CommitFlags::WILL_OVERWRITE)?; - } - if down_align_end != write_range.end { - let tail_range = down_align_end..write_range.end; - self.operate_on_range(tail_range, &mut write, CommitFlags::empty())?; - } - } - - if let Some(pager) = &self.pager { - let page_idx_range = get_page_idx_range(&write_range); - for page_idx in page_idx_range { - pager.update_page(page_idx)?; - } - } - Ok(()) - } - - /// Clears the target range in current VMO by writing zeros. - pub fn clear(&self, range: Range) -> Result<()> { - let buffer = vec![0u8; range.end - range.start]; - let mut reader = VmReader::from(buffer.as_slice()).to_fallible(); - self.write(range.start, &mut reader)?; - Ok(()) - } - /// Returns the size of current VMO. pub fn size(&self) -> usize { self.size.load(Ordering::Acquire) } - /// Resizes current VMO to target size. - /// - /// The VMO must be resizable. - /// - /// The new size will be rounded up to page boundaries. - pub fn resize(&self, new_size: usize) -> Result<()> { - assert!(self.flags.contains(VmoFlags::RESIZABLE)); - let new_size = new_size.align_up(PAGE_SIZE); - - let locked_pages = self.pages.lock(); - - let old_size = self.size(); - if new_size == old_size { - return Ok(()); - } - - self.size.store(new_size, Ordering::Release); - - if new_size < old_size { - self.decommit_pages(locked_pages, new_size..old_size)?; - } - - Ok(()) - } - - fn decommit_pages( - &self, - mut locked_pages: LockedXArray, - range: Range, - ) -> Result<()> { - let page_idx_range = get_page_idx_range(&range); - let mut cursor = locked_pages.cursor_mut(page_idx_range.start as u64); - - let Some(pager) = &self.pager else { - cursor.remove(); - while let Some(page_idx) = cursor.next_present() - && page_idx < page_idx_range.end as u64 - { - cursor.remove(); - } - return Ok(()); - }; - - let mut removed_page_idx = Vec::new(); - if cursor.remove().is_some() { - removed_page_idx.push(page_idx_range.start); - } - while let Some(page_idx) = cursor.next_present() - && page_idx < page_idx_range.end as u64 - { - removed_page_idx.push(page_idx as usize); - cursor.remove(); - } - - drop(locked_pages); - - for page_idx in removed_page_idx { - pager.decommit_page(page_idx)?; - } - - Ok(()) - } - /// Returns the flags of current VMO. pub fn flags(&self) -> VmoFlags { self.flags } - /// Replaces the page at the `page_idx` in the VMO with the input `page`. - fn replace(&self, page: UFrame, page_idx: usize) -> Result<()> { - let mut locked_pages = self.pages.lock(); - if page_idx >= self.size() / PAGE_SIZE { - return_errno_with_message!(Errno::EINVAL, "the page index is outside of the vmo"); + /// Returns the status of writable mappings of the VMO. + pub fn writable_mapping_status(&self) -> &WritableMappingStatus { + // Currently, only VMOs used by `MemfdInode` (anonymous) track writable mapping status. + // Disk-backed VMOs do not use this field. + debug_assert!(!self.is_disk_backed()); + &self.writable_mapping_status + } + + fn decommit_pages( + &self, + mut locked_pages: LockedXArray, + range: Range, + ) -> Result<()> { + let page_idx_range = get_page_idx_range(&range); + let mut cursor = locked_pages.cursor_mut(page_idx_range.start as u64); + + loop { + cursor.remove(); + let page_idx = cursor.next_present(); + if page_idx.is_none_or(|idx| idx >= page_idx_range.end as u64) { + break; + } } - locked_pages.store(page_idx as u64, page); Ok(()) } - /// Returns the status of writable mappings of the VMO. - pub fn writable_mapping_status(&self) -> &WritableMappingStatus { - // Only writable file-backed mappings may need to be tracked. - debug_assert!(self.pager.is_some()); - &self.writable_mapping_status + /// Returns whether this VMO is disk-backed. + fn is_disk_backed(&self) -> bool { + self.backend.is_some() + } +} + +impl Vmo { + /// Reads the specified amount of buffer content starting from the target offset in the VMO. + pub fn read(&self, offset: usize, writer: &mut VmWriter) -> Result<()> { + let read_len = writer.avail().min(self.size().saturating_sub(offset)); + let mut read_range = offset..(offset + read_len); + let mut read_offset = offset % PAGE_SIZE; + + let mut read = move |commit_fn: &mut dyn FnMut() -> core::result::Result< + (usize, CachePage), + VmoCommitError, + >| { + let (_, page) = commit_fn()?; + page.reader() + .skip(read_offset) + .read_fallible(writer) + .map_err(|e| VmoCommitError::from(e.0))?; + read_offset = 0; + Ok(()) + }; + + 'retry: loop { + let res = self.try_operate_on_range(&read_range, &mut read); + match res { + Ok(_) => return Ok(()), + Err(VmoCommitError::Err(e)) => return Err(e), + Err(VmoCommitError::NeedIo(index)) => { + self.commit_on(index)?; + read_range.start = index * PAGE_SIZE; + continue 'retry; + } + Err(VmoCommitError::WaitUntilInit(index, cache_page)) => { + cache_page.ensure_init(|locked_page| { + self.as_disk_backed() + .unwrap() + .backend + .read_page(index, locked_page) + })?; + read_range.start = index * PAGE_SIZE; + continue 'retry; + } + _ => unreachable!(), + } + } + } + + /// Writes the specified amount of buffer content starting from the target offset in the VMO. + pub fn write(&self, offset: usize, reader: &mut VmReader) -> Result<()> { + let write_len = reader.remain(); + let write_range = offset..(offset + write_len); + let mut write_offset = offset % PAGE_SIZE; + + if !self.is_disk_backed() { + // Fast path for anonymous VMOs + let write = move |commit_fn: &mut dyn FnMut() -> core::result::Result< + (usize, CachePage), + VmoCommitError, + >| { + let (_, page) = commit_fn()?; + page.writer() + .skip(write_offset) + .write_fallible(reader) + .map_err(|e| VmoCommitError::from(e.0))?; + write_offset = 0; + Ok(()) + }; + + return self.write_on_range( + write_range.clone(), + write, + Option:: Result<()>>::None, + false, + ); + } + + // Slow path for disk-backed VMOs (with dirty tracking) + let reader = RefCell::new(reader); + let write_offset = RefCell::new(write_offset); + let mut write = |commit_fn: &mut dyn FnMut() -> core::result::Result< + (usize, CachePage), + VmoCommitError, + >| { + let (index, page) = commit_fn()?; + let locked_page = page + .try_lock() + .ok_or_else(|| VmoCommitError::LockPageFailed(index, page))?; + locked_page.set_dirty(); + locked_page + .writer() + .skip(*write_offset.borrow()) + .write_fallible(&mut reader.borrow_mut()) + .map_err(|e| VmoCommitError::from(e.0))?; + *write_offset.borrow_mut() = 0; + Ok(()) + }; + + let mut fallback_write = |locked_page: &LockedCachePage| { + locked_page.set_dirty(); + locked_page + .writer() + .skip(*write_offset.borrow()) + .write_fallible(&mut reader.borrow_mut())?; + *write_offset.borrow_mut() = 0; + Ok(()) + }; + + if write_range.len() < PAGE_SIZE { + self.write_on_range(write_range.clone(), write, Some(fallback_write), false)?; + } else { + // Split into head (unaligned), middle (aligned), and tail (unaligned) + let temp = write_range.start + PAGE_SIZE - 1; + let up_align_start = temp - temp % PAGE_SIZE; + let down_align_end = write_range.end - write_range.end % PAGE_SIZE; + + if write_range.start != up_align_start { + let head_range = write_range.start..up_align_start; + self.write_on_range(head_range, &mut write, Some(&mut fallback_write), false)?; + } + if up_align_start != down_align_end { + // Middle part is page-aligned and will be completely overwritten + let mid_range = up_align_start..down_align_end; + self.write_on_range(mid_range, &mut write, Some(&mut fallback_write), true)?; + } + if down_align_end != write_range.end { + let tail_range = down_align_end..write_range.end; + self.write_on_range(tail_range, &mut write, Some(&mut fallback_write), false)?; + } + } + + Ok(()) + } + + fn write_on_range( + &self, + mut range: Range, + mut operate: F1, + mut fallback: Option, + will_overwrite: bool, + ) -> Result<()> + where + F1: FnMut( + &mut dyn FnMut() -> core::result::Result<(usize, CachePage), VmoCommitError>, + ) -> core::result::Result<(), VmoCommitError>, + F2: FnMut(&LockedCachePage) -> Result<()>, + { + 'retry: loop { + let res = self.try_operate_on_range_internal(&range, &mut operate, will_overwrite); + match res { + Ok(_) => return Ok(()), + Err(VmoCommitError::Err(e)) => return Err(e), + Err(VmoCommitError::NeedIo(index)) => { + self.commit_on_internal(index, will_overwrite)?; + range.start = index * PAGE_SIZE; + continue 'retry; + } + Err(VmoCommitError::WaitUntilInit(index, cache_page)) => { + cache_page.ensure_init(|locked_page| { + self.as_disk_backed() + .unwrap() + .backend + .read_page(index, locked_page) + })?; + range.start = index * PAGE_SIZE; + continue 'retry; + } + Err(VmoCommitError::LockPageFailed(index, cache_page)) => { + let Some(fallback) = &mut fallback else { + unreachable!() + }; + let locked_page = cache_page.lock(); + fallback(&locked_page)?; + range.start = (index + 1) * PAGE_SIZE; + continue 'retry; + } + } + } } } @@ -498,6 +555,169 @@ impl VmIoFill for Vmo { } } +/// A wrapper around a disk-backed VMO that provides specialized operations. +/// +/// This structure is created by calling [`Vmo::as_disk_backed()`] and provides +/// access to disk-backed specific functionality like reading from storage and +/// managing dirty pages. +pub struct DiskBackedVmo<'a> { + vmo: &'a Vmo, + backend: Arc, +} + +impl<'a> DiskBackedVmo<'a> { + /// Commits a page at the given index for a disk-backed VMO. + fn commit_on( + &self, + mut locked_pages: LockedXArray<'_, CachePage>, + page_idx: usize, + will_overwrite: bool, + ) -> Result { + let mut cursor = locked_pages.cursor_mut(page_idx as u64); + if let Some(page) = cursor.load() { + let page = page.clone(); + if self.backend.npages() > page_idx { + drop(locked_pages); + if !will_overwrite { + page.ensure_init(|locked_page| self.backend.read_page(page_idx, locked_page))?; + } + } + + return Ok(page); + }; + + // Page is within the file bounds - need to read from backend + if self.backend.npages() > page_idx { + let new_page = CachePage::alloc_uninit()?; + // Acquiring the lock from a new page must succeed. + let locked_page = new_page.try_lock().unwrap(); + + cursor.store(locked_page.clone()); + + drop(locked_pages); + + if will_overwrite { + // Page will be completely overwritten, no need to read + Ok(locked_page.unlock()) + } else { + // Read the page from backend storage + self.backend.read_page(page_idx, locked_page)?; + Ok(new_page) + } + } else { + // Page is beyond file bounds - treat as hole (zero-filled) + let new_page = CachePage::alloc_zero()?; + cursor.store(new_page.clone()); + + Ok(new_page) + } + } + + /// Attempts to commit a page using a cursor, without blocking on I/O. + fn try_commit_with_cursor( + &self, + cursor: &mut Cursor<'_, CachePage>, + will_overwrite: bool, + ) -> core::result::Result, VmoCommitError> { + let page_idx = cursor.index() as usize; + + let Some(page) = cursor.load() else { + return Err(VmoCommitError::NeedIo(page_idx)); + }; + + // If page is within file bounds, check if it's initialized + if !will_overwrite && self.backend.npages() > page_idx && page.is_uninit() { + return Err(VmoCommitError::WaitUntilInit(page_idx, page.clone())); + } + + Ok(Some((page_idx, page.clone()))) + } + + /// Collects dirty pages in the specified byte range. + /// + /// If `remove` is `true`, the pages will be removed from the XArray while + /// being collected. Otherwise, the pages are only read. + fn collect_dirty_pages( + &self, + range: &Range, + remove: bool, + ) -> Result> { + let mut locked_pages = self.vmo.pages.lock(); + if range.start > self.size() { + return Ok(Vec::new()); + } + + let page_idx_range = get_page_idx_range(range); + let npages = self.backend.npages(); + if page_idx_range.start >= npages { + return Ok(Vec::new()); + } + + let mut dirty_pages = Vec::new(); + + if remove { + let mut cursor = locked_pages.cursor_mut(page_idx_range.start as u64); + if let Some(page) = cursor.remove() + && page.is_dirty() + { + dirty_pages.push((page_idx_range.start, page.clone())); + } + + while let Some(page_idx) = cursor.next_present() { + let page_idx = page_idx as usize; + if page_idx >= page_idx_range.end || page_idx >= npages { + break; + } + + let page = cursor.remove().unwrap(); + if page.is_dirty() { + dirty_pages.push((page_idx, page.clone())); + } + } + } else { + let mut cursor = locked_pages.cursor(page_idx_range.start as u64); + if let Some(page) = cursor.load() + && page.is_dirty() + { + dirty_pages.push((page_idx_range.start, page.clone())); + } + + while let Some(page_idx) = cursor.next_present() { + let page_idx = page_idx as usize; + if page_idx >= page_idx_range.end || page_idx >= npages { + break; + } + + let page = cursor.load().unwrap(); + if page.is_dirty() { + dirty_pages.push((page_idx, page.clone())); + } + } + } + + Ok(dirty_pages) + } + + /// Writes back a collection of dirty pages to the backend storage. + fn write_back_pages(&self, dirty_pages: Vec<(usize, CachePage)>) -> Result<()> { + for (page_idx, page) in dirty_pages { + let locked_page = page.lock(); + if locked_page.is_dirty() { + self.backend.write_page(page_idx, locked_page)?; + } + } + Ok(()) + } +} + +impl Deref for DiskBackedVmo<'_> { + type Target = Vmo; + + fn deref(&self) -> &Self::Target { + self.vmo + } +} + /// Gets the page index range that contains the offset range of VMO. pub fn get_page_idx_range(vmo_offset_range: &Range) -> Range { let start = vmo_offset_range.start.align_down(PAGE_SIZE); diff --git a/kernel/src/vm/vmo/page_cache.rs b/kernel/src/vm/vmo/page_cache.rs new file mode 100644 index 000000000..5d79942ec --- /dev/null +++ b/kernel/src/vm/vmo/page_cache.rs @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: MPL-2.0 + +use core::{ops::Range, sync::atomic::Ordering}; + +use align_ext::AlignExt; +use ostd::mm::VmIoFill; + +use crate::{ + fs::utils::{PageCacheBackend, PageCacheOps}, + prelude::*, + vm::vmo::{Vmo, VmoFlags, VmoOptions}, +}; + +impl PageCacheOps for Vmo { + fn with_capacity(capacity: usize, backend: Weak) -> Result> { + VmoOptions::new(capacity) + .flags(VmoFlags::RESIZABLE) + .backend(backend) + .alloc() + } + + // TODO: This method also need to unmap the decommitted pages from the page tables. + fn resize(&self, new_size: usize, old_size: usize) -> Result<()> { + assert!(self.flags.contains(VmoFlags::RESIZABLE)); + + if new_size < old_size && !new_size.is_multiple_of(PAGE_SIZE) { + let fill_zero_end = old_size.min(new_size.align_up(PAGE_SIZE)); + PageCacheOps::fill_zeros(self, new_size..fill_zero_end)?; + } + + let new_size = new_size.align_up(PAGE_SIZE); + + let locked_pages = self.pages.lock(); + + let old_size = self.size(); + if new_size == old_size { + return Ok(()); + } + + self.size.store(new_size, Ordering::Release); + + if new_size < old_size { + self.decommit_pages(locked_pages, new_size..old_size)?; + } + + Ok(()) + } + + fn flush_range(&self, range: Range) -> Result<()> { + let Some(vmo) = self.as_disk_backed() else { + return Ok(()); + }; + + let dirty_pages = vmo.collect_dirty_pages(&range, false)?; + vmo.write_back_pages(dirty_pages) + } + + // TODO: This method also need to unmap the discarded pages from the page tables. + fn discard_range(&self, range: Range) -> Result<()> { + let Some(vmo) = self.as_disk_backed() else { + return Ok(()); + }; + + let dirty_pages = vmo.collect_dirty_pages(&range, true)?; + vmo.write_back_pages(dirty_pages) + } + + fn fill_zeros(&self, range: Range) -> Result<()> { + VmIoFill::fill_zeros(self, range.start, range.end - range.start)?; + Ok(()) + } +}