asterinas/ostd/src/mm/page_table/cursor/mod.rs

574 lines
21 KiB
Rust

// SPDX-License-Identifier: MPL-2.0
//! The page table cursor for mapping and querying over the page table.
//!
//! # The page table lock protocol
//!
//! We provide a fine-grained ranged mutual-exclusive lock protocol to allow
//! concurrent accesses to non-overlapping virtual ranges in the page table.
//!
//! [`CursorMut::new`] will lock a range in the virtual space and all the
//! operations on the range with the cursor will be atomic as a transaction.
//!
//! The guarantee of the lock protocol is that, if two cursors' ranges overlap,
//! all of one's operation must be finished before any of the other's
//! operation. The order depends on the scheduling of the threads. If a cursor
//! is ordered after another cursor, it will see all the changes made by the
//! previous cursor.
//!
//! The implementation of the lock protocol resembles two-phase locking (2PL).
//! [`CursorMut::new`] accepts an address range, which indicates the page table
//! entries that may be visited by this cursor. Then, [`CursorMut::new`] finds
//! an intermediate page table (not necessarily the last-level or the top-
//! level) which represents an address range that fully contains the whole
//! specified address range. Then it locks all the nodes in the sub-tree rooted
//! at the intermediate page table node, with a pre-order DFS order. The cursor
//! will only be able to access the page table entries in the locked range.
//! Upon destruction, the cursor will release the locks in the reverse order of
//! acquisition.
mod locking;
use core::{fmt::Debug, marker::PhantomData, mem::ManuallyDrop, ops::Range};
use align_ext::AlignExt;
use super::{
Entry, PageTable, PageTableConfig, PageTableError, PageTableGuard, PagingConstsTrait,
PagingLevel, PteState, PteStateRef, page_size, pte_index,
};
use crate::{
mm::{
PageProperty, Vaddr,
page_table::{PageTableNode, is_valid_range},
},
sync::RcuDrop,
task::atomic_mode::InAtomicMode,
};
/// The cursor for traversal over the page table.
///
/// A slot is a PTE at any levels, which correspond to a certain virtual
/// memory range sized by the "page size" of the current level.
///
/// A cursor is able to move to the next slot, to read page properties,
/// and even to jump to a virtual address directly.
#[derive(Debug)]
pub(crate) struct Cursor<'rcu, C: PageTableConfig> {
/// The current path of the cursor.
///
/// The level 1 page table lock guard is at index 0, and the level N page
/// table lock guard is at index N - 1.
path: [Option<PageTableGuard<'rcu, C>>; MAX_NR_LEVELS],
/// The cursor should be used in a RCU read side critical section.
rcu_guard: &'rcu dyn InAtomicMode,
/// The level of the page table that the cursor currently points to.
level: PagingLevel,
/// The top-most level that the cursor is allowed to access.
///
/// From `level` to `guard_level`, the nodes are held in `path`.
guard_level: PagingLevel,
/// The virtual address that the cursor currently points to.
va: Vaddr,
/// The virtual address range that is locked.
barrier_va: Range<Vaddr>,
_phantom: PhantomData<&'rcu PageTable<C>>,
}
/// The maximum value of `PagingConstsTrait::NR_LEVELS`.
const MAX_NR_LEVELS: usize = 4;
/// A fragment of a page table that can be taken out of the page table.
#[derive(Debug)]
#[must_use]
pub(crate) enum PageTableFrag<C: PageTableConfig> {
/// A mapped page table item.
Mapped { va: Vaddr, item: RcuDrop<C::Item> },
/// A sub-tree of a page table that is taken out of the page table.
///
/// The caller is responsible for dropping it after TLB coherence.
StrayPageTable {
pt: RcuDrop<PageTableNode<C>>,
va: Vaddr,
len: usize,
num_frames: usize,
},
}
impl<'rcu, C: PageTableConfig> Cursor<'rcu, C> {
/// Creates a cursor claiming exclusive access over the given range.
///
/// The cursor created will only be able to query or jump within the given
/// range. Out-of-bound accesses will result in panics or errors as return values,
/// depending on the access method.
pub fn new(
pt: &'rcu PageTable<C>,
guard: &'rcu dyn InAtomicMode,
va: &Range<Vaddr>,
) -> Result<Self, PageTableError> {
if !is_valid_range::<C>(va) || va.is_empty() {
return Err(PageTableError::InvalidVaddrRange(va.start, va.end));
}
if !va.start.is_multiple_of(C::BASE_PAGE_SIZE) || !va.end.is_multiple_of(C::BASE_PAGE_SIZE)
{
return Err(PageTableError::UnalignedVaddr);
}
const { assert!(C::NR_LEVELS as usize <= MAX_NR_LEVELS) };
Ok(locking::lock_range(pt, guard, va))
}
/// Gets the current virtual address.
pub fn virt_addr(&self) -> Vaddr {
self.va
}
/// Queries the mapping at the current virtual address.
///
/// If the cursor is pointing to a valid virtual address that is locked,
/// it will return the virtual address range and the item at that slot.
pub fn query(&mut self) -> Result<PagesState<'rcu, C>, PageTableError> {
if self.va >= self.barrier_va.end {
return Err(PageTableError::InvalidVaddr(self.va));
}
let rcu_guard = self.rcu_guard;
loop {
let cur_entry = self.cur_entry();
let item = match cur_entry.to_ref() {
PteStateRef::PageTable(pt) => {
// SAFETY: The `pt` must be locked and no other guards exist.
let guard = unsafe { pt.make_guard_unchecked(rcu_guard) };
self.push_level(guard);
continue;
}
PteStateRef::Absent => None,
PteStateRef::Mapped(item) => Some(item),
};
return Ok((self.cur_va_range(), item));
}
}
/// Moves the cursor forward to the next mapped virtual address.
///
/// If there is mapped virtual address following the current address within
/// next `len` bytes, it will return that mapped address. In this case, the
/// cursor will stop at the mapped address.
///
/// Otherwise, it will return `None`. And the cursor may stop at any
/// address after `len` bytes.
///
/// # Panics
///
/// Panics if:
/// - the length is longer than the remaining range of the cursor;
/// - the length is not page-aligned.
pub fn find_next(&mut self, len: usize) -> Option<Vaddr> {
self.find_next_impl(len, false, false)
}
/// Moves the cursor forward to the next fragment in the range.
///
/// See [`Self::find_next`] for more details. Other than the semantics
/// provided by [`Self::find_next`], this method also supports finding non-
/// leaf entries and splitting huge pages if necessary.
///
/// `find_unmap_subtree` specifies whether the cursor should stop at the
/// highest possible level for unmapping. If `false`, the cursor will only
/// stop at leaf entries.
///
/// `split_huge` specifies whether the cursor should split huge pages when
/// it finds a huge page that is mapped over the required range (`len`).
fn find_next_impl(
&mut self,
len: usize,
find_unmap_subtree: bool,
split_huge: bool,
) -> Option<Vaddr> {
assert_eq!(len % C::BASE_PAGE_SIZE, 0);
let end = self.va + len;
assert!(end <= self.barrier_va.end);
debug_assert_eq!(end % C::BASE_PAGE_SIZE, 0);
let rcu_guard = self.rcu_guard;
while self.va < end {
let cur_va = self.va;
let cur_va_range = self.cur_va_range();
let cur_entry_fits_range = cur_va == cur_va_range.start && cur_va_range.end <= end;
let mut cur_entry = self.cur_entry();
match cur_entry.to_ref() {
PteStateRef::PageTable(pt) => {
if find_unmap_subtree
&& cur_entry_fits_range
&& (C::TOP_LEVEL_CAN_UNMAP || self.level != C::NR_LEVELS)
{
return Some(cur_va);
}
// SAFETY: The `pt` must be locked and no other guards exist.
let pt_guard = unsafe { pt.make_guard_unchecked(rcu_guard) };
// If there's no mapped PTEs in the next level, we can
// skip to save time.
if pt_guard.nr_children() != 0 {
self.push_level(pt_guard);
} else {
let _ = ManuallyDrop::new(pt_guard);
self.move_forward();
}
continue;
}
PteStateRef::Absent => {
self.move_forward();
continue;
}
PteStateRef::Mapped(_) => {
if cur_entry_fits_range || !split_huge {
return Some(cur_va);
}
let split_child = cur_entry
.split_if_mapped_huge(rcu_guard)
.expect("The entry must be a huge page");
self.push_level(split_child);
continue;
}
}
}
None
}
/// Jumps to the given virtual address.
/// If the target address is out of the range, this method will return `Err`.
///
/// # Panics
///
/// This method panics if the address has bad alignment.
pub fn jump(&mut self, va: Vaddr) -> Result<(), PageTableError> {
assert!(va.is_multiple_of(C::BASE_PAGE_SIZE));
if !self.barrier_va.contains(&va) {
return Err(PageTableError::InvalidVaddr(va));
}
// FIXME: Maintain the `self.barrier_va.contains(self.va)` invariant:
// <https://github.com/asterinas/asterinas/pull/2613>.
if self.va == self.barrier_va.end {
while self.level < self.guard_level {
self.pop_level();
}
self.va = va;
return Ok(());
}
debug_assert!(self.barrier_va.contains(&self.va));
loop {
let node_size = page_size::<C>(self.level + 1);
let node_start = self.va.align_down(node_size);
// If the address is within the current node, we can jump directly.
if node_start <= va && va < node_start + node_size {
self.va = va;
return Ok(());
}
self.pop_level();
}
}
/// Traverses forward to the end of [`Self::cur_va_range`].
///
/// If reached the end of the current page table node, it (recursively)
/// moves itself up to the next page of the parent page.
fn move_forward(&mut self) {
let next_va = self.cur_va_range().end;
while self.level < self.guard_level && pte_index::<C>(next_va, self.level) == 0 {
self.pop_level();
}
self.va = next_va;
}
/// Goes up a level.
fn pop_level(&mut self) {
let taken = self.path[self.level as usize - 1]
.take()
.expect("Popping a level without a lock");
let _ = ManuallyDrop::new(taken);
debug_assert!(self.level < self.guard_level);
self.level += 1;
}
/// Goes down a level to a child page table.
fn push_level(&mut self, child_pt: PageTableGuard<'rcu, C>) {
self.level -= 1;
debug_assert_eq!(self.level, child_pt.level());
let old = self.path[self.level as usize - 1].replace(child_pt);
debug_assert!(old.is_none());
}
fn cur_entry(&mut self) -> Entry<'_, 'rcu, C> {
let node = self.path[self.level as usize - 1].as_mut().unwrap();
node.entry(pte_index::<C>(self.va, self.level))
}
/// Gets the virtual address range that the current entry covers.
fn cur_va_range(&self) -> Range<Vaddr> {
let entry_size = page_size::<C>(self.level);
let entry_start = self.va.align_down(entry_size);
entry_start..entry_start + entry_size
}
}
impl<C: PageTableConfig> Drop for Cursor<'_, C> {
fn drop(&mut self) {
locking::unlock_range(self);
}
}
/// The state of virtual pages represented by a page table.
///
/// This is the return type of the [`Cursor::query`] method.
pub type PagesState<'a, C> = (Range<Vaddr>, Option<<C as PageTableConfig>::ItemRef<'a>>);
/// The cursor of a page table that is capable of map, unmap or protect pages.
///
/// It has all the capabilities of a [`Cursor`], which can navigate over the
/// page table corresponding to the address range. A virtual address range
/// in a page table can only be accessed by one cursor, regardless of the
/// mutability of the cursor.
#[derive(Debug)]
pub(crate) struct CursorMut<'rcu, C: PageTableConfig>(Cursor<'rcu, C>);
impl<'rcu, C: PageTableConfig> CursorMut<'rcu, C> {
/// Creates a cursor claiming exclusive access over the given range.
///
/// The cursor created will only be able to map, query or jump within the given
/// range. Out-of-bound accesses will result in panics or errors as return values,
/// depending on the access method.
pub(super) fn new(
pt: &'rcu PageTable<C>,
guard: &'rcu dyn InAtomicMode,
va: &Range<Vaddr>,
) -> Result<Self, PageTableError> {
Cursor::new(pt, guard, va).map(|inner| Self(inner))
}
/// Moves the cursor forward to the next mapped virtual address.
///
/// This is the same as [`Cursor::find_next`].
pub fn find_next(&mut self, len: usize) -> Option<Vaddr> {
self.0.find_next(len)
}
/// Jumps to the given virtual address.
///
/// This is the same as [`Cursor::jump`].
///
/// # Panics
///
/// This method panics if the address is out of the range where the cursor is required to operate,
/// or has bad alignment.
pub fn jump(&mut self, va: Vaddr) -> Result<(), PageTableError> {
self.0.jump(va)
}
/// Gets the current virtual address.
pub fn virt_addr(&self) -> Vaddr {
self.0.virt_addr()
}
/// Queries the mapping at the current virtual address.
///
/// If the cursor is pointing to a valid virtual address that is locked,
/// it will return the virtual address range and the item at that slot.
pub fn query(&mut self) -> Result<PagesState<'rcu, C>, PageTableError> {
self.0.query()
}
/// Maps the item starting from the current address to a physical address range.
///
/// The current virtual address should not be mapped.
///
/// # Panics
///
/// This function will panic if
/// - the virtual address range to be mapped is out of the locked range;
/// - the current virtual address is not aligned to the page size of the
/// item to be mapped;
/// - the virtual address range contains mappings that conflicts with the item.
///
/// # Safety
///
/// The caller should ensure that
/// - the range being mapped does not affect kernel's memory safety;
/// - the physical address to be mapped is valid and safe to use.
pub unsafe fn map(&mut self, item: C::Item) {
assert!(self.0.va < self.0.barrier_va.end);
let (_, level, _) = C::item_raw_info(&item);
assert!(level <= C::HIGHEST_TRANSLATION_LEVEL);
let size = page_size::<C>(level);
assert_eq!(self.0.va % size, 0);
let end = self.0.va + size;
assert!(end <= self.0.barrier_va.end);
let rcu_guard = self.0.rcu_guard;
// Adjust ourselves to the level of the item.
while self.0.level != level {
if self.0.level < level {
self.0.pop_level();
continue;
}
// We are at a higher level, go down.
let mut cur_entry = self.0.cur_entry();
match cur_entry.to_ref() {
PteStateRef::PageTable(pt) => {
// SAFETY: The `pt` must be locked and no other guards exist.
let pt_guard = unsafe { pt.make_guard_unchecked(rcu_guard) };
self.0.push_level(pt_guard);
}
PteStateRef::Absent => {
let child_guard = cur_entry.alloc_if_none(rcu_guard).unwrap();
self.0.push_level(child_guard);
}
PteStateRef::Mapped(_) => {
let split_child = cur_entry.split_if_mapped_huge(rcu_guard).unwrap();
self.0.push_level(split_child);
}
}
}
if !matches!(self.0.cur_entry().to_ref(), PteStateRef::Absent) {
panic!("Mapping over an already mapped page at {:#x}", self.0.va);
}
let _ = self.replace_cur_entry(PteState::Mapped(RcuDrop::new(item)));
self.0.move_forward();
}
/// Finds and removes the first page table fragment in the following range.
///
/// The range to be found in is the current virtual address with the
/// provided length.
///
/// The function stops and yields the fragment if it has actually removed a
/// fragment, no matter if the following pages are also required to be
/// unmapped. The returned virtual address is the virtual page that existed
/// before the removal but having just been unmapped.
///
/// It also makes the cursor moves forward to the next page after the
/// removed one, when an actual page is removed. If no mapped pages exist
/// in the following range, the cursor will stop at the end of the range
/// and return [`None`].
///
/// The caller should handle TLB coherence if necessary, using the returned
/// virtual address range.
///
/// # Safety
///
/// The caller should ensure that:
/// - the range being unmapped does not affect kernel's memory safety.
/// - the items mapped in `PageTableFrag` must outlive any TLB entries
/// that cache the mappings.
///
/// # Panics
///
/// Panics if:
/// - the length is longer than the remaining range of the cursor;
/// - the length is not page-aligned.
pub unsafe fn take_next(&mut self, len: usize) -> Option<PageTableFrag<C>> {
self.0.find_next_impl(len, true, true)?;
let frag = self.replace_cur_entry(PteState::Absent);
self.0.move_forward();
frag
}
/// Applies the operation to the next slot of mapping within the range.
///
/// The range to be found in is the current virtual address with the
/// provided length.
///
/// The function stops and yields the actually protected range if it has
/// actually protected a page, no matter if the following pages are also
/// required to be protected.
///
/// It also makes the cursor moves forward to the next page after the
/// protected one. If no mapped pages exist in the following range, the
/// cursor will stop at the end of the range and return [`None`].
///
/// # Safety
///
/// The caller should ensure that:
/// - the range being protected with the operation does not affect
/// kernel's memory safety;
/// - the privileged flag `AVAIL1` should not be altered, since this flag
/// is reserved for all page tables.
///
/// # Panics
///
/// Panics if:
/// - the length is longer than the remaining range of the cursor;
/// - the length is not page-aligned.
pub unsafe fn protect_next(
&mut self,
len: usize,
op: &mut impl FnMut(&mut PageProperty),
) -> Option<Range<Vaddr>> {
self.0.find_next_impl(len, false, true)?;
self.0.cur_entry().protect(op);
let protected_va = self.0.cur_va_range();
self.0.move_forward();
Some(protected_va)
}
fn replace_cur_entry(&mut self, new_child: PteState<C>) -> Option<PageTableFrag<C>> {
let rcu_guard = self.0.rcu_guard;
let va = self.0.va;
let level = self.0.level;
let old = self.0.cur_entry().replace(new_child);
match old {
PteState::Absent => None,
PteState::Mapped(item) => Some(PageTableFrag::Mapped { va, item }),
PteState::PageTable(pt) => {
debug_assert_eq!(pt.level(), level - 1);
if !C::TOP_LEVEL_CAN_UNMAP && level == C::NR_LEVELS {
let _ = ManuallyDrop::new(pt); // leak it to make shared PTs stay `'static`.
panic!("Unmapping shared kernel page table nodes");
}
// SAFETY: We must have locked this node.
let locked_pt = unsafe { pt.borrow().make_guard_unchecked(rcu_guard) };
// SAFETY:
// - We checked that we are not unmapping shared kernel page table nodes.
// - We must have locked the entire sub-tree since the range is locked.
let num_frames =
unsafe { locking::dfs_mark_stray_and_unlock(rcu_guard, locked_pt) };
Some(PageTableFrag::StrayPageTable {
pt,
va,
len: page_size::<C>(self.0.level),
num_frames,
})
}
}
}
}