Add pivot_root syscall

This commit is contained in:
Chen Chengjun 2026-02-11 06:14:59 +00:00
parent b0407dd517
commit 4b0fe3d283
10 changed files with 222 additions and 12 deletions

View File

@ -175,7 +175,7 @@ which are summarized in the table below.
| 152 | munlockall | ❌ | N/A |
| 153 | vhangup | ❌ | N/A |
| 154 | modify_ldt | ❌ | N/A |
| 155 | pivot_root | ❌ | N/A |
| 155 | pivot_root | ✅ | 💯 |
| 156 | _sysctl | ❌ | N/A |
| 157 | prctl | ✅ | [⚠️](syscall-flag-coverage/namespaces-cgroups-and-security/#prctl) |
| 158 | arch_prctl | ✅ | [⚠️](syscall-flag-coverage/system-information-and-misc/#arch_prctl) |

View File

@ -14,3 +14,6 @@ chroot(path);
// Remove a watch from an inotify instance
inotify_rm_watch(fd, wd);
// Change the root mount in the mount namespace of the calling thread
pivot_root(new_root, put_old);

View File

@ -159,7 +159,7 @@ impl Path {
/// For example, first `mount /dev/sda1 /mnt` and then `mount /dev/sda2 /mnt`.
/// After the second mount is completed, the content of the first mount will be overridden.
/// We need to recursively obtain the top `Path`.
pub(super) fn get_top_path(mut self) -> Self {
fn get_top_path(mut self) -> Self {
while self.dentry.is_mountpoint() {
if let Some(child_mount) = self.mount.get(&self.dentry) {
let inner = child_mount.root_dentry().clone();
@ -173,13 +173,47 @@ impl Path {
}
/// Finds the corresponding `Path` in the given mount namespace.
pub(super) fn find_corresponding_mount(&self, mnt_ns: &Arc<MountNamespace>) -> Option<Self> {
fn find_corresponding_mount(&self, mnt_ns: &Arc<MountNamespace>) -> Option<Self> {
let corresponding_mount = self.mount.find_corresponding_mount(mnt_ns)?;
let corresponding_path = Self::new(corresponding_mount, self.dentry.clone());
Some(corresponding_path)
}
/// Checks if this path is reachable from the given `root` path.
///
/// A path is considered reachable if it is the same as or a descendant
/// of the `root` path. The check traverses upwards from the current path,
/// crossing mount point boundaries as necessary, until it either finds
/// the `root` path or reaches the global root.
fn is_reachable_from(&self, root: &Path) -> bool {
let mut owned;
let mut current = self;
loop {
if current.mount.id() != root.mount.id() {
let Some(parent_mount) = current.mount.parent().and_then(|mount| mount.upgrade())
else {
return false;
};
owned = Path::new(
parent_mount,
current
.mount
.mountpoint()
.expect("Mounts with parents must have a mount point")
.clone(),
);
current = &owned;
continue;
}
return current.dentry.is_equal_or_descendant_of(&root.dentry);
}
}
/// Returns true if the `Path` represents a pseudo file.
fn is_pseudo(&self) -> bool {
self.dentry.is_pseudo()
@ -333,7 +367,7 @@ impl Path {
}
let new_mount = self.mount.clone_mount_tree(&self.dentry, None, recursive);
new_mount.graft_mount_tree(dst_path)?;
new_mount.graft_mount_tree(dst_path);
Ok(())
}
@ -369,7 +403,9 @@ impl Path {
);
}
self.mount.graft_mount_tree(dst_path)
self.mount.graft_mount_tree(dst_path);
Ok(())
}
/// Sets the propagation type of the mount of this `Path`.

View File

@ -420,10 +420,9 @@ impl Mount {
}
/// Grafts the mount node tree to the mountpoint.
pub(super) fn graft_mount_tree(&self, target_path: &Path) -> Result<()> {
pub(super) fn graft_mount_tree(&self, target_path: &Path) {
self.detach_from_parent();
self.attach_to_path(target_path);
Ok(())
}
/// Gets a child mount node from the mountpoint if any.

View File

@ -12,7 +12,7 @@ use crate::{
utils::{InodeType, NAME_MAX, PATH_MAX, Permission, SYMLINKS_MAX, SymbolicLink},
},
prelude::*,
process::posix_thread::AsThreadLocal,
process::posix_thread::{AsPosixThread, AsThreadLocal, thread_table::ThreadTable},
};
/// The file descriptor of the current working directory.
@ -228,6 +228,121 @@ impl PathResolver {
Ok(())
}
/// Changes the root mount in the mount namespace of the calling thread.
///
/// This function moves the original root mount of the calling thread to `put_old_path` and makes
/// `new_root_path` the new root mount. For other threads in the current mount namespace, if their
/// root directory and current working directory are the same as the current thread's root directory,
/// they will also be changed to `new_root_path`.
//
// TODO: this method should only iterate threads in the current PID namespace instead of
// the whole thread table.
pub fn pivot_root(
&mut self,
new_root_path: FsPath,
put_old_path: FsPath,
thread_table: &ThreadTable,
ctx: &Context,
) -> Result<()> {
let new_root_path = self.lookup(&new_root_path)?;
let put_old_path = self.lookup(&put_old_path)?;
if new_root_path.type_() != InodeType::Dir || put_old_path.type_() != InodeType::Dir {
return_errno_with_message!(
Errno::ENOTDIR,
"`new_root` or `put_old` is not a directory"
);
}
if self.root.mount.id() == new_root_path.mount.id()
|| self.root.mount.id() == put_old_path.mount.id()
{
return_errno_with_message!(
Errno::EBUSY,
"`new_root` or `put_old` is on the current root mount"
);
}
if !new_root_path.is_mount_root() || !self.root.is_mount_root() {
return_errno_with_message!(
Errno::EINVAL,
"`new_root` or the current root is not a mount point"
);
}
if new_root_path.mount.parent().is_none() || self.root.mount.parent().is_none() {
return_errno_with_message!(
Errno::EINVAL,
"`new_root` or the current root is on the rootfs mount"
);
}
let current_ns_proxy = ctx.thread_local.borrow_ns_proxy();
let current_mnt_ns = current_ns_proxy.unwrap().mnt_ns();
if !current_mnt_ns.owns(&new_root_path.mount) || !current_mnt_ns.owns(&put_old_path.mount) {
return_errno_with_message!(
Errno::EINVAL,
"`new_root` or `put_old` is not in the current mount namespace"
);
}
if !put_old_path.is_reachable_from(&new_root_path) {
return_errno_with_message!(
Errno::EINVAL,
"`put_old` is not at or underneath `new_root`"
);
}
if !new_root_path.is_reachable_from(&self.root) {
return_errno_with_message!(
Errno::EINVAL,
"`new_root` is not underneath the current root"
);
}
// TODO: Check the following once we support `MS_SHARED`:
// "The propagation type of the parent mount of `new_root` and the
// parent mount of the current root directory must not be
// `MS_SHARED`; similarly, if `put_old` is an existing mount point,
// its propagation type must not be `MS_SHARED`."
let parent_path = {
let parent_mount = self.root.mount.parent().unwrap().upgrade().unwrap();
let mountpoint = self.root.mount.mountpoint().unwrap();
Path::new(parent_mount, mountpoint)
};
self.root.mount.graft_mount_tree(&put_old_path);
new_root_path.mount.graft_mount_tree(&parent_path);
for thread in thread_table.values() {
let posix_thread = thread.as_posix_thread().unwrap();
let ns_proxy = posix_thread.ns_proxy().lock();
let Some(ns_proxy) = ns_proxy.as_ref() else {
continue;
};
let mnt_ns = ns_proxy.mnt_ns();
if !Arc::ptr_eq(mnt_ns, current_mnt_ns) {
continue;
}
let fs = posix_thread.read_fs();
if Arc::ptr_eq(&fs, &ctx.thread_local.borrow_fs()) {
continue;
}
let mut fs_resolver = fs.resolver().write();
if fs_resolver.root() == &self.root {
fs_resolver.set_root(new_root_path.clone());
}
if fs_resolver.cwd() == &self.root {
fs_resolver.set_cwd(new_root_path.clone());
}
}
if self.cwd == self.root {
self.cwd = new_root_path.clone();
}
self.root = new_root_path;
Ok(())
}
}
/// The result of resolving an absolute path name.

View File

@ -3,20 +3,22 @@
use super::{Thread, Tid};
use crate::{prelude::*, process::posix_thread::AsPosixThread};
static THREAD_TABLE: SpinLock<BTreeMap<Tid, Arc<Thread>>> = SpinLock::new(BTreeMap::new());
pub type ThreadTable = BTreeMap<Tid, Arc<Thread>>;
/// Adds a posix thread to global thread table
static THREAD_TABLE: Mutex<ThreadTable> = Mutex::new(BTreeMap::new());
/// Adds a POSIX thread to the global thread table.
pub fn add_thread(tid: Tid, thread: Arc<Thread>) {
debug_assert_eq!(tid, thread.as_posix_thread().unwrap().tid());
THREAD_TABLE.lock().insert(tid, thread);
}
/// Removes a posix thread to global thread table
/// Removes a POSIX thread from the global thread table.
pub fn remove_thread(tid: Tid) {
THREAD_TABLE.lock().remove(&tid);
}
/// Gets a posix thread from the global thread table
/// Gets a POSIX thread from the global thread table.
pub fn get_thread(tid: Tid) -> Option<Arc<Thread>> {
THREAD_TABLE.lock().get(&tid).cloned()
}
@ -47,3 +49,12 @@ pub(in crate::process) fn make_current_main_thread(ctx: &Context) {
let thread = thread_table.remove(&old_tid).unwrap();
thread_table.insert(pid, thread);
}
/// Applies the given function to the global thread table.
pub fn with_global_threads<F, R>(f: F) -> R
where
F: FnOnce(&ThreadTable) -> R,
{
let table = THREAD_TABLE.lock();
f(&table)
}

View File

@ -83,6 +83,7 @@ macro_rules! import_generic_syscall_entries {
pidfd_getfd::sys_pidfd_getfd,
pidfd_open::sys_pidfd_open,
pipe::sys_pipe2,
pivot_root::sys_pivot_root,
ppoll::sys_ppoll,
prctl::sys_prctl,
pread64::sys_pread64,
@ -233,6 +234,7 @@ macro_rules! define_syscalls_with_generic_syscall_table {
SYS_LINKAT = 37 => sys_linkat(args[..5]);
SYS_UMOUNT = 39 => sys_umount(args[..2]);
SYS_MOUNT = 40 => sys_mount(args[..5]);
SYS_PIVOT_ROOT = 41 => sys_pivot_root(args[..2]);
SYS_STATFS = 43 => sys_statfs(args[..2]);
SYS_FSTATFS = 44 => sys_fstatfs(args[..2]);
SYS_TRUNCATE = 45 => sys_truncate(args[..2]);

View File

@ -84,6 +84,7 @@ use super::{
pidfd_getfd::sys_pidfd_getfd,
pidfd_open::sys_pidfd_open,
pipe::{sys_pipe, sys_pipe2},
pivot_root::sys_pivot_root,
poll::sys_poll,
ppoll::sys_ppoll,
prctl::sys_prctl,
@ -308,6 +309,7 @@ impl_syscall_nums_and_dispatch_fn! {
SYS_SCHED_GETSCHEDULER = 145 => sys_sched_getscheduler(args[..1]);
SYS_SCHED_GET_PRIORITY_MAX = 146 => sys_sched_get_priority_max(args[..1]);
SYS_SCHED_GET_PRIORITY_MIN = 147 => sys_sched_get_priority_min(args[..1]);
SYS_PIVOT_ROOT = 155 => sys_pivot_root(args[..2]);
SYS_PRCTL = 157 => sys_prctl(args[..5]);
SYS_ARCH_PRCTL = 158 => sys_arch_prctl(args[..2], &mut user_ctx);
SYS_SETRLIMIT = 160 => sys_setrlimit(args[..2]);

View File

@ -96,6 +96,7 @@ mod pause;
mod pidfd_getfd;
mod pidfd_open;
mod pipe;
mod pivot_root;
mod poll;
mod ppoll;
mod prctl;

View File

@ -0,0 +1,41 @@
// SPDX-License-Identifier: MPL-2.0
use super::SyscallReturn;
use crate::{
fs::path::FsPath, prelude::*, process::posix_thread::thread_table::with_global_threads,
syscall::constants::MAX_FILENAME_LEN,
};
pub fn sys_pivot_root(
new_root_ptr: Vaddr,
put_old_ptr: Vaddr,
ctx: &Context,
) -> Result<SyscallReturn> {
let new_root_name = ctx
.user_space()
.read_cstring(new_root_ptr, MAX_FILENAME_LEN)?;
let put_old_name = ctx
.user_space()
.read_cstring(put_old_ptr, MAX_FILENAME_LEN)?;
debug!(
"pivot_root: new_root = {:?}, put_old = {:?}",
new_root_name, put_old_name
);
let new_root_name = new_root_name.to_string_lossy();
let new_root_path = FsPath::try_from(new_root_name.as_ref())?;
let put_old_name = put_old_name.to_string_lossy();
let put_old_path = FsPath::try_from(put_old_name.as_ref())?;
// TODO: Locking the global thread table here is a workaround. We need to use a more
// suitable lock (i.e. the global mount lock or the namespace lock) to avoid deadlock.
with_global_threads(move |table| {
let fs_ref = ctx.thread_local.borrow_fs();
let mut fs_resolver = fs_ref.resolver().write();
fs_resolver.pivot_root(new_root_path, put_old_path, table, ctx)
})?;
Ok(SyscallReturn::Return(0))
}