From 4b0fe3d283ea49cc4caf64cc263ffe881d6afb6d Mon Sep 17 00:00:00 2001 From: Chen Chengjun Date: Wed, 11 Feb 2026 06:14:59 +0000 Subject: [PATCH] Add pivot_root syscall --- book/src/kernel/linux-compatibility/README.md | 2 +- .../fully_covered.scml | 3 + kernel/src/fs/path/mod.rs | 44 ++++++- kernel/src/fs/path/mount.rs | 3 +- kernel/src/fs/path/resolver.rs | 117 +++++++++++++++++- .../src/process/posix_thread/thread_table.rs | 19 ++- kernel/src/syscall/arch/generic.rs | 2 + kernel/src/syscall/arch/x86.rs | 2 + kernel/src/syscall/mod.rs | 1 + kernel/src/syscall/pivot_root.rs | 41 ++++++ 10 files changed, 222 insertions(+), 12 deletions(-) create mode 100644 kernel/src/syscall/pivot_root.rs diff --git a/book/src/kernel/linux-compatibility/README.md b/book/src/kernel/linux-compatibility/README.md index f4d44dda7..227135e53 100644 --- a/book/src/kernel/linux-compatibility/README.md +++ b/book/src/kernel/linux-compatibility/README.md @@ -175,7 +175,7 @@ which are summarized in the table below. | 152 | munlockall | ❌ | N/A | | 153 | vhangup | ❌ | N/A | | 154 | modify_ldt | ❌ | N/A | -| 155 | pivot_root | ❌ | N/A | +| 155 | pivot_root | ✅ | 💯 | | 156 | _sysctl | ❌ | N/A | | 157 | prctl | ✅ | [⚠️](syscall-flag-coverage/namespaces-cgroups-and-security/#prctl) | | 158 | arch_prctl | ✅ | [⚠️](syscall-flag-coverage/system-information-and-misc/#arch_prctl) | diff --git a/book/src/kernel/linux-compatibility/syscall-flag-coverage/file-systems-and-mount-control/fully_covered.scml b/book/src/kernel/linux-compatibility/syscall-flag-coverage/file-systems-and-mount-control/fully_covered.scml index 3ca04832f..c71dfc052 100644 --- a/book/src/kernel/linux-compatibility/syscall-flag-coverage/file-systems-and-mount-control/fully_covered.scml +++ b/book/src/kernel/linux-compatibility/syscall-flag-coverage/file-systems-and-mount-control/fully_covered.scml @@ -14,3 +14,6 @@ chroot(path); // Remove a watch from an inotify instance inotify_rm_watch(fd, wd); + +// Change the root mount in the mount namespace of the calling thread +pivot_root(new_root, put_old); diff --git a/kernel/src/fs/path/mod.rs b/kernel/src/fs/path/mod.rs index ea28eb5c0..499b5b784 100644 --- a/kernel/src/fs/path/mod.rs +++ b/kernel/src/fs/path/mod.rs @@ -159,7 +159,7 @@ impl Path { /// For example, first `mount /dev/sda1 /mnt` and then `mount /dev/sda2 /mnt`. /// After the second mount is completed, the content of the first mount will be overridden. /// We need to recursively obtain the top `Path`. - pub(super) fn get_top_path(mut self) -> Self { + fn get_top_path(mut self) -> Self { while self.dentry.is_mountpoint() { if let Some(child_mount) = self.mount.get(&self.dentry) { let inner = child_mount.root_dentry().clone(); @@ -173,13 +173,47 @@ impl Path { } /// Finds the corresponding `Path` in the given mount namespace. - pub(super) fn find_corresponding_mount(&self, mnt_ns: &Arc) -> Option { + fn find_corresponding_mount(&self, mnt_ns: &Arc) -> Option { let corresponding_mount = self.mount.find_corresponding_mount(mnt_ns)?; let corresponding_path = Self::new(corresponding_mount, self.dentry.clone()); Some(corresponding_path) } + /// Checks if this path is reachable from the given `root` path. + /// + /// A path is considered reachable if it is the same as or a descendant + /// of the `root` path. The check traverses upwards from the current path, + /// crossing mount point boundaries as necessary, until it either finds + /// the `root` path or reaches the global root. + fn is_reachable_from(&self, root: &Path) -> bool { + let mut owned; + let mut current = self; + + loop { + if current.mount.id() != root.mount.id() { + let Some(parent_mount) = current.mount.parent().and_then(|mount| mount.upgrade()) + else { + return false; + }; + + owned = Path::new( + parent_mount, + current + .mount + .mountpoint() + .expect("Mounts with parents must have a mount point") + .clone(), + ); + current = &owned; + + continue; + } + + return current.dentry.is_equal_or_descendant_of(&root.dentry); + } + } + /// Returns true if the `Path` represents a pseudo file. fn is_pseudo(&self) -> bool { self.dentry.is_pseudo() @@ -333,7 +367,7 @@ impl Path { } let new_mount = self.mount.clone_mount_tree(&self.dentry, None, recursive); - new_mount.graft_mount_tree(dst_path)?; + new_mount.graft_mount_tree(dst_path); Ok(()) } @@ -369,7 +403,9 @@ impl Path { ); } - self.mount.graft_mount_tree(dst_path) + self.mount.graft_mount_tree(dst_path); + + Ok(()) } /// Sets the propagation type of the mount of this `Path`. diff --git a/kernel/src/fs/path/mount.rs b/kernel/src/fs/path/mount.rs index 73e10d7c7..c6c6cae86 100644 --- a/kernel/src/fs/path/mount.rs +++ b/kernel/src/fs/path/mount.rs @@ -420,10 +420,9 @@ impl Mount { } /// Grafts the mount node tree to the mountpoint. - pub(super) fn graft_mount_tree(&self, target_path: &Path) -> Result<()> { + pub(super) fn graft_mount_tree(&self, target_path: &Path) { self.detach_from_parent(); self.attach_to_path(target_path); - Ok(()) } /// Gets a child mount node from the mountpoint if any. diff --git a/kernel/src/fs/path/resolver.rs b/kernel/src/fs/path/resolver.rs index c36ef2096..1e4dfca2b 100644 --- a/kernel/src/fs/path/resolver.rs +++ b/kernel/src/fs/path/resolver.rs @@ -12,7 +12,7 @@ use crate::{ utils::{InodeType, NAME_MAX, PATH_MAX, Permission, SYMLINKS_MAX, SymbolicLink}, }, prelude::*, - process::posix_thread::AsThreadLocal, + process::posix_thread::{AsPosixThread, AsThreadLocal, thread_table::ThreadTable}, }; /// The file descriptor of the current working directory. @@ -228,6 +228,121 @@ impl PathResolver { Ok(()) } + + /// Changes the root mount in the mount namespace of the calling thread. + /// + /// This function moves the original root mount of the calling thread to `put_old_path` and makes + /// `new_root_path` the new root mount. For other threads in the current mount namespace, if their + /// root directory and current working directory are the same as the current thread's root directory, + /// they will also be changed to `new_root_path`. + // + // TODO: this method should only iterate threads in the current PID namespace instead of + // the whole thread table. + pub fn pivot_root( + &mut self, + new_root_path: FsPath, + put_old_path: FsPath, + thread_table: &ThreadTable, + ctx: &Context, + ) -> Result<()> { + let new_root_path = self.lookup(&new_root_path)?; + let put_old_path = self.lookup(&put_old_path)?; + + if new_root_path.type_() != InodeType::Dir || put_old_path.type_() != InodeType::Dir { + return_errno_with_message!( + Errno::ENOTDIR, + "`new_root` or `put_old` is not a directory" + ); + } + if self.root.mount.id() == new_root_path.mount.id() + || self.root.mount.id() == put_old_path.mount.id() + { + return_errno_with_message!( + Errno::EBUSY, + "`new_root` or `put_old` is on the current root mount" + ); + } + if !new_root_path.is_mount_root() || !self.root.is_mount_root() { + return_errno_with_message!( + Errno::EINVAL, + "`new_root` or the current root is not a mount point" + ); + } + if new_root_path.mount.parent().is_none() || self.root.mount.parent().is_none() { + return_errno_with_message!( + Errno::EINVAL, + "`new_root` or the current root is on the rootfs mount" + ); + } + + let current_ns_proxy = ctx.thread_local.borrow_ns_proxy(); + let current_mnt_ns = current_ns_proxy.unwrap().mnt_ns(); + if !current_mnt_ns.owns(&new_root_path.mount) || !current_mnt_ns.owns(&put_old_path.mount) { + return_errno_with_message!( + Errno::EINVAL, + "`new_root` or `put_old` is not in the current mount namespace" + ); + } + + if !put_old_path.is_reachable_from(&new_root_path) { + return_errno_with_message!( + Errno::EINVAL, + "`put_old` is not at or underneath `new_root`" + ); + } + if !new_root_path.is_reachable_from(&self.root) { + return_errno_with_message!( + Errno::EINVAL, + "`new_root` is not underneath the current root" + ); + } + + // TODO: Check the following once we support `MS_SHARED`: + // "The propagation type of the parent mount of `new_root` and the + // parent mount of the current root directory must not be + // `MS_SHARED`; similarly, if `put_old` is an existing mount point, + // its propagation type must not be `MS_SHARED`." + + let parent_path = { + let parent_mount = self.root.mount.parent().unwrap().upgrade().unwrap(); + let mountpoint = self.root.mount.mountpoint().unwrap(); + Path::new(parent_mount, mountpoint) + }; + + self.root.mount.graft_mount_tree(&put_old_path); + new_root_path.mount.graft_mount_tree(&parent_path); + + for thread in thread_table.values() { + let posix_thread = thread.as_posix_thread().unwrap(); + let ns_proxy = posix_thread.ns_proxy().lock(); + let Some(ns_proxy) = ns_proxy.as_ref() else { + continue; + }; + let mnt_ns = ns_proxy.mnt_ns(); + if !Arc::ptr_eq(mnt_ns, current_mnt_ns) { + continue; + } + let fs = posix_thread.read_fs(); + if Arc::ptr_eq(&fs, &ctx.thread_local.borrow_fs()) { + continue; + } + + let mut fs_resolver = fs.resolver().write(); + if fs_resolver.root() == &self.root { + fs_resolver.set_root(new_root_path.clone()); + } + if fs_resolver.cwd() == &self.root { + fs_resolver.set_cwd(new_root_path.clone()); + } + } + + if self.cwd == self.root { + self.cwd = new_root_path.clone(); + } + self.root = new_root_path; + + Ok(()) + } } /// The result of resolving an absolute path name. diff --git a/kernel/src/process/posix_thread/thread_table.rs b/kernel/src/process/posix_thread/thread_table.rs index 61d4be387..4069dfb81 100644 --- a/kernel/src/process/posix_thread/thread_table.rs +++ b/kernel/src/process/posix_thread/thread_table.rs @@ -3,20 +3,22 @@ use super::{Thread, Tid}; use crate::{prelude::*, process::posix_thread::AsPosixThread}; -static THREAD_TABLE: SpinLock>> = SpinLock::new(BTreeMap::new()); +pub type ThreadTable = BTreeMap>; -/// Adds a posix thread to global thread table +static THREAD_TABLE: Mutex = Mutex::new(BTreeMap::new()); + +/// Adds a POSIX thread to the global thread table. pub fn add_thread(tid: Tid, thread: Arc) { debug_assert_eq!(tid, thread.as_posix_thread().unwrap().tid()); THREAD_TABLE.lock().insert(tid, thread); } -/// Removes a posix thread to global thread table +/// Removes a POSIX thread from the global thread table. pub fn remove_thread(tid: Tid) { THREAD_TABLE.lock().remove(&tid); } -/// Gets a posix thread from the global thread table +/// Gets a POSIX thread from the global thread table. pub fn get_thread(tid: Tid) -> Option> { THREAD_TABLE.lock().get(&tid).cloned() } @@ -47,3 +49,12 @@ pub(in crate::process) fn make_current_main_thread(ctx: &Context) { let thread = thread_table.remove(&old_tid).unwrap(); thread_table.insert(pid, thread); } + +/// Applies the given function to the global thread table. +pub fn with_global_threads(f: F) -> R +where + F: FnOnce(&ThreadTable) -> R, +{ + let table = THREAD_TABLE.lock(); + f(&table) +} diff --git a/kernel/src/syscall/arch/generic.rs b/kernel/src/syscall/arch/generic.rs index e01389fdd..57fd06393 100644 --- a/kernel/src/syscall/arch/generic.rs +++ b/kernel/src/syscall/arch/generic.rs @@ -83,6 +83,7 @@ macro_rules! import_generic_syscall_entries { pidfd_getfd::sys_pidfd_getfd, pidfd_open::sys_pidfd_open, pipe::sys_pipe2, + pivot_root::sys_pivot_root, ppoll::sys_ppoll, prctl::sys_prctl, pread64::sys_pread64, @@ -233,6 +234,7 @@ macro_rules! define_syscalls_with_generic_syscall_table { SYS_LINKAT = 37 => sys_linkat(args[..5]); SYS_UMOUNT = 39 => sys_umount(args[..2]); SYS_MOUNT = 40 => sys_mount(args[..5]); + SYS_PIVOT_ROOT = 41 => sys_pivot_root(args[..2]); SYS_STATFS = 43 => sys_statfs(args[..2]); SYS_FSTATFS = 44 => sys_fstatfs(args[..2]); SYS_TRUNCATE = 45 => sys_truncate(args[..2]); diff --git a/kernel/src/syscall/arch/x86.rs b/kernel/src/syscall/arch/x86.rs index 4157bca91..61574eb6b 100644 --- a/kernel/src/syscall/arch/x86.rs +++ b/kernel/src/syscall/arch/x86.rs @@ -84,6 +84,7 @@ use super::{ pidfd_getfd::sys_pidfd_getfd, pidfd_open::sys_pidfd_open, pipe::{sys_pipe, sys_pipe2}, + pivot_root::sys_pivot_root, poll::sys_poll, ppoll::sys_ppoll, prctl::sys_prctl, @@ -308,6 +309,7 @@ impl_syscall_nums_and_dispatch_fn! { SYS_SCHED_GETSCHEDULER = 145 => sys_sched_getscheduler(args[..1]); SYS_SCHED_GET_PRIORITY_MAX = 146 => sys_sched_get_priority_max(args[..1]); SYS_SCHED_GET_PRIORITY_MIN = 147 => sys_sched_get_priority_min(args[..1]); + SYS_PIVOT_ROOT = 155 => sys_pivot_root(args[..2]); SYS_PRCTL = 157 => sys_prctl(args[..5]); SYS_ARCH_PRCTL = 158 => sys_arch_prctl(args[..2], &mut user_ctx); SYS_SETRLIMIT = 160 => sys_setrlimit(args[..2]); diff --git a/kernel/src/syscall/mod.rs b/kernel/src/syscall/mod.rs index b6b57b2d3..192940956 100644 --- a/kernel/src/syscall/mod.rs +++ b/kernel/src/syscall/mod.rs @@ -96,6 +96,7 @@ mod pause; mod pidfd_getfd; mod pidfd_open; mod pipe; +mod pivot_root; mod poll; mod ppoll; mod prctl; diff --git a/kernel/src/syscall/pivot_root.rs b/kernel/src/syscall/pivot_root.rs new file mode 100644 index 000000000..94384623b --- /dev/null +++ b/kernel/src/syscall/pivot_root.rs @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MPL-2.0 + +use super::SyscallReturn; +use crate::{ + fs::path::FsPath, prelude::*, process::posix_thread::thread_table::with_global_threads, + syscall::constants::MAX_FILENAME_LEN, +}; + +pub fn sys_pivot_root( + new_root_ptr: Vaddr, + put_old_ptr: Vaddr, + ctx: &Context, +) -> Result { + let new_root_name = ctx + .user_space() + .read_cstring(new_root_ptr, MAX_FILENAME_LEN)?; + let put_old_name = ctx + .user_space() + .read_cstring(put_old_ptr, MAX_FILENAME_LEN)?; + + debug!( + "pivot_root: new_root = {:?}, put_old = {:?}", + new_root_name, put_old_name + ); + + let new_root_name = new_root_name.to_string_lossy(); + let new_root_path = FsPath::try_from(new_root_name.as_ref())?; + let put_old_name = put_old_name.to_string_lossy(); + let put_old_path = FsPath::try_from(put_old_name.as_ref())?; + + // TODO: Locking the global thread table here is a workaround. We need to use a more + // suitable lock (i.e. the global mount lock or the namespace lock) to avoid deadlock. + with_global_threads(move |table| { + let fs_ref = ctx.thread_local.borrow_fs(); + let mut fs_resolver = fs_ref.resolver().write(); + + fs_resolver.pivot_root(new_root_path, put_old_path, table, ctx) + })?; + + Ok(SyscallReturn::Return(0)) +}