Enable the process management for cgroup

This commit is contained in:
Chen Chengjun 2025-11-06 12:24:19 +00:00 committed by Ruihan Li
parent cda06613f0
commit 77fb73bdd7
9 changed files with 500 additions and 29 deletions

View File

@ -39,7 +39,9 @@ use spin::Once;
pub use self::{
attr::{SysAttr, SysAttrSet, SysAttrSetBuilder},
node::{SysBranchNode, SysNode, SysNodeId, SysNodeType, SysObj, SysPerms, SysSymlink},
node::{
SysBranchNode, SysNode, SysNodeId, SysNodeType, SysObj, SysPerms, SysSymlink, MAX_ATTR_SIZE,
},
tree::SysTree,
utils::{
AttrLessBranchNodeFields, BranchNodeFields, EmptyNode, NormalNodeFields, ObjFields,
@ -95,6 +97,8 @@ pub enum Error {
Overflow,
/// Page fault occurred during memory access
PageFault,
/// The current systree item is dead
IsDead,
}
impl core::fmt::Display for Error {
@ -108,6 +112,7 @@ impl core::fmt::Display for Error {
Error::AlreadyExists => write!(f, "The systree item already exists"),
Error::Overflow => write!(f, "Numerical overflow occurred"),
Error::PageFault => write!(f, "Page fault occurred during memory access"),
Error::IsDead => write!(f, "The current systree item is dead"),
}
}
}

View File

@ -347,6 +347,7 @@ impl From<aster_systree::Error> for Error {
AlreadyExists => Error::new(Errno::EEXIST),
Overflow => Error::new(Errno::EOVERFLOW),
PageFault => Error::new(Errno::EFAULT),
IsDead => Error::new(Errno::ENODEV),
}
}
}

View File

@ -6,10 +6,15 @@ use ostd::sync::RwLock;
use super::fs::CgroupFs;
use crate::{
fs::utils::{
systree_inode::{SysTreeInodeTy, SysTreeNodeKind},
FileSystem, Inode, InodeMode, Metadata,
fs::{
cgroupfs::CgroupNode,
path::{is_dot, is_dotdot},
utils::{
systree_inode::{SysTreeInodeTy, SysTreeNodeKind},
FileSystem, Inode, InodeMode, Metadata,
},
},
prelude::*,
Result,
};
@ -78,4 +83,33 @@ impl Inode for CgroupInode {
fn fs(&self) -> Arc<dyn FileSystem> {
CgroupFs::singleton().clone()
}
fn rmdir(&self, name: &str) -> Result<()> {
if is_dot(name) {
return_errno_with_message!(Errno::EINVAL, "rmdir on .");
}
if is_dotdot(name) {
return_errno_with_message!(Errno::ENOTEMPTY, "rmdir on ..");
}
let SysTreeNodeKind::Branch(branch_node) = self.node_kind() else {
return_errno_with_message!(Errno::ENOTDIR, "the current node is not a branch node");
};
let Some(child) = branch_node.child(name) else {
return_errno_with_message!(Errno::ENOENT, "the child node does not exist");
};
let target_node = child.as_any().downcast_ref::<CgroupNode>().unwrap();
// This will succeed only if the child is empty and has not been removed.
target_node.mark_as_dead()?;
// This is guaranteed to remove `child` because the dentry lock prevents
// concurrent modification to the children, and there are no races because
// `mark_as_dead` can succeed at most once.
branch_node.remove_child(name).unwrap();
Ok(())
}
}

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: MPL-2.0
use fs::CgroupFsType;
pub use systree_node::{CgroupMembership, CgroupNode};
mod fs;
mod inode;

View File

@ -4,16 +4,127 @@ use alloc::{
string::ToString,
sync::{Arc, Weak},
};
use core::fmt::Debug;
use core::{
fmt::Debug,
sync::atomic::{AtomicUsize, Ordering},
};
use aster_systree::{
inherit_sys_branch_node, BranchNodeFields, Error, Result, SysAttrSetBuilder, SysBranchNode,
SysObj, SysPerms, SysStr,
SysObj, SysPerms, SysStr, MAX_ATTR_SIZE,
};
use aster_util::printer::VmPrinter;
use inherit_methods_macro::inherit_methods;
use ostd::mm::{VmReader, VmWriter};
use spin::Once;
use crate::{
prelude::*,
process::{process_table, Pid, Process},
};
/// A type that provides exclusive, synchronized access to modify cgroup membership.
///
/// This struct encapsulates the logic for moving processes between cgroups.
/// By calling `CgroupMembership::lock()`, a thread can attempt to acquire a lock
/// on the global instance. Upon success, it returns a guard that provides mutable
/// access, allowing for safe cgroup membership modifications.
///
/// # Usage
///
/// ```rust,ignore
/// // Acquire the lock.
/// let membership = CgroupMembership::lock();
///
/// // Move a process to a new cgroup node.
/// membership.move_process_to_node(process, &new_cgroup);
///
/// // The lock is automatically released when `membership` is dropped.
/// ```
pub struct CgroupMembership {
_private: (),
}
impl CgroupMembership {
/// Acquires the lock on the global instance.
///
/// Returns a guard that provides mutable access to modify cgroup membership.
pub fn lock() -> MutexGuard<'static, Self> {
static CGROUP_MEMBERSHIP: Mutex<CgroupMembership> =
Mutex::new(CgroupMembership { _private: () });
CGROUP_MEMBERSHIP.lock()
}
/// Moves a process to the new cgroup node.
///
/// A process can only belong to one cgroup at a time.
/// When moved to a new cgroup, it's automatically removed from the
/// previous one.
pub fn move_process_to_node(
&mut self,
process: Arc<Process>,
new_cgroup: &CgroupNode,
) -> Result<()> {
if let Some(old_cgroup) = process.cgroup().get() {
// Fast path: If the process is already in this cgroup, do nothing.
if new_cgroup.id() == old_cgroup.id() {
return Ok(());
}
old_cgroup
.with_inner_mut(|old_cgroup_processes| {
old_cgroup_processes.remove(&process.pid()).unwrap();
if old_cgroup_processes.is_empty() {
let old_count = old_cgroup.populated_count.fetch_sub(1, Ordering::Relaxed);
if old_count == 1 {
old_cgroup.propagate_sub_populated();
}
}
})
.unwrap();
};
new_cgroup
.with_inner_mut(|current_processes| {
if current_processes.is_empty() {
let old_count = new_cgroup.populated_count.fetch_add(1, Ordering::Relaxed);
if old_count == 0 {
new_cgroup.propagate_add_populated();
}
}
current_processes.insert(process.pid(), Arc::downgrade(&process));
})
.ok_or(Error::IsDead)?;
process.set_cgroup(Some(new_cgroup.fields.weak_self().upgrade().unwrap()));
Ok(())
}
/// Moves a process to the root cgroup.
pub fn move_process_to_root(&mut self, process: &Process) {
let process_cgroup = process.cgroup();
let Some(old_cgroup) = process_cgroup.get() else {
return;
};
old_cgroup
.with_inner_mut(|old_cgroup_processes| {
old_cgroup_processes.remove(&process.pid()).unwrap();
if old_cgroup_processes.is_empty() {
let old_count = old_cgroup.populated_count.fetch_sub(1, Ordering::Relaxed);
if old_count == 1 {
old_cgroup.propagate_sub_populated();
}
}
})
.unwrap();
process.set_cgroup(None);
}
}
/// The root of a cgroup hierarchy, serving as the entry point to
/// the entire cgroup control system.
///
@ -29,9 +140,39 @@ pub(super) struct CgroupSystem {
/// Each node can bind a group of processes together for purpose of resource
/// management. Except for the root node, all nodes in the cgroup tree are of
/// this type.
#[derive(Debug)]
struct CgroupNode {
pub struct CgroupNode {
fields: BranchNodeFields<CgroupNode, Self>,
/// The inner data. If it is `None`, then the cgroup node is dead.
inner: RwMutex<Option<Inner>>,
/// The depth of the node in the cgroupfs [`SysTree`], where the child of
/// the root node has a depth of 1.
depth: usize,
/// Tracks the "populated" status of this node and its direct children.
///
/// The count is the sum of:
/// - The number of its direct children that are populated.
/// - A value of 1 if this node itself contains processes.
///
/// "populated": A node is considered populated if it has bound processes
/// either on itself or in any of its descendant nodes. Consequently,
/// a count > 0 indicates that this node is populated.
populated_count: AtomicUsize,
}
impl Debug for CgroupNode {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("CgroupNode")
.field("fields", &self.fields)
.field("populated_count", &self.populated_count)
.field("depth", &self.depth)
.finish_non_exhaustive()
}
}
#[derive(Default)]
struct Inner {
/// Processes bound to the cgroup node.
processes: BTreeMap<Pid, Weak<Process>>,
}
#[inherit_methods(from = "self.fields")]
@ -67,6 +208,10 @@ impl CgroupSystem {
SysStr::from("cgroup.max.depth"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(
SysStr::from("cgroup.procs"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(
SysStr::from("cgroup.threads"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
@ -86,7 +231,7 @@ impl CgroupSystem {
}
impl CgroupNode {
pub(self) fn new(name: SysStr) -> Arc<Self> {
pub(self) fn new(name: SysStr, depth: usize) -> Arc<Self> {
let mut builder = SysAttrSetBuilder::new();
// TODO: Add more attributes as needed. The normal cgroup node may have
// more attributes than the unified one.
@ -98,6 +243,10 @@ impl CgroupNode {
SysStr::from("cgroup.max.depth"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(
SysStr::from("cgroup.procs"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(
SysStr::from("cgroup.threads"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
@ -107,15 +256,125 @@ impl CgroupNode {
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(SysStr::from("cpu.stat"), SysPerms::DEFAULT_RO_ATTR_PERMS);
builder.add(
SysStr::from("cgroup.events"),
SysPerms::DEFAULT_RO_ATTR_PERMS,
);
let attrs = builder.build().expect("Failed to build attribute set");
Arc::new_cyclic(|weak_self| {
let fields = BranchNodeFields::new(name, attrs, weak_self.clone());
CgroupNode { fields }
CgroupNode {
fields,
inner: RwMutex::new(Some(Inner::default())),
depth,
populated_count: AtomicUsize::new(0),
}
})
}
}
// For process management
impl CgroupNode {
fn propagate_add_populated(&self) {
if self.depth <= 1 {
return;
}
let mut current_parent = Arc::downcast::<CgroupNode>(self.parent().unwrap()).unwrap();
loop {
let old_count = current_parent
.populated_count
.fetch_add(1, Ordering::Relaxed);
if old_count > 0 {
break;
}
if current_parent.depth == 1 {
break;
}
current_parent = Arc::downcast::<CgroupNode>(current_parent.parent().unwrap()).unwrap();
}
}
fn propagate_sub_populated(&self) {
if self.depth <= 1 {
return;
}
let mut current_parent = Arc::downcast::<CgroupNode>(self.parent().unwrap()).unwrap();
loop {
let old_count = current_parent
.populated_count
.fetch_sub(1, Ordering::Relaxed);
if old_count != 1 {
break;
}
if current_parent.depth == 1 {
break;
}
current_parent = Arc::downcast::<CgroupNode>(current_parent.parent().unwrap()).unwrap();
}
}
/// Performs a read-only operation on the inner data.
///
/// If the cgroup node is dead, returns `None`.
fn with_inner<F, R>(&self, op: F) -> Option<R>
where
F: FnOnce(&BTreeMap<Pid, Weak<Process>>) -> R,
{
let inner = self.inner.read();
let inner_ref = inner.as_ref()?;
Some(op(&inner_ref.processes))
}
/// Performs a mutable operation on the inner data.
///
/// If the cgroup node is dead, returns `None`.
fn with_inner_mut<F, R>(&self, op: F) -> Option<R>
where
F: FnOnce(&mut BTreeMap<Pid, Weak<Process>>) -> R,
{
let mut inner = self.inner.write();
let inner_ref = inner.as_mut()?;
Some(op(&mut inner_ref.processes))
}
/// Marks this cgroup node as dead.
///
/// This will succeed only if the cgroup node is empty and is alive.
/// Here, a cgroup node is considered empty if it has no child nodes and no
/// processes bound to it.
pub(super) fn mark_as_dead(&self) -> crate::Result<()> {
let mut inner = self.inner.write();
let Some(inner_ref) = inner.as_ref() else {
return_errno_with_message!(Errno::ENOENT, "the cgroup node is already dead");
};
if !inner_ref.processes.is_empty() {
return_errno_with_message!(Errno::EBUSY, "the cgroup hierarchy still has processes");
}
let children = self.fields.children_ref().read();
if !children.is_empty() {
return_errno_with_message!(
Errno::ENOTEMPTY,
"only an empty cgroup hierarchy can be removed"
);
}
*inner = None;
Ok(())
}
}
inherit_sys_branch_node!(CgroupSystem, fields, {
fn is_root(&self) -> bool {
true
@ -125,14 +384,50 @@ inherit_sys_branch_node!(CgroupSystem, fields, {
// This method should be a no-op for `RootNode`.
}
fn read_attr_at(&self, _name: &str, _offset: usize, _writer: &mut VmWriter) -> Result<usize> {
// TODO: Add support for reading attributes.
Err(Error::AttributeError)
fn read_attr_at(&self, name: &str, offset: usize, writer: &mut VmWriter) -> Result<usize> {
let mut printer = VmPrinter::new_skip(writer, offset);
match name {
"cgroup.procs" => {
let process_table = process_table::process_table_mut();
for process in process_table.iter() {
if process.cgroup().is_none() {
writeln!(printer, "{}", process.pid())?;
}
}
}
_ => {
// TODO: Add support for reading other attributes.
return Err(Error::AttributeError);
}
}
Ok(printer.bytes_written())
}
fn write_attr(&self, _name: &str, _reader: &mut VmReader) -> Result<usize> {
// TODO: Add support for writing attributes.
Err(Error::AttributeError)
fn write_attr(&self, name: &str, reader: &mut VmReader) -> Result<usize> {
match name {
"cgroup.procs" => {
let (content, len) = reader
.read_cstring_until_end(MAX_ATTR_SIZE)
.map_err(|_| Error::PageFault)?;
let pid = content
.to_str()
.ok()
.and_then(|string| string.trim().parse::<Pid>().ok())
.ok_or(Error::InvalidOperation)?;
with_process_cgroup_locked(pid, |process, cgroup_membership| {
cgroup_membership.move_process_to_root(&process);
Ok(())
})?;
Ok(len)
}
_ => {
// TODO: Add support for writing other attributes.
Err(Error::AttributeError)
}
}
}
fn perms(&self) -> SysPerms {
@ -140,21 +435,72 @@ inherit_sys_branch_node!(CgroupSystem, fields, {
}
fn create_child(&self, name: &str) -> Result<Arc<dyn SysObj>> {
let new_child = CgroupNode::new(name.to_string().into());
let new_child = CgroupNode::new(name.to_string().into(), 1);
self.add_child(new_child.clone())?;
Ok(new_child)
}
});
inherit_sys_branch_node!(CgroupNode, fields, {
fn read_attr_at(&self, _name: &str, _offset: usize, _writer: &mut VmWriter) -> Result<usize> {
// TODO: Add support for reading attributes.
Err(Error::AttributeError)
fn read_attr_at(&self, name: &str, offset: usize, writer: &mut VmWriter) -> Result<usize> {
self.with_inner(|processes| {
let mut printer = VmPrinter::new_skip(writer, offset);
match name {
"cgroup.procs" => {
for pid in processes.keys() {
writeln!(printer, "{}", pid)?;
}
}
"cgroup.events" => {
let res = if self.populated_count.load(Ordering::Relaxed) > 0 {
1
} else {
0
};
writeln!(printer, "populated {}", res)?;
// Currently we have not enabled the "frozen" attribute
// so the "frozen" field is always zero.
writeln!(printer, "frozen {}", 0)?;
}
_ => {
// TODO: Add support for reading other attributes.
return Err(Error::AttributeError);
}
}
Ok(printer.bytes_written())
})
.ok_or(Error::IsDead)?
}
fn write_attr(&self, _name: &str, _reader: &mut VmReader) -> Result<usize> {
// TODO: Add support for writing attributes.
Err(Error::AttributeError)
fn write_attr(&self, name: &str, reader: &mut VmReader) -> Result<usize> {
match name {
"cgroup.procs" => {
let (content, len) = reader
.read_cstring_until_end(MAX_ATTR_SIZE)
.map_err(|_| Error::PageFault)?;
let pid = content
.to_str()
.ok()
.and_then(|string| string.trim().parse::<Pid>().ok())
.ok_or(Error::InvalidOperation)?;
with_process_cgroup_locked(pid, |target_process, cgroup_membership| {
// TODO: According to the "no internal processes" rule of cgroupv2
// (Ref: https://man7.org/linux/man-pages/man7/cgroups.7.html),
// if the cgroup node has enabled some controllers like "memory", "io",
// it is forbidden to bind a process to an internal cgroup node.
cgroup_membership.move_process_to_node(target_process, self)
})?;
Ok(len)
}
_ => {
// TODO: Add support for writing other attributes.
Err(Error::AttributeError)
}
}
}
fn perms(&self) -> SysPerms {
@ -162,8 +508,37 @@ inherit_sys_branch_node!(CgroupNode, fields, {
}
fn create_child(&self, name: &str) -> Result<Arc<dyn SysObj>> {
let new_child = CgroupNode::new(name.to_string().into());
self.add_child(new_child.clone())?;
Ok(new_child)
self.with_inner(|_| {
let new_child = CgroupNode::new(name.to_string().into(), self.depth + 1);
self.add_child(new_child.clone())?;
Ok(new_child as _)
})
// TODO: This should be checked at upper layers.
.ok_or(Error::NotFound)?
}
});
/// A helper function to safely perform an operation on a process's cgroup.
///
/// The given `pid` means the PID of the target process. A PID of 0 refers to the
/// current process.
///
/// Returns `Error::InvalidOperation` if the PID is not found or if the target
/// process is a zombie.
fn with_process_cgroup_locked<F>(pid: Pid, op: F) -> Result<()>
where
F: FnOnce(Arc<Process>, &mut CgroupMembership) -> Result<()>,
{
let process = if pid == 0 {
current!()
} else {
process_table::get_process(pid).ok_or(Error::InvalidOperation)?
};
let mut cgroup_guard = CgroupMembership::lock();
if process.status().is_zombie() {
return Err(Error::InvalidOperation);
}
op(process, &mut cgroup_guard)
}

View File

@ -307,7 +307,9 @@ impl<KInode: SysTreeInodeTy + Send + Sync + 'static> Inode for KInode {
}
default fn resize(&self, _new_size: usize) -> Result<()> {
Err(Error::new(Errno::EPERM))
// The `resize` operation should be ignored by kernelfs inodes,
// and should not incur an error.
Ok(())
}
default fn atime(&self) -> Duration {
@ -434,6 +436,10 @@ impl<KInode: SysTreeInodeTy + Send + Sync + 'static> Inode for KInode {
Err(Error::new(Errno::EPERM))
}
default fn rmdir(&self, _name: &str) -> Result<()> {
Err(Error::new(Errno::EPERM))
}
default fn rename(
&self,
_old_name: &str,

View File

@ -18,6 +18,7 @@ use crate::{
cpu::LinuxAbi,
current_userspace,
fs::{
cgroupfs::CgroupMembership,
file_table::{FdFlags, FileTable},
thread_info::ThreadFsInfo,
},
@ -288,6 +289,15 @@ pub fn clone_child(
Ok(child_tid)
} else {
let child_process = clone_child_process(ctx, parent_context, clone_args)?;
let mut cgroup_guard = CgroupMembership::lock();
if let Some(cgroup) = ctx.process.cgroup().get() {
cgroup_guard
.move_process_to_node(child_process.clone(), &cgroup)
.unwrap();
}
drop(cgroup_guard);
if clone_args.flags.contains(CloneFlags::CLONE_VFORK) {
child_process.status().set_vfork_child(true);
}

View File

@ -3,7 +3,10 @@
use core::sync::atomic::Ordering;
use super::{process_table, Pid, Process};
use crate::{events::IoEvents, prelude::*, process::signal::signals::kernel::KernelSignal};
use crate::{
events::IoEvents, fs::cgroupfs::CgroupMembership, prelude::*,
process::signal::signals::kernel::KernelSignal,
};
/// Exits the current POSIX process.
///
@ -26,6 +29,11 @@ pub(super) fn exit_process(current_process: &Process) {
move_children_to_reaper_process(current_process);
send_child_death_signal(current_process);
// Remove the process from the cgroup.
let mut cgroup_guard = CgroupMembership::lock();
cgroup_guard.move_process_to_root(current_process);
drop(cgroup_guard);
}
/// Sends parent-death signals to the children.

View File

@ -20,6 +20,7 @@ use super::{
task_set::TaskSet,
};
use crate::{
fs::cgroupfs::CgroupNode,
prelude::*,
process::{
signal::{sig_queues::SigQueues, Pollee},
@ -42,7 +43,10 @@ mod timer_manager;
use atomic_integer_wrapper::define_atomic_version_of_integer_like_type;
pub use init_proc::spawn_init_process;
pub use job_control::JobControl;
use ostd::{sync::WaitQueue, task::Task};
use ostd::{
sync::{RcuOption, RcuOptionReadGuard, WaitQueue},
task::Task,
};
pub use process_group::ProcessGroup;
pub use session::Session;
pub use terminal::Terminal;
@ -98,6 +102,10 @@ pub struct Process {
reaped_children_stats: Mutex<ReapedChildrenStats>,
/// resource limits
resource_limits: ResourceLimits,
/// The bound cgroup of the process.
///
/// If this field is `None`, the process is bound to the root cgroup.
cgroup: RcuOption<Arc<CgroupNode>>,
/// Scheduling priority nice value
/// According to POSIX.1, the nice value is a per-process attribute,
/// the threads in a process should share a nice value.
@ -243,6 +251,7 @@ impl Process {
parent_death_signal: AtomicSigNum::new_empty(),
exit_signal: AtomicSigNum::new_empty(),
resource_limits,
cgroup: RcuOption::new(None),
nice: AtomicNice::new(nice),
oom_score_adj: AtomicI16::new(oom_score_adj),
timer_manager: PosixTimerManager::new(&prof_clock, process_ref),
@ -786,6 +795,28 @@ impl Process {
pub fn user_ns(&self) -> &Mutex<Arc<UserNamespace>> {
&self.user_ns
}
// ******************* cgroup ********************
/// Returns a RCU read guard to the cgroup of the process.
///
/// The returned cgroup is not a stable snapshot. It may be changed by other threads
/// and encounter race conditions. Users can use [`lock_cgroup_membership`] to obtain
/// a lock to prevent the cgroup from being changed.
///
/// [`lock_cgroup_membership`]: crate::fs::cgroupfs::lock_cgroup_membership
pub fn cgroup(&self) -> RcuOptionReadGuard<Arc<CgroupNode>> {
self.cgroup.read()
}
/// Sets the cgroup for this process.
///
/// Note: This function should only be called within the cgroup module.
/// Arbitrary calls may likely cause race conditions.
#[doc(hidden)]
pub fn set_cgroup(&self, cgroup: Option<Arc<CgroupNode>>) {
self.cgroup.update(cgroup);
}
}
/// Enqueues a process-directed kernel signal asynchronously.