diff --git a/kernel/comps/systree/src/lib.rs b/kernel/comps/systree/src/lib.rs index 0b001439e..0d54f784d 100644 --- a/kernel/comps/systree/src/lib.rs +++ b/kernel/comps/systree/src/lib.rs @@ -85,6 +85,8 @@ pub enum Error { NotFound, /// Invalid operation occurred InvalidOperation, + /// Resource is unavailable + ResourceUnavailable, /// Attribute operation failed AttributeError, /// Permission denied for operation @@ -106,6 +108,7 @@ impl core::fmt::Display for Error { match self { Error::NotFound => write!(f, "Attempted to access a non-existent systree item"), Error::InvalidOperation => write!(f, "Invalid operation occurred"), + Error::ResourceUnavailable => write!(f, "Resource is unavailable"), Error::AttributeError => write!(f, "Attribute error"), Error::PermissionDenied => write!(f, "Permission denied for operation"), Error::InternalError(msg) => write!(f, "Internal error: {}", msg), diff --git a/kernel/src/error.rs b/kernel/src/error.rs index ef4d3fdfe..40e4f52e2 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -341,6 +341,7 @@ impl From for Error { match err { NotFound => Error::new(Errno::ENOENT), InvalidOperation => Error::new(Errno::EINVAL), + ResourceUnavailable => Error::new(Errno::EBUSY), AttributeError => Error::new(Errno::EIO), PermissionDenied => Error::new(Errno::EACCES), InternalError(msg) => Error::with_message(Errno::EIO, msg), diff --git a/kernel/src/fs/cgroupfs/controller/cpuset.rs b/kernel/src/fs/cgroupfs/controller/cpuset.rs new file mode 100644 index 000000000..d81f04574 --- /dev/null +++ b/kernel/src/fs/cgroupfs/controller/cpuset.rs @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MPL-2.0 + +use alloc::sync::Arc; + +use aster_systree::{Error, Result, SysAttrSetBuilder, SysPerms, SysStr}; +use ostd::mm::{VmReader, VmWriter}; + +/// A sub-controller responsible for CPU resource management in the cgroup subsystem. +pub struct CpuSetController { + _private: (), +} + +impl CpuSetController { + pub(super) fn init_attr_set(builder: &mut SysAttrSetBuilder, is_root: bool) { + if !is_root { + builder.add(SysStr::from("cpuset.cpus"), SysPerms::DEFAULT_RW_ATTR_PERMS); + builder.add(SysStr::from("cpuset.mems"), SysPerms::DEFAULT_RW_ATTR_PERMS); + } + + builder.add( + SysStr::from("cpuset.cpus.effective"), + SysPerms::DEFAULT_RO_ATTR_PERMS, + ); + builder.add( + SysStr::from("cpuset.mems.effective"), + SysPerms::DEFAULT_RO_ATTR_PERMS, + ); + } +} + +impl super::SubControl for CpuSetController { + fn read_attr_at(&self, _name: &str, _offset: usize, _writer: &mut VmWriter) -> Result { + Err(Error::AttributeError) + } + + fn write_attr(&self, _name: &str, _reader: &mut VmReader) -> Result { + Err(Error::AttributeError) + } +} + +impl super::SubControlStatic for CpuSetController { + fn new(_is_root: bool) -> Self { + Self { _private: () } + } + + fn type_() -> super::SubCtrlType { + super::SubCtrlType::CpuSet + } + + fn read_from(controller: &super::Controller) -> Arc> { + controller.cpuset.read().get().clone() + } +} diff --git a/kernel/src/fs/cgroupfs/controller/memory.rs b/kernel/src/fs/cgroupfs/controller/memory.rs new file mode 100644 index 000000000..70aff21e2 --- /dev/null +++ b/kernel/src/fs/cgroupfs/controller/memory.rs @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MPL-2.0 + +use alloc::sync::Arc; + +use aster_systree::{Error, Result, SysAttrSetBuilder, SysPerms, SysStr}; +use ostd::mm::{VmReader, VmWriter}; + +/// A sub-controller responsible for memory resource management in the cgroup subsystem. +/// +/// Note that even if the controller is inactive, it still provides some interfaces +/// like "memory.pressure" for usage. +pub struct MemoryController { + _private: (), +} + +impl MemoryController { + pub(super) fn init_attr_set(builder: &mut SysAttrSetBuilder, is_root: bool) { + // These attributes only exist on the non-root cgroup nodes. + // However, it seems that the `memory.stat` attribute is also present on the root node in practice. + // Currently the implementation follows the documentation strictly. + // + // Reference: + if !is_root { + builder.add(SysStr::from("memory.stat"), SysPerms::DEFAULT_RO_ATTR_PERMS); + builder.add(SysStr::from("memory.max"), SysPerms::DEFAULT_RO_ATTR_PERMS); + builder.add( + SysStr::from("memory.events"), + SysPerms::DEFAULT_RO_ATTR_PERMS, + ); + } + } +} + +impl super::SubControl for MemoryController { + fn read_attr_at(&self, _name: &str, _offset: usize, _writer: &mut VmWriter) -> Result { + Err(Error::AttributeError) + } + + fn write_attr(&self, _name: &str, _reader: &mut VmReader) -> Result { + Err(Error::AttributeError) + } +} + +impl super::SubControlStatic for MemoryController { + fn new(_is_root: bool) -> Self { + Self { _private: () } + } + + fn type_() -> super::SubCtrlType { + super::SubCtrlType::Memory + } + + fn read_from(controller: &super::Controller) -> Arc> { + controller.memory.read().get().clone() + } +} diff --git a/kernel/src/fs/cgroupfs/controller/mod.rs b/kernel/src/fs/cgroupfs/controller/mod.rs new file mode 100644 index 000000000..233cd3f8f --- /dev/null +++ b/kernel/src/fs/cgroupfs/controller/mod.rs @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: MPL-2.0 + +use alloc::{collections::vec_deque::VecDeque, sync::Arc}; +use core::{fmt::Display, str::FromStr}; + +use aster_systree::{Error, Result, SysAttrSetBuilder, SysBranchNode, SysObj}; +use bitflags::bitflags; +use ostd::{ + mm::{VmReader, VmWriter}, + sync::{Mutex, MutexGuard, Rcu}, +}; + +use crate::fs::cgroupfs::{ + controller::{cpuset::CpuSetController, memory::MemoryController, pids::PidsController}, + systree_node::CgroupSysNode, + CgroupNode, +}; + +mod cpuset; +mod memory; +mod pids; + +/// A trait to abstract all individual cgroup sub-controllers. +trait SubControl { + fn read_attr_at(&self, name: &str, offset: usize, writer: &mut VmWriter) -> Result; + + fn write_attr(&self, name: &str, reader: &mut VmReader) -> Result; +} + +/// Defines the static properties and behaviors of a specific cgroup sub-controller. +trait SubControlStatic: SubControl + Sized + 'static { + /// Creates a new instance of the sub-controller. + fn new(is_root: bool) -> Self; + + /// Returns the `SubCtrlType` enum variant corresponding to this sub-controller. + fn type_() -> SubCtrlType; + + /// Reads and clones the `Arc` of this sub-controller in the given `Controller`. + fn read_from(controller: &Controller) -> Arc>; +} + +/// The type of a sub-controller in the cgroup subsystem. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(super) enum SubCtrlType { + Memory, + CpuSet, + Pids, +} + +impl SubCtrlType { + const ALL: [Self; 3] = [Self::Memory, Self::CpuSet, Self::Pids]; +} + +impl FromStr for SubCtrlType { + type Err = aster_systree::Error; + + fn from_str(s: &str) -> Result { + match s { + "memory" => Ok(SubCtrlType::Memory), + "cpuset" => Ok(SubCtrlType::CpuSet), + "pids" => Ok(SubCtrlType::Pids), + _ => Err(Error::NotFound), + } + } +} + +bitflags! { + /// A set of sub-controller types, represented as bitflags. + pub(super) struct SubCtrlSet: u8 { + const MEMORY = 1 << 0; + const CPUSET = 1 << 1; + const PIDS = 1 << 2; + } +} + +impl SubCtrlSet { + /// Checks whether a sub-control is active in the current set. + pub(super) fn contains_type(&self, ctrl_type: SubCtrlType) -> bool { + self.contains(ctrl_type.into()) + } + + /// Adds a sub-control type to the current set. + pub(super) fn add_type(&mut self, ctrl_type: SubCtrlType) { + *self |= ctrl_type.into() + } + + /// Removes a sub-control type from the current set. + pub(super) fn remove_type(&mut self, ctrl_type: SubCtrlType) { + *self -= ctrl_type.into() + } + + /// Returns an iterator over the sub-controller types in the current set. + pub(super) fn iter_types(&self) -> impl Iterator + '_ { + SubCtrlType::ALL + .into_iter() + .filter(|&ctrl_type| self.contains_type(ctrl_type)) + } +} + +impl Display for SubCtrlSet { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if self.contains(Self::MEMORY) { + write!(f, "memory ")?; + } + if self.contains(Self::CPUSET) { + write!(f, "cpuset ")?; + } + if self.contains(Self::PIDS) { + write!(f, "pids")?; + } + + Ok(()) + } +} + +impl From for SubCtrlSet { + fn from(ctrl_type: SubCtrlType) -> Self { + match ctrl_type { + SubCtrlType::Memory => Self::MEMORY, + SubCtrlType::CpuSet => Self::CPUSET, + SubCtrlType::Pids => Self::PIDS, + } + } +} + +/// The sub-controller for a specific cgroup controller type. +/// +/// If the sub-controller is inactive, the `inner` field will be `None`. +struct SubController { + inner: Option, + /// The parent sub-controller in the hierarchy. + /// + /// This field is used to traverse the controller hierarchy. + #[expect(dead_code)] + parent: Option>>, +} + +impl SubController { + fn new(parent_controller: Option<&LockedController>) -> Arc { + let is_active = if let Some(parent) = parent_controller { + parent.active_set.contains_type(T::type_()) + } else { + true + }; + + let inner = if is_active { + Some(T::new(parent_controller.is_none())) + } else { + None + }; + + let parent = parent_controller.map(|controller| T::read_from(controller.controller)); + + Arc::new(Self { inner, parent }) + } +} + +trait TryGetSubControl { + fn try_get(&self) -> Option<&dyn SubControl>; +} + +impl TryGetSubControl for SubController { + fn try_get(&self) -> Option<&dyn SubControl> { + self.inner.as_ref().map(|sub_ctrl| sub_ctrl as _) + } +} + +/// The controller for a single cgroup. +/// +/// This struct can manage the activation state of each sub-control, and dispatches read/write +/// operations to the appropriate sub-controllers. +/// +/// The following is an explanation of the activation for sub-controls and sub-controllers. When a +/// cgroup activates a specific sub-control (e.g., memory, io), it means this control capability is +/// being delegated to its children. Consequently, the corresponding sub-controller within the +/// child nodes will be activated. +/// +/// The root node serves as the origin for all these control capabilities, so the sub-controllers +/// it possesses are always active. For any other node, only if its parent node first enables a +/// sub-control, its corresponding sub-controller will be activated. +pub(super) struct Controller { + /// A set of types of active sub-controllers. + active_set: Mutex, + + memory: Rcu>>, + cpuset: Rcu>>, + pids: Rcu>>, +} + +impl Controller { + /// Creates a new controller manager for a cgroup. + pub(super) fn new(locked_parent_controller: Option<&LockedController>) -> Self { + let memory_controller = SubController::new(locked_parent_controller); + let cpuset_controller = SubController::new(locked_parent_controller); + let pids_controller = SubController::new(locked_parent_controller); + + Self { + active_set: Mutex::new(SubCtrlSet::empty()), + memory: Rcu::new(memory_controller), + cpuset: Rcu::new(cpuset_controller), + pids: Rcu::new(pids_controller), + } + } + + pub(super) fn init_attr_set(builder: &mut SysAttrSetBuilder, is_root: bool) { + MemoryController::init_attr_set(builder, is_root); + CpuSetController::init_attr_set(builder, is_root); + PidsController::init_attr_set(builder, is_root); + } + + pub(super) fn lock(&self) -> LockedController { + LockedController { + active_set: self.active_set.lock(), + controller: self, + } + } + + fn read_sub(&self, ctrl_type: SubCtrlType) -> Arc { + match ctrl_type { + SubCtrlType::Memory => MemoryController::read_from(self), + SubCtrlType::CpuSet => CpuSetController::read_from(self), + SubCtrlType::Pids => PidsController::read_from(self), + } + } + + /// Returns whether the attribute with the given name is absent in this controller. + pub(super) fn is_attr_absent(&self, name: &str) -> bool { + let Some((subsys, _)) = name.split_once('.') else { + return false; + }; + let Ok(ctrl_type) = SubCtrlType::from_str(subsys) else { + return false; + }; + + let sub_controller = self.read_sub(ctrl_type); + if sub_controller.try_get().is_none() { + // If the sub-controller is not active, all its attributes are considered absent. + true + } else { + false + } + } + + pub(super) fn read_attr_at( + &self, + name: &str, + offset: usize, + writer: &mut VmWriter, + ) -> Result { + let Some((subsys, _)) = name.split_once('.') else { + return Err(Error::NotFound); + }; + let ctrl_type = SubCtrlType::from_str(subsys)?; + + let sub_controller = self.read_sub(ctrl_type); + let Some(controller) = sub_controller.try_get() else { + return Err(Error::IsDead); + }; + + controller.read_attr_at(name, offset, writer) + } + + pub(super) fn write_attr(&self, name: &str, reader: &mut VmReader) -> Result { + let Some((subsys, _)) = name.split_once('.') else { + return Err(Error::NotFound); + }; + let ctrl_type = SubCtrlType::from_str(subsys)?; + + let sub_controller = self.read_sub(ctrl_type); + let Some(controller) = sub_controller.try_get() else { + return Err(Error::IsDead); + }; + + controller.write_attr(name, reader) + } +} + +/// A locked controller for a cgroup. +/// +/// Holding this lock indicates exclusive access to modify the sub-control state. +pub(super) struct LockedController<'a> { + active_set: MutexGuard<'a, SubCtrlSet>, + controller: &'a Controller, +} + +impl LockedController<'_> { + /// Activates a sub-control of the specified type. + pub(super) fn activate( + &mut self, + ctrl_type: SubCtrlType, + current_node: &dyn CgroupSysNode, + parent_controller: Option<&LockedController>, + ) -> Result<()> { + if self.active_set.contains_type(ctrl_type) { + return Ok(()); + } + + // A cgroup can activate the sub-control only if this + // sub-control has been activated in its parent cgroup. + if parent_controller + .is_some_and(|controller| !controller.active_set.contains_type(ctrl_type)) + { + return Err(Error::NotFound); + } + + self.active_set.add_type(ctrl_type); + self.update_sub_controllers_for_descents(ctrl_type, current_node); + + Ok(()) + } + + /// Deactivates a sub-control of the specified type. + pub(super) fn deactivate( + &mut self, + ctrl_type: SubCtrlType, + current_node: &dyn CgroupSysNode, + ) -> Result<()> { + if !self.active_set.contains_type(ctrl_type) { + return Ok(()); + } + + // If any child node has activated this sub-control, + // the deactivation operation will be rejected. + for child in current_node.children() { + let cgroup_child = child.as_any().downcast_ref::().unwrap(); + let child_controller = cgroup_child.controller().lock(); + // This is race-free because if a child wants to activate a sub-controller, it should + // first acquire the lock of the parent controller, which is held here. + if child_controller.active_set().contains_type(ctrl_type) { + return Err(Error::InvalidOperation); + } + } + + self.active_set.remove_type(ctrl_type); + self.update_sub_controllers_for_descents(ctrl_type, current_node); + + Ok(()) + } + + fn update_sub_controllers_for_descents( + &self, + ctrl_type: SubCtrlType, + current_node: &dyn CgroupSysNode, + ) { + fn update_sub_controller_for_one_child( + child: &Arc, + ctrl_type: SubCtrlType, + parent_controller: &LockedController, + ) { + let child_node = child.as_any().downcast_ref::().unwrap(); + match ctrl_type { + SubCtrlType::Memory => { + let new_controller = SubController::new(Some(parent_controller)); + child_node.controller().memory.update(new_controller); + } + SubCtrlType::CpuSet => { + let new_controller = SubController::new(Some(parent_controller)); + child_node.controller().cpuset.update(new_controller); + } + SubCtrlType::Pids => { + let new_controller = SubController::new(Some(parent_controller)); + child_node.controller().pids.update(new_controller); + } + } + } + + let mut descents = VecDeque::new(); + + // The following update logic is race-free due to the following reasons: + // + // 1. **No Concurrent Controller Activation/Deactivation**: + // At this point, we hold the controller lock for the current node and we know that the + // sub-controllers for the direct children are inactive. Then, no sub-controllers for + // any of the descendants can be activated before we release the lock. + // + // 2. **Concurrent Child Addition/Deletion is Fine**: + // We do need to consider that children may be added or removed concurrently. However, + // this is handled correctly: + // - If a child is added, it will attempt to hold its parent's controller lock, which is + // synchronized with the code below. If this happens after us, the up-to-date + // sub-controllers will be seen. If it happens before us, we will update the + // sub-controllers for it; due to race conditions, the sub-controllers may already be + // up to date, but updating them twice is harmless since they must not be activated. + // - If a child is removed, we may update a sub-controller that's about to be destroyed, + // which is harmless. + + // Update the direct children first. + current_node.visit_children_with(0, &mut |child_node| { + descents.push_back(child_node.clone()); + update_sub_controller_for_one_child(child_node, ctrl_type, self); + + Some(()) + }); + + // Then update all the other descendent nodes. + while let Some(node) = descents.pop_front() { + let current_node = node.as_any().downcast_ref::().unwrap(); + // For descendent nodes, the sub-control must be inactive. But taking the controller + // lock is necessary for synchronization purposes (see the explanation above). + let locked_controller = current_node.controller().lock(); + current_node.visit_children_with(0, &mut |child_node| { + descents.push_back(child_node.clone()); + update_sub_controller_for_one_child(child_node, ctrl_type, &locked_controller); + + Some(()) + }); + } + } + + pub(super) fn active_set(&self) -> SubCtrlSet { + *self.active_set + } +} diff --git a/kernel/src/fs/cgroupfs/controller/pids.rs b/kernel/src/fs/cgroupfs/controller/pids.rs new file mode 100644 index 000000000..a105e4616 --- /dev/null +++ b/kernel/src/fs/cgroupfs/controller/pids.rs @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: MPL-2.0 + +use alloc::sync::Arc; + +use aster_systree::{Error, Result, SysAttrSetBuilder, SysPerms, SysStr}; +use ostd::mm::{VmReader, VmWriter}; + +/// A sub-controller responsible for PID resource management in the cgroup subsystem. +/// +/// This controller will only provide interfaces in non-root cgroup nodes. +pub struct PidsController { + _private: (), +} + +impl PidsController { + pub(super) fn init_attr_set(builder: &mut SysAttrSetBuilder, is_root: bool) { + if !is_root { + builder.add(SysStr::from("pids.max"), SysPerms::DEFAULT_RW_ATTR_PERMS); + } + } +} + +impl super::SubControl for PidsController { + fn read_attr_at(&self, _name: &str, _offset: usize, _writer: &mut VmWriter) -> Result { + Err(Error::AttributeError) + } + + fn write_attr(&self, _name: &str, _reader: &mut VmReader) -> Result { + Err(Error::AttributeError) + } +} + +impl super::SubControlStatic for PidsController { + fn new(_is_root: bool) -> Self { + Self { _private: () } + } + + fn type_() -> super::SubCtrlType { + super::SubCtrlType::Pids + } + + fn read_from(controller: &super::Controller) -> Arc> { + controller.pids.read().get().clone() + } +} diff --git a/kernel/src/fs/cgroupfs/inode.rs b/kernel/src/fs/cgroupfs/inode.rs index 45175451f..b833c8b1d 100644 --- a/kernel/src/fs/cgroupfs/inode.rs +++ b/kernel/src/fs/cgroupfs/inode.rs @@ -119,4 +119,11 @@ impl Inode for CgroupInode { Ok(()) } + + fn is_dentry_cacheable(&self) -> bool { + // Attribute nodes should not be cached because they may be dynamically + // created or removed based on the state of the cgroup controller. + // Caching them could result in stale or incorrect entries. + !matches!(self.node_kind, SysTreeNodeKind::Attr(..)) + } } diff --git a/kernel/src/fs/cgroupfs/mod.rs b/kernel/src/fs/cgroupfs/mod.rs index 5bba889ef..75f09c60d 100644 --- a/kernel/src/fs/cgroupfs/mod.rs +++ b/kernel/src/fs/cgroupfs/mod.rs @@ -3,6 +3,7 @@ use fs::CgroupFsType; pub use systree_node::{CgroupMembership, CgroupNode}; +mod controller; mod fs; mod inode; mod systree_node; diff --git a/kernel/src/fs/cgroupfs/systree_node.rs b/kernel/src/fs/cgroupfs/systree_node.rs index 953a6306b..185ac786c 100644 --- a/kernel/src/fs/cgroupfs/systree_node.rs +++ b/kernel/src/fs/cgroupfs/systree_node.rs @@ -1,11 +1,57 @@ // SPDX-License-Identifier: MPL-2.0 +//! Implements the cgroup nodes for the unified cgroup hierarchy (cgroup v2). +//! +//! This module defines the structures for cgroup nodes ([`CgroupNode`]) and the cgroup +//! root ([`CgroupSystem`]), integrating them into the `systree`. It handles process +//! management within cgroups and the logic for reading and writing cgroup attributes. +//! +//! ## Locks and Lock Ordering +//! +//! To ensure thread safety during concurrent operations, this module uses several +//! locks within the cgroup nodes. Adhering to the correct lock ordering is crucial +//! to prevent deadlocks. +//! +//! ### Lock Types +//! +//! 1. **Controller Lock**: Each cgroup node (including the root) has a [`Controller`] +//! that contains a `Mutex`. This lock protects the activation state of the sub-controllers +//! for its children (e.g., `memory`, `pids`). +//! +//! 2. **Inner Lock**: Each non-root [`CgroupNode`] has an `RwMutex` that protects its +//! `inner` data. +//! +//! 3. **Children Lock**: Each cgroup node ([`CgroupNode`] and [`CgroupSystem`]) inherits +//! an `RwLock` from `BranchNodeFields`. This lock protects access to and +//! modification of the list of child cgroup nodes. +//! +//! 4. **Cgroup Membership Lock**: A global `Mutex` managed by [`CgroupMembership`] that +//! serializes modifications to process cgroup memberships across the entire system. +//! +//! ### Locking Rules +//! +//! To avoid deadlocks, the following lock ordering must be strictly followed: +//! +//! 1. **Parent Before Child**: +//! When operating on both a parent and a child node, the lock on the parent +//! node must be acquired before the lock on the child node. +//! +//! 2. **Order Within a Single Node**: +//! When multiple locks are needed on the same cgroup node, they must be +//! acquired in this specific order: +//! `Controller Lock` -> `Inner Lock` -> `Children Lock` +//! +//! 3. **Global Lock First**: +//! When acquiring the `Cgroup Membership Lock` along with any other cgroup locks, +//! the `Cgroup Membership Lock` must be acquired first. + use alloc::{ string::ToString, sync::{Arc, Weak}, }; use core::{ fmt::Debug, + str::FromStr, sync::atomic::{AtomicUsize, Ordering}, }; @@ -19,6 +65,7 @@ use ostd::mm::{VmReader, VmWriter}; use spin::Once; use crate::{ + fs::cgroupfs::controller::{Controller, LockedController, SubCtrlSet, SubCtrlType}, prelude::*, process::{process_table, Pid, Process}, }; @@ -66,12 +113,44 @@ impl CgroupMembership { process: Arc, new_cgroup: &CgroupNode, ) -> Result<()> { - if let Some(old_cgroup) = process.cgroup().get() { + let old_cgroup = if let Some(old_cgroup) = process.cgroup().get() { // Fast path: If the process is already in this cgroup, do nothing. if new_cgroup.id() == old_cgroup.id() { return Ok(()); } + Some(old_cgroup.clone()) + } else { + None + }; + + // Try to add the process to the new cgroup first. + + let controller = new_cgroup.controller.lock(); + // According to "no internal processes" rule of cgroupv2, if a non-root + // cgroup node has activated some sub-controls, it cannot bind any process. + // + // Reference: + if !controller.active_set().is_empty() { + return Err(Error::ResourceUnavailable); + } + new_cgroup + .with_inner_mut(|current_processes| { + if current_processes.is_empty() { + let old_count = new_cgroup.populated_count.fetch_add(1, Ordering::Relaxed); + if old_count == 0 { + new_cgroup.propagate_add_populated(); + } + } + + current_processes.insert(process.pid(), Arc::downgrade(&process)); + process.set_cgroup(Some(new_cgroup.fields.weak_self().upgrade().unwrap())); + }) + .ok_or(Error::IsDead)?; + drop(controller); + + // Remove the process from the old cgroup second. + if let Some(old_cgroup) = old_cgroup { old_cgroup .with_inner_mut(|old_cgroup_processes| { old_cgroup_processes.remove(&process.pid()).unwrap(); @@ -83,32 +162,22 @@ impl CgroupMembership { } }) .unwrap(); - }; - - new_cgroup - .with_inner_mut(|current_processes| { - if current_processes.is_empty() { - let old_count = new_cgroup.populated_count.fetch_add(1, Ordering::Relaxed); - if old_count == 0 { - new_cgroup.propagate_add_populated(); - } - } - current_processes.insert(process.pid(), Arc::downgrade(&process)); - }) - .ok_or(Error::IsDead)?; - - process.set_cgroup(Some(new_cgroup.fields.weak_self().upgrade().unwrap())); + } Ok(()) } /// Moves a process to the root cgroup. pub fn move_process_to_root(&mut self, process: &Process) { - let process_cgroup = process.cgroup(); - let Some(old_cgroup) = process_cgroup.get() else { + let old_cgroup = if let Some(old_cgroup) = process.cgroup().get() { + old_cgroup.clone() + } else { + // The process is already in the root cgroup. Do nothing. return; }; + process.set_cgroup(None); + old_cgroup .with_inner_mut(|old_cgroup_processes| { old_cgroup_processes.remove(&process.pid()).unwrap(); @@ -120,8 +189,6 @@ impl CgroupMembership { } }) .unwrap(); - - process.set_cgroup(None); } } @@ -130,9 +197,17 @@ impl CgroupMembership { /// /// The cgroup system provides v2 unified hierarchy, and is also used as a root /// node in the cgroup systree. -#[derive(Debug)] pub(super) struct CgroupSystem { fields: BranchNodeFields, + controller: Controller, +} + +impl Debug for CgroupSystem { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("CgroupSystem") + .field("fields", &self.fields) + .finish_non_exhaustive() + } } /// A control group node in the cgroup systree. @@ -142,6 +217,8 @@ pub(super) struct CgroupSystem { /// this type. pub struct CgroupNode { fields: BranchNodeFields, + /// The controller of this cgroup node. + controller: Controller, /// The inner data. If it is `None`, then the cgroup node is dead. inner: RwMutex>, /// The depth of the node in the cgroupfs [`SysTree`], where the child of @@ -204,6 +281,10 @@ impl CgroupSystem { SysStr::from("cgroup.controllers"), SysPerms::DEFAULT_RO_ATTR_PERMS, ); + builder.add( + SysStr::from("cgroup.subtree_control"), + SysPerms::DEFAULT_RW_ATTR_PERMS, + ); builder.add( SysStr::from("cgroup.max.depth"), SysPerms::DEFAULT_RW_ATTR_PERMS, @@ -216,22 +297,32 @@ impl CgroupSystem { SysStr::from("cgroup.threads"), SysPerms::DEFAULT_RW_ATTR_PERMS, ); - builder.add( - SysStr::from("cpu.pressure"), - SysPerms::DEFAULT_RW_ATTR_PERMS, - ); - builder.add(SysStr::from("cpu.stat"), SysPerms::DEFAULT_RO_ATTR_PERMS); + + Controller::init_attr_set(&mut builder, true); let attrs = builder.build().expect("Failed to build attribute set"); Arc::new_cyclic(|weak_self| { let fields = BranchNodeFields::new(name, attrs, weak_self.clone()); - CgroupSystem { fields } + CgroupSystem { + fields, + controller: Controller::new(None), + } }) } } +impl CgroupSysNode for CgroupSystem { + fn controller(&self) -> &Controller { + &self.controller + } +} + impl CgroupNode { - pub(self) fn new(name: SysStr, depth: usize) -> Arc { + pub(self) fn new( + name: SysStr, + depth: usize, + locked_parent_controller: &LockedController, + ) -> Arc { let mut builder = SysAttrSetBuilder::new(); // TODO: Add more attributes as needed. The normal cgroup node may have // more attributes than the unified one. @@ -239,6 +330,10 @@ impl CgroupNode { SysStr::from("cgroup.controllers"), SysPerms::DEFAULT_RO_ATTR_PERMS, ); + builder.add( + SysStr::from("cgroup.subtree_control"), + SysPerms::DEFAULT_RW_ATTR_PERMS, + ); builder.add( SysStr::from("cgroup.max.depth"), SysPerms::DEFAULT_RW_ATTR_PERMS, @@ -251,21 +346,19 @@ impl CgroupNode { SysStr::from("cgroup.threads"), SysPerms::DEFAULT_RW_ATTR_PERMS, ); - builder.add( - SysStr::from("cpu.pressure"), - SysPerms::DEFAULT_RW_ATTR_PERMS, - ); - builder.add(SysStr::from("cpu.stat"), SysPerms::DEFAULT_RO_ATTR_PERMS); builder.add( SysStr::from("cgroup.events"), SysPerms::DEFAULT_RO_ATTR_PERMS, ); + Controller::init_attr_set(&mut builder, false); + let attrs = builder.build().expect("Failed to build attribute set"); Arc::new_cyclic(|weak_self| { let fields = BranchNodeFields::new(name, attrs, weak_self.clone()); CgroupNode { fields, + controller: Controller::new(Some(locked_parent_controller)), inner: RwMutex::new(Some(Inner::default())), depth, populated_count: AtomicUsize::new(0), @@ -274,6 +367,12 @@ impl CgroupNode { } } +impl CgroupSysNode for CgroupNode { + fn controller(&self) -> &Controller { + &self.controller + } +} + // For process management impl CgroupNode { fn propagate_add_populated(&self) { @@ -323,6 +422,7 @@ impl CgroupNode { /// Performs a read-only operation on the inner data. /// /// If the cgroup node is dead, returns `None`. + #[must_use] fn with_inner(&self, op: F) -> Option where F: FnOnce(&BTreeMap>) -> R, @@ -336,6 +436,7 @@ impl CgroupNode { /// Performs a mutable operation on the inner data. /// /// If the cgroup node is dead, returns `None`. + #[must_use] fn with_inner_mut(&self, op: F) -> Option where F: FnOnce(&mut BTreeMap>) -> R, @@ -384,6 +485,14 @@ inherit_sys_branch_node!(CgroupSystem, fields, { // This method should be a no-op for `RootNode`. } + fn is_attr_absent(&self, name: &str) -> bool { + if name.starts_with("cgroup.") { + false + } else { + self.controller.is_attr_absent(name) + } + } + fn read_attr_at(&self, name: &str, offset: usize, writer: &mut VmWriter) -> Result { let mut printer = VmPrinter::new_skip(writer, offset); match name { @@ -395,10 +504,15 @@ inherit_sys_branch_node!(CgroupSystem, fields, { } } } - _ => { - // TODO: Add support for reading other attributes. - return Err(Error::AttributeError); + "cgroup.controllers" => { + writeln!(printer, "{}", SubCtrlSet::all())?; } + "cgroup.subtree_control" => { + let active_set = self.controller.lock().active_set(); + writeln!(printer, "{}", active_set)?; + } + // TODO: Add support for reading other attributes. + _ => return self.controller.read_attr_at(name, offset, writer), } Ok(printer.bytes_written()) @@ -423,10 +537,21 @@ inherit_sys_branch_node!(CgroupSystem, fields, { Ok(len) } - _ => { - // TODO: Add support for writing other attributes. - Err(Error::AttributeError) + "cgroup.subtree_control" => { + let (activate_set, deactivate_set, len) = read_subtree_control_from_reader(reader)?; + + let mut controller = self.controller.lock(); + for ctrl_type in activate_set.iter_types() { + controller.activate(ctrl_type, self, None)?; + } + for ctrl_type in deactivate_set.iter_types() { + controller.deactivate(ctrl_type, self)?; + } + + Ok(len) } + // TODO: Add support for writing other attributes. + _ => self.controller.write_attr(name, reader), } } @@ -435,23 +560,36 @@ inherit_sys_branch_node!(CgroupSystem, fields, { } fn create_child(&self, name: &str) -> Result> { - let new_child = CgroupNode::new(name.to_string().into(), 1); + let controller = self.controller.lock(); + let new_child = CgroupNode::new(name.to_string().into(), 1, &controller); self.add_child(new_child.clone())?; Ok(new_child) } }); inherit_sys_branch_node!(CgroupNode, fields, { + fn is_attr_absent(&self, name: &str) -> bool { + if name.starts_with("cgroup.") { + false + } else { + self.controller.is_attr_absent(name) + } + } + fn read_attr_at(&self, name: &str, offset: usize, writer: &mut VmWriter) -> Result { - self.with_inner(|processes| { - let mut printer = VmPrinter::new_skip(writer, offset); - match name { - "cgroup.procs" => { + let mut printer = VmPrinter::new_skip(writer, offset); + match name { + "cgroup.procs" => self + .with_inner(|processes| { for pid in processes.keys() { writeln!(printer, "{}", pid)?; } - } - "cgroup.events" => { + + Ok::(printer.bytes_written()) + }) + .ok_or(Error::IsDead)?, + "cgroup.events" => self + .with_inner(|_| { let res = if self.populated_count.load(Ordering::Relaxed) > 0 { 1 } else { @@ -462,16 +600,41 @@ inherit_sys_branch_node!(CgroupNode, fields, { // Currently we have not enabled the "frozen" attribute // so the "frozen" field is always zero. writeln!(printer, "frozen {}", 0)?; - } - _ => { - // TODO: Add support for reading other attributes. - return Err(Error::AttributeError); - } - } - Ok(printer.bytes_written()) - }) - .ok_or(Error::IsDead)? + Ok::(printer.bytes_written()) + }) + .ok_or(Error::IsDead)?, + "cgroup.controllers" => { + let active_set = self + .cgroup_parent() + .ok_or(Error::IsDead)? + .controller() + .lock() + .active_set(); + self.with_inner(|_| { + writeln!(printer, "{}", active_set)?; + + Ok::(printer.bytes_written()) + }) + .ok_or(Error::IsDead)? + } + "cgroup.subtree_control" => { + let active_set = self.controller.lock().active_set(); + self.with_inner(|_| { + writeln!(printer, "{}", active_set)?; + + Ok::(printer.bytes_written()) + }) + .ok_or(Error::IsDead)? + } + // TODO: Add support for reading other attributes. + _ => self + // This read may target a stale controller if the cgroup's sub-controllers + // are being concurrently updated. It is the duty of user-space programs + // to use proper synchronization to avoid such races. + .with_inner(|_| self.controller.read_attr_at(name, offset, writer)) + .ok_or(Error::IsDead)?, + } } fn write_attr(&self, name: &str, reader: &mut VmReader) -> Result { @@ -487,19 +650,45 @@ inherit_sys_branch_node!(CgroupNode, fields, { .ok_or(Error::InvalidOperation)?; with_process_cgroup_locked(pid, |target_process, cgroup_membership| { - // TODO: According to the "no internal processes" rule of cgroupv2 - // (Ref: https://man7.org/linux/man-pages/man7/cgroups.7.html), - // if the cgroup node has enabled some controllers like "memory", "io", - // it is forbidden to bind a process to an internal cgroup node. cgroup_membership.move_process_to_node(target_process, self) })?; Ok(len) } - _ => { - // TODO: Add support for writing other attributes. - Err(Error::AttributeError) + "cgroup.subtree_control" => { + let (activate_set, deactivate_set, len) = read_subtree_control_from_reader(reader)?; + + let parent_node = self.cgroup_parent().ok_or(Error::IsDead)?; + let parent_controller = parent_node.controller().lock(); + let mut current_controller = self.controller.lock(); + + self.with_inner(|processes| { + // According to "no internal processes" rule of cgroupv2, if a non-root + // cgroup node has bound processes, it cannot activate any sub-control. + // + // Reference: + if !processes.is_empty() { + return Err(Error::ResourceUnavailable); + } + + for ctrl_type in activate_set.iter_types() { + current_controller.activate(ctrl_type, self, Some(&parent_controller))?; + } + for ctrl_type in deactivate_set.iter_types() { + current_controller.deactivate(ctrl_type, self)?; + } + + Ok(len) + }) + .ok_or(Error::IsDead)? } + // TODO: Add support for writing other attributes. + _ => self + // This write may target a stale controller if the cgroup's sub-controllers + // are being concurrently updated. It is the duty of user-space programs + // to use proper synchronization to avoid such races. + .with_inner(|_| self.controller.write_attr(name, reader)) + .ok_or(Error::IsDead)?, } } @@ -508,8 +697,9 @@ inherit_sys_branch_node!(CgroupNode, fields, { } fn create_child(&self, name: &str) -> Result> { + let controller = self.controller.lock(); self.with_inner(|_| { - let new_child = CgroupNode::new(name.to_string().into(), self.depth + 1); + let new_child = CgroupNode::new(name.to_string().into(), self.depth + 1, &controller); self.add_child(new_child.clone())?; Ok(new_child as _) }) @@ -542,3 +732,57 @@ where op(process, &mut cgroup_guard) } + +/// Reads the actions for sub-control from the given reader. +/// +/// Returns the sets of controllers to be activated and deactivated, +/// along with the number of bytes read. The two sets will not overlap. +fn read_subtree_control_from_reader( + reader: &mut VmReader, +) -> Result<(SubCtrlSet, SubCtrlSet, usize)> { + let (content, len) = reader + .read_cstring_until_end(MAX_ATTR_SIZE) + .map_err(|_| Error::PageFault)?; + let content = content.to_str().map_err(|_| Error::InvalidOperation)?; + + let mut activate_set = SubCtrlSet::empty(); + let mut deactivate_set = SubCtrlSet::empty(); + + let actions = content.split_whitespace(); + for action in actions { + if action.len() < 2 { + return Err(Error::InvalidOperation); + } + + match action.chars().next() { + Some('+') => { + let ctrl_type = SubCtrlType::from_str(&action[1..])?; + activate_set.add_type(ctrl_type); + deactivate_set.remove_type(ctrl_type); + } + Some('-') => { + let ctrl_type = SubCtrlType::from_str(&action[1..])?; + deactivate_set.add_type(ctrl_type); + activate_set.remove_type(ctrl_type); + } + _ => return Err(Error::InvalidOperation), + }; + } + + Ok((activate_set, deactivate_set, len)) +} + +/// A trait that abstracts over different types of cgroup nodes (`CgroupNode`, `CgroupSystem`) +/// to provide a common API for controller logics. +pub(super) trait CgroupSysNode: SysBranchNode { + fn controller(&self) -> &Controller; + + fn cgroup_parent(&self) -> Option> { + let parent = self.parent()?; + if parent.is_root() { + Some(Arc::downcast::(parent).unwrap()) + } else { + Some(Arc::downcast::(parent).unwrap()) + } + } +}