Implement controller framework for cgroup subsystem

This commit is contained in:
Chen Chengjun 2025-12-03 02:27:22 +00:00 committed by Ruihan Li
parent dda8161bef
commit d58ae3a782
9 changed files with 886 additions and 63 deletions

View File

@ -85,6 +85,8 @@ pub enum Error {
NotFound,
/// Invalid operation occurred
InvalidOperation,
/// Resource is unavailable
ResourceUnavailable,
/// Attribute operation failed
AttributeError,
/// Permission denied for operation
@ -106,6 +108,7 @@ impl core::fmt::Display for Error {
match self {
Error::NotFound => write!(f, "Attempted to access a non-existent systree item"),
Error::InvalidOperation => write!(f, "Invalid operation occurred"),
Error::ResourceUnavailable => write!(f, "Resource is unavailable"),
Error::AttributeError => write!(f, "Attribute error"),
Error::PermissionDenied => write!(f, "Permission denied for operation"),
Error::InternalError(msg) => write!(f, "Internal error: {}", msg),

View File

@ -341,6 +341,7 @@ impl From<aster_systree::Error> for Error {
match err {
NotFound => Error::new(Errno::ENOENT),
InvalidOperation => Error::new(Errno::EINVAL),
ResourceUnavailable => Error::new(Errno::EBUSY),
AttributeError => Error::new(Errno::EIO),
PermissionDenied => Error::new(Errno::EACCES),
InternalError(msg) => Error::with_message(Errno::EIO, msg),

View File

@ -0,0 +1,53 @@
// SPDX-License-Identifier: MPL-2.0
use alloc::sync::Arc;
use aster_systree::{Error, Result, SysAttrSetBuilder, SysPerms, SysStr};
use ostd::mm::{VmReader, VmWriter};
/// A sub-controller responsible for CPU resource management in the cgroup subsystem.
pub struct CpuSetController {
_private: (),
}
impl CpuSetController {
pub(super) fn init_attr_set(builder: &mut SysAttrSetBuilder, is_root: bool) {
if !is_root {
builder.add(SysStr::from("cpuset.cpus"), SysPerms::DEFAULT_RW_ATTR_PERMS);
builder.add(SysStr::from("cpuset.mems"), SysPerms::DEFAULT_RW_ATTR_PERMS);
}
builder.add(
SysStr::from("cpuset.cpus.effective"),
SysPerms::DEFAULT_RO_ATTR_PERMS,
);
builder.add(
SysStr::from("cpuset.mems.effective"),
SysPerms::DEFAULT_RO_ATTR_PERMS,
);
}
}
impl super::SubControl for CpuSetController {
fn read_attr_at(&self, _name: &str, _offset: usize, _writer: &mut VmWriter) -> Result<usize> {
Err(Error::AttributeError)
}
fn write_attr(&self, _name: &str, _reader: &mut VmReader) -> Result<usize> {
Err(Error::AttributeError)
}
}
impl super::SubControlStatic for CpuSetController {
fn new(_is_root: bool) -> Self {
Self { _private: () }
}
fn type_() -> super::SubCtrlType {
super::SubCtrlType::CpuSet
}
fn read_from(controller: &super::Controller) -> Arc<super::SubController<Self>> {
controller.cpuset.read().get().clone()
}
}

View File

@ -0,0 +1,56 @@
// SPDX-License-Identifier: MPL-2.0
use alloc::sync::Arc;
use aster_systree::{Error, Result, SysAttrSetBuilder, SysPerms, SysStr};
use ostd::mm::{VmReader, VmWriter};
/// A sub-controller responsible for memory resource management in the cgroup subsystem.
///
/// Note that even if the controller is inactive, it still provides some interfaces
/// like "memory.pressure" for usage.
pub struct MemoryController {
_private: (),
}
impl MemoryController {
pub(super) fn init_attr_set(builder: &mut SysAttrSetBuilder, is_root: bool) {
// These attributes only exist on the non-root cgroup nodes.
// However, it seems that the `memory.stat` attribute is also present on the root node in practice.
// Currently the implementation follows the documentation strictly.
//
// Reference: <https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#memory-interface-files>
if !is_root {
builder.add(SysStr::from("memory.stat"), SysPerms::DEFAULT_RO_ATTR_PERMS);
builder.add(SysStr::from("memory.max"), SysPerms::DEFAULT_RO_ATTR_PERMS);
builder.add(
SysStr::from("memory.events"),
SysPerms::DEFAULT_RO_ATTR_PERMS,
);
}
}
}
impl super::SubControl for MemoryController {
fn read_attr_at(&self, _name: &str, _offset: usize, _writer: &mut VmWriter) -> Result<usize> {
Err(Error::AttributeError)
}
fn write_attr(&self, _name: &str, _reader: &mut VmReader) -> Result<usize> {
Err(Error::AttributeError)
}
}
impl super::SubControlStatic for MemoryController {
fn new(_is_root: bool) -> Self {
Self { _private: () }
}
fn type_() -> super::SubCtrlType {
super::SubCtrlType::Memory
}
fn read_from(controller: &super::Controller) -> Arc<super::SubController<Self>> {
controller.memory.read().get().clone()
}
}

View File

@ -0,0 +1,413 @@
// SPDX-License-Identifier: MPL-2.0
use alloc::{collections::vec_deque::VecDeque, sync::Arc};
use core::{fmt::Display, str::FromStr};
use aster_systree::{Error, Result, SysAttrSetBuilder, SysBranchNode, SysObj};
use bitflags::bitflags;
use ostd::{
mm::{VmReader, VmWriter},
sync::{Mutex, MutexGuard, Rcu},
};
use crate::fs::cgroupfs::{
controller::{cpuset::CpuSetController, memory::MemoryController, pids::PidsController},
systree_node::CgroupSysNode,
CgroupNode,
};
mod cpuset;
mod memory;
mod pids;
/// A trait to abstract all individual cgroup sub-controllers.
trait SubControl {
fn read_attr_at(&self, name: &str, offset: usize, writer: &mut VmWriter) -> Result<usize>;
fn write_attr(&self, name: &str, reader: &mut VmReader) -> Result<usize>;
}
/// Defines the static properties and behaviors of a specific cgroup sub-controller.
trait SubControlStatic: SubControl + Sized + 'static {
/// Creates a new instance of the sub-controller.
fn new(is_root: bool) -> Self;
/// Returns the `SubCtrlType` enum variant corresponding to this sub-controller.
fn type_() -> SubCtrlType;
/// Reads and clones the `Arc` of this sub-controller in the given `Controller`.
fn read_from(controller: &Controller) -> Arc<SubController<Self>>;
}
/// The type of a sub-controller in the cgroup subsystem.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum SubCtrlType {
Memory,
CpuSet,
Pids,
}
impl SubCtrlType {
const ALL: [Self; 3] = [Self::Memory, Self::CpuSet, Self::Pids];
}
impl FromStr for SubCtrlType {
type Err = aster_systree::Error;
fn from_str(s: &str) -> Result<Self> {
match s {
"memory" => Ok(SubCtrlType::Memory),
"cpuset" => Ok(SubCtrlType::CpuSet),
"pids" => Ok(SubCtrlType::Pids),
_ => Err(Error::NotFound),
}
}
}
bitflags! {
/// A set of sub-controller types, represented as bitflags.
pub(super) struct SubCtrlSet: u8 {
const MEMORY = 1 << 0;
const CPUSET = 1 << 1;
const PIDS = 1 << 2;
}
}
impl SubCtrlSet {
/// Checks whether a sub-control is active in the current set.
pub(super) fn contains_type(&self, ctrl_type: SubCtrlType) -> bool {
self.contains(ctrl_type.into())
}
/// Adds a sub-control type to the current set.
pub(super) fn add_type(&mut self, ctrl_type: SubCtrlType) {
*self |= ctrl_type.into()
}
/// Removes a sub-control type from the current set.
pub(super) fn remove_type(&mut self, ctrl_type: SubCtrlType) {
*self -= ctrl_type.into()
}
/// Returns an iterator over the sub-controller types in the current set.
pub(super) fn iter_types(&self) -> impl Iterator<Item = SubCtrlType> + '_ {
SubCtrlType::ALL
.into_iter()
.filter(|&ctrl_type| self.contains_type(ctrl_type))
}
}
impl Display for SubCtrlSet {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
if self.contains(Self::MEMORY) {
write!(f, "memory ")?;
}
if self.contains(Self::CPUSET) {
write!(f, "cpuset ")?;
}
if self.contains(Self::PIDS) {
write!(f, "pids")?;
}
Ok(())
}
}
impl From<SubCtrlType> for SubCtrlSet {
fn from(ctrl_type: SubCtrlType) -> Self {
match ctrl_type {
SubCtrlType::Memory => Self::MEMORY,
SubCtrlType::CpuSet => Self::CPUSET,
SubCtrlType::Pids => Self::PIDS,
}
}
}
/// The sub-controller for a specific cgroup controller type.
///
/// If the sub-controller is inactive, the `inner` field will be `None`.
struct SubController<T: SubControlStatic> {
inner: Option<T>,
/// The parent sub-controller in the hierarchy.
///
/// This field is used to traverse the controller hierarchy.
#[expect(dead_code)]
parent: Option<Arc<SubController<T>>>,
}
impl<T: SubControlStatic> SubController<T> {
fn new(parent_controller: Option<&LockedController>) -> Arc<Self> {
let is_active = if let Some(parent) = parent_controller {
parent.active_set.contains_type(T::type_())
} else {
true
};
let inner = if is_active {
Some(T::new(parent_controller.is_none()))
} else {
None
};
let parent = parent_controller.map(|controller| T::read_from(controller.controller));
Arc::new(Self { inner, parent })
}
}
trait TryGetSubControl {
fn try_get(&self) -> Option<&dyn SubControl>;
}
impl<T: SubControlStatic> TryGetSubControl for SubController<T> {
fn try_get(&self) -> Option<&dyn SubControl> {
self.inner.as_ref().map(|sub_ctrl| sub_ctrl as _)
}
}
/// The controller for a single cgroup.
///
/// This struct can manage the activation state of each sub-control, and dispatches read/write
/// operations to the appropriate sub-controllers.
///
/// The following is an explanation of the activation for sub-controls and sub-controllers. When a
/// cgroup activates a specific sub-control (e.g., memory, io), it means this control capability is
/// being delegated to its children. Consequently, the corresponding sub-controller within the
/// child nodes will be activated.
///
/// The root node serves as the origin for all these control capabilities, so the sub-controllers
/// it possesses are always active. For any other node, only if its parent node first enables a
/// sub-control, its corresponding sub-controller will be activated.
pub(super) struct Controller {
/// A set of types of active sub-controllers.
active_set: Mutex<SubCtrlSet>,
memory: Rcu<Arc<SubController<MemoryController>>>,
cpuset: Rcu<Arc<SubController<CpuSetController>>>,
pids: Rcu<Arc<SubController<PidsController>>>,
}
impl Controller {
/// Creates a new controller manager for a cgroup.
pub(super) fn new(locked_parent_controller: Option<&LockedController>) -> Self {
let memory_controller = SubController::new(locked_parent_controller);
let cpuset_controller = SubController::new(locked_parent_controller);
let pids_controller = SubController::new(locked_parent_controller);
Self {
active_set: Mutex::new(SubCtrlSet::empty()),
memory: Rcu::new(memory_controller),
cpuset: Rcu::new(cpuset_controller),
pids: Rcu::new(pids_controller),
}
}
pub(super) fn init_attr_set(builder: &mut SysAttrSetBuilder, is_root: bool) {
MemoryController::init_attr_set(builder, is_root);
CpuSetController::init_attr_set(builder, is_root);
PidsController::init_attr_set(builder, is_root);
}
pub(super) fn lock(&self) -> LockedController {
LockedController {
active_set: self.active_set.lock(),
controller: self,
}
}
fn read_sub(&self, ctrl_type: SubCtrlType) -> Arc<dyn TryGetSubControl> {
match ctrl_type {
SubCtrlType::Memory => MemoryController::read_from(self),
SubCtrlType::CpuSet => CpuSetController::read_from(self),
SubCtrlType::Pids => PidsController::read_from(self),
}
}
/// Returns whether the attribute with the given name is absent in this controller.
pub(super) fn is_attr_absent(&self, name: &str) -> bool {
let Some((subsys, _)) = name.split_once('.') else {
return false;
};
let Ok(ctrl_type) = SubCtrlType::from_str(subsys) else {
return false;
};
let sub_controller = self.read_sub(ctrl_type);
if sub_controller.try_get().is_none() {
// If the sub-controller is not active, all its attributes are considered absent.
true
} else {
false
}
}
pub(super) fn read_attr_at(
&self,
name: &str,
offset: usize,
writer: &mut VmWriter,
) -> Result<usize> {
let Some((subsys, _)) = name.split_once('.') else {
return Err(Error::NotFound);
};
let ctrl_type = SubCtrlType::from_str(subsys)?;
let sub_controller = self.read_sub(ctrl_type);
let Some(controller) = sub_controller.try_get() else {
return Err(Error::IsDead);
};
controller.read_attr_at(name, offset, writer)
}
pub(super) fn write_attr(&self, name: &str, reader: &mut VmReader) -> Result<usize> {
let Some((subsys, _)) = name.split_once('.') else {
return Err(Error::NotFound);
};
let ctrl_type = SubCtrlType::from_str(subsys)?;
let sub_controller = self.read_sub(ctrl_type);
let Some(controller) = sub_controller.try_get() else {
return Err(Error::IsDead);
};
controller.write_attr(name, reader)
}
}
/// A locked controller for a cgroup.
///
/// Holding this lock indicates exclusive access to modify the sub-control state.
pub(super) struct LockedController<'a> {
active_set: MutexGuard<'a, SubCtrlSet>,
controller: &'a Controller,
}
impl LockedController<'_> {
/// Activates a sub-control of the specified type.
pub(super) fn activate(
&mut self,
ctrl_type: SubCtrlType,
current_node: &dyn CgroupSysNode,
parent_controller: Option<&LockedController>,
) -> Result<()> {
if self.active_set.contains_type(ctrl_type) {
return Ok(());
}
// A cgroup can activate the sub-control only if this
// sub-control has been activated in its parent cgroup.
if parent_controller
.is_some_and(|controller| !controller.active_set.contains_type(ctrl_type))
{
return Err(Error::NotFound);
}
self.active_set.add_type(ctrl_type);
self.update_sub_controllers_for_descents(ctrl_type, current_node);
Ok(())
}
/// Deactivates a sub-control of the specified type.
pub(super) fn deactivate(
&mut self,
ctrl_type: SubCtrlType,
current_node: &dyn CgroupSysNode,
) -> Result<()> {
if !self.active_set.contains_type(ctrl_type) {
return Ok(());
}
// If any child node has activated this sub-control,
// the deactivation operation will be rejected.
for child in current_node.children() {
let cgroup_child = child.as_any().downcast_ref::<CgroupNode>().unwrap();
let child_controller = cgroup_child.controller().lock();
// This is race-free because if a child wants to activate a sub-controller, it should
// first acquire the lock of the parent controller, which is held here.
if child_controller.active_set().contains_type(ctrl_type) {
return Err(Error::InvalidOperation);
}
}
self.active_set.remove_type(ctrl_type);
self.update_sub_controllers_for_descents(ctrl_type, current_node);
Ok(())
}
fn update_sub_controllers_for_descents(
&self,
ctrl_type: SubCtrlType,
current_node: &dyn CgroupSysNode,
) {
fn update_sub_controller_for_one_child(
child: &Arc<dyn SysObj>,
ctrl_type: SubCtrlType,
parent_controller: &LockedController,
) {
let child_node = child.as_any().downcast_ref::<CgroupNode>().unwrap();
match ctrl_type {
SubCtrlType::Memory => {
let new_controller = SubController::new(Some(parent_controller));
child_node.controller().memory.update(new_controller);
}
SubCtrlType::CpuSet => {
let new_controller = SubController::new(Some(parent_controller));
child_node.controller().cpuset.update(new_controller);
}
SubCtrlType::Pids => {
let new_controller = SubController::new(Some(parent_controller));
child_node.controller().pids.update(new_controller);
}
}
}
let mut descents = VecDeque::new();
// The following update logic is race-free due to the following reasons:
//
// 1. **No Concurrent Controller Activation/Deactivation**:
// At this point, we hold the controller lock for the current node and we know that the
// sub-controllers for the direct children are inactive. Then, no sub-controllers for
// any of the descendants can be activated before we release the lock.
//
// 2. **Concurrent Child Addition/Deletion is Fine**:
// We do need to consider that children may be added or removed concurrently. However,
// this is handled correctly:
// - If a child is added, it will attempt to hold its parent's controller lock, which is
// synchronized with the code below. If this happens after us, the up-to-date
// sub-controllers will be seen. If it happens before us, we will update the
// sub-controllers for it; due to race conditions, the sub-controllers may already be
// up to date, but updating them twice is harmless since they must not be activated.
// - If a child is removed, we may update a sub-controller that's about to be destroyed,
// which is harmless.
// Update the direct children first.
current_node.visit_children_with(0, &mut |child_node| {
descents.push_back(child_node.clone());
update_sub_controller_for_one_child(child_node, ctrl_type, self);
Some(())
});
// Then update all the other descendent nodes.
while let Some(node) = descents.pop_front() {
let current_node = node.as_any().downcast_ref::<CgroupNode>().unwrap();
// For descendent nodes, the sub-control must be inactive. But taking the controller
// lock is necessary for synchronization purposes (see the explanation above).
let locked_controller = current_node.controller().lock();
current_node.visit_children_with(0, &mut |child_node| {
descents.push_back(child_node.clone());
update_sub_controller_for_one_child(child_node, ctrl_type, &locked_controller);
Some(())
});
}
}
pub(super) fn active_set(&self) -> SubCtrlSet {
*self.active_set
}
}

View File

@ -0,0 +1,45 @@
// SPDX-License-Identifier: MPL-2.0
use alloc::sync::Arc;
use aster_systree::{Error, Result, SysAttrSetBuilder, SysPerms, SysStr};
use ostd::mm::{VmReader, VmWriter};
/// A sub-controller responsible for PID resource management in the cgroup subsystem.
///
/// This controller will only provide interfaces in non-root cgroup nodes.
pub struct PidsController {
_private: (),
}
impl PidsController {
pub(super) fn init_attr_set(builder: &mut SysAttrSetBuilder, is_root: bool) {
if !is_root {
builder.add(SysStr::from("pids.max"), SysPerms::DEFAULT_RW_ATTR_PERMS);
}
}
}
impl super::SubControl for PidsController {
fn read_attr_at(&self, _name: &str, _offset: usize, _writer: &mut VmWriter) -> Result<usize> {
Err(Error::AttributeError)
}
fn write_attr(&self, _name: &str, _reader: &mut VmReader) -> Result<usize> {
Err(Error::AttributeError)
}
}
impl super::SubControlStatic for PidsController {
fn new(_is_root: bool) -> Self {
Self { _private: () }
}
fn type_() -> super::SubCtrlType {
super::SubCtrlType::Pids
}
fn read_from(controller: &super::Controller) -> Arc<super::SubController<Self>> {
controller.pids.read().get().clone()
}
}

View File

@ -119,4 +119,11 @@ impl Inode for CgroupInode {
Ok(())
}
fn is_dentry_cacheable(&self) -> bool {
// Attribute nodes should not be cached because they may be dynamically
// created or removed based on the state of the cgroup controller.
// Caching them could result in stale or incorrect entries.
!matches!(self.node_kind, SysTreeNodeKind::Attr(..))
}
}

View File

@ -3,6 +3,7 @@
use fs::CgroupFsType;
pub use systree_node::{CgroupMembership, CgroupNode};
mod controller;
mod fs;
mod inode;
mod systree_node;

View File

@ -1,11 +1,57 @@
// SPDX-License-Identifier: MPL-2.0
//! Implements the cgroup nodes for the unified cgroup hierarchy (cgroup v2).
//!
//! This module defines the structures for cgroup nodes ([`CgroupNode`]) and the cgroup
//! root ([`CgroupSystem`]), integrating them into the `systree`. It handles process
//! management within cgroups and the logic for reading and writing cgroup attributes.
//!
//! ## Locks and Lock Ordering
//!
//! To ensure thread safety during concurrent operations, this module uses several
//! locks within the cgroup nodes. Adhering to the correct lock ordering is crucial
//! to prevent deadlocks.
//!
//! ### Lock Types
//!
//! 1. **Controller Lock**: Each cgroup node (including the root) has a [`Controller`]
//! that contains a `Mutex`. This lock protects the activation state of the sub-controllers
//! for its children (e.g., `memory`, `pids`).
//!
//! 2. **Inner Lock**: Each non-root [`CgroupNode`] has an `RwMutex` that protects its
//! `inner` data.
//!
//! 3. **Children Lock**: Each cgroup node ([`CgroupNode`] and [`CgroupSystem`]) inherits
//! an `RwLock` from `BranchNodeFields`. This lock protects access to and
//! modification of the list of child cgroup nodes.
//!
//! 4. **Cgroup Membership Lock**: A global `Mutex` managed by [`CgroupMembership`] that
//! serializes modifications to process cgroup memberships across the entire system.
//!
//! ### Locking Rules
//!
//! To avoid deadlocks, the following lock ordering must be strictly followed:
//!
//! 1. **Parent Before Child**:
//! When operating on both a parent and a child node, the lock on the parent
//! node must be acquired before the lock on the child node.
//!
//! 2. **Order Within a Single Node**:
//! When multiple locks are needed on the same cgroup node, they must be
//! acquired in this specific order:
//! `Controller Lock` -> `Inner Lock` -> `Children Lock`
//!
//! 3. **Global Lock First**:
//! When acquiring the `Cgroup Membership Lock` along with any other cgroup locks,
//! the `Cgroup Membership Lock` must be acquired first.
use alloc::{
string::ToString,
sync::{Arc, Weak},
};
use core::{
fmt::Debug,
str::FromStr,
sync::atomic::{AtomicUsize, Ordering},
};
@ -19,6 +65,7 @@ use ostd::mm::{VmReader, VmWriter};
use spin::Once;
use crate::{
fs::cgroupfs::controller::{Controller, LockedController, SubCtrlSet, SubCtrlType},
prelude::*,
process::{process_table, Pid, Process},
};
@ -66,12 +113,44 @@ impl CgroupMembership {
process: Arc<Process>,
new_cgroup: &CgroupNode,
) -> Result<()> {
if let Some(old_cgroup) = process.cgroup().get() {
let old_cgroup = if let Some(old_cgroup) = process.cgroup().get() {
// Fast path: If the process is already in this cgroup, do nothing.
if new_cgroup.id() == old_cgroup.id() {
return Ok(());
}
Some(old_cgroup.clone())
} else {
None
};
// Try to add the process to the new cgroup first.
let controller = new_cgroup.controller.lock();
// According to "no internal processes" rule of cgroupv2, if a non-root
// cgroup node has activated some sub-controls, it cannot bind any process.
//
// Reference: <https://man7.org/linux/man-pages/man7/cgroups.7.html>
if !controller.active_set().is_empty() {
return Err(Error::ResourceUnavailable);
}
new_cgroup
.with_inner_mut(|current_processes| {
if current_processes.is_empty() {
let old_count = new_cgroup.populated_count.fetch_add(1, Ordering::Relaxed);
if old_count == 0 {
new_cgroup.propagate_add_populated();
}
}
current_processes.insert(process.pid(), Arc::downgrade(&process));
process.set_cgroup(Some(new_cgroup.fields.weak_self().upgrade().unwrap()));
})
.ok_or(Error::IsDead)?;
drop(controller);
// Remove the process from the old cgroup second.
if let Some(old_cgroup) = old_cgroup {
old_cgroup
.with_inner_mut(|old_cgroup_processes| {
old_cgroup_processes.remove(&process.pid()).unwrap();
@ -83,32 +162,22 @@ impl CgroupMembership {
}
})
.unwrap();
};
new_cgroup
.with_inner_mut(|current_processes| {
if current_processes.is_empty() {
let old_count = new_cgroup.populated_count.fetch_add(1, Ordering::Relaxed);
if old_count == 0 {
new_cgroup.propagate_add_populated();
}
}
current_processes.insert(process.pid(), Arc::downgrade(&process));
})
.ok_or(Error::IsDead)?;
process.set_cgroup(Some(new_cgroup.fields.weak_self().upgrade().unwrap()));
}
Ok(())
}
/// Moves a process to the root cgroup.
pub fn move_process_to_root(&mut self, process: &Process) {
let process_cgroup = process.cgroup();
let Some(old_cgroup) = process_cgroup.get() else {
let old_cgroup = if let Some(old_cgroup) = process.cgroup().get() {
old_cgroup.clone()
} else {
// The process is already in the root cgroup. Do nothing.
return;
};
process.set_cgroup(None);
old_cgroup
.with_inner_mut(|old_cgroup_processes| {
old_cgroup_processes.remove(&process.pid()).unwrap();
@ -120,8 +189,6 @@ impl CgroupMembership {
}
})
.unwrap();
process.set_cgroup(None);
}
}
@ -130,9 +197,17 @@ impl CgroupMembership {
///
/// The cgroup system provides v2 unified hierarchy, and is also used as a root
/// node in the cgroup systree.
#[derive(Debug)]
pub(super) struct CgroupSystem {
fields: BranchNodeFields<CgroupNode, Self>,
controller: Controller,
}
impl Debug for CgroupSystem {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("CgroupSystem")
.field("fields", &self.fields)
.finish_non_exhaustive()
}
}
/// A control group node in the cgroup systree.
@ -142,6 +217,8 @@ pub(super) struct CgroupSystem {
/// this type.
pub struct CgroupNode {
fields: BranchNodeFields<CgroupNode, Self>,
/// The controller of this cgroup node.
controller: Controller,
/// The inner data. If it is `None`, then the cgroup node is dead.
inner: RwMutex<Option<Inner>>,
/// The depth of the node in the cgroupfs [`SysTree`], where the child of
@ -204,6 +281,10 @@ impl CgroupSystem {
SysStr::from("cgroup.controllers"),
SysPerms::DEFAULT_RO_ATTR_PERMS,
);
builder.add(
SysStr::from("cgroup.subtree_control"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(
SysStr::from("cgroup.max.depth"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
@ -216,22 +297,32 @@ impl CgroupSystem {
SysStr::from("cgroup.threads"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(
SysStr::from("cpu.pressure"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(SysStr::from("cpu.stat"), SysPerms::DEFAULT_RO_ATTR_PERMS);
Controller::init_attr_set(&mut builder, true);
let attrs = builder.build().expect("Failed to build attribute set");
Arc::new_cyclic(|weak_self| {
let fields = BranchNodeFields::new(name, attrs, weak_self.clone());
CgroupSystem { fields }
CgroupSystem {
fields,
controller: Controller::new(None),
}
})
}
}
impl CgroupSysNode for CgroupSystem {
fn controller(&self) -> &Controller {
&self.controller
}
}
impl CgroupNode {
pub(self) fn new(name: SysStr, depth: usize) -> Arc<Self> {
pub(self) fn new(
name: SysStr,
depth: usize,
locked_parent_controller: &LockedController,
) -> Arc<Self> {
let mut builder = SysAttrSetBuilder::new();
// TODO: Add more attributes as needed. The normal cgroup node may have
// more attributes than the unified one.
@ -239,6 +330,10 @@ impl CgroupNode {
SysStr::from("cgroup.controllers"),
SysPerms::DEFAULT_RO_ATTR_PERMS,
);
builder.add(
SysStr::from("cgroup.subtree_control"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(
SysStr::from("cgroup.max.depth"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
@ -251,21 +346,19 @@ impl CgroupNode {
SysStr::from("cgroup.threads"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(
SysStr::from("cpu.pressure"),
SysPerms::DEFAULT_RW_ATTR_PERMS,
);
builder.add(SysStr::from("cpu.stat"), SysPerms::DEFAULT_RO_ATTR_PERMS);
builder.add(
SysStr::from("cgroup.events"),
SysPerms::DEFAULT_RO_ATTR_PERMS,
);
Controller::init_attr_set(&mut builder, false);
let attrs = builder.build().expect("Failed to build attribute set");
Arc::new_cyclic(|weak_self| {
let fields = BranchNodeFields::new(name, attrs, weak_self.clone());
CgroupNode {
fields,
controller: Controller::new(Some(locked_parent_controller)),
inner: RwMutex::new(Some(Inner::default())),
depth,
populated_count: AtomicUsize::new(0),
@ -274,6 +367,12 @@ impl CgroupNode {
}
}
impl CgroupSysNode for CgroupNode {
fn controller(&self) -> &Controller {
&self.controller
}
}
// For process management
impl CgroupNode {
fn propagate_add_populated(&self) {
@ -323,6 +422,7 @@ impl CgroupNode {
/// Performs a read-only operation on the inner data.
///
/// If the cgroup node is dead, returns `None`.
#[must_use]
fn with_inner<F, R>(&self, op: F) -> Option<R>
where
F: FnOnce(&BTreeMap<Pid, Weak<Process>>) -> R,
@ -336,6 +436,7 @@ impl CgroupNode {
/// Performs a mutable operation on the inner data.
///
/// If the cgroup node is dead, returns `None`.
#[must_use]
fn with_inner_mut<F, R>(&self, op: F) -> Option<R>
where
F: FnOnce(&mut BTreeMap<Pid, Weak<Process>>) -> R,
@ -384,6 +485,14 @@ inherit_sys_branch_node!(CgroupSystem, fields, {
// This method should be a no-op for `RootNode`.
}
fn is_attr_absent(&self, name: &str) -> bool {
if name.starts_with("cgroup.") {
false
} else {
self.controller.is_attr_absent(name)
}
}
fn read_attr_at(&self, name: &str, offset: usize, writer: &mut VmWriter) -> Result<usize> {
let mut printer = VmPrinter::new_skip(writer, offset);
match name {
@ -395,10 +504,15 @@ inherit_sys_branch_node!(CgroupSystem, fields, {
}
}
}
_ => {
// TODO: Add support for reading other attributes.
return Err(Error::AttributeError);
"cgroup.controllers" => {
writeln!(printer, "{}", SubCtrlSet::all())?;
}
"cgroup.subtree_control" => {
let active_set = self.controller.lock().active_set();
writeln!(printer, "{}", active_set)?;
}
// TODO: Add support for reading other attributes.
_ => return self.controller.read_attr_at(name, offset, writer),
}
Ok(printer.bytes_written())
@ -423,10 +537,21 @@ inherit_sys_branch_node!(CgroupSystem, fields, {
Ok(len)
}
_ => {
// TODO: Add support for writing other attributes.
Err(Error::AttributeError)
"cgroup.subtree_control" => {
let (activate_set, deactivate_set, len) = read_subtree_control_from_reader(reader)?;
let mut controller = self.controller.lock();
for ctrl_type in activate_set.iter_types() {
controller.activate(ctrl_type, self, None)?;
}
for ctrl_type in deactivate_set.iter_types() {
controller.deactivate(ctrl_type, self)?;
}
Ok(len)
}
// TODO: Add support for writing other attributes.
_ => self.controller.write_attr(name, reader),
}
}
@ -435,23 +560,36 @@ inherit_sys_branch_node!(CgroupSystem, fields, {
}
fn create_child(&self, name: &str) -> Result<Arc<dyn SysObj>> {
let new_child = CgroupNode::new(name.to_string().into(), 1);
let controller = self.controller.lock();
let new_child = CgroupNode::new(name.to_string().into(), 1, &controller);
self.add_child(new_child.clone())?;
Ok(new_child)
}
});
inherit_sys_branch_node!(CgroupNode, fields, {
fn is_attr_absent(&self, name: &str) -> bool {
if name.starts_with("cgroup.") {
false
} else {
self.controller.is_attr_absent(name)
}
}
fn read_attr_at(&self, name: &str, offset: usize, writer: &mut VmWriter) -> Result<usize> {
self.with_inner(|processes| {
let mut printer = VmPrinter::new_skip(writer, offset);
match name {
"cgroup.procs" => {
let mut printer = VmPrinter::new_skip(writer, offset);
match name {
"cgroup.procs" => self
.with_inner(|processes| {
for pid in processes.keys() {
writeln!(printer, "{}", pid)?;
}
}
"cgroup.events" => {
Ok::<usize, Error>(printer.bytes_written())
})
.ok_or(Error::IsDead)?,
"cgroup.events" => self
.with_inner(|_| {
let res = if self.populated_count.load(Ordering::Relaxed) > 0 {
1
} else {
@ -462,16 +600,41 @@ inherit_sys_branch_node!(CgroupNode, fields, {
// Currently we have not enabled the "frozen" attribute
// so the "frozen" field is always zero.
writeln!(printer, "frozen {}", 0)?;
}
_ => {
// TODO: Add support for reading other attributes.
return Err(Error::AttributeError);
}
}
Ok(printer.bytes_written())
})
.ok_or(Error::IsDead)?
Ok::<usize, Error>(printer.bytes_written())
})
.ok_or(Error::IsDead)?,
"cgroup.controllers" => {
let active_set = self
.cgroup_parent()
.ok_or(Error::IsDead)?
.controller()
.lock()
.active_set();
self.with_inner(|_| {
writeln!(printer, "{}", active_set)?;
Ok::<usize, Error>(printer.bytes_written())
})
.ok_or(Error::IsDead)?
}
"cgroup.subtree_control" => {
let active_set = self.controller.lock().active_set();
self.with_inner(|_| {
writeln!(printer, "{}", active_set)?;
Ok::<usize, Error>(printer.bytes_written())
})
.ok_or(Error::IsDead)?
}
// TODO: Add support for reading other attributes.
_ => self
// This read may target a stale controller if the cgroup's sub-controllers
// are being concurrently updated. It is the duty of user-space programs
// to use proper synchronization to avoid such races.
.with_inner(|_| self.controller.read_attr_at(name, offset, writer))
.ok_or(Error::IsDead)?,
}
}
fn write_attr(&self, name: &str, reader: &mut VmReader) -> Result<usize> {
@ -487,19 +650,45 @@ inherit_sys_branch_node!(CgroupNode, fields, {
.ok_or(Error::InvalidOperation)?;
with_process_cgroup_locked(pid, |target_process, cgroup_membership| {
// TODO: According to the "no internal processes" rule of cgroupv2
// (Ref: https://man7.org/linux/man-pages/man7/cgroups.7.html),
// if the cgroup node has enabled some controllers like "memory", "io",
// it is forbidden to bind a process to an internal cgroup node.
cgroup_membership.move_process_to_node(target_process, self)
})?;
Ok(len)
}
_ => {
// TODO: Add support for writing other attributes.
Err(Error::AttributeError)
"cgroup.subtree_control" => {
let (activate_set, deactivate_set, len) = read_subtree_control_from_reader(reader)?;
let parent_node = self.cgroup_parent().ok_or(Error::IsDead)?;
let parent_controller = parent_node.controller().lock();
let mut current_controller = self.controller.lock();
self.with_inner(|processes| {
// According to "no internal processes" rule of cgroupv2, if a non-root
// cgroup node has bound processes, it cannot activate any sub-control.
//
// Reference: <https://man7.org/linux/man-pages/man7/cgroups.7.html>
if !processes.is_empty() {
return Err(Error::ResourceUnavailable);
}
for ctrl_type in activate_set.iter_types() {
current_controller.activate(ctrl_type, self, Some(&parent_controller))?;
}
for ctrl_type in deactivate_set.iter_types() {
current_controller.deactivate(ctrl_type, self)?;
}
Ok(len)
})
.ok_or(Error::IsDead)?
}
// TODO: Add support for writing other attributes.
_ => self
// This write may target a stale controller if the cgroup's sub-controllers
// are being concurrently updated. It is the duty of user-space programs
// to use proper synchronization to avoid such races.
.with_inner(|_| self.controller.write_attr(name, reader))
.ok_or(Error::IsDead)?,
}
}
@ -508,8 +697,9 @@ inherit_sys_branch_node!(CgroupNode, fields, {
}
fn create_child(&self, name: &str) -> Result<Arc<dyn SysObj>> {
let controller = self.controller.lock();
self.with_inner(|_| {
let new_child = CgroupNode::new(name.to_string().into(), self.depth + 1);
let new_child = CgroupNode::new(name.to_string().into(), self.depth + 1, &controller);
self.add_child(new_child.clone())?;
Ok(new_child as _)
})
@ -542,3 +732,57 @@ where
op(process, &mut cgroup_guard)
}
/// Reads the actions for sub-control from the given reader.
///
/// Returns the sets of controllers to be activated and deactivated,
/// along with the number of bytes read. The two sets will not overlap.
fn read_subtree_control_from_reader(
reader: &mut VmReader,
) -> Result<(SubCtrlSet, SubCtrlSet, usize)> {
let (content, len) = reader
.read_cstring_until_end(MAX_ATTR_SIZE)
.map_err(|_| Error::PageFault)?;
let content = content.to_str().map_err(|_| Error::InvalidOperation)?;
let mut activate_set = SubCtrlSet::empty();
let mut deactivate_set = SubCtrlSet::empty();
let actions = content.split_whitespace();
for action in actions {
if action.len() < 2 {
return Err(Error::InvalidOperation);
}
match action.chars().next() {
Some('+') => {
let ctrl_type = SubCtrlType::from_str(&action[1..])?;
activate_set.add_type(ctrl_type);
deactivate_set.remove_type(ctrl_type);
}
Some('-') => {
let ctrl_type = SubCtrlType::from_str(&action[1..])?;
deactivate_set.add_type(ctrl_type);
activate_set.remove_type(ctrl_type);
}
_ => return Err(Error::InvalidOperation),
};
}
Ok((activate_set, deactivate_set, len))
}
/// A trait that abstracts over different types of cgroup nodes (`CgroupNode`, `CgroupSystem`)
/// to provide a common API for controller logics.
pub(super) trait CgroupSysNode: SysBranchNode {
fn controller(&self) -> &Controller;
fn cgroup_parent(&self) -> Option<Arc<dyn CgroupSysNode>> {
let parent = self.parent()?;
if parent.is_root() {
Some(Arc::downcast::<CgroupSystem>(parent).unwrap())
} else {
Some(Arc::downcast::<CgroupNode>(parent).unwrap())
}
}
}