Revert "Revert "Merge: cgroup: Backport upstream cgroup commits up to v6.8""
JIRA: https://issues.redhat.com/browse/RHEL-36683
Upstream Status: RHEL only
This reverts commit 08637d76a2
which is a
revert of "Merge: cgroup: Backport upstream cgroup commits up to v6.8"
Signed-off-by: Waiman Long <longman@redhat.com>
This commit is contained in:
parent
5b4807d262
commit
6d0328a7cf
|
@ -983,6 +983,23 @@ All cgroup core files are prefixed with "cgroup."
|
|||
killing cgroups is a process directed operation, i.e. it affects
|
||||
the whole thread-group.
|
||||
|
||||
cgroup.pressure
|
||||
A read-write single value file that allowed values are "0" and "1".
|
||||
The default is "1".
|
||||
|
||||
Writing "0" to the file will disable the cgroup PSI accounting.
|
||||
Writing "1" to the file will re-enable the cgroup PSI accounting.
|
||||
|
||||
This control attribute is not hierarchical, so disable or enable PSI
|
||||
accounting in a cgroup does not affect PSI accounting in descendants
|
||||
and doesn't need pass enablement via ancestors from root.
|
||||
|
||||
The reason this control attribute exists is that PSI accounts stalls for
|
||||
each cgroup separately and aggregates it at each level of the hierarchy.
|
||||
This may cause non-negligible overhead for some workloads when under
|
||||
deep level of the hierarchy, in which case this control attribute can
|
||||
be used to disable PSI accounting in the non-leaf cgroups.
|
||||
|
||||
irq.pressure
|
||||
A read-write nested-keyed file.
|
||||
|
||||
|
|
|
@ -14,27 +14,28 @@ architectures).
|
|||
II. How does it work?
|
||||
=====================
|
||||
|
||||
There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
|
||||
and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have
|
||||
PF_NOFREEZE unset (all user space processes and some kernel threads) are
|
||||
regarded as 'freezable' and treated in a special way before the system enters a
|
||||
suspend state as well as before a hibernation image is created (in what follows
|
||||
we only consider hibernation, but the description also applies to suspend).
|
||||
There is one per-task flag (PF_NOFREEZE) and three per-task states
|
||||
(TASK_FROZEN, TASK_FREEZABLE and __TASK_FREEZABLE_UNSAFE) used for that.
|
||||
The tasks that have PF_NOFREEZE unset (all user space tasks and some kernel
|
||||
threads) are regarded as 'freezable' and treated in a special way before the
|
||||
system enters a sleep state as well as before a hibernation image is created
|
||||
(hibernation is directly covered by what follows, but the description applies
|
||||
to system-wide suspend too).
|
||||
|
||||
Namely, as the first step of the hibernation procedure the function
|
||||
freeze_processes() (defined in kernel/power/process.c) is called. A system-wide
|
||||
variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
|
||||
whether the system is to undergo a freezing operation. And freeze_processes()
|
||||
sets this variable. After this, it executes try_to_freeze_tasks() that sends a
|
||||
fake signal to all user space processes, and wakes up all the kernel threads.
|
||||
All freezable tasks must react to that by calling try_to_freeze(), which
|
||||
results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
|
||||
the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
|
||||
it loop until PF_FROZEN is cleared for it. Then, we say that the task is
|
||||
'frozen' and therefore the set of functions handling this mechanism is referred
|
||||
to as 'the freezer' (these functions are defined in kernel/power/process.c,
|
||||
kernel/freezer.c & include/linux/freezer.h). User space processes are generally
|
||||
frozen before kernel threads.
|
||||
static key freezer_active (as opposed to a per-task flag or state) is used to
|
||||
indicate whether the system is to undergo a freezing operation. And
|
||||
freeze_processes() sets this static key. After this, it executes
|
||||
try_to_freeze_tasks() that sends a fake signal to all user space processes, and
|
||||
wakes up all the kernel threads. All freezable tasks must react to that by
|
||||
calling try_to_freeze(), which results in a call to __refrigerator() (defined
|
||||
in kernel/freezer.c), which changes the task's state to TASK_FROZEN, and makes
|
||||
it loop until it is woken by an explicit TASK_FROZEN wakeup. Then, that task
|
||||
is regarded as 'frozen' and so the set of functions handling this mechanism is
|
||||
referred to as 'the freezer' (these functions are defined in
|
||||
kernel/power/process.c, kernel/freezer.c & include/linux/freezer.h). User space
|
||||
tasks are generally frozen before kernel threads.
|
||||
|
||||
__refrigerator() must not be called directly. Instead, use the
|
||||
try_to_freeze() function (defined in include/linux/freezer.h), that checks
|
||||
|
@ -43,31 +44,40 @@ if the task is to be frozen and makes the task enter __refrigerator().
|
|||
For user space processes try_to_freeze() is called automatically from the
|
||||
signal-handling code, but the freezable kernel threads need to call it
|
||||
explicitly in suitable places or use the wait_event_freezable() or
|
||||
wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
|
||||
that combine interruptible sleep with checking if the task is to be frozen and
|
||||
calling try_to_freeze(). The main loop of a freezable kernel thread may look
|
||||
wait_event_freezable_timeout() macros (defined in include/linux/wait.h)
|
||||
that put the task to sleep (TASK_INTERRUPTIBLE) or freeze it (TASK_FROZEN) if
|
||||
freezer_active is set. The main loop of a freezable kernel thread may look
|
||||
like the following one::
|
||||
|
||||
set_freezable();
|
||||
do {
|
||||
hub_events();
|
||||
wait_event_freezable(khubd_wait,
|
||||
!list_empty(&hub_event_list) ||
|
||||
kthread_should_stop());
|
||||
} while (!kthread_should_stop() || !list_empty(&hub_event_list));
|
||||
|
||||
(from drivers/usb/core/hub.c::hub_thread()).
|
||||
while (true) {
|
||||
struct task_struct *tsk = NULL;
|
||||
|
||||
If a freezable kernel thread fails to call try_to_freeze() after the freezer has
|
||||
initiated a freezing operation, the freezing of tasks will fail and the entire
|
||||
hibernation operation will be cancelled. For this reason, freezable kernel
|
||||
threads must call try_to_freeze() somewhere or use one of the
|
||||
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
|
||||
spin_lock_irq(&oom_reaper_lock);
|
||||
if (oom_reaper_list != NULL) {
|
||||
tsk = oom_reaper_list;
|
||||
oom_reaper_list = tsk->oom_reaper_list;
|
||||
}
|
||||
spin_unlock_irq(&oom_reaper_lock);
|
||||
|
||||
if (tsk)
|
||||
oom_reap_task(tsk);
|
||||
}
|
||||
|
||||
(from mm/oom_kill.c::oom_reaper()).
|
||||
|
||||
If a freezable kernel thread is not put to the frozen state after the freezer
|
||||
has initiated a freezing operation, the freezing of tasks will fail and the
|
||||
entire system-wide transition will be cancelled. For this reason, freezable
|
||||
kernel threads must call try_to_freeze() somewhere or use one of the
|
||||
wait_event_freezable() and wait_event_freezable_timeout() macros.
|
||||
|
||||
After the system memory state has been restored from a hibernation image and
|
||||
devices have been reinitialized, the function thaw_processes() is called in
|
||||
order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that
|
||||
have been frozen leave __refrigerator() and continue running.
|
||||
order to wake up each frozen task. Then, the tasks that have been frozen leave
|
||||
__refrigerator() and continue running.
|
||||
|
||||
|
||||
Rationale behind the functions dealing with freezing and thawing of tasks
|
||||
|
@ -96,7 +106,8 @@ III. Which kernel threads are freezable?
|
|||
Kernel threads are not freezable by default. However, a kernel thread may clear
|
||||
PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
|
||||
directly is not allowed). From this point it is regarded as freezable
|
||||
and must call try_to_freeze() in a suitable place.
|
||||
and must call try_to_freeze() or variants of wait_event_freezable() in a
|
||||
suitable place.
|
||||
|
||||
IV. Why do we do that?
|
||||
======================
|
||||
|
|
|
@ -3714,10 +3714,9 @@ static int binder_wait_for_work(struct binder_thread *thread,
|
|||
struct binder_proc *proc = thread->proc;
|
||||
int ret = 0;
|
||||
|
||||
freezer_do_not_count();
|
||||
binder_inner_proc_lock(proc);
|
||||
for (;;) {
|
||||
prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE);
|
||||
prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
||||
if (binder_has_work_ilocked(thread, do_proc_work))
|
||||
break;
|
||||
if (do_proc_work)
|
||||
|
@ -3734,7 +3733,6 @@ static int binder_wait_for_work(struct binder_thread *thread,
|
|||
}
|
||||
finish_wait(&thread->wait, &wait);
|
||||
binder_inner_proc_unlock(proc);
|
||||
freezer_count();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -445,8 +445,8 @@ static int pt3_fetch_thread(void *data)
|
|||
pt3_proc_dma(adap);
|
||||
|
||||
delay = ktime_set(0, PT3_FETCH_DELAY * NSEC_PER_MSEC);
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
freezable_schedule_hrtimeout_range(&delay,
|
||||
set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
|
||||
schedule_hrtimeout_range(&delay,
|
||||
PT3_FETCH_DELAY_DELTA * NSEC_PER_MSEC,
|
||||
HRTIMER_MODE_REL);
|
||||
}
|
||||
|
|
|
@ -403,9 +403,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
|
|||
if (core_waiters > 0) {
|
||||
struct core_thread *ptr;
|
||||
|
||||
freezer_do_not_count();
|
||||
wait_for_completion(&core_state->startup);
|
||||
freezer_count();
|
||||
wait_for_completion_state(&core_state->startup,
|
||||
TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
|
||||
/*
|
||||
* Wait for all the threads to become inactive, so that
|
||||
* all the thread context (extended register state, like
|
||||
|
|
|
@ -578,7 +578,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
|
|||
}
|
||||
|
||||
wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
|
||||
nfs_wait_bit_killable, TASK_KILLABLE);
|
||||
nfs_wait_bit_killable,
|
||||
TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
|
||||
|
||||
folio_lock(folio);
|
||||
mapping = folio_file_mapping(folio);
|
||||
|
|
|
@ -72,18 +72,13 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
|
|||
return nfs_fileid_to_ino_t(fattr->fileid);
|
||||
}
|
||||
|
||||
static int nfs_wait_killable(int mode)
|
||||
int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
|
||||
{
|
||||
freezable_schedule_unsafe();
|
||||
schedule();
|
||||
if (signal_pending_state(mode, current))
|
||||
return -ERESTARTSYS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
|
||||
{
|
||||
return nfs_wait_killable(mode);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
|
||||
|
||||
/**
|
||||
|
@ -1343,7 +1338,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping)
|
|||
*/
|
||||
for (;;) {
|
||||
ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
|
||||
nfs_wait_bit_killable, TASK_KILLABLE);
|
||||
nfs_wait_bit_killable,
|
||||
TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
|
||||
if (ret)
|
||||
goto out;
|
||||
spin_lock(&inode->i_lock);
|
||||
|
|
|
@ -36,7 +36,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
|
|||
res = rpc_call_sync(clnt, msg, flags);
|
||||
if (res != -EJUKEBOX)
|
||||
break;
|
||||
freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
|
||||
__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
|
||||
schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
|
||||
res = -ERESTARTSYS;
|
||||
} while (!fatal_signal_pending(current));
|
||||
return res;
|
||||
|
|
|
@ -421,8 +421,8 @@ static int nfs4_delay_killable(long *timeout)
|
|||
{
|
||||
might_sleep();
|
||||
|
||||
freezable_schedule_timeout_killable_unsafe(
|
||||
nfs4_update_delay(timeout));
|
||||
__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
|
||||
schedule_timeout(nfs4_update_delay(timeout));
|
||||
if (!__fatal_signal_pending(current))
|
||||
return 0;
|
||||
return -EINTR;
|
||||
|
@ -432,7 +432,8 @@ static int nfs4_delay_interruptible(long *timeout)
|
|||
{
|
||||
might_sleep();
|
||||
|
||||
freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout));
|
||||
__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE);
|
||||
schedule_timeout(nfs4_update_delay(timeout));
|
||||
if (!signal_pending(current))
|
||||
return 0;
|
||||
return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS;
|
||||
|
@ -7427,7 +7428,8 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
|
|||
status = nfs4_proc_setlk(state, cmd, request);
|
||||
if ((status != -EAGAIN) || IS_SETLK(cmd))
|
||||
break;
|
||||
freezable_schedule_timeout_interruptible(timeout);
|
||||
__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
||||
schedule_timeout(timeout);
|
||||
timeout *= 2;
|
||||
timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
|
||||
status = -ERESTARTSYS;
|
||||
|
@ -7495,10 +7497,8 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
|
|||
break;
|
||||
|
||||
status = -ERESTARTSYS;
|
||||
freezer_do_not_count();
|
||||
wait_woken(&waiter.wait, TASK_INTERRUPTIBLE,
|
||||
wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE,
|
||||
NFS4_LOCK_MAXTIMEOUT);
|
||||
freezer_count();
|
||||
} while (!signalled());
|
||||
|
||||
remove_wait_queue(q, &waiter.wait);
|
||||
|
|
|
@ -1317,7 +1317,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
|
|||
|
||||
refcount_inc(&clp->cl_count);
|
||||
res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
|
||||
nfs_wait_bit_killable, TASK_KILLABLE);
|
||||
nfs_wait_bit_killable,
|
||||
TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
|
||||
if (res)
|
||||
goto out;
|
||||
if (clp->cl_cons_state < 0)
|
||||
|
|
|
@ -1911,7 +1911,7 @@ static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
|
|||
pnfs_layoutcommit_inode(lo->plh_inode, false);
|
||||
return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
|
||||
nfs_wait_bit_killable,
|
||||
TASK_KILLABLE);
|
||||
TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
|
||||
}
|
||||
|
||||
static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
|
||||
|
@ -3210,7 +3210,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
|
|||
status = wait_on_bit_lock_action(&nfsi->flags,
|
||||
NFS_INO_LAYOUTCOMMITTING,
|
||||
nfs_wait_bit_killable,
|
||||
TASK_KILLABLE);
|
||||
TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
|
||||
if (status)
|
||||
goto out;
|
||||
}
|
||||
|
|
|
@ -2459,7 +2459,7 @@ cifs_invalidate_mapping(struct inode *inode)
|
|||
static int
|
||||
cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
|
||||
{
|
||||
freezable_schedule_unsafe();
|
||||
schedule();
|
||||
if (signal_pending_state(mode, current))
|
||||
return -ERESTARTSYS;
|
||||
return 0;
|
||||
|
@ -2477,7 +2477,7 @@ cifs_revalidate_mapping(struct inode *inode)
|
|||
return 0;
|
||||
|
||||
rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
|
||||
TASK_KILLABLE);
|
||||
TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
|
|
|
@ -764,9 +764,10 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
|
|||
{
|
||||
int error;
|
||||
|
||||
error = wait_event_freezekillable_unsafe(server->response_q,
|
||||
midQ->mid_state != MID_REQUEST_SUBMITTED &&
|
||||
midQ->mid_state != MID_RESPONSE_RECEIVED);
|
||||
error = wait_event_state(server->response_q,
|
||||
midQ->mid_state != MID_REQUEST_SUBMITTED &&
|
||||
midQ->mid_state != MID_RESPONSE_RECEIVED,
|
||||
(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE));
|
||||
if (error < 0)
|
||||
return -ERESTARTSYS;
|
||||
|
||||
|
|
|
@ -604,9 +604,9 @@ xfsaild(
|
|||
|
||||
while (1) {
|
||||
if (tout && tout <= 20)
|
||||
set_current_state(TASK_KILLABLE);
|
||||
set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
|
||||
else
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
||||
|
||||
/*
|
||||
* Check kthread_should_stop() after we set the task state to
|
||||
|
@ -655,14 +655,14 @@ xfsaild(
|
|||
ailp->ail_target == ailp->ail_target_prev &&
|
||||
list_empty(&ailp->ail_buf_list)) {
|
||||
spin_unlock(&ailp->ail_lock);
|
||||
freezable_schedule();
|
||||
schedule();
|
||||
tout = 0;
|
||||
continue;
|
||||
}
|
||||
spin_unlock(&ailp->ail_lock);
|
||||
|
||||
if (tout)
|
||||
freezable_schedule_timeout(msecs_to_jiffies(tout));
|
||||
schedule_timeout(msecs_to_jiffies(tout));
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
|
|
|
@ -343,6 +343,20 @@ struct cgroup_rstat_cpu {
|
|||
*/
|
||||
struct cgroup_base_stat last_bstat;
|
||||
|
||||
/*
|
||||
* This field is used to record the cumulative per-cpu time of
|
||||
* the cgroup and its descendants. Currently it can be read via
|
||||
* eBPF/drgn etc, and we are still trying to determine how to
|
||||
* expose it in the cgroupfs interface.
|
||||
*/
|
||||
struct cgroup_base_stat subtree_bstat;
|
||||
|
||||
/*
|
||||
* Snapshots at the last reading. These are used to calculate the
|
||||
* deltas to propagate to the per-cpu subtree_bstat.
|
||||
*/
|
||||
struct cgroup_base_stat last_subtree_bstat;
|
||||
|
||||
/*
|
||||
* Child cgroups with stat updates on this cpu since the last read
|
||||
* are linked on the parent's ->updated_children through
|
||||
|
@ -430,6 +444,9 @@ struct cgroup {
|
|||
struct cgroup_file procs_file; /* handle for "cgroup.procs" */
|
||||
struct cgroup_file events_file; /* handle for "cgroup.events" */
|
||||
|
||||
/* handles for "{cpu,memory,io,irq}.pressure" */
|
||||
struct cgroup_file psi_files[NR_PSI_RESOURCES];
|
||||
|
||||
/*
|
||||
* The bitmask of subsystems enabled on the child cgroups.
|
||||
* ->subtree_control is the one configured through
|
||||
|
@ -542,6 +559,10 @@ struct cgroup_root {
|
|||
/* Unique id for this hierarchy. */
|
||||
int hierarchy_id;
|
||||
|
||||
/* A list running through the active hierarchies */
|
||||
struct list_head root_list;
|
||||
struct rcu_head rcu; /* Must be near the top */
|
||||
|
||||
/*
|
||||
* The root cgroup. The containing cgroup_root will be destroyed on its
|
||||
* release. cgrp->ancestors[0] will be used overflowing into the
|
||||
|
@ -555,9 +576,6 @@ struct cgroup_root {
|
|||
/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
|
||||
atomic_t nr_cgrps;
|
||||
|
||||
/* A list running through the active hierarchies */
|
||||
struct list_head root_list;
|
||||
|
||||
/* Hierarchy-specific flags */
|
||||
unsigned int flags;
|
||||
|
||||
|
|
|
@ -69,6 +69,7 @@ struct css_task_iter {
|
|||
extern struct file_system_type cgroup_fs_type;
|
||||
extern struct cgroup_root cgrp_dfl_root;
|
||||
extern struct css_set init_css_set;
|
||||
extern spinlock_t css_set_lock;
|
||||
|
||||
#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
|
||||
#include <linux/cgroup_subsys.h>
|
||||
|
@ -386,7 +387,6 @@ static inline void cgroup_unlock(void)
|
|||
* as locks used during the cgroup_subsys::attach() methods.
|
||||
*/
|
||||
#ifdef CONFIG_PROVE_RCU
|
||||
extern spinlock_t css_set_lock;
|
||||
#define task_css_set_check(task, __c) \
|
||||
rcu_dereference_check((task)->cgroups, \
|
||||
rcu_read_lock_sched_held() || \
|
||||
|
@ -859,4 +859,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
|
|||
|
||||
#endif /* CONFIG_CGROUP_BPF */
|
||||
|
||||
struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
|
||||
|
||||
#endif /* _LINUX_CGROUP_H */
|
||||
|
|
|
@ -8,9 +8,11 @@
|
|||
#include <linux/sched.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/jump_label.h>
|
||||
|
||||
#ifdef CONFIG_FREEZER
|
||||
extern atomic_t system_freezing_cnt; /* nr of freezing conds in effect */
|
||||
DECLARE_STATIC_KEY_FALSE(freezer_active);
|
||||
|
||||
extern bool pm_freezing; /* PM freezing in effect */
|
||||
extern bool pm_nosig_freezing; /* PM nosig freezing in effect */
|
||||
|
||||
|
@ -22,10 +24,7 @@ extern unsigned int freeze_timeout_msecs;
|
|||
/*
|
||||
* Check if a process has been frozen
|
||||
*/
|
||||
static inline bool frozen(struct task_struct *p)
|
||||
{
|
||||
return p->flags & PF_FROZEN;
|
||||
}
|
||||
extern bool frozen(struct task_struct *p);
|
||||
|
||||
extern bool freezing_slow_path(struct task_struct *p);
|
||||
|
||||
|
@ -34,9 +33,10 @@ extern bool freezing_slow_path(struct task_struct *p);
|
|||
*/
|
||||
static inline bool freezing(struct task_struct *p)
|
||||
{
|
||||
if (likely(!atomic_read(&system_freezing_cnt)))
|
||||
return false;
|
||||
return freezing_slow_path(p);
|
||||
if (static_branch_unlikely(&freezer_active))
|
||||
return freezing_slow_path(p);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Takes and releases task alloc lock using task_lock() */
|
||||
|
@ -48,23 +48,14 @@ extern int freeze_kernel_threads(void);
|
|||
extern void thaw_processes(void);
|
||||
extern void thaw_kernel_threads(void);
|
||||
|
||||
/*
|
||||
* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION
|
||||
* If try_to_freeze causes a lockdep warning it means the caller may deadlock
|
||||
*/
|
||||
static inline bool try_to_freeze_unsafe(void)
|
||||
static inline bool try_to_freeze(void)
|
||||
{
|
||||
might_sleep();
|
||||
if (likely(!freezing(current)))
|
||||
return false;
|
||||
return __refrigerator(false);
|
||||
}
|
||||
|
||||
static inline bool try_to_freeze(void)
|
||||
{
|
||||
if (!(current->flags & PF_NOFREEZE))
|
||||
debug_check_no_locks_held();
|
||||
return try_to_freeze_unsafe();
|
||||
return __refrigerator(false);
|
||||
}
|
||||
|
||||
extern bool freeze_task(struct task_struct *p);
|
||||
|
@ -79,195 +70,6 @@ static inline bool cgroup_freezing(struct task_struct *task)
|
|||
}
|
||||
#endif /* !CONFIG_CGROUP_FREEZER */
|
||||
|
||||
/*
|
||||
* The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
|
||||
* calls wait_for_completion(&vfork) and reset right after it returns from this
|
||||
* function. Next, the parent should call try_to_freeze() to freeze itself
|
||||
* appropriately in case the child has exited before the freezing of tasks is
|
||||
* complete. However, we don't want kernel threads to be frozen in unexpected
|
||||
* places, so we allow them to block freeze_processes() instead or to set
|
||||
* PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the
|
||||
* parent won't really block freeze_processes(), since ____call_usermodehelper()
|
||||
* (the child) does a little before exec/exit and it can't be frozen before
|
||||
* waking up the parent.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* freezer_do_not_count - tell freezer to ignore %current
|
||||
*
|
||||
* Tell freezers to ignore the current task when determining whether the
|
||||
* target frozen state is reached. IOW, the current task will be
|
||||
* considered frozen enough by freezers.
|
||||
*
|
||||
* The caller shouldn't do anything which isn't allowed for a frozen task
|
||||
* until freezer_cont() is called. Usually, freezer[_do_not]_count() pair
|
||||
* wrap a scheduling operation and nothing much else.
|
||||
*/
|
||||
static inline void freezer_do_not_count(void)
|
||||
{
|
||||
current->flags |= PF_FREEZER_SKIP;
|
||||
}
|
||||
|
||||
/**
|
||||
* freezer_count - tell freezer to stop ignoring %current
|
||||
*
|
||||
* Undo freezer_do_not_count(). It tells freezers that %current should be
|
||||
* considered again and tries to freeze if freezing condition is already in
|
||||
* effect.
|
||||
*/
|
||||
static inline void freezer_count(void)
|
||||
{
|
||||
current->flags &= ~PF_FREEZER_SKIP;
|
||||
/*
|
||||
* If freezing is in progress, the following paired with smp_mb()
|
||||
* in freezer_should_skip() ensures that either we see %true
|
||||
* freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
|
||||
*/
|
||||
smp_mb();
|
||||
try_to_freeze();
|
||||
}
|
||||
|
||||
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
|
||||
static inline void freezer_count_unsafe(void)
|
||||
{
|
||||
current->flags &= ~PF_FREEZER_SKIP;
|
||||
smp_mb();
|
||||
try_to_freeze_unsafe();
|
||||
}
|
||||
|
||||
/**
|
||||
* freezer_should_skip - whether to skip a task when determining frozen
|
||||
* state is reached
|
||||
* @p: task in quesion
|
||||
*
|
||||
* This function is used by freezers after establishing %true freezing() to
|
||||
* test whether a task should be skipped when determining the target frozen
|
||||
* state is reached. IOW, if this function returns %true, @p is considered
|
||||
* frozen enough.
|
||||
*/
|
||||
static inline bool freezer_should_skip(struct task_struct *p)
|
||||
{
|
||||
/*
|
||||
* The following smp_mb() paired with the one in freezer_count()
|
||||
* ensures that either freezer_count() sees %true freezing() or we
|
||||
* see cleared %PF_FREEZER_SKIP and return %false. This makes it
|
||||
* impossible for a task to slip frozen state testing after
|
||||
* clearing %PF_FREEZER_SKIP.
|
||||
*/
|
||||
smp_mb();
|
||||
return p->flags & PF_FREEZER_SKIP;
|
||||
}
|
||||
|
||||
/*
|
||||
* These functions are intended to be used whenever you want allow a sleeping
|
||||
* task to be frozen. Note that neither return any clear indication of
|
||||
* whether a freeze event happened while in this function.
|
||||
*/
|
||||
|
||||
/* Like schedule(), but should not block the freezer. */
|
||||
static inline void freezable_schedule(void)
|
||||
{
|
||||
freezer_do_not_count();
|
||||
schedule();
|
||||
freezer_count();
|
||||
}
|
||||
|
||||
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
|
||||
static inline void freezable_schedule_unsafe(void)
|
||||
{
|
||||
freezer_do_not_count();
|
||||
schedule();
|
||||
freezer_count_unsafe();
|
||||
}
|
||||
|
||||
/*
|
||||
* Like schedule_timeout(), but should not block the freezer. Do not
|
||||
* call this with locks held.
|
||||
*/
|
||||
static inline long freezable_schedule_timeout(long timeout)
|
||||
{
|
||||
long __retval;
|
||||
freezer_do_not_count();
|
||||
__retval = schedule_timeout(timeout);
|
||||
freezer_count();
|
||||
return __retval;
|
||||
}
|
||||
|
||||
/*
|
||||
* Like schedule_timeout_interruptible(), but should not block the freezer. Do not
|
||||
* call this with locks held.
|
||||
*/
|
||||
static inline long freezable_schedule_timeout_interruptible(long timeout)
|
||||
{
|
||||
long __retval;
|
||||
freezer_do_not_count();
|
||||
__retval = schedule_timeout_interruptible(timeout);
|
||||
freezer_count();
|
||||
return __retval;
|
||||
}
|
||||
|
||||
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
|
||||
static inline long freezable_schedule_timeout_interruptible_unsafe(long timeout)
|
||||
{
|
||||
long __retval;
|
||||
|
||||
freezer_do_not_count();
|
||||
__retval = schedule_timeout_interruptible(timeout);
|
||||
freezer_count_unsafe();
|
||||
return __retval;
|
||||
}
|
||||
|
||||
/* Like schedule_timeout_killable(), but should not block the freezer. */
|
||||
static inline long freezable_schedule_timeout_killable(long timeout)
|
||||
{
|
||||
long __retval;
|
||||
freezer_do_not_count();
|
||||
__retval = schedule_timeout_killable(timeout);
|
||||
freezer_count();
|
||||
return __retval;
|
||||
}
|
||||
|
||||
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
|
||||
static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
|
||||
{
|
||||
long __retval;
|
||||
freezer_do_not_count();
|
||||
__retval = schedule_timeout_killable(timeout);
|
||||
freezer_count_unsafe();
|
||||
return __retval;
|
||||
}
|
||||
|
||||
/*
|
||||
* Like schedule_hrtimeout_range(), but should not block the freezer. Do not
|
||||
* call this with locks held.
|
||||
*/
|
||||
static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
|
||||
u64 delta, const enum hrtimer_mode mode)
|
||||
{
|
||||
int __retval;
|
||||
freezer_do_not_count();
|
||||
__retval = schedule_hrtimeout_range(expires, delta, mode);
|
||||
freezer_count();
|
||||
return __retval;
|
||||
}
|
||||
|
||||
/*
|
||||
* Freezer-friendly wrappers around wait_event_interruptible(),
|
||||
* wait_event_killable() and wait_event_interruptible_timeout(), originally
|
||||
* defined in <linux/wait.h>
|
||||
*/
|
||||
|
||||
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
|
||||
#define wait_event_freezekillable_unsafe(wq, condition) \
|
||||
({ \
|
||||
int __retval; \
|
||||
freezer_do_not_count(); \
|
||||
__retval = wait_event_killable(wq, (condition)); \
|
||||
freezer_count_unsafe(); \
|
||||
__retval; \
|
||||
})
|
||||
|
||||
#else /* !CONFIG_FREEZER */
|
||||
static inline bool frozen(struct task_struct *p) { return false; }
|
||||
static inline bool freezing(struct task_struct *p) { return false; }
|
||||
|
@ -281,35 +83,8 @@ static inline void thaw_kernel_threads(void) {}
|
|||
|
||||
static inline bool try_to_freeze(void) { return false; }
|
||||
|
||||
static inline void freezer_do_not_count(void) {}
|
||||
static inline void freezer_count(void) {}
|
||||
static inline int freezer_should_skip(struct task_struct *p) { return 0; }
|
||||
static inline void set_freezable(void) {}
|
||||
|
||||
#define freezable_schedule() schedule()
|
||||
|
||||
#define freezable_schedule_unsafe() schedule()
|
||||
|
||||
#define freezable_schedule_timeout(timeout) schedule_timeout(timeout)
|
||||
|
||||
#define freezable_schedule_timeout_interruptible(timeout) \
|
||||
schedule_timeout_interruptible(timeout)
|
||||
|
||||
#define freezable_schedule_timeout_interruptible_unsafe(timeout) \
|
||||
schedule_timeout_interruptible(timeout)
|
||||
|
||||
#define freezable_schedule_timeout_killable(timeout) \
|
||||
schedule_timeout_killable(timeout)
|
||||
|
||||
#define freezable_schedule_timeout_killable_unsafe(timeout) \
|
||||
schedule_timeout_killable(timeout)
|
||||
|
||||
#define freezable_schedule_hrtimeout_range(expires, delta, mode) \
|
||||
schedule_hrtimeout_range(expires, delta, mode)
|
||||
|
||||
#define wait_event_freezekillable_unsafe(wq, condition) \
|
||||
wait_event_killable(wq, condition)
|
||||
|
||||
#endif /* !CONFIG_FREEZER */
|
||||
|
||||
#endif /* FREEZER_H_INCLUDED */
|
||||
|
|
|
@ -552,6 +552,10 @@ static inline int kernfs_setattr(struct kernfs_node *kn,
|
|||
const struct iattr *iattr)
|
||||
{ return -ENOSYS; }
|
||||
|
||||
static inline __poll_t kernfs_generic_poll(struct kernfs_open_file *of,
|
||||
struct poll_table_struct *pt)
|
||||
{ return -ENOSYS; }
|
||||
|
||||
static inline void kernfs_notify(struct kernfs_node *kn) { }
|
||||
|
||||
static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
|
||||
|
|
|
@ -31,17 +31,18 @@ struct misc_cg;
|
|||
* struct misc_res: Per cgroup per misc type resource
|
||||
* @max: Maximum limit on the resource.
|
||||
* @usage: Current usage of the resource.
|
||||
* @failed: True if charged failed for the resource in a cgroup.
|
||||
* @events: Number of times, the resource limit exceeded.
|
||||
*/
|
||||
struct misc_res {
|
||||
unsigned long max;
|
||||
atomic_long_t usage;
|
||||
atomic_long_t events;
|
||||
u64 max;
|
||||
atomic64_t usage;
|
||||
atomic64_t events;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct misc_cg - Miscellaneous controller's cgroup structure.
|
||||
* @css: cgroup subsys state object.
|
||||
* @events_file: Handle for the misc resources events file.
|
||||
* @res: Array of misc resources usage in the cgroup.
|
||||
*/
|
||||
struct misc_cg {
|
||||
|
@ -53,12 +54,10 @@ struct misc_cg {
|
|||
struct misc_res res[MISC_CG_RES_TYPES];
|
||||
};
|
||||
|
||||
unsigned long misc_cg_res_total_usage(enum misc_res_type type);
|
||||
int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity);
|
||||
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount);
|
||||
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount);
|
||||
u64 misc_cg_res_total_usage(enum misc_res_type type);
|
||||
int misc_cg_set_capacity(enum misc_res_type type, u64 capacity);
|
||||
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
|
||||
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
|
||||
|
||||
/**
|
||||
* css_misc() - Get misc cgroup from the css.
|
||||
|
@ -99,27 +98,26 @@ static inline void put_misc_cg(struct misc_cg *cg)
|
|||
|
||||
#else /* !CONFIG_CGROUP_MISC */
|
||||
|
||||
static inline unsigned long misc_cg_res_total_usage(enum misc_res_type type)
|
||||
static inline u64 misc_cg_res_total_usage(enum misc_res_type type)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int misc_cg_set_capacity(enum misc_res_type type,
|
||||
unsigned long capacity)
|
||||
static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int misc_cg_try_charge(enum misc_res_type type,
|
||||
struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
u64 amount)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void misc_cg_uncharge(enum misc_res_type type,
|
||||
struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
u64 amount)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -22,8 +22,9 @@ void psi_memstall_enter(unsigned long *flags);
|
|||
void psi_memstall_leave(unsigned long *flags);
|
||||
|
||||
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
|
||||
struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
||||
char *buf, enum psi_res res, struct file *file);
|
||||
struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
|
||||
enum psi_res res, struct file *file,
|
||||
struct kernfs_open_file *of);
|
||||
void psi_trigger_destroy(struct psi_trigger *t);
|
||||
|
||||
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
|
||||
|
@ -38,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
|
|||
int psi_cgroup_alloc(struct cgroup *cgrp);
|
||||
void psi_cgroup_free(struct cgroup *cgrp);
|
||||
void cgroup_move_task(struct task_struct *p, struct css_set *to);
|
||||
void psi_cgroup_restart(struct psi_group *group);
|
||||
#endif
|
||||
|
||||
#else /* CONFIG_PSI */
|
||||
|
@ -59,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
|
|||
{
|
||||
rcu_assign_pointer(p->cgroups, to);
|
||||
}
|
||||
static inline void psi_cgroup_restart(struct psi_group *group) {}
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_PSI */
|
||||
|
|
|
@ -136,6 +136,9 @@ struct psi_trigger {
|
|||
/* Wait queue for polling */
|
||||
wait_queue_head_t event_wait;
|
||||
|
||||
/* Kernfs file for cgroup triggers */
|
||||
struct kernfs_open_file *of;
|
||||
|
||||
/* Pending event flag */
|
||||
int event;
|
||||
|
||||
|
@ -157,6 +160,7 @@ struct psi_trigger {
|
|||
|
||||
struct psi_group {
|
||||
struct psi_group *parent;
|
||||
bool enabled;
|
||||
|
||||
/* Protects data used by the aggregator */
|
||||
struct mutex avgs_lock;
|
||||
|
@ -204,6 +208,8 @@ struct psi_group {
|
|||
|
||||
#else /* CONFIG_PSI */
|
||||
|
||||
#define NR_PSI_RESOURCES 0
|
||||
|
||||
struct psi_group { };
|
||||
|
||||
#endif /* CONFIG_PSI */
|
||||
|
|
|
@ -102,12 +102,19 @@ struct task_struct;
|
|||
#define TASK_WAKING 0x00000200
|
||||
#define TASK_NOLOAD 0x00000400
|
||||
#define TASK_NEW 0x00000800
|
||||
/* RT specific auxilliary flag to mark RT lock waiters */
|
||||
#define TASK_RTLOCK_WAIT 0x00001000
|
||||
#define TASK_STATE_MAX 0x00002000
|
||||
#define TASK_FREEZABLE 0x00002000
|
||||
#define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
|
||||
#define TASK_FROZEN 0x00008000
|
||||
#define TASK_STATE_MAX 0x00010000
|
||||
|
||||
#define TASK_ANY (TASK_STATE_MAX-1)
|
||||
|
||||
/*
|
||||
* DO NOT ADD ANY NEW USERS !
|
||||
*/
|
||||
#define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)
|
||||
|
||||
/* Convenience macros for the sake of set_current_state: */
|
||||
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
|
||||
#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
|
||||
|
@ -1747,7 +1754,6 @@ extern struct pid *cad_pid;
|
|||
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
|
||||
#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */
|
||||
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
|
||||
#define PF_FROZEN 0x00010000 /* Frozen for system suspend */
|
||||
#define PF_KSWAPD 0x00020000 /* I am kswapd */
|
||||
#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
|
||||
#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
|
||||
|
@ -1758,7 +1764,6 @@ extern struct pid *cad_pid;
|
|||
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
|
||||
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
|
||||
#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
|
||||
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
|
||||
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
|
||||
|
||||
/*
|
||||
|
|
|
@ -257,7 +257,7 @@ int rpc_malloc(struct rpc_task *);
|
|||
void rpc_free(struct rpc_task *);
|
||||
int rpciod_up(void);
|
||||
void rpciod_down(void);
|
||||
int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *);
|
||||
int rpc_wait_for_completion_task(struct rpc_task *task);
|
||||
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
|
||||
struct net;
|
||||
void rpc_show_tasks(struct net *);
|
||||
|
@ -269,11 +269,6 @@ extern struct workqueue_struct *xprtiod_workqueue;
|
|||
void rpc_prepare_task(struct rpc_task *task);
|
||||
gfp_t rpc_task_gfp_mask(void);
|
||||
|
||||
static inline int rpc_wait_for_completion_task(struct rpc_task *task)
|
||||
{
|
||||
return __rpc_wait_for_completion_task(task, NULL);
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
|
||||
static inline const char * rpc_qname(const struct rpc_wait_queue *q)
|
||||
{
|
||||
|
|
|
@ -278,7 +278,7 @@ static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
|
|||
|
||||
#define ___wait_is_interruptible(state) \
|
||||
(!__builtin_constant_p(state) || \
|
||||
state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \
|
||||
(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
|
||||
|
||||
extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);
|
||||
|
||||
|
@ -358,8 +358,8 @@ do { \
|
|||
} while (0)
|
||||
|
||||
#define __wait_event_freezable(wq_head, condition) \
|
||||
___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
|
||||
freezable_schedule())
|
||||
___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \
|
||||
0, 0, schedule())
|
||||
|
||||
/**
|
||||
* wait_event_freezable - sleep (or freeze) until a condition gets true
|
||||
|
@ -417,8 +417,8 @@ do { \
|
|||
|
||||
#define __wait_event_freezable_timeout(wq_head, condition, timeout) \
|
||||
___wait_event(wq_head, ___wait_cond_timeout(condition), \
|
||||
TASK_INTERRUPTIBLE, 0, timeout, \
|
||||
__ret = freezable_schedule_timeout(__ret))
|
||||
(TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \
|
||||
__ret = schedule_timeout(__ret))
|
||||
|
||||
/*
|
||||
* like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
|
||||
|
@ -639,8 +639,8 @@ do { \
|
|||
|
||||
|
||||
#define __wait_event_freezable_exclusive(wq, condition) \
|
||||
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
|
||||
freezable_schedule())
|
||||
___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\
|
||||
schedule())
|
||||
|
||||
#define wait_event_freezable_exclusive(wq, condition) \
|
||||
({ \
|
||||
|
|
|
@ -619,6 +619,7 @@ config TASK_IO_ACCOUNTING
|
|||
|
||||
config PSI
|
||||
bool "Pressure stall information tracking"
|
||||
select KERNFS
|
||||
help
|
||||
Collect metrics that indicate how overcommitted the CPU, memory,
|
||||
and IO capacity are in the system.
|
||||
|
|
|
@ -164,13 +164,13 @@ struct cgroup_mgctx {
|
|||
#define DEFINE_CGROUP_MGCTX(name) \
|
||||
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
|
||||
|
||||
extern spinlock_t css_set_lock;
|
||||
extern struct cgroup_subsys *cgroup_subsys[];
|
||||
extern struct list_head cgroup_roots;
|
||||
|
||||
/* iterate across the hierarchies */
|
||||
#define for_each_root(root) \
|
||||
list_for_each_entry((root), &cgroup_roots, root_list)
|
||||
list_for_each_entry_rcu((root), &cgroup_roots, root_list, \
|
||||
lockdep_is_held(&cgroup_mutex))
|
||||
|
||||
/**
|
||||
* for_each_subsys - iterate all enabled cgroup subsystems
|
||||
|
|
|
@ -360,10 +360,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
|
|||
}
|
||||
css_task_iter_end(&it);
|
||||
length = n;
|
||||
/* now sort & (if procs) strip out duplicates */
|
||||
/* now sort & strip out duplicates (tgids or recycled thread PIDs) */
|
||||
sort(array, length, sizeof(pid_t), cmppid, NULL);
|
||||
if (type == CGROUP_FILE_PROCS)
|
||||
length = pidlist_uniq(array, length);
|
||||
length = pidlist_uniq(array, length);
|
||||
|
||||
l = cgroup_pidlist_find_create(cgrp, type);
|
||||
if (!l) {
|
||||
|
@ -431,7 +430,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
|
|||
if (l->list[mid] == pid) {
|
||||
index = mid;
|
||||
break;
|
||||
} else if (l->list[mid] <= pid)
|
||||
} else if (l->list[mid] < pid)
|
||||
index = mid + 1;
|
||||
else
|
||||
end = mid;
|
||||
|
@ -1263,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* task_get_cgroup1 - Acquires the associated cgroup of a task within a
|
||||
* specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
|
||||
* hierarchy ID.
|
||||
* @tsk: The target task
|
||||
* @hierarchy_id: The ID of a cgroup1 hierarchy
|
||||
*
|
||||
* On success, the cgroup is returned. On failure, ERR_PTR is returned.
|
||||
* We limit it to cgroup1 only.
|
||||
*/
|
||||
struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
|
||||
{
|
||||
struct cgroup *cgrp = ERR_PTR(-ENOENT);
|
||||
struct cgroup_root *root;
|
||||
unsigned long flags;
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_root(root) {
|
||||
/* cgroup1 only*/
|
||||
if (root == &cgrp_dfl_root)
|
||||
continue;
|
||||
if (root->hierarchy_id != hierarchy_id)
|
||||
continue;
|
||||
spin_lock_irqsave(&css_set_lock, flags);
|
||||
cgrp = task_cgroup_from_root(tsk, root);
|
||||
if (!cgrp || !cgroup_tryget(cgrp))
|
||||
cgrp = ERR_PTR(-ENOENT);
|
||||
spin_unlock_irqrestore(&css_set_lock, flags);
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return cgrp;
|
||||
}
|
||||
|
||||
static int __init cgroup1_wq_init(void)
|
||||
{
|
||||
/*
|
||||
|
|
|
@ -494,28 +494,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
|
|||
return &cgrp->self;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
|
||||
* @cgrp: the cgroup of interest
|
||||
* @ss: the subsystem of interest
|
||||
*
|
||||
* Find and get @cgrp's css associated with @ss. If the css doesn't exist
|
||||
* or is offline, %NULL is returned.
|
||||
*/
|
||||
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
|
||||
struct cgroup_subsys *ss)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
rcu_read_lock();
|
||||
css = cgroup_css(cgrp, ss);
|
||||
if (css && !css_tryget_online(css))
|
||||
css = NULL;
|
||||
rcu_read_unlock();
|
||||
|
||||
return css;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
|
||||
* @cgrp: the cgroup of interest
|
||||
|
@ -681,7 +659,7 @@ EXPORT_SYMBOL_GPL(of_css);
|
|||
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
|
||||
* @cgrp: the target cgroup to iterate css's of
|
||||
*
|
||||
* Should be called under cgroup_[tree_]mutex.
|
||||
* Should be called under cgroup_mutex.
|
||||
*/
|
||||
#define for_each_css(css, ssid, cgrp) \
|
||||
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
|
||||
|
@ -931,7 +909,7 @@ static void css_set_move_task(struct task_struct *task,
|
|||
#define CSS_SET_HASH_BITS 7
|
||||
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
|
||||
|
||||
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
|
||||
static unsigned long css_set_hash(struct cgroup_subsys_state **css)
|
||||
{
|
||||
unsigned long key = 0UL;
|
||||
struct cgroup_subsys *ss;
|
||||
|
@ -1072,7 +1050,7 @@ static bool compare_css_sets(struct css_set *cset,
|
|||
*/
|
||||
static struct css_set *find_existing_css_set(struct css_set *old_cset,
|
||||
struct cgroup *cgrp,
|
||||
struct cgroup_subsys_state *template[])
|
||||
struct cgroup_subsys_state **template)
|
||||
{
|
||||
struct cgroup_root *root = cgrp->root;
|
||||
struct cgroup_subsys *ss;
|
||||
|
@ -1337,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
|
|||
|
||||
void cgroup_free_root(struct cgroup_root *root)
|
||||
{
|
||||
kfree(root);
|
||||
kfree_rcu(root, rcu);
|
||||
}
|
||||
|
||||
static void cgroup_destroy_root(struct cgroup_root *root)
|
||||
|
@ -1369,10 +1347,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
|
|||
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
|
||||
if (!list_empty(&root->root_list)) {
|
||||
list_del(&root->root_list);
|
||||
cgroup_root_count--;
|
||||
}
|
||||
WARN_ON_ONCE(list_empty(&root->root_list));
|
||||
list_del_rcu(&root->root_list);
|
||||
cgroup_root_count--;
|
||||
|
||||
if (!have_favordynmods)
|
||||
cgroup_favor_dynmods(root, false);
|
||||
|
@ -1412,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
|
|||
}
|
||||
}
|
||||
|
||||
BUG_ON(!res_cgroup);
|
||||
/*
|
||||
* If cgroup_mutex is not held, the cgrp_cset_link will be freed
|
||||
* before we remove the cgroup root from the root_list. Consequently,
|
||||
* when accessing a cgroup root, the cset_link may have already been
|
||||
* freed, resulting in a NULL res_cgroup. However, by holding the
|
||||
* cgroup_mutex, we ensure that res_cgroup can't be NULL.
|
||||
* If we don't hold cgroup_mutex in the caller, we must do the NULL
|
||||
* check.
|
||||
*/
|
||||
return res_cgroup;
|
||||
}
|
||||
|
||||
|
@ -1435,6 +1420,11 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
|
|||
|
||||
rcu_read_unlock();
|
||||
|
||||
/*
|
||||
* The namespace_sem is held by current, so the root cgroup can't
|
||||
* be umounted. Therefore, we can ensure that the res is non-NULL.
|
||||
*/
|
||||
WARN_ON_ONCE(!res);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -1471,7 +1461,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
|
|||
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
|
||||
struct cgroup_root *root)
|
||||
{
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
lockdep_assert_held(&css_set_lock);
|
||||
|
||||
return __cset_cgroup_from_root(cset, root);
|
||||
|
@ -1479,7 +1468,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
|
|||
|
||||
/*
|
||||
* Return the cgroup for "task" from the given hierarchy. Must be
|
||||
* called with cgroup_mutex and css_set_lock held.
|
||||
* called with css_set_lock held to prevent task's groups from being modified.
|
||||
* Must be called with either cgroup_mutex or rcu read lock to prevent the
|
||||
* cgroup root from being destroyed.
|
||||
*/
|
||||
struct cgroup *task_cgroup_from_root(struct task_struct *task,
|
||||
struct cgroup_root *root)
|
||||
|
@ -1740,25 +1731,27 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
|
|||
struct cftype *cfts, *failed_cfts;
|
||||
int ret;
|
||||
|
||||
if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
|
||||
if (css->flags & CSS_VISIBLE)
|
||||
return 0;
|
||||
|
||||
if (!css->ss) {
|
||||
if (cgroup_on_dfl(cgrp)) {
|
||||
ret = cgroup_addrm_files(&cgrp->self, cgrp,
|
||||
ret = cgroup_addrm_files(css, cgrp,
|
||||
cgroup_base_files, true);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (cgroup_psi_enabled()) {
|
||||
ret = cgroup_addrm_files(&cgrp->self, cgrp,
|
||||
ret = cgroup_addrm_files(css, cgrp,
|
||||
cgroup_psi_files, true);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
cgroup_addrm_files(css, cgrp,
|
||||
cgroup1_base_files, true);
|
||||
ret = cgroup_addrm_files(css, cgrp,
|
||||
cgroup1_base_files, true);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
list_for_each_entry(cfts, &css->ss->cfts, node) {
|
||||
|
@ -2040,7 +2033,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
|
|||
struct cgroup_root *root = ctx->root;
|
||||
struct cgroup *cgrp = &root->cgrp;
|
||||
|
||||
INIT_LIST_HEAD(&root->root_list);
|
||||
INIT_LIST_HEAD_RCU(&root->root_list);
|
||||
atomic_set(&root->nr_cgrps, 1);
|
||||
cgrp->root = root;
|
||||
init_cgroup_housekeeping(cgrp);
|
||||
|
@ -2123,7 +2116,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
|
|||
* care of subsystems' refcounts, which are explicitly dropped in
|
||||
* the failure exit path.
|
||||
*/
|
||||
list_add(&root->root_list, &cgroup_roots);
|
||||
list_add_rcu(&root->root_list, &cgroup_roots);
|
||||
cgroup_root_count++;
|
||||
|
||||
/*
|
||||
|
@ -2503,7 +2496,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
|
|||
|
||||
/*
|
||||
* This function may be called both before and
|
||||
* after cgroup_taskset_migrate(). The two cases
|
||||
* after cgroup_migrate_execute(). The two cases
|
||||
* can be distinguished by looking at whether @cset
|
||||
* has its ->mg_dst_cset set.
|
||||
*/
|
||||
|
@ -2648,10 +2641,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
|
|||
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* mixables don't care */
|
||||
if (cgroup_is_mixable(dst_cgrp))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* If @dst_cgrp is already or can become a thread root or is
|
||||
* threaded, it doesn't matter.
|
||||
|
@ -3662,9 +3651,32 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
|
||||
struct cgroup *cgrp, int ssid)
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
/**
|
||||
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
|
||||
* @cgrp: the cgroup of interest
|
||||
* @ss: the subsystem of interest
|
||||
*
|
||||
* Find and get @cgrp's css associated with @ss. If the css doesn't exist
|
||||
* or is offline, %NULL is returned.
|
||||
*/
|
||||
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
|
||||
struct cgroup_subsys *ss)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
rcu_read_lock();
|
||||
css = cgroup_css(cgrp, ss);
|
||||
if (css && !css_tryget_online(css))
|
||||
css = NULL;
|
||||
rcu_read_unlock();
|
||||
|
||||
return css;
|
||||
}
|
||||
|
||||
static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct cgroup_subsys *ss = cgroup_subsys[ssid];
|
||||
struct cgroup_subsys_state *css;
|
||||
int ret;
|
||||
|
@ -3681,14 +3693,44 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int cgroup_local_stat_show(struct seq_file *seq,
|
||||
struct cgroup *cgrp, int ssid)
|
||||
{
|
||||
struct cgroup_subsys *ss = cgroup_subsys[ssid];
|
||||
struct cgroup_subsys_state *css;
|
||||
int ret;
|
||||
|
||||
if (!ss->css_local_stat_show)
|
||||
return 0;
|
||||
|
||||
css = cgroup_tryget_css(cgrp, ss);
|
||||
if (!css)
|
||||
return 0;
|
||||
|
||||
ret = ss->css_local_stat_show(seq, css);
|
||||
css_put(css);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int cpu_stat_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
|
||||
int ret = 0;
|
||||
|
||||
cgroup_base_stat_cputime_show(seq);
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
|
||||
ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cpu_local_stat_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
|
||||
int ret = 0;
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
@ -3746,8 +3788,8 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
|
|||
return psi_show(seq, psi, PSI_CPU);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, enum psi_res res)
|
||||
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, enum psi_res res)
|
||||
{
|
||||
struct cgroup_file_ctx *ctx = of->priv;
|
||||
struct psi_trigger *new;
|
||||
|
@ -3768,7 +3810,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
|
|||
}
|
||||
|
||||
psi = cgroup_psi(cgrp);
|
||||
new = psi_trigger_create(psi, buf, res, of->file);
|
||||
new = psi_trigger_create(psi, buf, res, of->file, of);
|
||||
if (IS_ERR(new)) {
|
||||
cgroup_put(cgrp);
|
||||
return PTR_ERR(new);
|
||||
|
@ -3784,21 +3826,21 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
|
|||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
|
||||
return pressure_write(of, buf, nbytes, PSI_IO);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
|
||||
return pressure_write(of, buf, nbytes, PSI_MEM);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
|
||||
return pressure_write(of, buf, nbytes, PSI_CPU);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
|
@ -3814,10 +3856,58 @@ static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
|
|||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
|
||||
return pressure_write(of, buf, nbytes, PSI_IRQ);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int cgroup_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
seq_printf(seq, "%d\n", psi->enabled);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
ssize_t ret;
|
||||
int enable;
|
||||
struct cgroup *cgrp;
|
||||
struct psi_group *psi;
|
||||
|
||||
ret = kstrtoint(strstrip(buf), 0, &enable);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (enable < 0 || enable > 1)
|
||||
return -ERANGE;
|
||||
|
||||
cgrp = cgroup_kn_lock_live(of->kn, false);
|
||||
if (!cgrp)
|
||||
return -ENOENT;
|
||||
|
||||
psi = cgroup_psi(cgrp);
|
||||
if (psi->enabled != enable) {
|
||||
int i;
|
||||
|
||||
/* show or hide {cpu,memory,io,irq}.pressure files */
|
||||
for (i = 0; i < NR_PSI_RESOURCES; i++)
|
||||
cgroup_file_show(&cgrp->psi_files[i], enable);
|
||||
|
||||
psi->enabled = enable;
|
||||
if (enable)
|
||||
psi_cgroup_restart(psi);
|
||||
}
|
||||
|
||||
cgroup_kn_unlock(of->kn);
|
||||
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
|
||||
poll_table *pt)
|
||||
{
|
||||
|
@ -4110,20 +4200,6 @@ static struct kernfs_ops cgroup_kf_ops = {
|
|||
.seq_show = cgroup_seqfile_show,
|
||||
};
|
||||
|
||||
/* set uid and gid of cgroup dirs and files to that of the creator */
|
||||
static int cgroup_kn_set_ugid(struct kernfs_node *kn)
|
||||
{
|
||||
struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
|
||||
.ia_uid = current_fsuid(),
|
||||
.ia_gid = current_fsgid(), };
|
||||
|
||||
if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
|
||||
gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
|
||||
return 0;
|
||||
|
||||
return kernfs_setattr(kn, &iattr);
|
||||
}
|
||||
|
||||
static void cgroup_file_notify_timer(struct timer_list *timer)
|
||||
{
|
||||
cgroup_file_notify(container_of(timer, struct cgroup_file,
|
||||
|
@ -4136,25 +4212,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
|
|||
char name[CGROUP_FILE_NAME_MAX];
|
||||
struct kernfs_node *kn;
|
||||
struct lock_class_key *key = NULL;
|
||||
int ret;
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
key = &cft->lockdep_key;
|
||||
#endif
|
||||
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
|
||||
cgroup_file_mode(cft),
|
||||
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
|
||||
current_fsuid(), current_fsgid(),
|
||||
0, cft->kf_ops, cft,
|
||||
NULL, key);
|
||||
if (IS_ERR(kn))
|
||||
return PTR_ERR(kn);
|
||||
|
||||
ret = cgroup_kn_set_ugid(kn);
|
||||
if (ret) {
|
||||
kernfs_remove(kn);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (cft->file_offset) {
|
||||
struct cgroup_file *cfile = (void *)css + cft->file_offset;
|
||||
|
||||
|
@ -4302,14 +4371,13 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
|
||||
static void cgroup_rm_cftypes_locked(struct cftype *cfts)
|
||||
{
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
list_del(&cfts->node);
|
||||
cgroup_apply_cftypes(cfts, false);
|
||||
cgroup_exit_cftypes(cfts);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -4325,8 +4393,6 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts)
|
|||
*/
|
||||
int cgroup_rm_cftypes(struct cftype *cfts)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!cfts || cfts[0].name[0] == '\0')
|
||||
return 0;
|
||||
|
||||
|
@ -4334,9 +4400,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
|
|||
return -ENOENT;
|
||||
|
||||
cgroup_lock();
|
||||
ret = cgroup_rm_cftypes_locked(cfts);
|
||||
cgroup_rm_cftypes_locked(cfts);
|
||||
cgroup_unlock();
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -5234,6 +5300,7 @@ static struct cftype cgroup_psi_files[] = {
|
|||
#ifdef CONFIG_PSI
|
||||
{
|
||||
.name = "io.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
|
||||
.seq_show = cgroup_io_pressure_show,
|
||||
.write = cgroup_io_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
|
@ -5241,6 +5308,7 @@ static struct cftype cgroup_psi_files[] = {
|
|||
},
|
||||
{
|
||||
.name = "memory.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
|
||||
.seq_show = cgroup_memory_pressure_show,
|
||||
.write = cgroup_memory_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
|
@ -5248,6 +5316,7 @@ static struct cftype cgroup_psi_files[] = {
|
|||
},
|
||||
{
|
||||
.name = "cpu.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
|
||||
.seq_show = cgroup_cpu_pressure_show,
|
||||
.write = cgroup_cpu_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
|
@ -5256,12 +5325,18 @@ static struct cftype cgroup_psi_files[] = {
|
|||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
{
|
||||
.name = "irq.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
|
||||
.seq_show = cgroup_irq_pressure_show,
|
||||
.write = cgroup_irq_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
.release = cgroup_pressure_release,
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.name = "cgroup.pressure",
|
||||
.seq_show = cgroup_pressure_show,
|
||||
.write = cgroup_pressure_write,
|
||||
},
|
||||
#endif /* CONFIG_PSI */
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
@ -5282,7 +5357,7 @@ static struct cftype cgroup_psi_files[] = {
|
|||
* RCU callback.
|
||||
*
|
||||
* 4. After the grace period, the css can be freed. Implemented in
|
||||
* css_free_work_fn().
|
||||
* css_free_rwork_fn().
|
||||
*
|
||||
* It is actually hairier because both step 2 and 4 require process context
|
||||
* and thus involve punting to css->destroy_work adding two additional
|
||||
|
@ -5526,8 +5601,7 @@ err_free_css:
|
|||
|
||||
/*
|
||||
* The returned cgroup is fully initialized including its control mask, but
|
||||
* it isn't associated with its kernfs_node and doesn't have the control
|
||||
* mask applied.
|
||||
* it doesn't have the control mask applied.
|
||||
*/
|
||||
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
|
||||
umode_t mode)
|
||||
|
@ -5552,7 +5626,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
|
|||
goto out_cancel_ref;
|
||||
|
||||
/* create the directory */
|
||||
kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
|
||||
kn = kernfs_create_dir_ns(parent->kn, name, mode,
|
||||
current_fsuid(), current_fsgid(),
|
||||
cgrp, NULL);
|
||||
if (IS_ERR(kn)) {
|
||||
ret = PTR_ERR(kn);
|
||||
goto out_stat_exit;
|
||||
|
@ -5697,10 +5773,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
|
|||
*/
|
||||
kernfs_get(cgrp->kn);
|
||||
|
||||
ret = cgroup_kn_set_ugid(cgrp->kn);
|
||||
if (ret)
|
||||
goto out_destroy;
|
||||
|
||||
ret = css_populate_dir(&cgrp->self);
|
||||
if (ret)
|
||||
goto out_destroy;
|
||||
|
@ -5853,7 +5925,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
|
|||
/*
|
||||
* Mark @cgrp and the associated csets dead. The former prevents
|
||||
* further task migration and child creation by disabling
|
||||
* cgroup_lock_live_group(). The latter makes the csets ignored by
|
||||
* cgroup_kn_lock_live(). The latter makes the csets ignored by
|
||||
* the migration path.
|
||||
*/
|
||||
cgrp->self.flags &= ~CSS_ONLINE;
|
||||
|
@ -5875,7 +5947,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
|
|||
parent->nr_threaded_children--;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
|
||||
for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
|
||||
tcgrp->nr_descendants--;
|
||||
tcgrp->nr_dying_descendants++;
|
||||
/*
|
||||
|
@ -6068,8 +6140,8 @@ int __init cgroup_init(void)
|
|||
continue;
|
||||
|
||||
if (cgroup1_ssid_disabled(ssid))
|
||||
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
|
||||
ss->name);
|
||||
pr_info("Disabling %s control group subsystem in v1 mounts\n",
|
||||
ss->legacy_name);
|
||||
|
||||
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
|
||||
|
||||
|
@ -6201,7 +6273,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
|||
if (!buf)
|
||||
goto out;
|
||||
|
||||
cgroup_lock();
|
||||
rcu_read_lock();
|
||||
spin_lock_irq(&css_set_lock);
|
||||
|
||||
for_each_root(root) {
|
||||
|
@ -6212,6 +6284,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
|||
if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
|
||||
continue;
|
||||
|
||||
cgrp = task_cgroup_from_root(tsk, root);
|
||||
/* The root has already been unmounted. */
|
||||
if (!cgrp)
|
||||
continue;
|
||||
|
||||
seq_printf(m, "%d:", root->hierarchy_id);
|
||||
if (root != &cgrp_dfl_root)
|
||||
for_each_subsys(ss, ssid)
|
||||
|
@ -6222,9 +6299,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
|||
seq_printf(m, "%sname=%s", count ? "," : "",
|
||||
root->name);
|
||||
seq_putc(m, ':');
|
||||
|
||||
cgrp = task_cgroup_from_root(tsk, root);
|
||||
|
||||
/*
|
||||
* On traditional hierarchies, all zombie tasks show up as
|
||||
* belonging to the root cgroup. On the default hierarchy,
|
||||
|
@ -6256,7 +6330,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
|||
retval = 0;
|
||||
out_unlock:
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
cgroup_unlock();
|
||||
rcu_read_unlock();
|
||||
kfree(buf);
|
||||
out:
|
||||
return retval;
|
||||
|
|
|
@ -2562,7 +2562,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
|||
update_partition_sd_lb(cs, old_prs);
|
||||
out_free:
|
||||
free_cpumasks(NULL, &tmp);
|
||||
return 0;
|
||||
return retval;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2598,9 +2598,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
|||
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
|
||||
return 0;
|
||||
|
||||
if (alloc_cpumasks(NULL, &tmp))
|
||||
return -ENOMEM;
|
||||
|
||||
if (*buf)
|
||||
compute_effective_exclusive_cpumask(trialcs, NULL);
|
||||
|
||||
|
@ -2615,6 +2612,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
|||
if (retval)
|
||||
return retval;
|
||||
|
||||
if (alloc_cpumasks(NULL, &tmp))
|
||||
return -ENOMEM;
|
||||
|
||||
if (old_prs) {
|
||||
if (cpumask_empty(trialcs->effective_xcpus)) {
|
||||
invalidate = true;
|
||||
|
@ -4379,17 +4379,16 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
|
|||
is_empty = cpumask_empty(cs->cpus_allowed) ||
|
||||
nodes_empty(cs->mems_allowed);
|
||||
|
||||
mutex_unlock(&cpuset_mutex);
|
||||
|
||||
/*
|
||||
* Move tasks to the nearest ancestor with execution resources,
|
||||
* This is full cgroup operation which will also call back into
|
||||
* cpuset. Should be done outside any lock.
|
||||
*/
|
||||
if (is_empty)
|
||||
if (is_empty) {
|
||||
mutex_unlock(&cpuset_mutex);
|
||||
remove_tasks_in_empty_cpuset(cs);
|
||||
|
||||
mutex_lock(&cpuset_mutex);
|
||||
mutex_lock(&cpuset_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -4559,6 +4558,7 @@ unlock:
|
|||
|
||||
/**
|
||||
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
|
||||
* @work: unused
|
||||
*
|
||||
* This function is called after either CPU or memory configuration has
|
||||
* changed and updates cpuset accordingly. The top_cpuset is always
|
||||
|
@ -4941,6 +4941,7 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask)
|
|||
|
||||
/**
|
||||
* cpuset_spread_node() - On which node to begin search for a page
|
||||
* @rotor: round robin rotor
|
||||
*
|
||||
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
|
||||
* tasks in a cpuset with is_spread_page or is_spread_slab set),
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <linux/freezer.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/cpu.h>
|
||||
|
||||
/*
|
||||
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
|
||||
|
@ -65,9 +66,15 @@ static struct freezer *parent_freezer(struct freezer *freezer)
|
|||
bool cgroup_freezing(struct task_struct *task)
|
||||
{
|
||||
bool ret;
|
||||
unsigned int state;
|
||||
|
||||
rcu_read_lock();
|
||||
ret = task_freezer(task)->state & CGROUP_FREEZING;
|
||||
/* Check if the cgroup is still FREEZING, but not FROZEN. The extra
|
||||
* !FROZEN check is required, because the FREEZING bit is not cleared
|
||||
* when the state FROZEN is reached.
|
||||
*/
|
||||
state = task_freezer(task)->state;
|
||||
ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
|
@ -107,16 +114,18 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
|
|||
struct freezer *freezer = css_freezer(css);
|
||||
struct freezer *parent = parent_freezer(freezer);
|
||||
|
||||
cpus_read_lock();
|
||||
mutex_lock(&freezer_mutex);
|
||||
|
||||
freezer->state |= CGROUP_FREEZER_ONLINE;
|
||||
|
||||
if (parent && (parent->state & CGROUP_FREEZING)) {
|
||||
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
|
||||
atomic_inc(&system_freezing_cnt);
|
||||
static_branch_inc_cpuslocked(&freezer_active);
|
||||
}
|
||||
|
||||
mutex_unlock(&freezer_mutex);
|
||||
cpus_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -131,14 +140,16 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
|
|||
{
|
||||
struct freezer *freezer = css_freezer(css);
|
||||
|
||||
cpus_read_lock();
|
||||
mutex_lock(&freezer_mutex);
|
||||
|
||||
if (freezer->state & CGROUP_FREEZING)
|
||||
atomic_dec(&system_freezing_cnt);
|
||||
static_branch_dec_cpuslocked(&freezer_active);
|
||||
|
||||
freezer->state = 0;
|
||||
|
||||
mutex_unlock(&freezer_mutex);
|
||||
cpus_read_unlock();
|
||||
}
|
||||
|
||||
static void freezer_css_free(struct cgroup_subsys_state *css)
|
||||
|
@ -179,6 +190,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
|
|||
__thaw_task(task);
|
||||
} else {
|
||||
freeze_task(task);
|
||||
|
||||
/* clear FROZEN and propagate upwards */
|
||||
while (freezer && (freezer->state & CGROUP_FROZEN)) {
|
||||
freezer->state &= ~CGROUP_FROZEN;
|
||||
|
@ -271,16 +283,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
|
|||
css_task_iter_start(css, 0, &it);
|
||||
|
||||
while ((task = css_task_iter_next(&it))) {
|
||||
if (freezing(task)) {
|
||||
/*
|
||||
* freezer_should_skip() indicates that the task
|
||||
* should be skipped when determining freezing
|
||||
* completion. Consider it frozen in addition to
|
||||
* the usual frozen condition.
|
||||
*/
|
||||
if (!frozen(task) && !freezer_should_skip(task))
|
||||
goto out_iter_end;
|
||||
}
|
||||
if (freezing(task) && !frozen(task))
|
||||
goto out_iter_end;
|
||||
}
|
||||
|
||||
freezer->state |= CGROUP_FROZEN;
|
||||
|
@ -357,7 +361,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
|
|||
|
||||
if (freeze) {
|
||||
if (!(freezer->state & CGROUP_FREEZING))
|
||||
atomic_inc(&system_freezing_cnt);
|
||||
static_branch_inc_cpuslocked(&freezer_active);
|
||||
freezer->state |= state;
|
||||
freeze_cgroup(freezer);
|
||||
} else {
|
||||
|
@ -366,9 +370,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
|
|||
freezer->state &= ~state;
|
||||
|
||||
if (!(freezer->state & CGROUP_FREEZING)) {
|
||||
if (was_freezing)
|
||||
atomic_dec(&system_freezing_cnt);
|
||||
freezer->state &= ~CGROUP_FROZEN;
|
||||
if (was_freezing)
|
||||
static_branch_dec_cpuslocked(&freezer_active);
|
||||
unfreeze_cgroup(freezer);
|
||||
}
|
||||
}
|
||||
|
@ -386,6 +390,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
|
|||
{
|
||||
struct cgroup_subsys_state *pos;
|
||||
|
||||
cpus_read_lock();
|
||||
/*
|
||||
* Update all its descendants in pre-order traversal. Each
|
||||
* descendant will try to inherit its parent's FREEZING state as
|
||||
|
@ -414,6 +419,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
|
|||
}
|
||||
rcu_read_unlock();
|
||||
mutex_unlock(&freezer_mutex);
|
||||
cpus_read_unlock();
|
||||
}
|
||||
|
||||
static ssize_t freezer_write(struct kernfs_open_file *of,
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
#include <linux/misc_cgroup.h>
|
||||
|
||||
#define MAX_STR "max"
|
||||
#define MAX_NUM ULONG_MAX
|
||||
#define MAX_NUM U64_MAX
|
||||
|
||||
/* Miscellaneous res name, keep it in sync with enum misc_res_type */
|
||||
static const char *const misc_res_name[] = {
|
||||
|
@ -37,7 +37,7 @@ static struct misc_cg root_cg;
|
|||
* more than the actual capacity. We are using Limits resource distribution
|
||||
* model of cgroup for miscellaneous controller.
|
||||
*/
|
||||
static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
|
||||
static u64 misc_res_capacity[MISC_CG_RES_TYPES];
|
||||
|
||||
/**
|
||||
* parent_misc() - Get the parent of the passed misc cgroup.
|
||||
|
@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type)
|
|||
* Context: Any context.
|
||||
* Return: Current total usage of the resource.
|
||||
*/
|
||||
unsigned long misc_cg_res_total_usage(enum misc_res_type type)
|
||||
u64 misc_cg_res_total_usage(enum misc_res_type type)
|
||||
{
|
||||
if (valid_type(type))
|
||||
return atomic_long_read(&root_cg.res[type].usage);
|
||||
return atomic64_read(&root_cg.res[type].usage);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
|
|||
* * %0 - Successfully registered the capacity.
|
||||
* * %-EINVAL - If @type is invalid.
|
||||
*/
|
||||
int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
|
||||
int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
|
||||
{
|
||||
if (!valid_type(type))
|
||||
return -EINVAL;
|
||||
|
@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
|
|||
* Context: Any context.
|
||||
*/
|
||||
static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
u64 amount)
|
||||
{
|
||||
WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
|
||||
WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage),
|
||||
"misc cgroup resource %s became less than 0",
|
||||
misc_res_name[type]);
|
||||
}
|
||||
|
@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
|
|||
* * -EBUSY - If max limit will be crossed or total usage will be more than the
|
||||
* capacity.
|
||||
*/
|
||||
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
|
||||
{
|
||||
struct misc_cg *i, *j;
|
||||
int ret;
|
||||
struct misc_res *res;
|
||||
int new_usage;
|
||||
u64 new_usage;
|
||||
|
||||
if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
|
||||
return -EINVAL;
|
||||
|
@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
|||
for (i = cg; i; i = parent_misc(i)) {
|
||||
res = &i->res[type];
|
||||
|
||||
new_usage = atomic_long_add_return(amount, &res->usage);
|
||||
new_usage = atomic64_add_return(amount, &res->usage);
|
||||
if (new_usage > READ_ONCE(res->max) ||
|
||||
new_usage > READ_ONCE(misc_res_capacity[type])) {
|
||||
ret = -EBUSY;
|
||||
|
@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
|||
|
||||
err_charge:
|
||||
for (j = i; j; j = parent_misc(j)) {
|
||||
atomic_long_inc(&j->res[type].events);
|
||||
atomic64_inc(&j->res[type].events);
|
||||
cgroup_file_notify(&j->events_file);
|
||||
}
|
||||
|
||||
|
@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge);
|
|||
*
|
||||
* Context: Any context.
|
||||
*/
|
||||
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
|
||||
unsigned long amount)
|
||||
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
|
||||
{
|
||||
struct misc_cg *i;
|
||||
|
||||
|
@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
|
|||
{
|
||||
int i;
|
||||
struct misc_cg *cg = css_misc(seq_css(sf));
|
||||
unsigned long max;
|
||||
u64 max;
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
if (READ_ONCE(misc_res_capacity[i])) {
|
||||
|
@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
|
|||
if (max == MAX_NUM)
|
||||
seq_printf(sf, "%s max\n", misc_res_name[i]);
|
||||
else
|
||||
seq_printf(sf, "%s %lu\n", misc_res_name[i],
|
||||
seq_printf(sf, "%s %llu\n", misc_res_name[i],
|
||||
max);
|
||||
}
|
||||
}
|
||||
|
@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
|
|||
* Return:
|
||||
* * >= 0 - Number of bytes processed in the input.
|
||||
* * -EINVAL - If buf is not valid.
|
||||
* * -ERANGE - If number is bigger than the unsigned long capacity.
|
||||
* * -ERANGE - If number is bigger than the u64 capacity.
|
||||
*/
|
||||
static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, loff_t off)
|
||||
{
|
||||
struct misc_cg *cg;
|
||||
unsigned long max;
|
||||
u64 max;
|
||||
int ret = 0, i;
|
||||
enum misc_res_type type = MISC_CG_RES_TYPES;
|
||||
char *token;
|
||||
|
@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
|
|||
if (!strcmp(MAX_STR, buf)) {
|
||||
max = MAX_NUM;
|
||||
} else {
|
||||
ret = kstrtoul(buf, 0, &max);
|
||||
ret = kstrtou64(buf, 0, &max);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
|
|||
static int misc_cg_current_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
int i;
|
||||
unsigned long usage;
|
||||
u64 usage;
|
||||
struct misc_cg *cg = css_misc(seq_css(sf));
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
usage = atomic_long_read(&cg->res[i].usage);
|
||||
usage = atomic64_read(&cg->res[i].usage);
|
||||
if (READ_ONCE(misc_res_capacity[i]) || usage)
|
||||
seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
|
||||
seq_printf(sf, "%s %llu\n", misc_res_name[i], usage);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
|
|||
static int misc_cg_capacity_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
int i;
|
||||
unsigned long cap;
|
||||
u64 cap;
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
cap = READ_ONCE(misc_res_capacity[i]);
|
||||
if (cap)
|
||||
seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
|
||||
seq_printf(sf, "%s %llu\n", misc_res_name[i], cap);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
|
|||
static int misc_events_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct misc_cg *cg = css_misc(seq_css(sf));
|
||||
unsigned long events, i;
|
||||
u64 events;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
events = atomic_long_read(&cg->res[i].events);
|
||||
events = atomic64_read(&cg->res[i].events);
|
||||
if (READ_ONCE(misc_res_capacity[i]) || events)
|
||||
seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
|
||||
seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -397,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
|
|||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
WRITE_ONCE(cg->res[i].max, MAX_NUM);
|
||||
atomic_long_set(&cg->res[i].usage, 0);
|
||||
atomic64_set(&cg->res[i].usage, 0);
|
||||
}
|
||||
|
||||
return &cg->css;
|
||||
|
|
|
@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = {
|
|||
.install = cgroupns_install,
|
||||
.owner = cgroupns_owner,
|
||||
};
|
||||
|
||||
static __init int cgroup_namespaces_init(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(cgroup_namespaces_init);
|
||||
|
|
|
@ -399,6 +399,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
|
|||
{
|
||||
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
|
||||
struct cgroup *parent = cgroup_parent(cgrp);
|
||||
struct cgroup_rstat_cpu *prstatc;
|
||||
struct cgroup_base_stat delta;
|
||||
unsigned seq;
|
||||
|
||||
|
@ -412,17 +413,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
|
|||
delta = rstatc->bstat;
|
||||
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
|
||||
|
||||
/* propagate percpu delta to global */
|
||||
/* propagate per-cpu delta to cgroup and per-cpu global statistics */
|
||||
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
|
||||
cgroup_base_stat_add(&cgrp->bstat, &delta);
|
||||
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
|
||||
cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
|
||||
|
||||
/* propagate global delta to parent (unless that's root) */
|
||||
/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
|
||||
if (cgroup_parent(parent)) {
|
||||
delta = cgrp->bstat;
|
||||
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
|
||||
cgroup_base_stat_add(&parent->bstat, &delta);
|
||||
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
|
||||
|
||||
delta = rstatc->subtree_bstat;
|
||||
prstatc = cgroup_rstat_cpu(parent, cpu);
|
||||
cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
|
||||
cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
|
||||
cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -405,10 +405,10 @@ static void coredump_task_exit(struct task_struct *tsk)
|
|||
complete(&core_state->startup);
|
||||
|
||||
for (;;) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
|
||||
if (!self.task) /* see coredump_finish() */
|
||||
break;
|
||||
freezable_schedule();
|
||||
schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
}
|
||||
|
|
|
@ -1527,13 +1527,12 @@ static void complete_vfork_done(struct task_struct *tsk)
|
|||
static int wait_for_vfork_done(struct task_struct *child,
|
||||
struct completion *vfork)
|
||||
{
|
||||
unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
|
||||
int killed;
|
||||
|
||||
freezer_do_not_count();
|
||||
cgroup_enter_frozen();
|
||||
killed = wait_for_completion_killable(vfork);
|
||||
killed = wait_for_completion_state(vfork, state);
|
||||
cgroup_leave_frozen(false);
|
||||
freezer_count();
|
||||
|
||||
if (killed) {
|
||||
task_lock(child);
|
||||
|
|
133
kernel/freezer.c
133
kernel/freezer.c
|
@ -13,10 +13,11 @@
|
|||
#include <linux/kthread.h>
|
||||
|
||||
/* total number of freezing conditions in effect */
|
||||
atomic_t system_freezing_cnt = ATOMIC_INIT(0);
|
||||
EXPORT_SYMBOL(system_freezing_cnt);
|
||||
DEFINE_STATIC_KEY_FALSE(freezer_active);
|
||||
EXPORT_SYMBOL(freezer_active);
|
||||
|
||||
/* indicate whether PM freezing is in effect, protected by
|
||||
/*
|
||||
* indicate whether PM freezing is in effect, protected by
|
||||
* system_transition_mutex
|
||||
*/
|
||||
bool pm_freezing;
|
||||
|
@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock);
|
|||
* freezing_slow_path - slow path for testing whether a task needs to be frozen
|
||||
* @p: task to be tested
|
||||
*
|
||||
* This function is called by freezing() if system_freezing_cnt isn't zero
|
||||
* This function is called by freezing() if freezer_active isn't zero
|
||||
* and tests whether @p needs to enter and stay in frozen state. Can be
|
||||
* called under any context. The freezers are responsible for ensuring the
|
||||
* target tasks see the updated state.
|
||||
|
@ -52,41 +53,40 @@ bool freezing_slow_path(struct task_struct *p)
|
|||
}
|
||||
EXPORT_SYMBOL(freezing_slow_path);
|
||||
|
||||
bool frozen(struct task_struct *p)
|
||||
{
|
||||
return READ_ONCE(p->__state) & TASK_FROZEN;
|
||||
}
|
||||
|
||||
/* Refrigerator is place where frozen processes are stored :-). */
|
||||
bool __refrigerator(bool check_kthr_stop)
|
||||
{
|
||||
/* Hmm, should we be allowed to suspend when there are realtime
|
||||
processes around? */
|
||||
unsigned int state = get_current_state();
|
||||
bool was_frozen = false;
|
||||
unsigned int save = get_current_state();
|
||||
|
||||
pr_debug("%s entered refrigerator\n", current->comm);
|
||||
|
||||
WARN_ON_ONCE(state && !(state & TASK_NORMAL));
|
||||
|
||||
for (;;) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
bool freeze;
|
||||
|
||||
set_current_state(TASK_FROZEN);
|
||||
|
||||
spin_lock_irq(&freezer_lock);
|
||||
current->flags |= PF_FROZEN;
|
||||
if (!freezing(current) ||
|
||||
(check_kthr_stop && kthread_should_stop()))
|
||||
current->flags &= ~PF_FROZEN;
|
||||
freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
|
||||
spin_unlock_irq(&freezer_lock);
|
||||
|
||||
if (!(current->flags & PF_FROZEN))
|
||||
if (!freeze)
|
||||
break;
|
||||
|
||||
was_frozen = true;
|
||||
schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
pr_debug("%s left refrigerator\n", current->comm);
|
||||
|
||||
/*
|
||||
* Restore saved task state before returning. The mb'd version
|
||||
* needs to be used; otherwise, it might silently break
|
||||
* synchronization which depends on ordered task state change.
|
||||
*/
|
||||
set_current_state(save);
|
||||
|
||||
return was_frozen;
|
||||
}
|
||||
EXPORT_SYMBOL(__refrigerator);
|
||||
|
@ -101,6 +101,44 @@ static void fake_signal_wake_up(struct task_struct *p)
|
|||
}
|
||||
}
|
||||
|
||||
static int __set_task_frozen(struct task_struct *p, void *arg)
|
||||
{
|
||||
unsigned int state = READ_ONCE(p->__state);
|
||||
|
||||
if (p->on_rq)
|
||||
return 0;
|
||||
|
||||
if (p != current && task_curr(p))
|
||||
return 0;
|
||||
|
||||
if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
|
||||
* can suffer spurious wakeups.
|
||||
*/
|
||||
if (state & TASK_FREEZABLE)
|
||||
WARN_ON_ONCE(!(state & TASK_NORMAL));
|
||||
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
/*
|
||||
* It's dangerous to freeze with locks held; there be dragons there.
|
||||
*/
|
||||
if (!(state & __TASK_FREEZABLE_UNSAFE))
|
||||
WARN_ON_ONCE(debug_locks && p->lockdep_depth);
|
||||
#endif
|
||||
|
||||
WRITE_ONCE(p->__state, TASK_FROZEN);
|
||||
return TASK_FROZEN;
|
||||
}
|
||||
|
||||
static bool __freeze_task(struct task_struct *p)
|
||||
{
|
||||
/* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */
|
||||
return task_call_func(p, __set_task_frozen, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* freeze_task - send a freeze request to given task
|
||||
* @p: task to send the request to
|
||||
|
@ -116,20 +154,8 @@ bool freeze_task(struct task_struct *p)
|
|||
{
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* This check can race with freezer_do_not_count, but worst case that
|
||||
* will result in an extra wakeup being sent to the task. It does not
|
||||
* race with freezer_count(), the barriers in freezer_count() and
|
||||
* freezer_should_skip() ensure that either freezer_count() sees
|
||||
* freezing == true in try_to_freeze() and freezes, or
|
||||
* freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
|
||||
* normally.
|
||||
*/
|
||||
if (freezer_should_skip(p))
|
||||
return false;
|
||||
|
||||
spin_lock_irqsave(&freezer_lock, flags);
|
||||
if (!freezing(p) || frozen(p)) {
|
||||
if (!freezing(p) || frozen(p) || __freeze_task(p)) {
|
||||
spin_unlock_irqrestore(&freezer_lock, flags);
|
||||
return false;
|
||||
}
|
||||
|
@ -137,19 +163,52 @@ bool freeze_task(struct task_struct *p)
|
|||
if (!(p->flags & PF_KTHREAD))
|
||||
fake_signal_wake_up(p);
|
||||
else
|
||||
wake_up_state(p, TASK_INTERRUPTIBLE);
|
||||
wake_up_state(p, TASK_NORMAL);
|
||||
|
||||
spin_unlock_irqrestore(&freezer_lock, flags);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
|
||||
* state in p->jobctl. If either of them got a wakeup that was missed because
|
||||
* TASK_FROZEN, then their canonical state reflects that and the below will
|
||||
* refuse to restore the special state and instead issue the wakeup.
|
||||
*/
|
||||
static int __set_task_special(struct task_struct *p, void *arg)
|
||||
{
|
||||
unsigned int state = 0;
|
||||
|
||||
if (p->jobctl & JOBCTL_TRACED)
|
||||
state = TASK_TRACED;
|
||||
|
||||
else if (p->jobctl & JOBCTL_STOPPED)
|
||||
state = TASK_STOPPED;
|
||||
|
||||
if (state)
|
||||
WRITE_ONCE(p->__state, state);
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
void __thaw_task(struct task_struct *p)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long flags, flags2;
|
||||
|
||||
spin_lock_irqsave(&freezer_lock, flags);
|
||||
if (frozen(p))
|
||||
wake_up_process(p);
|
||||
if (WARN_ON_ONCE(freezing(p)))
|
||||
goto unlock;
|
||||
|
||||
if (lock_task_sighand(p, &flags2)) {
|
||||
/* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
|
||||
bool ret = task_call_func(p, __set_task_special, NULL);
|
||||
unlock_task_sighand(p, &flags2);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
wake_up_state(p, TASK_FROZEN);
|
||||
unlock:
|
||||
spin_unlock_irqrestore(&freezer_lock, flags);
|
||||
}
|
||||
|
||||
|
|
|
@ -348,7 +348,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
|
|||
* futex_queue() calls spin_unlock() upon completion, both serializing
|
||||
* access to the hash list and forcing another memory barrier.
|
||||
*/
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
||||
futex_queue(q, hb);
|
||||
|
||||
/* Arm the timer */
|
||||
|
@ -366,7 +366,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
|
|||
* is no timeout, or if it has yet to expire.
|
||||
*/
|
||||
if (!timeout || timeout->task)
|
||||
freezable_schedule();
|
||||
schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
}
|
||||
|
@ -444,7 +444,7 @@ retry:
|
|||
return ret;
|
||||
}
|
||||
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
|
||||
|
@ -518,7 +518,7 @@ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
|
|||
return;
|
||||
}
|
||||
|
||||
freezable_schedule();
|
||||
schedule();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -95,8 +95,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
|
|||
* Ensure the task is not frozen.
|
||||
* Also, skip vfork and any other user process that freezer should skip.
|
||||
*/
|
||||
if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
|
||||
return;
|
||||
if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN))
|
||||
return;
|
||||
|
||||
/*
|
||||
* When a freshly created task is scheduled once, changes its state to
|
||||
|
@ -191,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
|
|||
hung_task_show_lock = false;
|
||||
rcu_read_lock();
|
||||
for_each_process_thread(g, t) {
|
||||
unsigned int state;
|
||||
|
||||
if (!max_count--)
|
||||
goto unlock;
|
||||
if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
|
||||
|
@ -198,8 +200,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
|
|||
goto unlock;
|
||||
last_break = jiffies;
|
||||
}
|
||||
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
|
||||
if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE)
|
||||
/*
|
||||
* skip the TASK_KILLABLE tasks -- these can be killed
|
||||
* skip the TASK_IDLE tasks -- those are genuinely idle
|
||||
*/
|
||||
state = READ_ONCE(t->__state);
|
||||
if ((state & TASK_UNINTERRUPTIBLE) &&
|
||||
!(state & TASK_WAKEKILL) &&
|
||||
!(state & TASK_NOLOAD))
|
||||
check_hung_task(t, timeout);
|
||||
}
|
||||
unlock:
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
unsigned int lock_system_sleep(void)
|
||||
{
|
||||
unsigned int flags = current->flags;
|
||||
current->flags |= PF_FREEZER_SKIP;
|
||||
current->flags |= PF_NOFREEZE;
|
||||
mutex_lock(&system_transition_mutex);
|
||||
return flags;
|
||||
}
|
||||
|
@ -33,24 +33,8 @@ EXPORT_SYMBOL_GPL(lock_system_sleep);
|
|||
|
||||
void unlock_system_sleep(unsigned int flags)
|
||||
{
|
||||
/*
|
||||
* Don't use freezer_count() because we don't want the call to
|
||||
* try_to_freeze() here.
|
||||
*
|
||||
* Reason:
|
||||
* Fundamentally, we just don't need it, because freezing condition
|
||||
* doesn't come into effect until we release the
|
||||
* system_transition_mutex lock, since the freezer always works with
|
||||
* system_transition_mutex held.
|
||||
*
|
||||
* More importantly, in the case of hibernation,
|
||||
* unlock_system_sleep() gets called in snapshot_read() and
|
||||
* snapshot_write() when the freezing condition is still in effect.
|
||||
* Which means, if we use try_to_freeze() here, it would make them
|
||||
* enter the refrigerator, thus causing hibernation to lockup.
|
||||
*/
|
||||
if (!(flags & PF_FREEZER_SKIP))
|
||||
current->flags &= ~PF_FREEZER_SKIP;
|
||||
if (!(flags & PF_NOFREEZE))
|
||||
current->flags &= ~PF_NOFREEZE;
|
||||
mutex_unlock(&system_transition_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unlock_system_sleep);
|
||||
|
|
|
@ -53,8 +53,7 @@ static int try_to_freeze_tasks(bool user_only)
|
|||
if (p == current || !freeze_task(p))
|
||||
continue;
|
||||
|
||||
if (!freezer_should_skip(p))
|
||||
todo++;
|
||||
todo++;
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
|
||||
|
@ -99,8 +98,7 @@ static int try_to_freeze_tasks(bool user_only)
|
|||
if (!wakeup || pm_debug_messages_on) {
|
||||
read_lock(&tasklist_lock);
|
||||
for_each_process_thread(g, p) {
|
||||
if (p != current && !freezer_should_skip(p)
|
||||
&& freezing(p) && !frozen(p))
|
||||
if (p != current && freezing(p) && !frozen(p))
|
||||
sched_show_task(p);
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
|
@ -132,7 +130,7 @@ int freeze_processes(void)
|
|||
current->flags |= PF_SUSPEND_TASK;
|
||||
|
||||
if (!pm_freezing)
|
||||
atomic_inc(&system_freezing_cnt);
|
||||
static_branch_inc(&freezer_active);
|
||||
|
||||
pm_wakeup_clear(0);
|
||||
pr_info("Freezing user space processes ... ");
|
||||
|
@ -193,7 +191,7 @@ void thaw_processes(void)
|
|||
|
||||
trace_suspend_resume(TPS("thaw_processes"), 0, true);
|
||||
if (pm_freezing)
|
||||
atomic_dec(&system_freezing_cnt);
|
||||
static_branch_dec(&freezer_active);
|
||||
pm_freezing = false;
|
||||
pm_nosig_freezing = false;
|
||||
|
||||
|
|
|
@ -269,7 +269,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
|
|||
read_unlock(&tasklist_lock);
|
||||
|
||||
if (!ret && !ignore_state &&
|
||||
WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
|
||||
WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN)))
|
||||
ret = -ESRCH;
|
||||
|
||||
return ret;
|
||||
|
|
|
@ -4309,7 +4309,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
|||
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
|
||||
* __schedule(). See the comment for smp_mb__after_spinlock().
|
||||
*
|
||||
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
|
||||
* A similar smp_rmb() lives in __task_needs_rq_lock().
|
||||
*/
|
||||
smp_rmb();
|
||||
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
|
||||
|
@ -4407,6 +4407,40 @@ out:
|
|||
return success;
|
||||
}
|
||||
|
||||
static bool __task_needs_rq_lock(struct task_struct *p)
|
||||
{
|
||||
unsigned int state = READ_ONCE(p->__state);
|
||||
|
||||
/*
|
||||
* Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
|
||||
* the task is blocked. Make sure to check @state since ttwu() can drop
|
||||
* locks at the end, see ttwu_queue_wakelist().
|
||||
*/
|
||||
if (state == TASK_RUNNING || state == TASK_WAKING)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Ensure we load p->on_rq after p->__state, otherwise it would be
|
||||
* possible to, falsely, observe p->on_rq == 0.
|
||||
*
|
||||
* See try_to_wake_up() for a longer comment.
|
||||
*/
|
||||
smp_rmb();
|
||||
if (p->on_rq)
|
||||
return true;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* Ensure the task has finished __schedule() and will not be referenced
|
||||
* anymore. Again, see try_to_wake_up() for a longer comment.
|
||||
*/
|
||||
smp_rmb();
|
||||
smp_cond_load_acquire(&p->on_cpu, !VAL);
|
||||
#endif
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* task_call_func - Invoke a function on task in fixed state
|
||||
* @p: Process for which the function is to be invoked, can be @current.
|
||||
|
@ -4424,28 +4458,12 @@ out:
|
|||
int task_call_func(struct task_struct *p, task_call_f func, void *arg)
|
||||
{
|
||||
struct rq *rq = NULL;
|
||||
unsigned int state;
|
||||
struct rq_flags rf;
|
||||
int ret;
|
||||
|
||||
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
|
||||
|
||||
state = READ_ONCE(p->__state);
|
||||
|
||||
/*
|
||||
* Ensure we load p->on_rq after p->__state, otherwise it would be
|
||||
* possible to, falsely, observe p->on_rq == 0.
|
||||
*
|
||||
* See try_to_wake_up() for a longer comment.
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
/*
|
||||
* Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
|
||||
* the task is blocked. Make sure to check @state since ttwu() can drop
|
||||
* locks at the end, see ttwu_queue_wakelist().
|
||||
*/
|
||||
if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq)
|
||||
if (__task_needs_rq_lock(p))
|
||||
rq = __task_rq_lock(p, &rf);
|
||||
|
||||
/*
|
||||
|
@ -6654,7 +6672,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
|
|||
prev->sched_contributes_to_load =
|
||||
(prev_state & TASK_UNINTERRUPTIBLE) &&
|
||||
!(prev_state & TASK_NOLOAD) &&
|
||||
!(prev->flags & PF_FROZEN);
|
||||
!(prev_state & TASK_FROZEN);
|
||||
|
||||
if (prev->sched_contributes_to_load)
|
||||
rq->nr_uninterruptible++;
|
||||
|
@ -9270,7 +9288,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p)
|
|||
* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
|
||||
* TASK_KILLABLE).
|
||||
*/
|
||||
if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
|
||||
if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
|
|
|
@ -159,7 +159,6 @@ __setup("psi=", setup_psi);
|
|||
#define EXP_300s 2034 /* 1/exp(2s/300s) */
|
||||
|
||||
/* PSI trigger definitions */
|
||||
#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
|
||||
#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
|
||||
#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
|
||||
|
||||
|
@ -180,6 +179,7 @@ static void group_init(struct psi_group *group)
|
|||
{
|
||||
int cpu;
|
||||
|
||||
group->enabled = true;
|
||||
for_each_possible_cpu(cpu)
|
||||
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
|
||||
group->avg_last_update = sched_clock();
|
||||
|
@ -483,8 +483,12 @@ static void update_triggers(struct psi_group *group, u64 now,
|
|||
continue;
|
||||
|
||||
/* Generate an event */
|
||||
if (cmpxchg(&t->event, 0, 1) == 0)
|
||||
wake_up_interruptible(&t->event_wait);
|
||||
if (cmpxchg(&t->event, 0, 1) == 0) {
|
||||
if (t->of)
|
||||
kernfs_notify(t->of->kn);
|
||||
else
|
||||
wake_up_interruptible(&t->event_wait);
|
||||
}
|
||||
t->last_event_time = now;
|
||||
/* Reset threshold breach flag once event got generated */
|
||||
t->pending_event = false;
|
||||
|
@ -771,17 +775,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
|||
groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||
|
||||
/*
|
||||
* First we assess the aggregate resource states this CPU's
|
||||
* tasks have been in since the last change, and account any
|
||||
* SOME and FULL time these may have resulted in.
|
||||
*
|
||||
* Then we update the task counts according to the state
|
||||
* First we update the task counts according to the state
|
||||
* change requested through the @clear and @set bits.
|
||||
*
|
||||
* Then if the cgroup PSI stats accounting enabled, we
|
||||
* assess the aggregate resource states this CPU's tasks
|
||||
* have been in since the last change, and account any
|
||||
* SOME and FULL time these may have resulted in.
|
||||
*/
|
||||
write_seqcount_begin(&groupc->seq);
|
||||
|
||||
record_times(groupc, now);
|
||||
|
||||
/*
|
||||
* Start with TSK_ONCPU, which doesn't have a corresponding
|
||||
* task count - it's just a boolean flag directly encoded in
|
||||
|
@ -820,6 +823,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
|||
if (set & (1 << t))
|
||||
groupc->tasks[t]++;
|
||||
|
||||
if (!group->enabled) {
|
||||
/*
|
||||
* On the first group change after disabling PSI, conclude
|
||||
* the current state and flush its time. This is unlikely
|
||||
* to matter to the user, but aggregation (get_recent_times)
|
||||
* may have already incorporated the live state into times_prev;
|
||||
* avoid a delta sample underflow when PSI is later re-enabled.
|
||||
*/
|
||||
if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
|
||||
record_times(groupc, now);
|
||||
|
||||
groupc->state_mask = state_mask;
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
return;
|
||||
}
|
||||
|
||||
for (s = 0; s < NR_PSI_STATES; s++) {
|
||||
if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
|
||||
state_mask |= (1 << s);
|
||||
|
@ -836,6 +856,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
|||
if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
|
||||
state_mask |= (1 << PSI_MEM_FULL);
|
||||
|
||||
record_times(groupc, now);
|
||||
|
||||
groupc->state_mask = state_mask;
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
|
@ -985,6 +1007,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
|
|||
|
||||
group = task_psi_group(task);
|
||||
do {
|
||||
if (!group->enabled)
|
||||
continue;
|
||||
|
||||
groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||
|
||||
write_seqcount_begin(&groupc->seq);
|
||||
|
@ -1160,6 +1185,40 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
|
|||
|
||||
task_rq_unlock(rq, task, &rf);
|
||||
}
|
||||
|
||||
void psi_cgroup_restart(struct psi_group *group)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* After we disable psi_group->enabled, we don't actually
|
||||
* stop percpu tasks accounting in each psi_group_cpu,
|
||||
* instead only stop test_state() loop, record_times()
|
||||
* and averaging worker, see psi_group_change() for details.
|
||||
*
|
||||
* When disable cgroup PSI, this function has nothing to sync
|
||||
* since cgroup pressure files are hidden and percpu psi_group_cpu
|
||||
* would see !psi_group->enabled and only do task accounting.
|
||||
*
|
||||
* When re-enable cgroup PSI, this function use psi_group_change()
|
||||
* to get correct state mask from test_state() loop on tasks[],
|
||||
* and restart groupc->state_start from now, use .clear = .set = 0
|
||||
* here since no task status really changed.
|
||||
*/
|
||||
if (!group->enabled)
|
||||
return;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct rq_flags rf;
|
||||
u64 now;
|
||||
|
||||
rq_lock_irq(rq, &rf);
|
||||
now = cpu_clock(cpu);
|
||||
psi_group_change(group, cpu, 0, 0, now, true);
|
||||
rq_unlock_irq(rq, &rf);
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_CGROUPS */
|
||||
|
||||
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
|
||||
|
@ -1237,8 +1296,9 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
|
|||
return single_open(file, psi_cpu_show, NULL);
|
||||
}
|
||||
|
||||
struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
||||
char *buf, enum psi_res res, struct file *file)
|
||||
struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
|
||||
enum psi_res res, struct file *file,
|
||||
struct kernfs_open_file *of)
|
||||
{
|
||||
struct psi_trigger *t;
|
||||
enum psi_states state;
|
||||
|
@ -1270,8 +1330,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
|||
if (state >= PSI_NONIDLE)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
if (window_us < WINDOW_MIN_US ||
|
||||
window_us > WINDOW_MAX_US)
|
||||
if (window_us == 0 || window_us > WINDOW_MAX_US)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
/*
|
||||
|
@ -1297,7 +1356,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
|||
|
||||
t->event = 0;
|
||||
t->last_event_time = 0;
|
||||
init_waitqueue_head(&t->event_wait);
|
||||
t->of = of;
|
||||
if (!of)
|
||||
init_waitqueue_head(&t->event_wait);
|
||||
t->pending_event = false;
|
||||
t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
|
||||
|
||||
|
@ -1354,7 +1415,10 @@ void psi_trigger_destroy(struct psi_trigger *t)
|
|||
* being accessed later. Can happen if cgroup is deleted from under a
|
||||
* polling process.
|
||||
*/
|
||||
wake_up_pollfree(&t->event_wait);
|
||||
if (t->of)
|
||||
kernfs_notify(t->of->kn);
|
||||
else
|
||||
wake_up_interruptible(&t->event_wait);
|
||||
|
||||
if (t->aggregator == PSI_AVGS) {
|
||||
mutex_lock(&group->avgs_lock);
|
||||
|
@ -1426,7 +1490,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
|
|||
if (!t)
|
||||
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
|
||||
|
||||
poll_wait(file, &t->event_wait, wait);
|
||||
if (t->of)
|
||||
kernfs_generic_poll(t->of, wait);
|
||||
else
|
||||
poll_wait(file, &t->event_wait, wait);
|
||||
|
||||
if (cmpxchg(&t->event, 1, 0) == 1)
|
||||
ret |= EPOLLPRI;
|
||||
|
@ -1465,7 +1532,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
|
|||
return -EBUSY;
|
||||
}
|
||||
|
||||
new = psi_trigger_create(&psi_system, buf, res, file);
|
||||
new = psi_trigger_create(&psi_system, buf, res, file, NULL);
|
||||
if (IS_ERR(new)) {
|
||||
mutex_unlock(&seq->lock);
|
||||
return PTR_ERR(new);
|
||||
|
|
|
@ -2294,7 +2294,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
|
|||
cgroup_enter_frozen();
|
||||
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
|
||||
preempt_enable_no_resched();
|
||||
freezable_schedule();
|
||||
schedule();
|
||||
cgroup_leave_frozen(true);
|
||||
|
||||
/*
|
||||
|
@ -2463,7 +2463,7 @@ static bool do_signal_stop(int signr)
|
|||
|
||||
/* Now we don't run again until woken by SIGCONT or SIGKILL */
|
||||
cgroup_enter_frozen();
|
||||
freezable_schedule();
|
||||
schedule();
|
||||
return true;
|
||||
} else {
|
||||
/*
|
||||
|
@ -2538,11 +2538,11 @@ static void do_freezer_trap(void)
|
|||
* immediately (if there is a non-fatal signal pending), and
|
||||
* put the task into sleep.
|
||||
*/
|
||||
__set_current_state(TASK_INTERRUPTIBLE);
|
||||
__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
||||
clear_thread_flag(TIF_SIGPENDING);
|
||||
spin_unlock_irq(¤t->sighand->siglock);
|
||||
cgroup_enter_frozen();
|
||||
freezable_schedule();
|
||||
schedule();
|
||||
}
|
||||
|
||||
static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
|
||||
|
@ -3587,9 +3587,9 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
|
|||
recalc_sigpending();
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
|
||||
__set_current_state(TASK_INTERRUPTIBLE);
|
||||
ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
|
||||
HRTIMER_MODE_REL);
|
||||
__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
||||
ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
|
||||
HRTIMER_MODE_REL);
|
||||
spin_lock_irq(&tsk->sighand->siglock);
|
||||
__set_task_blocked(tsk, &tsk->real_blocked);
|
||||
sigemptyset(&tsk->real_blocked);
|
||||
|
|
|
@ -2038,11 +2038,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t)
|
|||
struct restart_block *restart;
|
||||
|
||||
do {
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
||||
hrtimer_sleeper_start_expires(t, t->mode);
|
||||
|
||||
if (likely(t->task))
|
||||
freezable_schedule();
|
||||
schedule();
|
||||
|
||||
hrtimer_cancel(&t->timer);
|
||||
t->mode = HRTIMER_MODE_ABS;
|
||||
|
|
20
kernel/umh.c
20
kernel/umh.c
|
@ -404,6 +404,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
|
|||
*/
|
||||
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
|
||||
{
|
||||
unsigned int state = TASK_UNINTERRUPTIBLE;
|
||||
DECLARE_COMPLETION_ONSTACK(done);
|
||||
int retval = 0;
|
||||
|
||||
|
@ -438,23 +439,26 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
|
|||
goto unlock;
|
||||
|
||||
if (wait & UMH_FREEZABLE)
|
||||
freezer_do_not_count();
|
||||
state |= TASK_FREEZABLE;
|
||||
|
||||
if (wait & UMH_KILLABLE) {
|
||||
retval = wait_for_completion_killable(&done);
|
||||
retval = wait_for_completion_state(&done, state | TASK_KILLABLE);
|
||||
if (!retval)
|
||||
goto wait_done;
|
||||
|
||||
/* umh_complete() will see NULL and free sub_info */
|
||||
if (xchg(&sub_info->complete, NULL))
|
||||
goto unlock;
|
||||
/* fallthrough, umh_complete() was already called */
|
||||
|
||||
/*
|
||||
* fallthrough; in case of -ERESTARTSYS now do uninterruptible
|
||||
* wait_for_completion_state(). Since umh_complete() shall call
|
||||
* complete() in a moment if xchg() above returned NULL, this
|
||||
* uninterruptible wait_for_completion_state() will not block
|
||||
* SIGKILL'ed processes for long.
|
||||
*/
|
||||
}
|
||||
|
||||
wait_for_completion(&done);
|
||||
|
||||
if (wait & UMH_FREEZABLE)
|
||||
freezer_count();
|
||||
wait_for_completion_state(&done, state);
|
||||
|
||||
wait_done:
|
||||
retval = sub_info->retval;
|
||||
|
|
|
@ -826,8 +826,8 @@ static void khugepaged_alloc_sleep(void)
|
|||
DEFINE_WAIT(wait);
|
||||
|
||||
add_wait_queue(&khugepaged_wait, &wait);
|
||||
freezable_schedule_timeout_interruptible(
|
||||
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
|
||||
__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
||||
schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
|
||||
remove_wait_queue(&khugepaged_wait, &wait);
|
||||
}
|
||||
|
||||
|
|
|
@ -276,7 +276,7 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
|
|||
|
||||
static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
|
||||
{
|
||||
freezable_schedule_unsafe();
|
||||
schedule();
|
||||
if (signal_pending_state(mode, current))
|
||||
return -ERESTARTSYS;
|
||||
return 0;
|
||||
|
@ -340,14 +340,12 @@ static int rpc_complete_task(struct rpc_task *task)
|
|||
* to enforce taking of the wq->lock and hence avoid races with
|
||||
* rpc_complete_task().
|
||||
*/
|
||||
int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action)
|
||||
int rpc_wait_for_completion_task(struct rpc_task *task)
|
||||
{
|
||||
if (action == NULL)
|
||||
action = rpc_wait_bit_killable;
|
||||
return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
|
||||
action, TASK_KILLABLE);
|
||||
rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
|
||||
EXPORT_SYMBOL_GPL(rpc_wait_for_completion_task);
|
||||
|
||||
/*
|
||||
* Make an RPC task runnable.
|
||||
|
@ -986,7 +984,7 @@ static void __rpc_execute(struct rpc_task *task)
|
|||
trace_rpc_task_sync_sleep(task, task->tk_action);
|
||||
status = out_of_line_wait_on_bit(&task->tk_runstate,
|
||||
RPC_TASK_QUEUED, rpc_wait_bit_killable,
|
||||
TASK_KILLABLE);
|
||||
TASK_KILLABLE|TASK_FREEZABLE);
|
||||
if (status < 0) {
|
||||
/*
|
||||
* When a sync task receives a signal, it exits with
|
||||
|
|
|
@ -2448,13 +2448,14 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
|
|||
struct sk_buff *last, unsigned int last_len,
|
||||
bool freezable)
|
||||
{
|
||||
unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
|
||||
struct sk_buff *tail;
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
unix_state_lock(sk);
|
||||
|
||||
for (;;) {
|
||||
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
|
||||
prepare_to_wait(sk_sleep(sk), &wait, state);
|
||||
|
||||
tail = skb_peek_tail(&sk->sk_receive_queue);
|
||||
if (tail != last ||
|
||||
|
@ -2467,10 +2468,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
|
|||
|
||||
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
|
||||
unix_state_unlock(sk);
|
||||
if (freezable)
|
||||
timeo = freezable_schedule_timeout(timeo);
|
||||
else
|
||||
timeo = schedule_timeout(timeo);
|
||||
timeo = schedule_timeout(timeo);
|
||||
unix_state_lock(sk);
|
||||
|
||||
if (sock_flag(sk, SOCK_DEAD))
|
||||
|
|
Loading…
Reference in New Issue