Compare commits

...

43 Commits

Author SHA1 Message Date
Linus Torvalds 8b789f2b76 15 hotfixes. 11 are cc:stable and the remainder address post-6.16 issues
or aren't considered necessary for -stable kernels.  13 of these fixes are
 for MM.
 
 The usual shower of singletons, plus
 
 - A 5 patch series from Hugh which addresses various misbehaviors in
   get_user_pages()
 
 - A 2 patch series from SeongJae which addresses a quite severe issue in
   DAMON
 
 - A 3 patch series also from SeongJae which completes some fixes for a
   DAMON startup issue
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCaMuGSgAKCRDdBJ7gKXxA
 junjAP9b9pqZ+xh/MhDWObiRilS8wRDF76NDj237x2oqKTTnmAEA1Rxnqf9nQotP
 XyuXfMZnHDcAHLc1EnsG7OjtMd7QDgU=
 =lifP
 -----END PGP SIGNATURE-----

Merge tag 'mm-hotfixes-stable-2025-09-17-21-10' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
 "15 hotfixes. 11 are cc:stable and the remainder address post-6.16
  issues or aren't considered necessary for -stable kernels. 13 of these
  fixes are for MM.

  The usual shower of singletons, plus

   - fixes from Hugh to address various misbehaviors in get_user_pages()

   - patches from SeongJae to address a quite severe issue in DAMON

   - another series also from SeongJae which completes some fixes for a
     DAMON startup issue"

* tag 'mm-hotfixes-stable-2025-09-17-21-10' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
  zram: fix slot write race condition
  nilfs2: fix CFI failure when accessing /sys/fs/nilfs2/features/*
  samples/damon/mtier: avoid starting DAMON before initialization
  samples/damon/prcl: avoid starting DAMON before initialization
  samples/damon/wsse: avoid starting DAMON before initialization
  MAINTAINERS: add Lance Yang as a THP reviewer
  MAINTAINERS: add Jann Horn as rmap reviewer
  mm/damon/sysfs: use dynamically allocated repeat mode damon_call_control
  mm/damon/core: introduce damon_call_control->dealloc_on_cancel
  mm: folio_may_be_lru_cached() unless folio_test_large()
  mm: revert "mm: vmscan.c: fix OOM on swap stress test"
  mm: revert "mm/gup: clear the LRU flag of a page before adding to LRU batch"
  mm/gup: local lru_add_drain() to avoid lru_add_drain_all()
  mm/gup: check ref_count instead of lru before migration
2025-09-17 21:34:26 -07:00
Linus Torvalds 592a93fea1 three ksmbd fixes, all for stable
-----BEGIN PGP SIGNATURE-----
 
 iQGzBAABCgAdFiEE6fsu8pdIjtWE/DpLiiy9cAdyT1EFAmjK9LIACgkQiiy9cAdy
 T1FOqAv+Nh5n1lHWf8sehusS2Ly4Ef9oI2KZyk9m176u33n4/YorAR+HheLUtuo7
 makGggFtDos5F9CyBkLJHCPXMNLLIB+xu3GNJQV2qE0Dr0Sc2BPQgdv5xJBAW70O
 bL30YI5/H+E6NK1cc7gXrhT/yCg7jJUYN3XpVbT5MbVLy3p4gZOxsdZe7HEBBLEO
 ET+Jbf0+WSFo7VYOe7FFZGUO7LyNarSY8aJNz+KgCYPHwobhmPMMSTkAcw4rjkl/
 YRRjuo4445TepRr3l/x46ygbWA1TxQwvmq2gZZaAgAZCMJIbhVxmPUup2yqWn9dM
 MOZca8PpL9D/p9CWZgyIhFnUxIE2HKr9OZ1Z7/c6UMaZl7oym5Y+krPzblO6RvL+
 JLRn+qkp9VjtcIzO/TqORoWpu7C0aW9FrzGnCMAHaWnaipFo1/NQ4Ewehu9Ev1Sz
 bBPvDwnRivkBhRHbnKM+0w0og+AyDx8J0LzsIF0e2Qc3NuD2f3TPJ598FVNYj5V+
 A1Vhyinw
 =mah2
 -----END PGP SIGNATURE-----

Merge tag '6.17-rc6-ksmbd-fixes' of git://git.samba.org/ksmbd

Pull smb server fixes from Steve French:

 - Two fixes for remaining_data_length and offset checks in receive path

 - Don't go over max SGEs which caused smbdirect send to fail (and
   trigger disconnect)

* tag '6.17-rc6-ksmbd-fixes' of git://git.samba.org/ksmbd:
  ksmbd: smbdirect: verify remaining_data_length respects max_fragmented_recv_size
  ksmbd: smbdirect: validate data_offset and data_length field of smb_direct_data_transfer
  smb: server: let smb_direct_writev() respect SMB_DIRECT_MAX_SEND_SGES
2025-09-17 18:23:01 -07:00
Linus Torvalds 992d4e481e Probes fixes for v6.17-rc6:
- kprobe-event: Fix null-ptr-deref in trace_kprobe_create_internal(),
   which handles NULL return of kmemdup() correctly.
 -----BEGIN PGP SIGNATURE-----
 
 iQFPBAABCgA5FiEEh7BulGwFlgAOi5DV2/sHvwUrPxsFAmjLOcYbHG1hc2FtaS5o
 aXJhbWF0c3VAZ21haWwuY29tAAoJENv7B78FKz8b7PUH/0ao2r/pj/vfKDNWrGlY
 TJ59tabrQQ9AGB27jqL8nZPbie4Jn1UBKMsuvRcOfvbSLtmnrxOtqgx/RmJOVnjC
 JuLEWQt8XTiBatsLsPst/CNnzV9V/oLmZ7Fv8Z1QVqzCfpnyCW4HaHc6XaH8IM3r
 5x6fIrZFKFu7E58t2yo972L+tNIPFwr457VTt2nCdHXlL3mwnK+GtYeNBnWSk40+
 9k16xShDVx3tm+oPEJ2jyJApchR4wWU1vIshMYSu1ygp9UacdJWajKt1qOQOMEih
 H2sNlBwZLTWGfhS9exBPori9mthhH4wxzqEYzpPHw0WgNn0OF9QL5AXX44VGJzG5
 JqM=
 =1NqY
 -----END PGP SIGNATURE-----

Merge tag 'probes-fixes-v6.17-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull probe fix from Masami Hiramatsu:

 - kprobe-event: Fix null-ptr-deref in trace_kprobe_create_internal(),
   by handling NULL return of kmemdup() correctly

* tag 'probes-fixes-v6.17-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  tracing: kprobe-event: Fix null-ptr-deref in trace_kprobe_create_internal()
2025-09-17 16:52:26 -07:00
Wang Liang dc3382fffd tracing: kprobe-event: Fix null-ptr-deref in trace_kprobe_create_internal()
A crash was observed with the following output:

Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN PTI
KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
CPU: 1 UID: 0 PID: 2899 Comm: syz.2.399 Not tainted 6.17.0-rc5+ #5 PREEMPT(none)
RIP: 0010:trace_kprobe_create_internal+0x3fc/0x1440 kernel/trace/trace_kprobe.c:911
Call Trace:
 <TASK>
 trace_kprobe_create_cb+0xa2/0xf0 kernel/trace/trace_kprobe.c:1089
 trace_probe_create+0xf1/0x110 kernel/trace/trace_probe.c:2246
 dyn_event_create+0x45/0x70 kernel/trace/trace_dynevent.c:128
 create_or_delete_trace_kprobe+0x5e/0xc0 kernel/trace/trace_kprobe.c:1107
 trace_parse_run_command+0x1a5/0x330 kernel/trace/trace.c:10785
 vfs_write+0x2b6/0xd00 fs/read_write.c:684
 ksys_write+0x129/0x240 fs/read_write.c:738
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x5d/0x2d0 arch/x86/entry/syscall_64.c:94
 </TASK>

Function kmemdup() may return NULL in trace_kprobe_create_internal(), add
check for it's return value.

Link: https://lore.kernel.org/all/20250916075816.3181175-1-wangliang74@huawei.com/

Fixes: 33b4e38baa ("tracing: kprobe-event: Allocate string buffers from heap")
Signed-off-by: Wang Liang <wangliang74@huawei.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-09-18 07:36:41 +09:00
Linus Torvalds 37889ceadd sched_ext: Fixes for v6.17-rc6
This contains 2 sched_ext fixes.
 
 - Fix build failure when !FAIR_GROUP_SCHED && EXT_GROUP_SCHED.
 
 - Revert "sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()" which
   was causing issues with per-CPU task scheduling and reenqueuing behavior.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaMsN6g4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGRHQAP4kRGIyxCaCnoSHHDyI8R2SLUzDKvvByaVNdbKO
 5VlCaAEAy0wKViyJDojpd5DXMFlFYCm8gXWQ0aD++hhYX1XfawI=
 =b+ar
 -----END PGP SIGNATURE-----

Merge tag 'sched_ext-for-6.17-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Fix build failure when !FAIR_GROUP_SCHED && EXT_GROUP_SCHED

 - Revert "sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()"
   which was causing issues with per-CPU task scheduling and reenqueuing
   behavior

* tag 'sched_ext-for-6.17-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  sched_ext, sched/core: Fix build failure when !FAIR_GROUP_SCHED && EXT_GROUP_SCHED
  Revert "sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()"
2025-09-17 13:27:31 -07:00
Linus Torvalds 05950213a9 cgroup: Fixes for v6.17-rc6
This contains two cgroup changes. Both are pretty low risk.
 
 - Fix deadlock in cgroup destruction when repeatedly mounting/unmounting
   perf_event and net_prio controllers. The issue occurs because
   cgroup_destroy_wq has max_active=1, causing root destruction to wait for
   CSS offline operations that are queued behind it. The fix splits
   cgroup_destroy_wq into three separate workqueues to eliminate the
   blocking.
 
 - Set of->priv to NULL upon file release to make potential bugs to manifest
   as NULL pointer dereferences rather than use-after-free errors.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaMsIBg4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGUlBAP95OycrYMUu1iIc37YfClvugsBZJpOV/qMQpNTm
 oPjGoQEAv5dzOTo+763ecFUfRjCT469Ke7wFapS1RCVL7hEd5As=
 =Kykt
 -----END PGP SIGNATURE-----

Merge tag 'cgroup-for-6.17-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:
 "This contains two cgroup changes. Both are pretty low risk.

  - Fix deadlock in cgroup destruction when repeatedly
    mounting/unmounting perf_event and net_prio controllers.

    The issue occurs because cgroup_destroy_wq has max_active=1, causing
    root destruction to wait for CSS offline operations that are queued
    behind it.

    The fix splits cgroup_destroy_wq into three separate workqueues to
    eliminate the blocking.

  - Set of->priv to NULL upon file release to make potential bugs to
    manifest as NULL pointer dereferences rather than use-after-free
    errors"

* tag 'cgroup-for-6.17-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup/psi: Set of->priv to NULL upon file release
  cgroup: split cgroup_destroy_wq into 3 workqueues
2025-09-17 13:22:08 -07:00
Linus Torvalds d4b779985a - fix integer overflow in dm-stripe
- limit tag size in dm-integrity to 255 bytes
 
 - fix 'alignment inconsistency' warning in dm-raid
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYIADIWIQRnH8MwLyZDhyYfesYTAyx9YGnhbQUCaMrAdRQcbXBhdG9ja2FA
 cmVkaGF0LmNvbQAKCRATAyx9YGnhbYhtAQDsWJHG7dOVuvoY+kyM9SmC2My6l5u2
 awYBsjVT//9KDAD/c6s+ffeseY6VpBuFmPx5i75ieCmUcuKTMUxgdIErPwM=
 =FXwZ
 -----END PGP SIGNATURE-----

Merge tag 'for-6.17/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mikulas Patocka:

 - fix integer overflow in dm-stripe

 - limit tag size in dm-integrity to 255 bytes

 - fix 'alignment inconsistency' warning in dm-raid

* tag 'for-6.17/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm-raid: don't set io_min and io_opt for raid1
  dm-integrity: limit MAX_TAG_SIZE to 255
  dm-stripe: fix a possible integer overflow
2025-09-17 08:07:02 -07:00
Linus Torvalds b6f456a76f for-6.17-rc6-tag
-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmjKPo0ACgkQxWXV+ddt
 WDurFg/9FcP3Sg2BDCAm1T+akEy9nAoR1DjYBlOToNxjJ9uHdKUljVQjPTp3mLrD
 pZDowuNoXkH8ig+/XknhiD51ynGGqCiqXEMj37qFbPJRe6V2iBo6gh4XidHlbvuv
 YDcyrCyDiF3p+0QEGCVjTFNubJuFSWDlcf07K8BpsvcXi3945v0rHY6B1doBiTjh
 TcnPievRxeOdDQiCJ4yja2GkoMEMw8fdNQ1EyfSRxX7EPICCGChY+FHJJCnX4oGI
 4rqe5v4LPA6l0PrGWKZ/crikPNBlzQZ3otD2drdLDEkusHC5vKpmuGL/r3IgP0gB
 OvtMIe70z0abBOOk+Rk/REFaDdVhGyXhuEeqraKK85+2eVkyUTy6fswP+Qj+sVq1
 /AqOr1OaJpVlpnAzP02TbmSZnm8WPOe8mVY7sUI66nGbSHRdVWPxORb2MEwgueP1
 B8G/6s46uLJrH5ipqCBHmFKvKNuUgiYJbtWJrhLsl0PVa5C0yVP1mC8vaQNlTIwz
 B+oPEUFW4SOZL7/uvwn12FhVPRyk15YdEt9CxNtM2ipHKTfTu9ptjvwy6gQ9/6MM
 zltxuQLMBiieBegiwOpocISXAyCB+aj6XP/jlQrpSb7vSGOaJXj3STzJSWNYKJ6w
 /sZbBmF4Mtim+CCgbDbEXpWfM4hW55bM3fSRgukpiianEzvdyZM=
 =0aOx
 -----END PGP SIGNATURE-----

Merge tag 'for-6.17-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - in zoned mode, turn assertion to proper code when reserving space in
   relocation block group

 - fix search key of extended ref (hardlink) when replaying log

 - fix initialization of file extent tree on filesystems without
   no-holes feature

 - add harmless data race annotation to block group comparator

* tag 'for-6.17-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: annotate block group access with data_race() when sorting for reclaim
  btrfs: initialize inode::file_extent_tree after i_mode has been set
  btrfs: zoned: fix incorrect ASSERT in btrfs_zoned_reserve_data_reloc_bg()
  btrfs: fix invalid extref key setup when replaying dentry
2025-09-17 07:55:45 -07:00
Mikulas Patocka a865562646 dm-raid: don't set io_min and io_opt for raid1
These commands
 modprobe brd rd_size=1048576
 vgcreate vg /dev/ram*
 lvcreate -m4 -L10 -n lv vg
trigger the following warnings:
device-mapper: table: 252:10: adding target device (start sect 0 len 24576) caused an alignment inconsistency
device-mapper: table: 252:10: adding target device (start sect 0 len 24576) caused an alignment inconsistency

The warnings are caused by the fact that io_min is 512 and physical block
size is 4096.

If there's chunk-less raid, such as raid1, io_min shouldn't be set to zero
because it would be raised to 512 and it would trigger the warning.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Cc: stable@vger.kernel.org
2025-09-17 16:04:29 +02:00
Tejun Heo a1eab4d813 sched_ext, sched/core: Fix build failure when !FAIR_GROUP_SCHED && EXT_GROUP_SCHED
While collecting SCX related fields in struct task_group into struct
scx_task_group, 6e6558a6bc ("sched_ext, sched/core: Factor out struct
scx_task_group") forgot update tg->scx_weight usage in tg_weight(), which
leads to build failure when CONFIG_FAIR_GROUP_SCHED is disabled but
CONFIG_EXT_GROUP_SCHED is enabled. Fix it.

Fixes: 6e6558a6bc ("sched_ext, sched/core: Factor out struct scx_task_group")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202509170230.MwZsJSWa-lkp@intel.com/
Tested-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-09-16 23:07:27 -10:00
Linus Torvalds 5aca7966d2 perf-tools fixes for v6.17-rc7
A small set of fixes for crashes in different commands and conditions.
 
 Signed-off-by: Namhyung Kim <namhyung@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQSo2x5BnqMqsoHtzsmMstVUGiXMgwUCaMndGgAKCRCMstVUGiXM
 g/IJAP9MynklJNT/wwpusMi9hDFrqiwlhg7W6Z2bHAGsOvS08AEAwORNKs4vMaXY
 +7q25ylQuHIAP1cE3vchcCvLuXQIbQo=
 =hQ4X
 -----END PGP SIGNATURE-----

Merge tag 'perf-tools-fixes-for-v6.17-2025-09-16' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools

Pull perf tools fixes from Namhyung Kim:
 "A small set of fixes for crashes in different commands and conditions"

* tag 'perf-tools-fixes-for-v6.17-2025-09-16' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools:
  perf maps: Ensure kmap is set up for all inserts
  perf lock: Provide a host_env for session new
  perf subcmd: avoid crash in exclude_cmds when excludes is empty
2025-09-16 15:15:54 -07:00
Andrea Righi 0b47b6c354 Revert "sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()"
scx_bpf_reenqueue_local() can be called from ops.cpu_release() when a
CPU is taken by a higher scheduling class to give tasks queued to the
CPU's local DSQ a chance to be migrated somewhere else, instead of
waiting indefinitely for that CPU to become available again.

In doing so, we decided to skip migration-disabled tasks, under the
assumption that they cannot be migrated anyway.

However, when a higher scheduling class preempts a CPU, the running task
is always inserted at the head of the local DSQ as a migration-disabled
task. This means it is always skipped by scx_bpf_reenqueue_local(), and
ends up being confined to the same CPU even if that CPU is heavily
contended by other higher scheduling class tasks.

As an example, let's consider the following scenario:

 $ schedtool -a 0,1, -e yes > /dev/null
 $ sudo schedtool -F -p 99 -a 0, -e \
   stress-ng -c 1 --cpu-load 99 --cpu-load-slice 1000

The first task (SCHED_EXT) can run on CPU0 or CPU1. The second task
(SCHED_FIFO) is pinned to CPU0 and consumes ~99% of it. If the SCHED_EXT
task initially runs on CPU0, it will remain there because it always sees
CPU0 as "idle" in the short gaps left by the RT task, resulting in ~1%
utilization while CPU1 stays idle:

    0[||||||||||||||||||||||100.0%]   8[                        0.0%]
    1[                        0.0%]   9[                        0.0%]
    2[                        0.0%]  10[                        0.0%]
    3[                        0.0%]  11[                        0.0%]
    4[                        0.0%]  12[                        0.0%]
    5[                        0.0%]  13[                        0.0%]
    6[                        0.0%]  14[                        0.0%]
    7[                        0.0%]  15[                        0.0%]
  PID USER       PRI  NI  S CPU  CPU%▽MEM%   TIME+  Command
 1067 root        RT   0  R   0  99.0  0.2  0:31.16 stress-ng-cpu [run]
  975 arighi      20   0  R   0   1.0  0.0  0:26.32 yes

By allowing scx_bpf_reenqueue_local() to re-enqueue migration-disabled
tasks, the scheduler can choose to migrate them to other CPUs (CPU1 in
this case) via ops.enqueue(), leading to better CPU utilization:

    0[||||||||||||||||||||||100.0%]   8[                        0.0%]
    1[||||||||||||||||||||||100.0%]   9[                        0.0%]
    2[                        0.0%]  10[                        0.0%]
    3[                        0.0%]  11[                        0.0%]
    4[                        0.0%]  12[                        0.0%]
    5[                        0.0%]  13[                        0.0%]
    6[                        0.0%]  14[                        0.0%]
    7[                        0.0%]  15[                        0.0%]
  PID USER       PRI  NI  S CPU  CPU%▽MEM%   TIME+  Command
  577 root        RT   0  R   0 100.0  0.2  0:23.17 stress-ng-cpu [run]
  555 arighi      20   0  R   1 100.0  0.0  0:28.67 yes

It's debatable whether per-CPU tasks should be re-enqueued as well, but
doing so is probably safer: the scheduler can recognize re-enqueued
tasks through the %SCX_ENQ_REENQ flag, reassess their placement, and
either put them back at the head of the local DSQ or let another task
attempt to take the CPU.

This also prevents giving per-CPU tasks an implicit priority boost,
which would otherwise make them more likely to reclaim CPUs preempted by
higher scheduling classes.

Fixes: 97e13ecb02 ("sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()")
Cc: stable@vger.kernel.org # v6.15+
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Acked-by: Changwoo Min <changwoo@igalia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-09-16 10:15:23 -10:00
Sergey Senozhatsky ce4be9e430 zram: fix slot write race condition
Parallel concurrent writes to the same zram index result in leaked
zsmalloc handles.  Schematically we can have something like this:

CPU0                              CPU1
zram_slot_lock()
zs_free(handle)
zram_slot_lock()
				zram_slot_lock()
				zs_free(handle)
				zram_slot_lock()

compress			compress
handle = zs_malloc()		handle = zs_malloc()
zram_slot_lock
zram_set_handle(handle)
zram_slot_lock
				zram_slot_lock
				zram_set_handle(handle)
				zram_slot_lock

Either CPU0 or CPU1 zsmalloc handle will leak because zs_free() is done
too early.  In fact, we need to reset zram entry right before we set its
new handle, all under the same slot lock scope.

Link: https://lkml.kernel.org/r/20250909045150.635345-1-senozhatsky@chromium.org
Fixes: 71268035f5 ("zram: free slot memory early during write")
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reported-by: Changhui Zhong <czhong@redhat.com>
Closes: https://lore.kernel.org/all/CAGVVp+UtpGoW5WEdEU7uVTtsSCjPN=ksN6EcvyypAtFDOUf30A@mail.gmail.com/
Tested-by: Changhui Zhong <czhong@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Minchan Kim <minchan@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-15 20:01:45 -07:00
Ian Rogers 20c9ccffcc perf maps: Ensure kmap is set up for all inserts
__maps__fixup_overlap_and_insert may split or directly insert a map,
when doing this the map may need to have a kmap set up for the sake of
the kmaps. The missing kmap set up fails the check_invariants test in
maps, later "Internal error" reports from map__kmap and ultimately
causes segfaults.

Similar fixes were added in commit e0e4e0b8b7 ("perf maps: Add
missing map__set_kmap_maps() when replacing a kernel map") and commit
25d9c0301d ("perf maps: Set the kmaps for newly created/added kernel
maps") but they missed cases. To try to reduce the risk of this,
update the kmap directly following any manual insert. This identified
another problem in maps__copy_from.

Fixes: e0e4e0b8b7 ("perf maps: Add missing map__set_kmap_maps() when replacing a kernel map")
Fixes: 25d9c0301d ("perf maps: Set the kmaps for newly created/added kernel maps")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
2025-09-15 10:03:23 -07:00
Linus Torvalds 46a51f4f5e Power Supply Fixes for 6.17 cycle
* bq27xxx: avoid spamming the log for missing bq27000 battery
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE72YNB0Y/i3JqeVQT2O7X88g7+poFAmjIJ94ACgkQ2O7X88g7
 +poGYg//YDnvqykimHcuZyohxZsfwX2cq5YAXVYCrbMmr9EanCDjuGVsct0T1uIA
 jlNwxagpvIDrj+0L7yd31cVN9TYeIAA/EMAfK9mK7bOCCKKAKOyn/osdLNbptcXf
 VtBVUcxvmPp5Q04cNpEnTgsUh5Ge5dv7I+aKRNW6A/gjEc3gLGof62rCMV6uNQ62
 wzTmymhwmw+oLqfgA53mTazgDyOSJccWm2o1C2ZQgotsaIBOCjrj7KpbaFfMT1CW
 9MySS8ZmZKEJRGb3qIKQXOhq/W2EcazNfi0q9ECkTMA6QmB2/BqYpZyzrFWFCDJW
 T4/V4Ztflcp7uTcpQon/TssR7Xti+2rb0dRDf+YzEW3eQtBrrdlVFOjVWdr4MluN
 Owqi4/gs9QjtF988aQ5E5NN+SbklEmfeBlB1XXlqoQa/APpcA5QEfZktQPiX88oo
 RUWNAP7w8ivsX/IIdy4PGh4iDYbqULCGDW+3wqRLnc5A1qYkh6e4E+Rq7hD611js
 oRkjyckgZCpP60COC7IsEszxEj0Uhd5VqirroGrTILDiDSkXvxx3KvZ2qSwCSxgj
 5eAtSbVx8FgjE7je9vW5ph/BXP86xNBZNBydPFvk2wRvbAh39Eo6UN3l1UqgmZyp
 4/5jnKaSse/OStUvnd9dukqCmeMN7BDLqkQsqcC1LV3j4M8Ga0g=
 =NTQb
 -----END PGP SIGNATURE-----

Merge tag 'for-v6.17-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply

Pull power supply fixes from Sebastian Reichel:

 - bq27xxx: avoid spamming the log for missing bq27000 battery

* tag 'for-v6.17-rc' of git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-power-supply:
  power: supply: bq27xxx: restrict no-battery detection to bq27000
  power: supply: bq27xxx: fix error return in case of no bq27000 hdq battery
2025-09-15 08:15:11 -07:00
Filipe Manana 80eb65ccf6 btrfs: annotate block group access with data_race() when sorting for reclaim
When sorting the block group list for reclaim we are using a block group's
used bytes counter without taking the block group's spinlock, so we can
race with a concurrent task updating it (at btrfs_update_block_group()),
which makes tools like KCSAN unhappy and report a race.

Since the sorting is not strictly needed from a functional perspective
and such races should rarely cause any ordering changes (only load/store
tearing could cause them), not to mention that after the sorting the
ordering may no longer be accurate due to concurrent allocations and
deallocations of extents in a block group, annotate the accesses to the
used counter with data_race() to silence KCSAN and similar tools.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-09-15 05:25:43 +02:00
austinchang 8679d2687c btrfs: initialize inode::file_extent_tree after i_mode has been set
btrfs_init_file_extent_tree() uses S_ISREG() to determine if the file is
a regular file. In the beginning of btrfs_read_locked_inode(), the i_mode
hasn't been read from inode item, then file_extent_tree won't be used at
all in volumes without NO_HOLES.

Fix this by calling btrfs_init_file_extent_tree() after i_mode is
initialized in btrfs_read_locked_inode().

Fixes: 3d7db6e8bd ("btrfs: don't allocate file extent tree for non regular files")
CC: stable@vger.kernel.org # 6.12+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: austinchang <austinchang@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-09-15 05:25:40 +02:00
Johannes Thumshirn 5b8d296475 btrfs: zoned: fix incorrect ASSERT in btrfs_zoned_reserve_data_reloc_bg()
When moving a block-group to the dedicated data relocation space-info in
btrfs_zoned_reserve_data_reloc_bg() it is asserted that the newly
created block group for data relocation does not contain any
zone_unusable bytes.

But on disks with zone_capacity < zone_size, the difference between
zone_size and zone_capacity is accounted as zone_unusable.

Instead of asserting that the block-group does not contain any
zone_unusable bytes, remove them from the block-groups total size.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Link: https://lore.kernel.org/linux-block/CAHj4cs8-cS2E+-xQ-d2Bj6vMJZ+CwT_cbdWBTju4BV35LsvEYw@mail.gmail.com/
Fixes: daa0fde322 ("btrfs: zoned: fix data relocation block group reservation")
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-09-15 05:25:37 +02:00
Filipe Manana b62fd63ade btrfs: fix invalid extref key setup when replaying dentry
The offset for an extref item's key is not the object ID of the parent
dir, otherwise we would not need the extref item and would use plain ref
items. Instead the offset is the result of a hash computation that uses
the object ID of the parent dir and the name associated to the entry.
So fix this by setting the key offset at replay_one_name() to be the
result of calling btrfs_extref_hash().

Fixes: 725af92a62 ("btrfs: Open-code name_in_log_ref in replay_one_name")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-09-15 05:25:31 +02:00
Stefan Metzmacher e1868ba37f ksmbd: smbdirect: verify remaining_data_length respects max_fragmented_recv_size
This is inspired by the check for data_offset + data_length.

Cc: Steve French <smfrench@gmail.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Cc: stable@vger.kernel.org
Fixes: 2ea086e35c ("ksmbd: add buffer validation for smb direct")
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-09-14 22:17:10 -05:00
Namjae Jeon 5282491fc4 ksmbd: smbdirect: validate data_offset and data_length field of smb_direct_data_transfer
If data_offset and data_length of smb_direct_data_transfer struct are
invalid, out of bounds issue could happen.
This patch validate data_offset and data_length field in recv_done.

Cc: stable@vger.kernel.org
Fixes: 2ea086e35c ("ksmbd: add buffer validation for smb direct")
Reviewed-by: Stefan Metzmacher <metze@samba.org>
Reported-by: Luigino Camastra, Aisle Research <luigino.camastra@aisle.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-09-14 22:17:10 -05:00
Stefan Metzmacher d162694037 smb: server: let smb_direct_writev() respect SMB_DIRECT_MAX_SEND_SGES
We should not use more sges for ib_post_send() than we told the rdma
device in rdma_create_qp()!

Otherwise ib_post_send() will return -EINVAL, so we disconnect the
connection. Or with the current siw.ko we'll get 0 from ib_post_send(),
but will never ever get a completion for the request. I've already sent a
fix for siw.ko...

So we need to make sure smb_direct_writev() limits the number of vectors
we pass to individual smb_direct_post_send_data() calls, so that we
don't go over the queue pair limits.

Commit 621433b7e2 ("ksmbd: smbd: relax the count of sges required")
was very strange and I guess only needed because
SMB_DIRECT_MAX_SEND_SGES was 8 at that time. It basically removed the
check that the rdma device is able to handle the number of sges we try
to use.

While the real problem was added by commit ddbdc861e3 ("ksmbd: smbd:
introduce read/write credits for RDMA read/write") as it used the
minumun of device->attrs.max_send_sge and device->attrs.max_sge_rd, with
the problem that device->attrs.max_sge_rd is always 1 for iWarp. And
that limitation should only apply to RDMA Read operations. For now we
keep that limitation for RDMA Write operations too, fixing that is a
task for another day as it's not really required a bug fix.

Commit 2b4eeeaa90 ("ksmbd: decrease the number of SMB3 smbdirect
server SGEs") lowered SMB_DIRECT_MAX_SEND_SGES to 6, which is also used
by our client code. And that client code enforces
device->attrs.max_send_sge >= 6 since commit d2e81f92e5 ("Decrease the
number of SMB3 smbdirect client SGEs") and (briefly looking) only the
i40w driver provides only 3, see I40IW_MAX_WQ_FRAGMENT_COUNT. But
currently we'd require 4 anyway, so that would not work anyway, but now
it fails early.

Cc: Steve French <smfrench@gmail.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Hyunchul Lee <hyc.lee@gmail.com>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Cc: linux-rdma@vger.kernel.org
Fixes: 0626e6641f ("cifsd: add server handler for central processing and tranport layers")
Fixes: ddbdc861e3 ("ksmbd: smbd: introduce read/write credits for RDMA read/write")
Fixes: 621433b7e2 ("ksmbd: smbd: relax the count of sges required")
Fixes: 2b4eeeaa90 ("ksmbd: decrease the number of SMB3 smbdirect server SGEs")
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-09-14 22:17:10 -05:00
Nathan Chancellor 025e87f8ea nilfs2: fix CFI failure when accessing /sys/fs/nilfs2/features/*
When accessing one of the files under /sys/fs/nilfs2/features when
CONFIG_CFI_CLANG is enabled, there is a CFI violation:

  CFI failure at kobj_attr_show+0x59/0x80 (target: nilfs_feature_revision_show+0x0/0x30; expected type: 0xfc392c4d)
  ...
  Call Trace:
   <TASK>
   sysfs_kf_seq_show+0x2a6/0x390
   ? __cfi_kobj_attr_show+0x10/0x10
   kernfs_seq_show+0x104/0x15b
   seq_read_iter+0x580/0xe2b
  ...

When the kobject of the kset for /sys/fs/nilfs2 is initialized, its ktype
is set to kset_ktype, which has a ->sysfs_ops of kobj_sysfs_ops.  When
nilfs_feature_attr_group is added to that kobject via
sysfs_create_group(), the kernfs_ops of each files is sysfs_file_kfops_rw,
which will call sysfs_kf_seq_show() when ->seq_show() is called. 
sysfs_kf_seq_show() in turn calls kobj_attr_show() through
->sysfs_ops->show().  kobj_attr_show() casts the provided attribute out to
a 'struct kobj_attribute' via container_of() and calls ->show(), resulting
in the CFI violation since neither nilfs_feature_revision_show() nor
nilfs_feature_README_show() match the prototype of ->show() in 'struct
kobj_attribute'.

Resolve the CFI violation by adjusting the second parameter in
nilfs_feature_{revision,README}_show() from 'struct attribute' to 'struct
kobj_attribute' to match the expected prototype.

Link: https://lkml.kernel.org/r/20250906144410.22511-1-konishi.ryusuke@gmail.com
Fixes: aebe17f684 ("nilfs2: add /sys/fs/nilfs2/features group")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202509021646.bc78d9ef-lkp@intel.com/
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:38 -07:00
SeongJae Park c62cff4048 samples/damon/mtier: avoid starting DAMON before initialization
Commit 964314344e ("samples/damon/mtier: support boot time enable
setup") is somehow incompletely applying the origin patch [1].  It is
missing the part that avoids starting DAMON before module initialization. 
Probably a mistake during a merge has happened.  Fix it by applying the
missed part again.

Link: https://lkml.kernel.org/r/20250909022238.2989-4-sj@kernel.org
Link: https://lore.kernel.org/20250706193207.39810-4-sj@kernel.org [1]
Fixes: 964314344e ("samples/damon/mtier: support boot time enable setup")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:38 -07:00
SeongJae Park e6b733ca2f samples/damon/prcl: avoid starting DAMON before initialization
Commit 2780505ec2 ("samples/damon/prcl: fix boot time enable crash") is
somehow incompletely applying the origin patch [1].  It is missing the
part that avoids starting DAMON before module initialization.  Probably a
mistake during a merge has happened.  Fix it by applying the missed part
again.

Link: https://lkml.kernel.org/r/20250909022238.2989-3-sj@kernel.org
Link: https://lore.kernel.org/20250706193207.39810-3-sj@kernel.org [1]
Fixes: 2780505ec2 ("samples/damon/prcl: fix boot time enable crash")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:37 -07:00
SeongJae Park f826edeb88 samples/damon/wsse: avoid starting DAMON before initialization
Patch series "samples/damon: fix boot time enable handling fixup merge
mistakes".

First three patches of the patch series "mm/damon: fix misc bugs in DAMON
modules" [1] were trying to fix boot time DAMON sample modules enabling
issues.  The issues are the modules can crash if those are enabled before
DAMON is enabled, like using boot time parameter options.  The three
patches were fixing the issues by avoiding starting DAMON before the
module initialization phase.

However, probably by a mistake during a merge, only half of the change is
merged, and the part for avoiding the starting of DAMON before the module
initialized is missed.  So the problem is not solved and thus the modules
can still crash if enabled before DAMON is initialized.  Fix those by
applying the unmerged parts again.

Note that the broken commits are merged into 6.17-rc1, but also backported
to relevant stable kernels.  So this series also needs to be merged into
the stable kernels.  Hence Cc-ing stable@.


This patch (of 3):

Commit 0ed1165c37 ("samples/damon/wsse: fix boot time enable handling")
is somehow incompletely applying the origin patch [2].  It is missing the
part that avoids starting DAMON before module initialization.  Probably a
mistake during a merge has happened.  Fix it by applying the missed part
again.

Link: https://lkml.kernel.org/r/20250909022238.2989-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250909022238.2989-2-sj@kernel.org
Link: https://lkml.kernel.org/r/20250706193207.39810-1-sj@kernel.org [1]
Link: https://lore.kernel.org/20250706193207.39810-2-sj@kernel.org [2]
Fixes: 0ed1165c37 ("samples/damon/wsse: fix boot time enable handling")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:37 -07:00
Lance Yang 72291a5a0e MAINTAINERS: add Lance Yang as a THP reviewer
I've been actively digging into the MM/THP subsystem for over a year now,
and there's a real interest in contributing more and getting further
involved.

Well, missing out on any more cool THP things is really a pain ;)

Link: https://lkml.kernel.org/r/20250908104857.35397-1-lance.yang@linux.dev
Signed-off-by: Lance Yang <lance.yang@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: Barry Song <baohua@kernel.org>
Acked-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:37 -07:00
Lorenzo Stoakes 615cd3705d MAINTAINERS: add Jann Horn as rmap reviewer
Jann has been an excellent contributor in all areas of memory management,
and has demonstrated great expertise in the reverse mapping.

It's therefore appropriate for him to become a reviewer.

Link: https://lkml.kernel.org/r/20250908194959.820913-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:37 -07:00
SeongJae Park 04a06b139e mm/damon/sysfs: use dynamically allocated repeat mode damon_call_control
DAMON sysfs interface is using a single global repeat mode
damon_call_control variable for refresh_ms handling, for all DAMON
contexts.  As a result, when there are more than one context, the single
global damon_call_control is unexpectedly over-written (corrupted). 
Particularly the ->link field is overwritten by the multiple contexts and
this can cause a user hangup, and/or a kernel crash.  Fix it by using
dynamically allocated damon_call_control object per DAMON context.

Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org
Link: https://lore.kernel.org/20250904011738.930-1-yunjeong.mun@sk.com [1]
Link: https://lore.kernel.org/20250905035411.39501-1-sj@kernel.org [2]
Fixes: d809a7c64b ("mm/damon/sysfs: implement refresh_ms file internal work")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Yunjeong Mun <yunjeong.mun@sk.com>
Closes: https://lore.kernel.org/20250904011738.930-1-yunjeong.mun@sk.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:36 -07:00
SeongJae Park e6a0deb6fa mm/damon/core: introduce damon_call_control->dealloc_on_cancel
Patch series "mm/damon/sysfs: fix refresh_ms control overwriting on
multi-kdamonds usages".

Automatic esssential DAMON/DAMOS status update feature of DAMON sysfs
interface (refresh_ms) is broken [1] for multiple DAMON contexts
(kdamonds) use case, since it uses a global single damon_call_control
object for all created DAMON contexts.  The fields of the object,
particularly the list field is over-written for the contexts and it makes
unexpected results including user-space hangup and kernel crashes [2]. 
Fix it by extending damon_call_control for the use case and updating the
usage on DAMON sysfs interface to use per-context dynamically allocated
damon_call_control object.


This patch (of 2):

When damon_call_control->repeat is set, damon_call() is executed
asynchronously, and is eventually canceled when kdamond finishes.  If the
damon_call_control object is dynamically allocated, finding the place to
deallocate the object is difficult.  Introduce a new damon_call_control
field, namely dealloc_on_cancel, to ask the kdamond deallocates those
dynamically allocated objects when those are canceled.

Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org
Link: https://lkml.kernel.org/r/20250908201513.60802-2-sj@kernel.org
Fixes: d809a7c64b ("mm/damon/sysfs: implement refresh_ms file internal work")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Yunjeong Mun <yunjeong.mun@sk.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:36 -07:00
Hugh Dickins 2da6de30e6 mm: folio_may_be_lru_cached() unless folio_test_large()
mm/swap.c and mm/mlock.c agree to drain any per-CPU batch as soon as a
large folio is added: so collect_longterm_unpinnable_folios() just wastes
effort when calling lru_add_drain[_all]() on a large folio.

But although there is good reason not to batch up PMD-sized folios, we
might well benefit from batching a small number of low-order mTHPs (though
unclear how that "small number" limitation will be implemented).

So ask if folio_may_be_lru_cached() rather than !folio_test_large(), to
insulate those particular checks from future change.  Name preferred to
"folio_is_batchable" because large folios can well be put on a batch: it's
just the per-CPU LRU caches, drained much later, which need care.

Marked for stable, to counter the increase in lru_add_drain_all()s from
"mm/gup: check ref_count instead of lru before migration".

Link: https://lkml.kernel.org/r/57d2eaf8-3607-f318-e0c5-be02dce61ad0@google.com
Fixes: 9a4e9f3b2d ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region")
Signed-off-by: Hugh Dickins <hughd@google.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Keir Fraser <keirf@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shivank Garg <shivankg@amd.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: yangge <yangge1116@126.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:36 -07:00
Hugh Dickins 8d79ed36bf mm: revert "mm: vmscan.c: fix OOM on swap stress test"
This reverts commit 0885ef470560: that was a fix to the reverted
33dfe9204f.

Link: https://lkml.kernel.org/r/aa0e9d67-fbcd-9d79-88a1-641dfbe1d9d1@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Keir Fraser <keirf@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shivank Garg <shivankg@amd.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: yangge <yangge1116@126.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:36 -07:00
Hugh Dickins afb99e9f50 mm: revert "mm/gup: clear the LRU flag of a page before adding to LRU batch"
This reverts commit 33dfe9204f29: now that
collect_longterm_unpinnable_folios() is checking ref_count instead of lru,
and mlock/munlock do not participate in the revised LRU flag clearing,
those changes are misleading, and enlarge the window during which
mlock/munlock may miss an mlock_count update.

It is possible (I'd hesitate to claim probable) that the greater
likelihood of missed mlock_count updates would explain the "Realtime
threads delayed due to kcompactd0" observed on 6.12 in the Link below.  If
that is the case, this reversion will help; but a complete solution needs
also a further patch, beyond the scope of this series.

Included some 80-column cleanup around folio_batch_add_and_move().

The role of folio_test_clear_lru() (before taking per-memcg lru_lock) is
questionable since 6.13 removed mem_cgroup_move_account() etc; but perhaps
there are still some races which need it - not examined here.

Link: https://lore.kernel.org/linux-mm/DU0PR01MB10385345F7153F334100981888259A@DU0PR01MB10385.eurprd01.prod.exchangelabs.com/
Link: https://lkml.kernel.org/r/05905d7b-ed14-68b1-79d8-bdec30367eba@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Keir Fraser <keirf@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shivank Garg <shivankg@amd.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: yangge <yangge1116@126.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:35 -07:00
Hugh Dickins a09a8a1fbb mm/gup: local lru_add_drain() to avoid lru_add_drain_all()
In many cases, if collect_longterm_unpinnable_folios() does need to drain
the LRU cache to release a reference, the cache in question is on this
same CPU, and much more efficiently drained by a preliminary local
lru_add_drain(), than the later cross-CPU lru_add_drain_all().

Marked for stable, to counter the increase in lru_add_drain_all()s from
"mm/gup: check ref_count instead of lru before migration".  Note for clean
backports: can take 6.16 commit a03db236ae ("gup: optimize longterm
pin_user_pages() for large folio") first.

Link: https://lkml.kernel.org/r/66f2751f-283e-816d-9530-765db7edc465@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Keir Fraser <keirf@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shivank Garg <shivankg@amd.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: yangge <yangge1116@126.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:35 -07:00
Hugh Dickins 98c6d25931 mm/gup: check ref_count instead of lru before migration
Patch series "mm: better GUP pin lru_add_drain_all()", v2.

Series of lru_add_drain_all()-related patches, arising from recent mm/gup
migration report from Will Deacon.


This patch (of 5):

Will Deacon reports:-

When taking a longterm GUP pin via pin_user_pages(),
__gup_longterm_locked() tries to migrate target folios that should not be
longterm pinned, for example because they reside in a CMA region or
movable zone.  This is done by first pinning all of the target folios
anyway, collecting all of the longterm-unpinnable target folios into a
list, dropping the pins that were just taken and finally handing the list
off to migrate_pages() for the actual migration.

It is critically important that no unexpected references are held on the
folios being migrated, otherwise the migration will fail and
pin_user_pages() will return -ENOMEM to its caller.  Unfortunately, it is
relatively easy to observe migration failures when running pKVM (which
uses pin_user_pages() on crosvm's virtual address space to resolve stage-2
page faults from the guest) on a 6.15-based Pixel 6 device and this
results in the VM terminating prematurely.

In the failure case, 'crosvm' has called mlock(MLOCK_ONFAULT) on its
mapping of guest memory prior to the pinning.  Subsequently, when
pin_user_pages() walks the page-table, the relevant 'pte' is not present
and so the faulting logic allocates a new folio, mlocks it with
mlock_folio() and maps it in the page-table.

Since commit 2fbb0c10d1 ("mm/munlock: mlock_page() munlock_page() batch
by pagevec"), mlock/munlock operations on a folio (formerly page), are
deferred.  For example, mlock_folio() takes an additional reference on the
target folio before placing it into a per-cpu 'folio_batch' for later
processing by mlock_folio_batch(), which drops the refcount once the
operation is complete.  Processing of the batches is coupled with the LRU
batch logic and can be forcefully drained with lru_add_drain_all() but as
long as a folio remains unprocessed on the batch, its refcount will be
elevated.

This deferred batching therefore interacts poorly with the pKVM pinning
scenario as we can find ourselves in a situation where the migration code
fails to migrate a folio due to the elevated refcount from the pending
mlock operation.

Hugh Dickins adds:-

!folio_test_lru() has never been a very reliable way to tell if an
lru_add_drain_all() is worth calling, to remove LRU cache references to
make the folio migratable: the LRU flag may be set even while the folio is
held with an extra reference in a per-CPU LRU cache.

5.18 commit 2fbb0c10d1 may have made it more unreliable.  Then 6.11
commit 33dfe9204f ("mm/gup: clear the LRU flag of a page before adding
to LRU batch") tried to make it reliable, by moving LRU flag clearing; but
missed the mlock/munlock batches, so still unreliable as reported.

And it turns out to be difficult to extend 33dfe9204f29's LRU flag
clearing to the mlock/munlock batches: if they do benefit from batching,
mlock/munlock cannot be so effective when easily suppressed while !LRU.

Instead, switch to an expected ref_count check, which was more reliable
all along: some more false positives (unhelpful drains) than before, and
never a guarantee that the folio will prove migratable, but better.

Note on PG_private_2: ceph and nfs are still using the deprecated
PG_private_2 flag, with the aid of netfs and filemap support functions. 
Although it is consistently matched by an increment of folio ref_count,
folio_expected_ref_count() intentionally does not recognize it, and ceph
folio migration currently depends on that for PG_private_2 folios to be
rejected.  New references to the deprecated flag are discouraged, so do
not add it into the collect_longterm_unpinnable_folios() calculation: but
longterm pinning of transiently PG_private_2 ceph and nfs folios (an
uncommon case) may invoke a redundant lru_add_drain_all().  And this makes
easy the backport to earlier releases: up to and including 6.12, btrfs
also used PG_private_2, but without a ref_count increment.

Note for stable backports: requires 6.16 commit 86ebd50224 ("mm:
add folio_expected_ref_count() for reference count calculation").

Link: https://lkml.kernel.org/r/41395944-b0e3-c3ac-d648-8ddd70451d28@google.com
Link: https://lkml.kernel.org/r/bd1f314a-fca1-8f19-cac0-b936c9614557@google.com
Fixes: 9a4e9f3b2d ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region")
Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Will Deacon <will@kernel.org>
Closes: https://lore.kernel.org/linux-mm/20250815101858.24352-1-will@kernel.org/
Acked-by: Kiryl Shutsemau <kas@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Keir Fraser <keirf@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shivank Garg <shivankg@amd.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: yangge <yangge1116@126.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-09-13 13:05:35 -07:00
Ian Rogers 7947ad1561 perf lock: Provide a host_env for session new
When "perf lock con" is run in a live mode, with no data file, a host
environment must be provided. Testing missed this as a failing assert
was creating the 1 line of expected stderr output.

  $ sudo perf lock con -ab true
  perf: util/session.c:195: __perf_session__new: Assertion `host_env != NULL' failed.
  Aborted

Fixes: 525a599bad ("perf env: Remove global perf_env")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
2025-09-12 17:53:03 -07:00
hupu a5edf3550f perf subcmd: avoid crash in exclude_cmds when excludes is empty
When cross-compiling the perf tool for ARM64, `perf help` may crash
with the following assertion failure:

  help.c:122: exclude_cmds: Assertion `cmds->names[ci] == NULL' failed.

This happens when the perf binary is not named exactly "perf" or when
multiple "perf-*" binaries exist in the same directory. In such cases,
the `excludes` command list can be empty, which leads to the final
assertion in exclude_cmds() being triggered.

Add a simple guard at the beginning of exclude_cmds() to return early
if excludes->cnt is zero, preventing the crash.

Signed-off-by: hupu <hupu.gm@gmail.com>
Reported-by: Guilherme Amadio <amadio@gentoo.org>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20250909094953.106706-1-amadio@gentoo.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
2025-09-12 17:51:35 -07:00
Mikulas Patocka 77b8e6fbf9 dm-integrity: limit MAX_TAG_SIZE to 255
MAX_TAG_SIZE was 0x1a8 and it may be truncated in the "bi->metadata_size
= ic->tag_size" assignment. We need to limit it to 255.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
2025-09-08 15:57:04 +02:00
H. Nikolaus Schaller 1e451977e1 power: supply: bq27xxx: restrict no-battery detection to bq27000
There are fuel gauges in the bq27xxx series (e.g. bq27z561) which may in some
cases report 0xff as the value of BQ27XXX_REG_FLAGS that should not be
interpreted as "no battery" like for a disconnected battery with some built
in bq27000 chip.

So restrict the no-battery detection originally introduced by

    commit 3dd843e1c2 ("bq27000: report missing device better.")

to the bq27000.

There is no need to backport further because this was hidden before

	commit f16d9fb6cf ("power: supply: bq27xxx: Retrieve again when busy")

Fixes: f16d9fb6cf ("power: supply: bq27xxx: Retrieve again when busy")
Suggested-by: Jerry Lv <Jerry.Lv@axis.com>
Cc: stable@vger.kernel.org
Signed-off-by: H. Nikolaus Schaller <hns@goldelico.com>
Link: https://lore.kernel.org/r/dd979fa6855fd051ee5117016c58daaa05966e24.1755945297.git.hns@goldelico.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
2025-09-06 02:11:38 +02:00
H. Nikolaus Schaller 2c334d0384 power: supply: bq27xxx: fix error return in case of no bq27000 hdq battery
Since commit

	commit f16d9fb6cf ("power: supply: bq27xxx: Retrieve again when busy")

the console log of some devices with hdq enabled but no bq27000 battery
(like e.g. the Pandaboard) is flooded with messages like:

[   34.247833] power_supply bq27000-battery: driver failed to report 'status' property: -1

as soon as user-space is finding a /sys entry and trying to read the
"status" property.

It turns out that the offending commit changes the logic to now return the
value of cache.flags if it is <0. This is likely under the assumption that
it is an error number. In normal errors from bq27xxx_read() this is indeed
the case.

But there is special code to detect if no bq27000 is installed or accessible
through hdq/1wire and wants to report this. In that case, the cache.flags
are set historically by

	commit 3dd843e1c2 ("bq27000: report missing device better.")

to constant -1 which did make reading properties return -ENODEV. So everything
appeared to be fine before the return value was passed upwards.

Now the -1 is returned as -EPERM instead of -ENODEV, triggering the error
condition in power_supply_format_property() which then floods the console log.

So we change the detection of missing bq27000 battery to simply set

	cache.flags = -ENODEV

instead of -1.

Fixes: f16d9fb6cf ("power: supply: bq27xxx: Retrieve again when busy")
Cc: Jerry Lv <Jerry.Lv@axis.com>
Cc: stable@vger.kernel.org
Signed-off-by: H. Nikolaus Schaller <hns@goldelico.com>
Link: https://lore.kernel.org/r/692f79eb6fd541adb397038ea6e750d4de2deddf.1755945297.git.hns@goldelico.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
2025-09-06 02:11:38 +02:00
Chen Ridong 94a4acfec1 cgroup/psi: Set of->priv to NULL upon file release
Setting of->priv to NULL when the file is released enables earlier bug
detection. This allows potential bugs to manifest as NULL pointer
dereferences rather than use-after-free errors[1], which are generally more
difficult to diagnose.

[1] https://lore.kernel.org/cgroups/38ef3ff9-b380-44f0-9315-8b3714b0948d@huaweicloud.com/T/#m8a3b3f88f0ff3da5925d342e90043394f8b2091b
Signed-off-by: Chen Ridong <chenridong@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-08-22 07:47:43 -10:00
Chen Ridong 79f919a89c cgroup: split cgroup_destroy_wq into 3 workqueues
A hung task can occur during [1] LTP cgroup testing when repeatedly
mounting/unmounting perf_event and net_prio controllers with
systemd.unified_cgroup_hierarchy=1. The hang manifests in
cgroup_lock_and_drain_offline() during root destruction.

Related case:
cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event
cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio

Call Trace:
	cgroup_lock_and_drain_offline+0x14c/0x1e8
	cgroup_destroy_root+0x3c/0x2c0
	css_free_rwork_fn+0x248/0x338
	process_one_work+0x16c/0x3b8
	worker_thread+0x22c/0x3b0
	kthread+0xec/0x100
	ret_from_fork+0x10/0x20

Root Cause:

CPU0                            CPU1
mount perf_event                umount net_prio
cgroup1_get_tree                cgroup_kill_sb
rebind_subsystems               // root destruction enqueues
				// cgroup_destroy_wq
// kill all perf_event css
                                // one perf_event css A is dying
                                // css A offline enqueues cgroup_destroy_wq
                                // root destruction will be executed first
                                css_free_rwork_fn
                                cgroup_destroy_root
                                cgroup_lock_and_drain_offline
                                // some perf descendants are dying
                                // cgroup_destroy_wq max_active = 1
                                // waiting for css A to die

Problem scenario:
1. CPU0 mounts perf_event (rebind_subsystems)
2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work
3. A dying perf_event CSS gets queued for offline after root destruction
4. Root destruction waits for offline completion, but offline work is
   blocked behind root destruction in cgroup_destroy_wq (max_active=1)

Solution:
Split cgroup_destroy_wq into three dedicated workqueues:
cgroup_offline_wq – Handles CSS offline operations
cgroup_release_wq – Manages resource release
cgroup_free_wq – Performs final memory deallocation

This separation eliminates blocking in the CSS free path while waiting for
offline operations to complete.

[1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers
Fixes: 334c3679ec ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends")
Reported-by: Gao Yingjie <gaoyingjie@uniontech.com>
Signed-off-by: Chen Ridong <chenridong@huawei.com>
Suggested-by: Teju Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-08-22 07:44:11 -10:00
Mikulas Patocka 1071d560af dm-stripe: fix a possible integer overflow
There's a possible integer overflow in stripe_io_hints if we have too
large chunk size. Test if the overflow happened, and if it did, don't set
limits->io_min and limits->io_opt;

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Suggested-by: Dongsheng Yang <dongsheng.yang@linux.dev>
Cc: stable@vger.kernel.org
2025-08-11 13:17:32 +02:00
32 changed files with 304 additions and 149 deletions

View File

@ -16196,6 +16196,7 @@ R: Rik van Riel <riel@surriel.com>
R: Liam R. Howlett <Liam.Howlett@oracle.com>
R: Vlastimil Babka <vbabka@suse.cz>
R: Harry Yoo <harry.yoo@oracle.com>
R: Jann Horn <jannh@google.com>
L: linux-mm@kvack.org
S: Maintained
F: include/linux/rmap.h
@ -16240,6 +16241,7 @@ R: Nico Pache <npache@redhat.com>
R: Ryan Roberts <ryan.roberts@arm.com>
R: Dev Jain <dev.jain@arm.com>
R: Barry Song <baohua@kernel.org>
R: Lance Yang <lance.yang@linux.dev>
L: linux-mm@kvack.org
S: Maintained
W: http://www.linux-mm.org

View File

@ -1795,6 +1795,7 @@ static int write_same_filled_page(struct zram *zram, unsigned long fill,
u32 index)
{
zram_slot_lock(zram, index);
zram_free_page(zram, index);
zram_set_flag(zram, index, ZRAM_SAME);
zram_set_handle(zram, index, fill);
zram_slot_unlock(zram, index);
@ -1832,6 +1833,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page,
kunmap_local(src);
zram_slot_lock(zram, index);
zram_free_page(zram, index);
zram_set_flag(zram, index, ZRAM_HUGE);
zram_set_handle(zram, index, handle);
zram_set_obj_size(zram, index, PAGE_SIZE);
@ -1855,11 +1857,6 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
unsigned long element;
bool same_filled;
/* First, free memory allocated to this slot (if any) */
zram_slot_lock(zram, index);
zram_free_page(zram, index);
zram_slot_unlock(zram, index);
mem = kmap_local_page(page);
same_filled = page_same_filled(mem, &element);
kunmap_local(mem);
@ -1901,6 +1898,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
zcomp_stream_put(zstrm);
zram_slot_lock(zram, index);
zram_free_page(zram, index);
zram_set_handle(zram, index, handle);
zram_set_obj_size(zram, index, comp_len);
zram_slot_unlock(zram, index);

View File

@ -133,7 +133,7 @@ struct journal_sector {
commit_id_t commit_id;
};
#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
#define MAX_TAG_SIZE 255
#define METADATA_PADDING_SECTORS 8

View File

@ -3813,8 +3813,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
struct raid_set *rs = ti->private;
unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors);
limits->io_min = chunk_size_bytes;
limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs);
if (chunk_size_bytes) {
limits->io_min = chunk_size_bytes;
limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs);
}
}
static void raid_presuspend(struct dm_target *ti)

View File

@ -456,11 +456,15 @@ static void stripe_io_hints(struct dm_target *ti,
struct queue_limits *limits)
{
struct stripe_c *sc = ti->private;
unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT;
unsigned int io_min, io_opt;
limits->chunk_sectors = sc->chunk_size;
limits->io_min = chunk_size;
limits->io_opt = chunk_size * sc->stripes;
if (!check_shl_overflow(sc->chunk_size, SECTOR_SHIFT, &io_min) &&
!check_mul_overflow(io_min, sc->stripes, &io_opt)) {
limits->io_min = io_min;
limits->io_opt = io_opt;
}
}
static struct target_type stripe_target = {

View File

@ -1919,8 +1919,8 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di)
bool has_singe_flag = di->opts & BQ27XXX_O_ZERO;
cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag);
if ((cache.flags & 0xff) == 0xff)
cache.flags = -1; /* read error */
if (di->chip == BQ27000 && (cache.flags & 0xff) == 0xff)
cache.flags = -ENODEV; /* bq27000 hdq read error */
if (cache.flags >= 0) {
cache.capacity = bq27xxx_battery_read_soc(di);

View File

@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
bg1 = list_entry(a, struct btrfs_block_group, bg_list);
bg2 = list_entry(b, struct btrfs_block_group, bg_list);
return bg1->used > bg2->used;
/*
* Some other task may be updating the ->used field concurrently, but it
* is not serious if we get a stale value or load/store tearing issues,
* as sorting the list of block groups to reclaim is not critical and an
* occasional imperfect order is ok. So silence KCSAN and avoid the
* overhead of locking or any other synchronization.
*/
return data_race(bg1->used > bg2->used);
}
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)

View File

@ -1843,7 +1843,6 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
struct inode *vfs_inode = &inode->vfs_inode;
@ -1864,8 +1863,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
btrfs_inode_set_file_extent_range(inode, 0,
round_up(i_size_read(vfs_inode), fs_info->sectorsize));
vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));

View File

@ -3885,10 +3885,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
bool filled = false;
int first_xattr_slot;
ret = btrfs_init_file_extent_tree(inode);
if (ret)
goto out;
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
@ -3920,8 +3916,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
btrfs_inode_set_file_extent_range(inode, 0,
round_up(i_size_read(vfs_inode), fs_info->sectorsize));
inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
@ -3953,6 +3947,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
btrfs_set_inode_mapping_order(inode);
cache_index:
ret = btrfs_init_file_extent_tree(inode);
if (ret)
goto out;
btrfs_inode_set_file_extent_range(inode, 0,
round_up(i_size_read(vfs_inode), fs_info->sectorsize));
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any

View File

@ -1964,7 +1964,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = key->objectid;
search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len);
ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
if (ret < 0) {
goto out;

View File

@ -2582,9 +2582,9 @@ again:
spin_lock(&space_info->lock);
space_info->total_bytes -= bg->length;
space_info->disk_total -= bg->length * factor;
space_info->disk_total -= bg->zone_unusable;
/* There is no allocation ever happened. */
ASSERT(bg->used == 0);
ASSERT(bg->zone_unusable == 0);
/* No super block in a block group on the zoned setup. */
ASSERT(bg->bytes_super == 0);
spin_unlock(&space_info->lock);

View File

@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
************************************************************************/
static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
struct attribute *attr, char *buf)
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d.%d\n",
NILFS_CURRENT_REV, NILFS_MINOR_REV);
@ -1087,7 +1087,7 @@ static const char features_readme_str[] =
"(1) revision\n\tshow current revision of NILFS file system driver.\n";
static ssize_t nilfs_feature_README_show(struct kobject *kobj,
struct attribute *attr,
struct kobj_attribute *attr,
char *buf)
{
return sysfs_emit(buf, features_readme_str);

View File

@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups {
struct completion sg_segments_kobj_unregister;
};
#define NILFS_COMMON_ATTR_STRUCT(name) \
#define NILFS_KOBJ_ATTR_STRUCT(name) \
struct nilfs_##name##_attr { \
struct attribute attr; \
ssize_t (*show)(struct kobject *, struct attribute *, \
ssize_t (*show)(struct kobject *, struct kobj_attribute *, \
char *); \
ssize_t (*store)(struct kobject *, struct attribute *, \
ssize_t (*store)(struct kobject *, struct kobj_attribute *, \
const char *, size_t); \
}
NILFS_COMMON_ATTR_STRUCT(feature);
NILFS_KOBJ_ATTR_STRUCT(feature);
#define NILFS_DEV_ATTR_STRUCT(name) \
struct nilfs_##name##_attr { \

View File

@ -554,7 +554,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
case SMB_DIRECT_MSG_DATA_TRANSFER: {
struct smb_direct_data_transfer *data_transfer =
(struct smb_direct_data_transfer *)recvmsg->packet;
unsigned int data_length;
u32 remaining_data_length, data_offset, data_length;
int avail_recvmsg_count, receive_credits;
if (wc->byte_len <
@ -564,15 +564,25 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
return;
}
remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
data_length = le32_to_cpu(data_transfer->data_length);
if (data_length) {
if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
(u64)data_length) {
put_recvmsg(t, recvmsg);
smb_direct_disconnect_rdma_connection(t);
return;
}
data_offset = le32_to_cpu(data_transfer->data_offset);
if (wc->byte_len < data_offset ||
wc->byte_len < (u64)data_offset + data_length) {
put_recvmsg(t, recvmsg);
smb_direct_disconnect_rdma_connection(t);
return;
}
if (remaining_data_length > t->max_fragmented_recv_size ||
data_length > t->max_fragmented_recv_size ||
(u64)remaining_data_length + (u64)data_length >
(u64)t->max_fragmented_recv_size) {
put_recvmsg(t, recvmsg);
smb_direct_disconnect_rdma_connection(t);
return;
}
if (data_length) {
if (t->full_packet_received)
recvmsg->first_segment = true;
@ -1209,78 +1219,130 @@ static int smb_direct_writev(struct ksmbd_transport *t,
bool need_invalidate, unsigned int remote_key)
{
struct smb_direct_transport *st = smb_trans_direct_transfort(t);
int remaining_data_length;
int start, i, j;
int max_iov_size = st->max_send_size -
size_t remaining_data_length;
size_t iov_idx;
size_t iov_ofs;
size_t max_iov_size = st->max_send_size -
sizeof(struct smb_direct_data_transfer);
int ret;
struct kvec vec;
struct smb_direct_send_ctx send_ctx;
int error = 0;
if (st->status != SMB_DIRECT_CS_CONNECTED)
return -ENOTCONN;
//FIXME: skip RFC1002 header..
if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4))
return -EINVAL;
buflen -= 4;
iov_idx = 1;
iov_ofs = 0;
remaining_data_length = buflen;
ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen);
smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key);
start = i = 1;
buflen = 0;
while (true) {
buflen += iov[i].iov_len;
if (buflen > max_iov_size) {
if (i > start) {
remaining_data_length -=
(buflen - iov[i].iov_len);
ret = smb_direct_post_send_data(st, &send_ctx,
&iov[start], i - start,
remaining_data_length);
if (ret)
goto done;
} else {
/* iov[start] is too big, break it */
int nvec = (buflen + max_iov_size - 1) /
max_iov_size;
while (remaining_data_length) {
struct kvec vecs[SMB_DIRECT_MAX_SEND_SGES - 1]; /* minus smbdirect hdr */
size_t possible_bytes = max_iov_size;
size_t possible_vecs;
size_t bytes = 0;
size_t nvecs = 0;
for (j = 0; j < nvec; j++) {
vec.iov_base =
(char *)iov[start].iov_base +
j * max_iov_size;
vec.iov_len =
min_t(int, max_iov_size,
buflen - max_iov_size * j);
remaining_data_length -= vec.iov_len;
ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1,
remaining_data_length);
if (ret)
goto done;
}
i++;
if (i == niovs)
break;
}
start = i;
buflen = 0;
} else {
i++;
if (i == niovs) {
/* send out all remaining vecs */
remaining_data_length -= buflen;
ret = smb_direct_post_send_data(st, &send_ctx,
&iov[start], i - start,
remaining_data_length);
if (ret)
/*
* For the last message remaining_data_length should be
* have been 0 already!
*/
if (WARN_ON_ONCE(iov_idx >= niovs)) {
error = -EINVAL;
goto done;
}
/*
* We have 2 factors which limit the arguments we pass
* to smb_direct_post_send_data():
*
* 1. The number of supported sges for the send,
* while one is reserved for the smbdirect header.
* And we currently need one SGE per page.
* 2. The number of negotiated payload bytes per send.
*/
possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx);
while (iov_idx < niovs && possible_vecs && possible_bytes) {
struct kvec *v = &vecs[nvecs];
int page_count;
v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs;
v->iov_len = min_t(size_t,
iov[iov_idx].iov_len - iov_ofs,
possible_bytes);
page_count = get_buf_page_count(v->iov_base, v->iov_len);
if (page_count > possible_vecs) {
/*
* If the number of pages in the buffer
* is to much (because we currently require
* one SGE per page), we need to limit the
* length.
*
* We know possible_vecs is at least 1,
* so we always keep the first page.
*
* We need to calculate the number extra
* pages (epages) we can also keep.
*
* We calculate the number of bytes in the
* first page (fplen), this should never be
* larger than v->iov_len because page_count is
* at least 2, but adding a limitation feels
* better.
*
* Then we calculate the number of bytes (elen)
* we can keep for the extra pages.
*/
size_t epages = possible_vecs - 1;
size_t fpofs = offset_in_page(v->iov_base);
size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len);
size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE);
v->iov_len = fplen + elen;
page_count = get_buf_page_count(v->iov_base, v->iov_len);
if (WARN_ON_ONCE(page_count > possible_vecs)) {
/*
* Something went wrong in the above
* logic...
*/
error = -EINVAL;
goto done;
break;
}
}
possible_vecs -= page_count;
nvecs += 1;
possible_bytes -= v->iov_len;
bytes += v->iov_len;
iov_ofs += v->iov_len;
if (iov_ofs >= iov[iov_idx].iov_len) {
iov_idx += 1;
iov_ofs = 0;
}
}
remaining_data_length -= bytes;
ret = smb_direct_post_send_data(st, &send_ctx,
vecs, nvecs,
remaining_data_length);
if (unlikely(ret)) {
error = ret;
goto done;
}
}
done:
ret = smb_direct_flush_send_list(st, &send_ctx, true);
if (unlikely(!ret && error))
ret = error;
/*
* As an optimization, we don't wait for individual I/O to finish
@ -1744,6 +1806,11 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
return -EINVAL;
}
if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) {
pr_err("warning: device max_send_sge = %d too small\n",
device->attrs.max_send_sge);
return -EINVAL;
}
if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) {
pr_err("warning: device max_recv_sge = %d too small\n",
device->attrs.max_recv_sge);
@ -1767,7 +1834,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
cap->max_send_wr = max_send_wrs;
cap->max_recv_wr = t->recv_credit_max;
cap->max_send_sge = max_sge_per_wr;
cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES;
cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
cap->max_inline_data = 0;
cap->max_rdma_ctxs = t->max_rw_credits;

View File

@ -636,6 +636,7 @@ struct damon_operations {
* @data: Data that will be passed to @fn.
* @repeat: Repeat invocations.
* @return_code: Return code from @fn invocation.
* @dealloc_on_cancel: De-allocate when canceled.
*
* Control damon_call(), which requests specific kdamond to invoke a given
* function. Refer to damon_call() for more details.
@ -645,6 +646,7 @@ struct damon_call_control {
void *data;
bool repeat;
int return_code;
bool dealloc_on_cancel;
/* private: internal use only */
/* informs if the kdamond finished handling of the request */
struct completion completion;

View File

@ -385,6 +385,16 @@ void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void mark_page_accessed(struct page *);
void folio_mark_accessed(struct folio *);
static inline bool folio_may_be_lru_cached(struct folio *folio)
{
/*
* Holding PMD-sized folios in per-CPU LRU cache unbalances accounting.
* Holding small numbers of low-order mTHP folios in per-CPU LRU cache
* will be sensible, but nobody has implemented and tested that yet.
*/
return !folio_test_large(folio);
}
extern atomic_t lru_disable_count;
static inline bool lru_cache_disabled(void)

View File

@ -126,8 +126,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
* of concurrent destructions. Use a separate workqueue so that cgroup
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
*
* A cgroup destruction should enqueue work sequentially to:
* cgroup_offline_wq: use for css offline work
* cgroup_release_wq: use for css release work
* cgroup_free_wq: use for free work
*
* Rationale for using separate workqueues:
* The cgroup root free work may depend on completion of other css offline
* operations. If all tasks were enqueued to a single workqueue, this could
* create a deadlock scenario where:
* - Free work waits for other css offline work to complete.
* - But other css offline work is queued after free work in the same queue.
*
* Example deadlock scenario with single workqueue (cgroup_destroy_wq):
* 1. umount net_prio
* 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
* 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
* 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
* 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
* which can never complete as it's behind in the same queue and
* workqueue's max_active is 1.
*/
static struct workqueue_struct *cgroup_destroy_wq;
static struct workqueue_struct *cgroup_offline_wq;
static struct workqueue_struct *cgroup_release_wq;
static struct workqueue_struct *cgroup_free_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@ -4159,6 +4182,7 @@ static void cgroup_file_release(struct kernfs_open_file *of)
cft->release(of);
put_cgroup_ns(ctx->ns);
kfree(ctx);
of->priv = NULL;
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
@ -5558,7 +5582,7 @@ static void css_release_work_fn(struct work_struct *work)
cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@ -5567,7 +5591,7 @@ static void css_release(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_release_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
queue_work(cgroup_release_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@ -5701,7 +5725,7 @@ err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
@ -5939,7 +5963,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
queue_work(cgroup_offline_wq, &css->destroy_work);
}
}
@ -6325,8 +6349,14 @@ static int __init cgroup_wq_init(void)
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
BUG_ON(!cgroup_offline_wq);
cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1);
BUG_ON(!cgroup_release_wq);
cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1);
BUG_ON(!cgroup_free_wq);
return 0;
}
core_initcall(cgroup_wq_init);

View File

@ -9551,7 +9551,7 @@ static unsigned long tg_weight(struct task_group *tg)
#ifdef CONFIG_FAIR_GROUP_SCHED
return scale_load_down(tg->shares);
#else
return sched_weight_from_cgroup(tg->scx_weight);
return sched_weight_from_cgroup(tg->scx.weight);
#endif
}

View File

@ -6788,12 +6788,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
* CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
* the current local DSQ for running tasks and thus are not
* visible to the BPF scheduler.
*
* Also skip re-enqueueing tasks that can only run on this
* CPU, as they would just be re-added to the same local
* DSQ without any benefit.
*/
if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
if (p->migration_pending)
continue;
dispatch_dequeue(rq, p);

View File

@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
return -EINVAL;
}
buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
if (ret || !maxactive) {

View File

@ -2479,10 +2479,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel)
mutex_lock(&ctx->call_controls_lock);
list_del(&control->list);
mutex_unlock(&ctx->call_controls_lock);
if (!control->repeat)
if (!control->repeat) {
complete(&control->completion);
else
} else if (control->canceled && control->dealloc_on_cancel) {
kfree(control);
continue;
} else {
list_add(&control->list, &repeat_controls);
}
}
control = list_first_entry_or_null(&repeat_controls,
struct damon_call_control, list);

View File

@ -1534,14 +1534,10 @@ static int damon_sysfs_repeat_call_fn(void *data)
return 0;
}
static struct damon_call_control damon_sysfs_repeat_call_control = {
.fn = damon_sysfs_repeat_call_fn,
.repeat = true,
};
static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
{
struct damon_ctx *ctx;
struct damon_call_control *repeat_call_control;
int err;
if (damon_sysfs_kdamond_running(kdamond))
@ -1554,18 +1550,29 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
damon_destroy_ctx(kdamond->damon_ctx);
kdamond->damon_ctx = NULL;
repeat_call_control = kmalloc(sizeof(*repeat_call_control),
GFP_KERNEL);
if (!repeat_call_control)
return -ENOMEM;
ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
if (IS_ERR(ctx))
if (IS_ERR(ctx)) {
kfree(repeat_call_control);
return PTR_ERR(ctx);
}
err = damon_start(&ctx, 1, false);
if (err) {
kfree(repeat_call_control);
damon_destroy_ctx(ctx);
return err;
}
kdamond->damon_ctx = ctx;
damon_sysfs_repeat_call_control.data = kdamond;
damon_call(ctx, &damon_sysfs_repeat_call_control);
repeat_call_control->fn = damon_sysfs_repeat_call_fn;
repeat_call_control->data = kdamond;
repeat_call_control->repeat = true;
repeat_call_control->dealloc_on_cancel = true;
damon_call(ctx, repeat_call_control);
return err;
}

View File

@ -2287,8 +2287,8 @@ static unsigned long collect_longterm_unpinnable_folios(
struct pages_or_folios *pofs)
{
unsigned long collected = 0;
bool drain_allow = true;
struct folio *folio;
int drained = 0;
long i = 0;
for (folio = pofs_get_folio(pofs, i); folio;
@ -2307,9 +2307,17 @@ static unsigned long collect_longterm_unpinnable_folios(
continue;
}
if (!folio_test_lru(folio) && drain_allow) {
if (drained == 0 && folio_may_be_lru_cached(folio) &&
folio_ref_count(folio) !=
folio_expected_ref_count(folio) + 1) {
lru_add_drain();
drained = 1;
}
if (drained == 1 && folio_may_be_lru_cached(folio) &&
folio_ref_count(folio) !=
folio_expected_ref_count(folio) + 1) {
lru_add_drain_all();
drain_allow = false;
drained = 2;
}
if (!folio_isolate_lru(folio))

View File

@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio)
folio_get(folio);
if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
folio_test_large(folio) || lru_cache_disabled())
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio)
folio_get(folio);
if (!folio_batch_add(fbatch, mlock_new(folio)) ||
folio_test_large(folio) || lru_cache_disabled())
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}
@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio)
*/
folio_get(folio);
if (!folio_batch_add(fbatch, folio) ||
folio_test_large(folio) || lru_cache_disabled())
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
local_unlock(&mlock_fbatch.lock);
}

View File

@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
/* block memcg migration while the folio moves between lru */
if (move_fn != lru_add && !folio_test_clear_lru(folio))
continue;
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
}
static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
struct folio *folio, move_fn_t move_fn,
bool on_lru, bool disable_irq)
struct folio *folio, move_fn_t move_fn, bool disable_irq)
{
unsigned long flags;
if (on_lru && !folio_test_clear_lru(folio))
return;
folio_get(folio);
if (disable_irq)
@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
else
local_lock(&cpu_fbatches.lock);
if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
lru_cache_disabled())
if (!folio_batch_add(this_cpu_ptr(fbatch), folio) ||
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
if (disable_irq)
@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
local_unlock(&cpu_fbatches.lock);
}
#define folio_batch_add_and_move(folio, op, on_lru) \
__folio_batch_add_and_move( \
&cpu_fbatches.op, \
folio, \
op, \
on_lru, \
offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \
#define folio_batch_add_and_move(folio, op) \
__folio_batch_add_and_move( \
&cpu_fbatches.op, \
folio, \
op, \
offsetof(struct cpu_fbatches, op) >= \
offsetof(struct cpu_fbatches, lock_irq) \
)
static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
@ -231,10 +231,10 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
void folio_rotate_reclaimable(struct folio *folio)
{
if (folio_test_locked(folio) || folio_test_dirty(folio) ||
folio_test_unevictable(folio))
folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
folio_batch_add_and_move(folio, lru_move_tail, true);
folio_batch_add_and_move(folio, lru_move_tail);
}
void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
@ -328,10 +328,11 @@ static void folio_activate_drain(int cpu)
void folio_activate(struct folio *folio)
{
if (folio_test_active(folio) || folio_test_unevictable(folio))
if (folio_test_active(folio) || folio_test_unevictable(folio) ||
!folio_test_lru(folio))
return;
folio_batch_add_and_move(folio, lru_activate, true);
folio_batch_add_and_move(folio, lru_activate);
}
#else
@ -507,7 +508,7 @@ void folio_add_lru(struct folio *folio)
lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
folio_set_active(folio);
folio_batch_add_and_move(folio, lru_add, false);
folio_batch_add_and_move(folio, lru_add);
}
EXPORT_SYMBOL(folio_add_lru);
@ -685,13 +686,13 @@ void lru_add_drain_cpu(int cpu)
void deactivate_file_folio(struct folio *folio)
{
/* Deactivating an unevictable folio will not accelerate reclaim */
if (folio_test_unevictable(folio))
if (folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
if (lru_gen_enabled() && lru_gen_clear_refs(folio))
return;
folio_batch_add_and_move(folio, lru_deactivate_file, true);
folio_batch_add_and_move(folio, lru_deactivate_file);
}
/*
@ -704,13 +705,13 @@ void deactivate_file_folio(struct folio *folio)
*/
void folio_deactivate(struct folio *folio)
{
if (folio_test_unevictable(folio))
if (folio_test_unevictable(folio) || !folio_test_lru(folio))
return;
if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
return;
folio_batch_add_and_move(folio, lru_deactivate, true);
folio_batch_add_and_move(folio, lru_deactivate);
}
/**
@ -723,10 +724,11 @@ void folio_deactivate(struct folio *folio)
void folio_mark_lazyfree(struct folio *folio)
{
if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
!folio_test_lru(folio) ||
folio_test_swapcache(folio) || folio_test_unevictable(folio))
return;
folio_batch_add_and_move(folio, lru_lazyfree, true);
folio_batch_add_and_move(folio, lru_lazyfree);
}
void lru_add_drain(void)

View File

@ -4507,7 +4507,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* ineligible */
if (!folio_test_lru(folio) || zone > sc->reclaim_idx) {
if (zone > sc->reclaim_idx) {
gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;

View File

@ -208,6 +208,9 @@ static int damon_sample_mtier_enable_store(
if (enabled == is_enabled)
return 0;
if (!init_called)
return 0;
if (enabled) {
err = damon_sample_mtier_start();
if (err)

View File

@ -137,6 +137,9 @@ static int damon_sample_prcl_enable_store(
if (enabled == is_enabled)
return 0;
if (!init_called)
return 0;
if (enabled) {
err = damon_sample_prcl_start();
if (err)

View File

@ -118,6 +118,9 @@ static int damon_sample_wsse_enable_store(
return 0;
if (enabled) {
if (!init_called)
return 0;
err = damon_sample_wsse_start();
if (err)
enabled = false;

View File

@ -75,6 +75,9 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
size_t ci, cj, ei;
int cmp;
if (!excludes->cnt)
return;
ci = cj = ei = 0;
while (ci < cmds->cnt && ei < excludes->cnt) {
cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);

View File

@ -2009,6 +2009,7 @@ static int __cmd_contention(int argc, const char **argv)
.owner = show_lock_owner,
.cgroups = RB_ROOT,
};
struct perf_env host_env;
lockhash_table = calloc(LOCKHASH_SIZE, sizeof(*lockhash_table));
if (!lockhash_table)
@ -2024,7 +2025,10 @@ static int __cmd_contention(int argc, const char **argv)
eops.mmap = perf_event__process_mmap;
eops.tracing_data = perf_event__process_tracing_data;
session = perf_session__new(use_bpf ? NULL : &data, &eops);
perf_env__init(&host_env);
session = __perf_session__new(use_bpf ? NULL : &data, &eops,
/*trace_event_repipe=*/false, &host_env);
if (IS_ERR(session)) {
pr_err("Initializing perf session failed\n");
err = PTR_ERR(session);
@ -2142,6 +2146,7 @@ out_delete:
evlist__delete(con.evlist);
lock_contention_finish(&con);
perf_session__delete(session);
perf_env__exit(&host_env);
zfree(&lockhash_table);
return err;
}

View File

@ -477,6 +477,7 @@ static int __maps__insert(struct maps *maps, struct map *new)
}
/* Insert the value at the end. */
maps_by_address[nr_maps] = map__get(new);
map__set_kmap_maps(new, maps);
if (maps_by_name)
maps_by_name[nr_maps] = map__get(new);
@ -502,8 +503,6 @@ static int __maps__insert(struct maps *maps, struct map *new)
if (map__end(new) < map__start(new))
RC_CHK_ACCESS(maps)->ends_broken = true;
map__set_kmap_maps(new, maps);
return 0;
}
@ -891,6 +890,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
if (before) {
map__put(maps_by_address[i]);
maps_by_address[i] = before;
map__set_kmap_maps(before, maps);
if (maps_by_name) {
map__put(maps_by_name[ni]);
@ -918,6 +918,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
*/
map__put(maps_by_address[i]);
maps_by_address[i] = map__get(new);
map__set_kmap_maps(new, maps);
if (maps_by_name) {
map__put(maps_by_name[ni]);
@ -942,14 +943,13 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
*/
map__put(maps_by_address[i]);
maps_by_address[i] = map__get(new);
map__set_kmap_maps(new, maps);
if (maps_by_name) {
map__put(maps_by_name[ni]);
maps_by_name[ni] = map__get(new);
}
map__set_kmap_maps(new, maps);
check_invariants(maps);
return err;
}
@ -1019,6 +1019,7 @@ int maps__copy_from(struct maps *dest, struct maps *parent)
err = unwind__prepare_access(dest, new, NULL);
if (!err) {
dest_maps_by_address[i] = new;
map__set_kmap_maps(new, dest);
if (dest_maps_by_name)
dest_maps_by_name[i] = map__get(new);
RC_CHK_ACCESS(dest)->nr_maps = i + 1;